diff options
-rw-r--r-- | gn3/computations/partial_correlations_optimised.py | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/gn3/computations/partial_correlations_optimised.py b/gn3/computations/partial_correlations_optimised.py new file mode 100644 index 0000000..601289c --- /dev/null +++ b/gn3/computations/partial_correlations_optimised.py @@ -0,0 +1,244 @@ +""" +This contains an optimised version of the + `gn3.computations.partial_correlations.partial_correlations_entry` +function. +""" +from functools import partial +from typing import Any, Tuple + +from gn3.settings import TEXTDIR +from gn3.function_helpers import compose +from gn3.db.partial_correlations import traits_info, traits_data +from gn3.db.species import species_name, translate_to_mouse_gene_id +from gn3.db.traits import export_informative, retrieve_trait_dataset +from gn3.db.correlations import ( + get_filename, + check_for_literature_info, + check_symbol_for_tissue_correlation) +from gn3.computations.partial_correlations import ( + fix_samples, + partial_corrs, + control_samples, + trait_for_output, + find_identical_traits, + tissue_correlation_by_list, + literature_correlation_by_list) + +def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] + conn: Any, primary_trait_name: str, + control_trait_names: Tuple[str, ...], method: str, + criteria: int, target_db_name: str) -> dict: + """ + This is the 'ochestration' function for the partial-correlation feature. + + This function will dispatch the functions doing data fetches from the + database (and various other places) and feed that data to the functions + doing the conversions and computations. It will then return the results of + all of that work. + + This function is doing way too much. Look into splitting out the + functionality into smaller functions that do fewer things. + """ + threshold = 0 + corr_min_informative = 4 + + all_traits = traits_info( + conn, threshold, (primary_trait_name,) + control_trait_names) + all_traits_data = traits_data(conn, all_traits) + + # primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) + primary_trait = tuple( + trait for trait in all_traits + if trait["trait_fullname"] == primary_trait_name)[0] + group = primary_trait["db"]["group"] + # primary_trait_data = retrieve_trait_data(primary_trait, conn) + primary_trait_data = all_traits_data[primary_trait["trait_name"]] + primary_samples, primary_values, _primary_variances = export_informative( + primary_trait_data) + + # cntrl_traits = tuple( + # retrieve_trait_info(threshold, trait_full_name, conn) + # for trait_full_name in control_trait_names) + # cntrl_traits_data = tuple( + # retrieve_trait_data(cntrl_trait, conn) + # for cntrl_trait in cntrl_traits) + cntrl_traits = tuple( + trait for trait in all_traits + if trait["trait_fullname"] != primary_trait_name) + cntrl_traits_data = tuple( + data for trait_name, data in all_traits_data.items() + if trait_name != primary_trait["trait_name"]) + species = species_name(conn, group) + + (cntrl_samples, + cntrl_values, + _cntrl_variances, + _cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + + common_primary_control_samples = primary_samples + fixed_primary_vals = primary_values + fixed_control_vals = cntrl_values + if not all(cnt_smp == primary_samples for cnt_smp in cntrl_samples): + (common_primary_control_samples, + fixed_primary_vals, + fixed_control_vals, + _primary_variances, + _cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + + if len(common_primary_control_samples) < corr_min_informative: + return { + "status": "error", + "message": ( + f"Fewer than {corr_min_informative} samples data entered for " + f"{group} dataset. No calculation of correlation has been " + "attempted."), + "error_type": "Inadequate Samples"} + + identical_traits_names = find_identical_traits( + primary_trait_name, primary_values, control_trait_names, cntrl_values) + if len(identical_traits_names) > 0: + return { + "status": "error", + "message": ( + f"{identical_traits_names[0]} and {identical_traits_names[1]} " + "have the same values for the {len(fixed_primary_vals)} " + "samples that will be used to compute the partial correlation " + "(common for all primary and control traits). In such cases, " + "partial correlation cannot be computed. Please re-select your " + "traits."), + "error_type": "Identical Traits"} + + input_trait_geneid = primary_trait.get("geneid", 0) + input_trait_symbol = primary_trait.get("symbol", "") + input_trait_mouse_geneid = translate_to_mouse_gene_id( + species, input_trait_geneid, conn) + + tissue_probeset_freeze_id = 1 + db_type = primary_trait["db"]["dataset_type"] + + if db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return { + "status": "error", + "message": ( + "Wrong correlation type: It is not possible to compute the " + f"{method} between your trait and data in the {target_db_name} " + "database. Please try again after selecting another type of " + "correlation."), + "error_type": "Correlation Type"} + + if (method.lower() == "sgo literature correlation" and ( + bool(input_trait_geneid) is False or + check_for_literature_info(conn, input_trait_mouse_geneid))): + return { + "status": "error", + "message": ( + "No Literature Information: This gene does not have any " + "associated Literature Information."), + "error_type": "Literature Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and bool(input_trait_symbol) is False): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + target_dataset = retrieve_trait_dataset( + ("Temp" if "Temp" in target_db_name else + ("Publish" if "Publish" in target_db_name else + "Geno" if "Geno" in target_db_name else "ProbeSet")), + {"db": {"dataset_name": target_db_name}, "trait_name": "_"}, + threshold, + conn) + + database_filename = get_filename(conn, target_db_name, TEXTDIR) + _total_traits, all_correlations = partial_corrs( + conn, common_primary_control_samples, fixed_primary_vals, + fixed_control_vals, len(fixed_primary_vals), species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, {**target_dataset, "dataset_type": target_dataset["type"]}, database_filename) + + + def __make_sorter__(method): + def __sort_6__(row): + return row[6] + + def __sort_3__(row): + return row[3] + + if "literature" in method.lower(): + return __sort_6__ + + if "tissue" in method.lower(): + return __sort_6__ + + return __sort_3__ + + # sorted_correlations = sorted( + # all_correlations, key=__make_sorter__(method)) + + add_lit_corr_and_tiss_corr = compose( + partial(literature_correlation_by_list, conn, species), + partial( + tissue_correlation_by_list, conn, input_trait_symbol, + tissue_probeset_freeze_id, method)) + + selected_results = sorted( + all_correlations, + key=__make_sorter__(method))[:min(criteria, len(all_correlations))] + traits_list_corr_info = { + "{target_dataset['dataset_name']}::{item[0]}": { + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} + if len(item) == 8 else {}), + **({"l_corr": item[6]} + if len(item) == 7 else {}) + } for item in selected_results} + + trait_list = add_lit_corr_and_tiss_corr(tuple( + {**trait, **traits_list_corr_info.get(trait["trait_fullname"], {})} + for trait in traits_info( + conn, threshold, + tuple( + f"{target_dataset['dataset_name']}::{item[0]}" + for item in selected_results)))) + + return { + "status": "success", + "results": { + "primary_trait": trait_for_output(primary_trait), + "control_traits": tuple( + trait_for_output(trait) for trait in cntrl_traits), + "correlations": tuple( + trait_for_output(trait) for trait in trait_list), + "dataset_type": target_dataset["type"], + "method": "spearman" if "spearman" in method.lower() else "pearson" + }} |