diff options
Diffstat (limited to 'gn3/computations')
-rw-r--r-- | gn3/computations/correlations.py | 5 | ||||
-rw-r--r-- | gn3/computations/partial_correlations.py | 137 |
2 files changed, 117 insertions, 25 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 37c70e9..09288c5 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -8,6 +8,7 @@ from typing import List from typing import Tuple from typing import Optional from typing import Callable +from typing import Generator import scipy.stats import pingouin as pg @@ -80,7 +81,7 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, zip(*list(normalize_values(trait_vals, target_samples_vals)))) num_overlap = len(normalized_traits_vals) except ValueError: - return + return None if num_overlap > 5: @@ -107,7 +108,7 @@ package :not packaged in guix def filter_shared_sample_keys(this_samplelist, - target_samplelist) -> Tuple[List, List]: + target_samplelist) -> Generator: """Given primary and target sample-list for two base and target trait select filter the values using the shared keys diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 719c605..984c15a 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -18,6 +18,7 @@ from gn3.random import random_string from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line from gn3.db.traits import export_informative +from gn3.db.datasets import retrieve_trait_dataset from gn3.db.traits import retrieve_trait_info, retrieve_trait_data from gn3.db.species import species_name, translate_to_mouse_gene_id from gn3.db.correlations import ( @@ -216,7 +217,7 @@ def good_dataset_samples_indexes( def partial_correlations_fast(# pylint: disable=[R0913, R0914] samples, primary_vals, control_vals, database_filename, fetched_correlations, method: str, correlation_type: str) -> Tuple[ - float, Tuple[float, ...]]: + int, Tuple[float, ...]]: """ Computes partial correlation coefficients using data from a CSV file. @@ -257,8 +258,9 @@ def partial_correlations_fast(# pylint: disable=[R0913, R0914] ## `correlation_type` parameter return len(all_correlations), tuple( corr + ( - (fetched_correlations[corr[0]],) if correlation_type == "literature" - else fetched_correlations[corr[0]][0:2]) + (fetched_correlations[corr[0]],) # type: ignore[index] + if correlation_type == "literature" + else fetched_correlations[corr[0]][0:2]) # type: ignore[index] for idx, corr in enumerate(all_correlations)) def build_data_frame( @@ -305,11 +307,19 @@ def compute_partial( prim for targ, prim in zip(targ_vals, primary_vals) if targ is not None] + if len(primary) < 3: + return None + + def __remove_controls_for_target_nones(cont_targ): + return tuple(cont for cont, targ in cont_targ if targ is not None) + + conts_targs = tuple(tuple( + zip(control, targ_vals)) for control in control_vals) datafrm = build_data_frame( primary, - tuple(targ for targ in targ_vals if targ is not None), - tuple(cont for i, cont in enumerate(control_vals) - if target[0][i] is not None)) + [targ for targ in targ_vals if targ is not None], + [__remove_controls_for_target_nones(cont_targ) + for cont_targ in conts_targs]) covariates = "z" if datafrm.shape[1] == 3 else [ col for col in datafrm.columns if col not in ("x", "y")] ppc = pingouin.partial_corr( @@ -332,13 +342,17 @@ def compute_partial( zero_order_corr["r"][0], zero_order_corr["p-val"][0]) return tuple( - __compute_trait_info__(target) - for target in zip(target_vals, target_names)) + result for result in ( + __compute_trait_info__(target) + for target in zip(target_vals, target_names)) + if result is not None) def partial_correlations_normal(# pylint: disable=R0913 primary_vals, control_vals, input_trait_gene_id, trait_database, data_start_pos: int, db_type: str, method: str) -> Tuple[ - float, Tuple[float, ...]]: + int, Tuple[Union[ + Tuple[str, int, float, float, float, float], None], + ...]]:#Tuple[float, ...] """ Computes the correlation coefficients. @@ -360,7 +374,7 @@ def partial_correlations_normal(# pylint: disable=R0913 return tuple(item) + (trait_database[1], trait_database[2]) return item - target_trait_names, target_trait_vals = reduce( + target_trait_names, target_trait_vals = reduce(# type: ignore[var-annotated] lambda acc, item: (acc[0]+(item[0],), acc[1]+(item[data_start_pos:],)), trait_database, (tuple(), tuple())) @@ -413,7 +427,7 @@ def partial_corrs(# pylint: disable=[R0913] data_start_pos, dataset, method) def literature_correlation_by_list( - conn: Any, species: str, trait_list: Tuple[dict]) -> Tuple[dict]: + conn: Any, species: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` @@ -473,7 +487,7 @@ def literature_correlation_by_list( def tissue_correlation_by_list( conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int, - method: str, trait_list: Tuple[dict]) -> Tuple[dict]: + method: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` @@ -496,7 +510,7 @@ def tissue_correlation_by_list( primary_trait_value = prim_trait_symbol_value_dict[ primary_trait_symbol.lower()] gene_symbol_list = tuple( - trait for trait in trait_list if "symbol" in trait.keys()) + trait["symbol"] for trait in trait_list if "symbol" in trait.keys()) symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( gene_symbol_list, tissue_probeset_freeze_id, conn) return tuple( @@ -514,6 +528,54 @@ def tissue_correlation_by_list( } for trait in trait_list) return trait_list +def trait_for_output(trait): + """ + Process a trait for output. + + Removes a lot of extraneous data from the trait, that is not needed for + the display of partial correlation results. + This function also removes all key-value pairs, for which the value is + `None`, because it is a waste of network resources to transmit the key-value + pair just to indicate it does not exist. + """ + trait = { + "trait_type": trait["trait_type"], + "dataset_name": trait["db"]["dataset_name"], + "dataset_type": trait["db"]["dataset_type"], + "group": trait["db"]["group"], + "trait_fullname": trait["trait_fullname"], + "trait_name": trait["trait_name"], + "symbol": trait.get("symbol"), + "description": trait.get("description"), + "pre_publication_description": trait.get( + "pre_publication_description"), + "post_publication_description": trait.get( + "post_publication_description"), + "original_description": trait.get( + "original_description"), + "authors": trait.get("authors"), + "year": trait.get("year"), + "probe_target_description": trait.get( + "probe_target_description"), + "chr": trait.get("chr"), + "mb": trait.get("mb"), + "geneid": trait.get("geneid"), + "homologeneid": trait.get("homologeneid"), + "noverlap": trait.get("noverlap"), + "partial_corr": trait.get("partial_corr"), + "partial_corr_p_value": trait.get("partial_corr_p_value"), + "corr": trait.get("corr"), + "corr_p_value": trait.get("corr_p_value"), + "rank_order": trait.get("rank_order"), + "delta": ( + None if trait.get("partial_corr") is None + else (trait.get("partial_corr") - trait.get("corr"))), + "l_corr": trait.get("l_corr"), + "tissue_corr": trait.get("tissue_corr"), + "tissue_p_value": trait.get("tissue_p_value") + } + return {key: val for key, val in trait.items() if val is not None} + def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] conn: Any, primary_trait_name: str, control_trait_names: Tuple[str, ...], method: str, @@ -640,28 +702,47 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] "any associated Tissue Correlation Information."), "error_type": "Tissue Correlation"} + target_dataset = retrieve_trait_dataset( + ("Temp" if "Temp" in target_db_name else + ("Publish" if "Publish" in target_db_name else + "Geno" if "Geno" in target_db_name else "ProbeSet")), + {"db": {"dataset_name": target_db_name}, "trait_name": "_"}, + threshold, + conn) + database_filename = get_filename(conn, target_db_name, TEXTDIR) _total_traits, all_correlations = partial_corrs( conn, common_primary_control_samples, fixed_primary_vals, fixed_control_vals, len(fixed_primary_vals), species, input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, - method, primary_trait["db"], database_filename) + method, {**target_dataset, "dataset_type": target_dataset["type"]}, database_filename) def __make_sorter__(method): - def __sort_6__(row): - return row[6] - - def __sort_3__(row): + def __compare_lit_or_tiss_correlation_values_(row): + # Index Content + # 0 trait name + # 1 N + # 2 partial correlation coefficient + # 3 p value of partial correlation + # 6 literature/tissue correlation value + return (row[6], row[3]) + + def __compare_partial_correlation_p_values__(row): + # Index Content + # 0 trait name + # 1 partial correlation coefficient + # 2 N + # 3 p value of partial correlation return row[3] if "literature" in method.lower(): - return __sort_6__ + return __compare_lit_or_tiss_correlation_values_ if "tissue" in method.lower(): - return __sort_6__ + return __compare_lit_or_tiss_correlation_values_ - return __sort_3__ + return __compare_partial_correlation_p_values__ sorted_correlations = sorted( all_correlations, key=__make_sorter__(method)) @@ -676,7 +757,7 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] { **retrieve_trait_info( threshold, - f"{primary_trait['db']['dataset_name']}::{item[0]}", + f"{target_dataset['dataset_name']}::{item[0]}", conn), "noverlap": item[1], "partial_corr": item[2], @@ -694,4 +775,14 @@ def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] for item in sorted_correlations[:min(criteria, len(all_correlations))])) - return trait_list + return { + "status": "success", + "results": { + "primary_trait": trait_for_output(primary_trait), + "control_traits": tuple( + trait_for_output(trait) for trait in cntrl_traits), + "correlations": tuple( + trait_for_output(trait) for trait in trait_list), + "dataset_type": target_dataset["type"], + "method": "spearman" if "spearman" in method.lower() else "pearson" + }} |