From 05740a60d6616f28751f96ca30adeb524f4369ad Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 1 Nov 2021 10:49:35 +0300 Subject: Implement `compute_partial_correlations_fast` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `compute_partial_correlations_fast` that is a partial migration of `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` in GN1. This function will probably be reworked once the dependencies are fully migrated. It also needs tests to be added. --- gn3/computations/partial_correlations.py | 49 ++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'gn3/computations') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index ba4de9e..1a6868a 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -9,6 +9,9 @@ from functools import reduce from typing import Any, Tuple, Sequence from scipy.stats import pearsonr, spearmanr +from gn3.settings import TEXTDIR +from gn3.data_helpers import parse_csv_line + def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ Fetches data for the control traits. @@ -192,3 +195,49 @@ def good_dataset_samples_indexes( return tuple(sorted( samples_from_file.index(good) for good in set(samples).intersection(set(samples_from_file)))) + +def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] + samples, primary_vals, control_vals, database_filename, + fetched_correlations, method: str, correlation_type: str) -> Tuple[ + float, Tuple[float, ...]]: + """ + This is a partial migration of the + `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` + function in GeneNetwork1. + """ + assert method in ("spearman", "pearson") + with open(f"{TEXTDIR}/{database_filename}", "r") as dataset_file: + dataset = tuple(dataset_file.readlines()) + + good_dataset_samples = good_dataset_samples_indexes( + samples, parse_csv_line(dataset[0])[1:]) + + def __process_trait_names_and_values__(acc, line): + trait_line = parse_csv_line(line) + trait_name = trait_line[0] + trait_data = trait_line[1:] + if trait_name in fetched_correlations.keys(): + return ( + acc[0] + (trait_name,), + acc[1] + tuple( + trait_data[i] if i in good_dataset_samples else None + for i in range(len(trait_data)))) + return acc + + processed_trait_names_values: tuple = reduce( + __process_trait_names_and_values__, dataset[1:], (tuple(), tuple())) + all_target_trait_names: Tuple[str, ...] = processed_trait_names_values[0] + all_target_trait_values: Tuple[float, ...] = processed_trait_names_values[1] + + all_correlations = determine_partials( + primary_vals, control_vals, all_target_trait_names, + all_target_trait_values, method) + ## Line 772 to 779 in GN1 are the cause of the weird complexity in the + ## return below. Once the surrounding code is successfully migrated and + ## reworked, this complexity might go away, by getting rid of the + ## `correlation_type` parameter + return len(all_correlations), tuple( + corr + ( + (fetched_correlations[corr[0]],) if correlation_type == "literature" + else fetched_correlations[corr[0]][0:2]) + for idx, corr in enumerate(all_correlations)) -- cgit v1.2.3