diff options
author | Frederick Muriuki Muriithi | 2021-11-01 10:49:35 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2021-11-01 10:49:35 +0300 |
commit | 05740a60d6616f28751f96ca30adeb524f4369ad (patch) | |
tree | dead5dd6321e47cd0cab04942cf4a600c1ee6ca5 | |
parent | b56341f9144623cc41bc815b337490ace04b2495 (diff) | |
download | genenetwork3-05740a60d6616f28751f96ca30adeb524f4369ad.tar.gz |
Implement `compute_partial_correlations_fast`
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi
* Implement `compute_partial_correlations_fast` that is a partial migration of
`web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` in
GN1.
This function will probably be reworked once the dependencies are fully
migrated.
It also needs tests to be added.
-rw-r--r-- | gn3/computations/partial_correlations.py | 49 | ||||
-rw-r--r-- | gn3/settings.py | 3 |
2 files changed, 52 insertions, 0 deletions
diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index ba4de9e..1a6868a 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -9,6 +9,9 @@ from functools import reduce from typing import Any, Tuple, Sequence from scipy.stats import pearsonr, spearmanr +from gn3.settings import TEXTDIR +from gn3.data_helpers import parse_csv_line + def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ Fetches data for the control traits. @@ -192,3 +195,49 @@ def good_dataset_samples_indexes( return tuple(sorted( samples_from_file.index(good) for good in set(samples).intersection(set(samples_from_file)))) + +def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] + samples, primary_vals, control_vals, database_filename, + fetched_correlations, method: str, correlation_type: str) -> Tuple[ + float, Tuple[float, ...]]: + """ + This is a partial migration of the + `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` + function in GeneNetwork1. + """ + assert method in ("spearman", "pearson") + with open(f"{TEXTDIR}/{database_filename}", "r") as dataset_file: + dataset = tuple(dataset_file.readlines()) + + good_dataset_samples = good_dataset_samples_indexes( + samples, parse_csv_line(dataset[0])[1:]) + + def __process_trait_names_and_values__(acc, line): + trait_line = parse_csv_line(line) + trait_name = trait_line[0] + trait_data = trait_line[1:] + if trait_name in fetched_correlations.keys(): + return ( + acc[0] + (trait_name,), + acc[1] + tuple( + trait_data[i] if i in good_dataset_samples else None + for i in range(len(trait_data)))) + return acc + + processed_trait_names_values: tuple = reduce( + __process_trait_names_and_values__, dataset[1:], (tuple(), tuple())) + all_target_trait_names: Tuple[str, ...] = processed_trait_names_values[0] + all_target_trait_values: Tuple[float, ...] = processed_trait_names_values[1] + + all_correlations = determine_partials( + primary_vals, control_vals, all_target_trait_names, + all_target_trait_values, method) + ## Line 772 to 779 in GN1 are the cause of the weird complexity in the + ## return below. Once the surrounding code is successfully migrated and + ## reworked, this complexity might go away, by getting rid of the + ## `correlation_type` parameter + return len(all_correlations), tuple( + corr + ( + (fetched_correlations[corr[0]],) if correlation_type == "literature" + else fetched_correlations[corr[0]][0:2]) + for idx, corr in enumerate(all_correlations)) diff --git a/gn3/settings.py b/gn3/settings.py index e85eeff..57c63df 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -50,3 +50,6 @@ CORS_HEADERS = [ "Authorization", "Access-Control-Allow-Credentials" ] + +GNSHARE = os.environ.get("GNSHARE", "/gnshare/gn/") +TEXTDIR = f"{GNSHARE}/web/ProbeSetFreeze_DataMatrix" |