From 8f036415975d6e224e5e94277997329c0f1fa159 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Fri, 29 Oct 2021 09:49:28 +0300 Subject: Feature/biweight reimplementation (#47) * add biweight reimplementation with pingouin * delete biweight scripts and tests * add python-pingouin to guix file * delete biweight paths * mypy fix:pingouin mising imports * pep8 formatting && pylint fixes--- gn3/computations/correlations.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'gn3/computations/correlations.py') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index bb13ff1..c930df0 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -8,7 +8,7 @@ from typing import Optional from typing import Callable import scipy.stats -from gn3.computations.biweight import calculate_biweight_corr +import pingouin as pg def map_shared_keys_to_values(target_sample_keys: List, @@ -102,11 +102,10 @@ package :not packaged in guix """ - try: - results = calculate_biweight_corr(x_val, y_val) - return results - except Exception as error: - raise error + results = pg.corr(x_val, y_val, method="bicor") + corr_coeff = results["r"].values[0] + p_val = results["p-val"].values[0] + return (corr_coeff, p_val) def filter_shared_sample_keys(this_samplelist, -- cgit v1.2.3 From 905626a2a27332f2fab74195bbcf615bf5c5b6bf Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Nov 2021 16:41:48 +0300 Subject: replace list with generators --- gn3/computations/correlations.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'gn3/computations/correlations.py') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index c930df0..8eaa523 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -49,13 +49,9 @@ def normalize_values(a_values: List, ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) """ - a_new = [] - b_new = [] for a_val, b_val in zip(a_values, b_values): if (a_val and b_val is not None): - a_new.append(a_val) - b_new.append(b_val) - return a_new, b_new, len(a_new) + yield a_val, b_val def compute_corr_coeff_p_value(primary_values: List, target_values: List, @@ -81,8 +77,10 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, correlation coeff and p value """ - (sanitized_traits_vals, sanitized_target_vals, - num_overlap) = normalize_values(trait_vals, target_samples_vals) + + sanitized_traits_vals, sanitized_target_vals = list( + zip(*list(normalize_values(trait_vals, target_samples_vals)))) + num_overlap = len(sanitized_traits_vals) if num_overlap > 5: @@ -114,13 +112,9 @@ def filter_shared_sample_keys(this_samplelist, filter the values using the shared keys """ - this_vals = [] - target_vals = [] for key, value in target_samplelist.items(): if key in this_samplelist: - target_vals.append(value) - this_vals.append(this_samplelist[key]) - return (this_vals, target_vals) + yield value, this_samplelist[key] def fast_compute_all_sample_correlation(this_trait, @@ -139,9 +133,10 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( - this_trait_samples, target_trait_data))) - with multiprocessing.Pool(4) as pool: + processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + )) + with multiprocessing.Pool() as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: @@ -172,8 +167,10 @@ def compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - this_vals, target_vals = filter_shared_sample_keys( - this_trait_samples, target_trait_data) + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + # this_vals, target_vals = filter_shared_sample_keys( + # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3 From 01ddb7300b451108983327ae11f69e265a2ec2e0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Nov 2021 11:38:35 +0300 Subject: fix:spawned processes memory issues --- gn3/computations/correlations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gn3/computations/correlations.py') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8eaa523..8302afc 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,6 +1,7 @@ """module contains code for correlations""" import math import multiprocessing +from contextlib import closing from typing import List from typing import Tuple @@ -136,7 +137,7 @@ def fast_compute_all_sample_correlation(this_trait, processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) )) - with multiprocessing.Pool() as pool: + with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: -- cgit v1.2.3 From e9fb78b5bc43bd8c63b8b790f0f3fe826051fbe7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:23:55 +0300 Subject: fix target and base sample data order --- gn3/computations/correlations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/computations/correlations.py') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8302afc..4987571 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -115,7 +115,7 @@ def filter_shared_sample_keys(this_samplelist, """ for key, value in target_samplelist.items(): if key in this_samplelist: - yield value, this_samplelist[key] + yield this_samplelist[key], value def fast_compute_all_sample_correlation(this_trait, -- cgit v1.2.3 From fa1af0daa093e80a2c235f0294d7fe61a5b65b4b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:31:48 +0300 Subject: pylint fixes and pep8 formatting --- gn3/computations/correlations.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'gn3/computations/correlations.py') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 4987571..c5c56db 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -134,9 +134,9 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( - this_trait_samples, target_trait_data)))) - )) + processed_values.append((trait_name, corr_method, + list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))))) with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) @@ -170,8 +170,6 @@ def compute_all_sample_correlation(this_trait, target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) - # this_vals, target_vals = filter_shared_sample_keys( - # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3