From 905626a2a27332f2fab74195bbcf615bf5c5b6bf Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Tue, 9 Nov 2021 16:41:48 +0300 Subject: replace list with generators --- gn3/computations/correlations.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index c930df0..8eaa523 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -49,13 +49,9 @@ def normalize_values(a_values: List, ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) """ - a_new = [] - b_new = [] for a_val, b_val in zip(a_values, b_values): if (a_val and b_val is not None): - a_new.append(a_val) - b_new.append(b_val) - return a_new, b_new, len(a_new) + yield a_val, b_val def compute_corr_coeff_p_value(primary_values: List, target_values: List, @@ -81,8 +77,10 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, correlation coeff and p value """ - (sanitized_traits_vals, sanitized_target_vals, - num_overlap) = normalize_values(trait_vals, target_samples_vals) + + sanitized_traits_vals, sanitized_target_vals = list( + zip(*list(normalize_values(trait_vals, target_samples_vals)))) + num_overlap = len(sanitized_traits_vals) if num_overlap > 5: @@ -114,13 +112,9 @@ def filter_shared_sample_keys(this_samplelist, filter the values using the shared keys """ - this_vals = [] - target_vals = [] for key, value in target_samplelist.items(): if key in this_samplelist: - target_vals.append(value) - this_vals.append(this_samplelist[key]) - return (this_vals, target_vals) + yield value, this_samplelist[key] def fast_compute_all_sample_correlation(this_trait, @@ -139,9 +133,10 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *filter_shared_sample_keys( - this_trait_samples, target_trait_data))) - with multiprocessing.Pool(4) as pool: + processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + )) + with multiprocessing.Pool() as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: @@ -172,8 +167,10 @@ def compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - this_vals, target_vals = filter_shared_sample_keys( - this_trait_samples, target_trait_data) + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + # this_vals, target_vals = filter_shared_sample_keys( + # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3 From 01ddb7300b451108983327ae11f69e265a2ec2e0 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Wed, 10 Nov 2021 11:38:35 +0300 Subject: fix:spawned processes memory issues --- gn3/computations/correlations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8eaa523..8302afc 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -1,6 +1,7 @@ """module contains code for correlations""" import math import multiprocessing +from contextlib import closing from typing import List from typing import Tuple @@ -136,7 +137,7 @@ def fast_compute_all_sample_correlation(this_trait, processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) )) - with multiprocessing.Pool() as pool: + with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) for sample_correlation in results: -- cgit v1.2.3 From e9fb78b5bc43bd8c63b8b790f0f3fe826051fbe7 Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:23:55 +0300 Subject: fix target and base sample data order --- gn3/computations/correlations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 8302afc..4987571 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -115,7 +115,7 @@ def filter_shared_sample_keys(this_samplelist, """ for key, value in target_samplelist.items(): if key in this_samplelist: - yield value, this_samplelist[key] + yield this_samplelist[key], value def fast_compute_all_sample_correlation(this_trait, -- cgit v1.2.3 From fa1af0daa093e80a2c235f0294d7fe61a5b65b4b Mon Sep 17 00:00:00 2001 From: Alexander Kabui Date: Thu, 11 Nov 2021 00:31:48 +0300 Subject: pylint fixes and pep8 formatting --- gn3/computations/correlations.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index 4987571..c5c56db 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -134,9 +134,9 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys( - this_trait_samples, target_trait_data)))) - )) + processed_values.append((trait_name, corr_method, + list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))))) with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) @@ -170,8 +170,6 @@ def compute_all_sample_correlation(this_trait, target_trait_data = target_trait["trait_sample_data"] this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( this_trait_samples, target_trait_data)))) - # this_vals, target_vals = filter_shared_sample_keys( - # this_trait_samples, target_trait_data) sample_correlation = compute_sample_r_correlation( trait_name=trait_name, -- cgit v1.2.3 From 4e790f08000825931cb5edec1738d2b7d073f73e Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 15:45:22 +0530 Subject: Reimplement __items_with_values using list comprehension. * gn3/computations/correlations2.py: Remove import of reduce from functools. (__items_with_values): Reimplement using list comprehension. --- gn3/computations/correlations2.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 93db3fa..69921b1 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -7,24 +7,13 @@ compute_correlation: TODO: Describe what the function does...""" from math import sqrt -from functools import reduce ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): """Retains only corresponding items in the data items that are not `None` values. This should probably be renamed to something sensible""" - def both_not_none(item1, item2): - """Check that both items are not the value `None`.""" - if (item1 is not None) and (item2 is not None): - return (item1, item2) - return None - def split_lists(accumulator, item): - """Separate the 'x' and 'y' items.""" - return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] - return reduce( - split_lists, - filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), - [[], []]) + filtered = [x for x in zip(dbdata, userdata) if x[0] is not None and x[1] is not None] + return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): """Compute some form of correlation. -- cgit v1.2.3 From ec1d2180d99e0cde1dc181ee9ed79e86cf1a5675 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 11 Nov 2021 16:10:35 +0530 Subject: Reimplement correlations2.compute_correlation using pearsonr. correlations2.compute_correlation computes the Pearson correlation coefficient. Outsource this computation to scipy.stats.pearsonr. When the inputs are constant, the Pearson correlation coefficient does not exist and is represented by NaN. Update the tests to reflect this. * gn3/computations/correlations2.py: Remove import of sqrt from math. (compute_correlation): Reimplement using scipy.stats.pearsonr. * tests/unit/computations/test_correlation.py: Import math. (TestCorrelation.test_compute_correlation): When inputs are constant, set expected correlation coefficient to NaN. --- gn3/computations/correlations2.py | 21 ++++----------------- tests/unit/computations/test_correlation.py | 5 +++-- 2 files changed, 7 insertions(+), 19 deletions(-) (limited to 'gn3/computations') diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 69921b1..d0222ae 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -6,7 +6,7 @@ FUNCTIONS: compute_correlation: TODO: Describe what the function does...""" -from math import sqrt +from scipy import stats ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): @@ -16,24 +16,11 @@ def __items_with_values(dbdata, userdata): return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): - """Compute some form of correlation. + """Compute the Pearson correlation coefficient. This is extracted from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 """ x_items, y_items = __items_with_values(dbdata, userdata) - if len(x_items) < 6: - return (0.0, len(x_items)) - meanx = sum(x_items)/len(x_items) - meany = sum(y_items)/len(y_items) - def cal_corr_vals(acc, item): - xitem, yitem = item - return [ - acc[0] + ((xitem - meanx) * (yitem - meany)), - acc[1] + ((xitem - meanx) * (xitem - meanx)), - acc[2] + ((yitem - meany) * (yitem - meany))] - xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) - try: - return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) - except ZeroDivisionError: - return(0, len(x_items)) + correlation = stats.pearsonr(x_items, y_items)[0] if len(x_items) >= 6 else 0 + return (correlation, len(x_items)) diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py index e6cf198..d60dd62 100644 --- a/tests/unit/computations/test_correlation.py +++ b/tests/unit/computations/test_correlation.py @@ -4,6 +4,7 @@ from unittest import mock import unittest from collections import namedtuple +import math from numpy.testing import assert_almost_equal from gn3.computations.correlations import normalize_values @@ -471,10 +472,10 @@ class TestCorrelation(TestCase): [None, None, None, None, None, None, None, None, None, 0], (0.0, 1)], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - (0, 10)], + (math.nan, 10)], [[9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], [9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87], - (0.9999999999999998, 10)], + (math.nan, 10)], [[9.3, 2.2, 5.4, 7.2, 6.4, 7.6, 3.8, 1.8, 8.4, 0.2], [0.6, 3.97, 5.82, 8.21, 1.65, 4.55, 6.72, 9.5, 7.33, 2.34], (-0.12720361919462056, 10)], -- cgit v1.2.3