From 4e790f08000825931cb5edec1738d2b7d073f73e Mon Sep 17 00:00:00 2001
From: Arun Isaac
Date: Thu, 11 Nov 2021 15:45:22 +0530
Subject: Reimplement __items_with_values using list comprehension.

* gn3/computations/correlations2.py: Remove import of reduce from functools.
(__items_with_values): Reimplement using list comprehension.
---
 gn3/computations/correlations2.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'gn3/computations/correlations2.py')

diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py
index 93db3fa..69921b1 100644
--- a/gn3/computations/correlations2.py
+++ b/gn3/computations/correlations2.py
@@ -7,24 +7,13 @@ compute_correlation:
     TODO: Describe what the function does..."""
 
 from math import sqrt
-from functools import reduce
 ## From GN1: mostly for clustering and heatmap generation
 
 def __items_with_values(dbdata, userdata):
     """Retains only corresponding items in the data items that are not `None` values.
     This should probably be renamed to something sensible"""
-    def both_not_none(item1, item2):
-        """Check that both items are not the value `None`."""
-        if (item1 is not None) and (item2 is not None):
-            return (item1, item2)
-        return None
-    def split_lists(accumulator, item):
-        """Separate the 'x' and 'y' items."""
-        return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]]
-    return reduce(
-        split_lists,
-        filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)),
-        [[], []])
+    filtered = [x for x in zip(dbdata, userdata) if x[0] is not None and x[1] is not None]
+    return tuple(zip(*filtered)) if filtered else ([], [])
 
 def compute_correlation(dbdata, userdata):
     """Compute some form of correlation.
-- 
cgit 1.4.1


From ec1d2180d99e0cde1dc181ee9ed79e86cf1a5675 Mon Sep 17 00:00:00 2001
From: Arun Isaac
Date: Thu, 11 Nov 2021 16:10:35 +0530
Subject: Reimplement correlations2.compute_correlation using pearsonr.

correlations2.compute_correlation computes the Pearson correlation
coefficient. Outsource this computation to scipy.stats.pearsonr. When the
inputs are constant, the Pearson correlation coefficient does not exist and is
represented by NaN. Update the tests to reflect this.

* gn3/computations/correlations2.py: Remove import of sqrt from math.
(compute_correlation): Reimplement using scipy.stats.pearsonr.
* tests/unit/computations/test_correlation.py: Import math.
(TestCorrelation.test_compute_correlation): When inputs are constant, set
expected correlation coefficient to NaN.
---
 gn3/computations/correlations2.py           | 21 ++++-----------------
 tests/unit/computations/test_correlation.py |  5 +++--
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'gn3/computations/correlations2.py')

diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py
index 69921b1..d0222ae 100644
--- a/gn3/computations/correlations2.py
+++ b/gn3/computations/correlations2.py
@@ -6,7 +6,7 @@ FUNCTIONS:
 compute_correlation:
     TODO: Describe what the function does..."""
 
-from math import sqrt
+from scipy import stats
 ## From GN1: mostly for clustering and heatmap generation
 
 def __items_with_values(dbdata, userdata):
@@ -16,24 +16,11 @@ def __items_with_values(dbdata, userdata):
     return tuple(zip(*filtered)) if filtered else ([], [])
 
 def compute_correlation(dbdata, userdata):
-    """Compute some form of correlation.
+    """Compute the Pearson correlation coefficient.
 
     This is extracted from
     https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647
     """
     x_items, y_items = __items_with_values(dbdata, userdata)
-    if len(x_items) < 6:
-        return (0.0, len(x_items))
-    meanx = sum(x_items)/len(x_items)
-    meany = sum(y_items)/len(y_items)
-    def cal_corr_vals(acc, item):
-        xitem, yitem = item
-        return [
-            acc[0] + ((xitem - meanx) * (yitem - meany)),
-            acc[1] + ((xitem - meanx) * (xitem - meanx)),
-            acc[2] + ((yitem - meany) * (yitem - meany))]
-    xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0])
-    try:
-        return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items))
-    except ZeroDivisionError:
-        return(0, len(x_items))
+    correlation = stats.pearsonr(x_items, y_items)[0] if len(x_items) >= 6 else 0
+    return (correlation, len(x_items))
diff --git a/tests/unit/computations/test_correlation.py b/tests/unit/computations/test_correlation.py
index e6cf198..d60dd62 100644
--- a/tests/unit/computations/test_correlation.py
+++ b/tests/unit/computations/test_correlation.py
@@ -4,6 +4,7 @@ from unittest import mock
 import unittest
 
 from collections import namedtuple
+import math
 from numpy.testing import assert_almost_equal
 
 from gn3.computations.correlations import normalize_values
@@ -471,10 +472,10 @@ class TestCorrelation(TestCase):
                  [None, None, None, None, None, None, None, None, None, 0],
                  (0.0, 1)],
                 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                 (0, 10)],
+                 (math.nan, 10)],
                 [[9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87],
                  [9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87, 9.87],
-                 (0.9999999999999998, 10)],
+                 (math.nan, 10)],
                 [[9.3, 2.2, 5.4, 7.2, 6.4, 7.6, 3.8, 1.8, 8.4, 0.2],
                  [0.6, 3.97, 5.82, 8.21, 1.65, 4.55, 6.72, 9.5, 7.33, 2.34],
                  (-0.12720361919462056, 10)],
-- 
cgit 1.4.1