From 8f036415975d6e224e5e94277997329c0f1fa159 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Fri, 29 Oct 2021 09:49:28 +0300
Subject: Feature/biweight reimplementation (#47)

* add biweight reimplementation with pingouin

* delete biweight scripts and tests

* add python-pingouin to guix file

* delete biweight paths

* mypy fix:pingouin mising imports

* pep8 formatting && pylint fixes---
 gn3/computations/correlations.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'gn3/computations/correlations.py')

diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index bb13ff1..c930df0 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -8,7 +8,7 @@ from typing import Optional
 from typing import Callable
 
 import scipy.stats
-from gn3.computations.biweight import calculate_biweight_corr
+import pingouin as pg
 
 
 def map_shared_keys_to_values(target_sample_keys: List,
@@ -102,11 +102,10 @@ package :not packaged in guix
 
     """
 
-    try:
-        results = calculate_biweight_corr(x_val, y_val)
-        return results
-    except Exception as error:
-        raise error
+    results = pg.corr(x_val, y_val, method="bicor")
+    corr_coeff = results["r"].values[0]
+    p_val = results["p-val"].values[0]
+    return (corr_coeff, p_val)
 
 
 def filter_shared_sample_keys(this_samplelist,
-- 
cgit v1.2.3


From 905626a2a27332f2fab74195bbcf615bf5c5b6bf Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Tue, 9 Nov 2021 16:41:48 +0300
Subject: replace list with generators

---
 gn3/computations/correlations.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'gn3/computations/correlations.py')

diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index c930df0..8eaa523 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -49,13 +49,9 @@ def normalize_values(a_values: List,
     ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3)
 
     """
-    a_new = []
-    b_new = []
     for a_val, b_val in zip(a_values, b_values):
         if (a_val and b_val is not None):
-            a_new.append(a_val)
-            b_new.append(b_val)
-    return a_new, b_new, len(a_new)
+            yield a_val, b_val
 
 
 def compute_corr_coeff_p_value(primary_values: List, target_values: List,
@@ -81,8 +77,10 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals,
     correlation coeff and p value
 
     """
-    (sanitized_traits_vals, sanitized_target_vals,
-     num_overlap) = normalize_values(trait_vals, target_samples_vals)
+
+    sanitized_traits_vals, sanitized_target_vals = list(
+        zip(*list(normalize_values(trait_vals, target_samples_vals))))
+    num_overlap = len(sanitized_traits_vals)
 
     if num_overlap > 5:
 
@@ -114,13 +112,9 @@ def filter_shared_sample_keys(this_samplelist,
     filter the values using the shared keys
 
     """
-    this_vals = []
-    target_vals = []
     for key, value in target_samplelist.items():
         if key in this_samplelist:
-            target_vals.append(value)
-            this_vals.append(this_samplelist[key])
-    return (this_vals, target_vals)
+            yield value, this_samplelist[key]
 
 
 def fast_compute_all_sample_correlation(this_trait,
@@ -139,9 +133,10 @@ def fast_compute_all_sample_correlation(this_trait,
     for target_trait in target_dataset:
         trait_name = target_trait.get("trait_id")
         target_trait_data = target_trait["trait_sample_data"]
-        processed_values.append((trait_name, corr_method, *filter_shared_sample_keys(
-            this_trait_samples, target_trait_data)))
-    with multiprocessing.Pool(4) as pool:
+        processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys(
+            this_trait_samples, target_trait_data))))
+        ))
+    with multiprocessing.Pool() as pool:
         results = pool.starmap(compute_sample_r_correlation, processed_values)
 
         for sample_correlation in results:
@@ -172,8 +167,10 @@ def compute_all_sample_correlation(this_trait,
     for target_trait in target_dataset:
         trait_name = target_trait.get("trait_id")
         target_trait_data = target_trait["trait_sample_data"]
-        this_vals, target_vals = filter_shared_sample_keys(
-            this_trait_samples, target_trait_data)
+        this_vals, target_vals = list(zip(*list(filter_shared_sample_keys(
+            this_trait_samples, target_trait_data))))
+        # this_vals, target_vals = filter_shared_sample_keys(
+        #     this_trait_samples, target_trait_data)
 
         sample_correlation = compute_sample_r_correlation(
             trait_name=trait_name,
-- 
cgit v1.2.3


From 01ddb7300b451108983327ae11f69e265a2ec2e0 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Wed, 10 Nov 2021 11:38:35 +0300
Subject: fix:spawned processes memory issues

---
 gn3/computations/correlations.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'gn3/computations/correlations.py')

diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 8eaa523..8302afc 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -1,6 +1,7 @@
 """module contains code for correlations"""
 import math
 import multiprocessing
+from contextlib import closing
 
 from typing import List
 from typing import Tuple
@@ -136,7 +137,7 @@ def fast_compute_all_sample_correlation(this_trait,
         processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys(
             this_trait_samples, target_trait_data))))
         ))
-    with multiprocessing.Pool() as pool:
+    with closing(multiprocessing.Pool()) as pool:
         results = pool.starmap(compute_sample_r_correlation, processed_values)
 
         for sample_correlation in results:
-- 
cgit v1.2.3


From e9fb78b5bc43bd8c63b8b790f0f3fe826051fbe7 Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 00:23:55 +0300
Subject: fix target and base sample data order

---
 gn3/computations/correlations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gn3/computations/correlations.py')

diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 8302afc..4987571 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -115,7 +115,7 @@ def filter_shared_sample_keys(this_samplelist,
     """
     for key, value in target_samplelist.items():
         if key in this_samplelist:
-            yield value, this_samplelist[key]
+            yield this_samplelist[key], value
 
 
 def fast_compute_all_sample_correlation(this_trait,
-- 
cgit v1.2.3


From fa1af0daa093e80a2c235f0294d7fe61a5b65b4b Mon Sep 17 00:00:00 2001
From: Alexander Kabui
Date: Thu, 11 Nov 2021 00:31:48 +0300
Subject: pylint fixes and pep8 formatting

---
 gn3/computations/correlations.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'gn3/computations/correlations.py')

diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 4987571..c5c56db 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -134,9 +134,9 @@ def fast_compute_all_sample_correlation(this_trait,
     for target_trait in target_dataset:
         trait_name = target_trait.get("trait_id")
         target_trait_data = target_trait["trait_sample_data"]
-        processed_values.append((trait_name, corr_method, *list(zip(*list(filter_shared_sample_keys(
-            this_trait_samples, target_trait_data))))
-        ))
+        processed_values.append((trait_name, corr_method,
+                                 list(zip(*list(filter_shared_sample_keys(
+                                     this_trait_samples, target_trait_data))))))
     with closing(multiprocessing.Pool()) as pool:
         results = pool.starmap(compute_sample_r_correlation, processed_values)
 
@@ -170,8 +170,6 @@ def compute_all_sample_correlation(this_trait,
         target_trait_data = target_trait["trait_sample_data"]
         this_vals, target_vals = list(zip(*list(filter_shared_sample_keys(
             this_trait_samples, target_trait_data))))
-        # this_vals, target_vals = filter_shared_sample_keys(
-        #     this_trait_samples, target_trait_data)
 
         sample_correlation = compute_sample_r_correlation(
             trait_name=trait_name,
-- 
cgit v1.2.3