aboutsummaryrefslogtreecommitdiff
path: root/gn3
diff options
context:
space:
mode:
Diffstat (limited to 'gn3')
-rw-r--r--gn3/computations/correlations.py59
1 files changed, 47 insertions, 12 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 24e9871..cc166fa 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -9,6 +9,7 @@ from typing import Optional
from typing import Callable
import scipy.stats
+import numpy as np
import pingouin as pg
@@ -424,24 +425,58 @@ def fast_compute_tissue_correlation(primary_tissue_dict: dict,
key=lambda trait_name: -abs(list(trait_name.values())[0]["tissue_corr"]))
-def _mp_calculate(x_val, dataset, n_jobs, chunk_size):
+def compute_correlation_2(corr_inputs):
+
+ (this_trait_samples, target_trait) = corr_inputs
+
+ trait_name = target_trait.get("trait_id")
+ target_trait_data = target_trait["trait_sample_data"]
+
+ try:
+ (x_vals, y_vals) = list(zip(*list(filter_shared_sample_keys(
+ this_trait_samples, target_trait_data))))
+
+ x_vals = np.array(x_vals, dtype=float)
+ y_vals = np.array(y_vals, dtype=float)
+
+ if len(x_vals) > 5:
+ # remove nan values
+ nans_values = np.logical_or(np.isnan(x_vals), np.isnan(y_vals))
+
+ (corr_coeff, p_val) = scipy.stats.pearsonr(
+ x_vals[~nans_values], y_vals[~nans_values])
+
+ # print(corr_coeff, p_val)
+ return{trait_name: {
+ "corr_coefficient": corr_coeff,
+ "p_value": p_val,
+ "num_overlap": len(x_vals)
+ }}
+
+ except ValueError:
+ return
+
+
+def mp_calculate(this_trait, dataset, n_jobs: int = -1, chunk_size: int = 500):
"""corr mp reimplementation
credit:https://github.com/bukson/nancorrmp
- """
- def _compute_correlation_2(x_val, y_val, trait_name):
- """function to compute correlation"""
-
- return (trait_name, scipy.stats.pearsonr(x_val, y_val))
+ """
+ this_trait_samples = this_trait["trait_sample_data"]
- arguments = ((x_val, y_val, trait_name) for (trait_name, y_val) in dataset)
+ arguments = [(this_trait_samples, target_trait)
+ for target_trait in dataset]
processes = n_jobs if n_jobs > 0 else None
- worker_function = _compute_correlation_2
+ chunksize = len(arguments)//processes
+
+ worker_function = compute_correlation_2
- with multiprocessing.pool(processes=processes) as pool:
- results = list(pool.imap_unordered(
- worker_function, arguments, chunksize=chunks))
+ with multiprocessing.Pool(processes=processes) as pool:
+ corr_results = list(pool.imap_unordered(
+ worker_function, arguments, chunksize=chunksize))
- return results
+ return sorted(
+ corr_results,
+ key=lambda trait_name: -abs(list(trait_name.values())[0]["corr_coefficient"]))