Merge pull request #56 from genenetwork/partial-correlations

Partial correlations
author: Muriithi Frederick Muriuki 2021-11-20 18:06:32 +0300
committer: GitHub 2021-11-20 18:06:32 +0300
commit: 92d5766f5514181cd6aa82fc0d0f225666e892cb (patch)
tree: 74840e8e2118e24e1f49eb780ca0bbf24704e510 /gn3
parent: abc0d36f39c691652fee81bce808d625fc368e72 (diff)
parent: 08c81b8892060353bb7fb15555875f03bbdcb46e (diff)
download: genenetwork3-92d5766f5514181cd6aa82fc0d0f225666e892cb.tar.gz
3 files changed, 86 insertions, 26 deletions
diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py
index 07dc16d..4f45159 100644
--- a/gn3/computations/partial_correlations.py
+++ b/gn3/computations/partial_correlations.py
@@ -5,10 +5,14 @@ It is an attempt to migrate over the partial correlations feature from
 GeneNetwork1.
 """
 
+import math
 from functools import reduce
-from typing import Any, Tuple, Sequence
+from typing import Any, Tuple, Union, Sequence
 from scipy.stats import pearsonr, spearmanr
 
+import pandas
+import pingouin
+
 from gn3.settings import TEXTDIR
 from gn3.data_helpers import parse_csv_line
 
@@ -152,7 +156,7 @@ def tissue_correlation(
         "Method must be one of: {}".format(",".join(method_fns.keys())))
 
     corr, pvalue = method_fns[method](primary_trait_values, target_trait_values)
-    return (round(corr, 10), round(pvalue, 10))
+    return (corr, pvalue)
 
 def batch_computed_tissue_correlation(
         primary_trait_values: Tuple[float, ...], target_traits_dict: dict,
@@ -245,7 +249,7 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914]
     all_target_trait_names: Tuple[str, ...] = processed_trait_names_values[0]
     all_target_trait_values: Tuple[float, ...] = processed_trait_names_values[1]
 
-    all_correlations = determine_partials(
+    all_correlations = compute_partial(
         primary_vals, control_vals, all_target_trait_names,
         all_target_trait_values, method)
     ## Line 772 to 779 in GN1 are the cause of the weird complexity in the
@@ -258,32 +262,71 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914]
             else fetched_correlations[corr[0]][0:2])
         for idx, corr in enumerate(all_correlations))
 
-def partial_correlation_matrix(
+def build_data_frame(
         xdata: Tuple[float, ...], ydata: Tuple[float, ...],
-        zdata: Tuple[float, ...], method: str = "pearsons",
-        omit_nones: bool = True) -> float:
+        zdata: Union[
+            Tuple[float, ...],
+            Tuple[Tuple[float, ...], ...]]) -> pandas.DataFrame:
     """
-    Computes the partial correlation coefficient using the
-    'variance-covariance matrix' method
-
-    This is a partial migration of the
-    `web.webqtl.correlation.correlationFunction.determinPartialsByR` function in
-    GeneNetwork1, specifically the `pcor.mat` function written in the R
-    programming language.
+    Build a pandas DataFrame object from xdata, ydata and zdata
     """
-    return 0
-
-def partial_correlation_recursive(
-        xdata: Tuple[float, ...], ydata: Tuple[float, ...],
-        zdata: Tuple[float, ...], method: str = "pearsons",
-        omit_nones: bool = True) -> float:
+    x_y_df = pandas.DataFrame({"x": xdata, "y": ydata})
+    if isinstance(zdata[0], float):
+        return x_y_df.join(pandas.DataFrame({"z": zdata}))
+    interm_df = x_y_df.join(pandas.DataFrame(
+        {"z{}".format(i): val for i, val in enumerate(zdata)}))
+    if interm_df.shape[1] == 3:
+        return interm_df.rename(columns={"z0": "z"})
+    return interm_df
+
+def compute_partial(
+        primary_vals, control_vals, target_vals, target_names,
+        method: str) -> Tuple[
+            Union[
+                Tuple[str, int, float, float, float, float], None],
+            ...]:
     """
-    Computes the partial correlation coefficient using the 'recursive formula'
-    method
+    Compute the partial correlations.
 
-    This is a partial migration of the
-    `web.webqtl.correlation.correlationFunction.determinPartialsByR` function in
-    GeneNetwork1, specifically the `pcor.rec` function written in the R
-    programming language.
+    This is a re-implementation of the
+    `web.webqtl.correlation.correlationFunction.determinePartialsByR` function
+    in GeneNetwork1.
+
+    This implementation reworks the child function `compute_partial` which will
+    then be used in the place of `determinPartialsByR`.
+
+    TODO: moving forward, we might need to use the multiprocessing library to
+    speed up the computations, in case they are found to be slow.
     """
-    return 0
+    # replace the R code with `pingouin.partial_corr`
+    def __compute_trait_info__(target):
+        primary = [
+            prim for targ, prim in zip(target, primary_vals)
+            if targ is not None]
+        datafrm = build_data_frame(
+            primary,
+            [targ for targ in target if targ is not None],
+            [cont for i, cont in enumerate(control_vals)
+             if target[i] is not None])
+        covariates = "z" if datafrm.shape[1] == 3 else [
+            col for col in datafrm.columns if col not in ("x", "y")]
+        ppc = pingouin.partial_corr(
+            data=datafrm, x="x", y="y", covar=covariates, method=method)
+        pc_coeff = ppc["r"]
+
+        zero_order_corr = pingouin.corr(
+            datafrm["x"], datafrm["y"], method=method)
+
+        if math.isnan(pc_coeff):
+            return (
+                target[1], len(primary), pc_coeff, 1, zero_order_corr["r"],
+                zero_order_corr["p-val"])
+        return (
+            target[1], len(primary), pc_coeff,
+            (ppc["p-val"] if not math.isnan(ppc["p-val"]) else (
+                0 if (abs(pc_coeff - 1) < 0.0000001) else 1)),
+            zero_order_corr["r"], zero_order_corr["p-val"])
+
+    return tuple(
+        __compute_trait_info__(target)
+        for target in zip(target_vals, target_names))
diff --git a/gn3/data_helpers.py b/gn3/data_helpers.py
index d3f942b..b72fbc5 100644
--- a/gn3/data_helpers.py
+++ b/gn3/data_helpers.py
@@ -24,6 +24,21 @@ def partition_all(num: int, items: Sequence[Any]) -> Tuple[Tuple[Any, ...], ...]
         in reduce(
             __compute_start_stop__, iterations, tuple())])
 
+def partition_by(partition_fn, items):
+    """
+    Given a sequence `items`, return a tuple of tuples, each of which contain
+    the values in `items` partitioned such that the first item in each internal
+    tuple, when passed to `partition_function` returns True.
+
+    This is an approximation of Clojure's `partition-by` function.
+    """
+    def __partitioner__(accumulator, item):
+        if partition_fn(item):
+            return accumulator + ((item,),)
+        return accumulator[:-1] + (accumulator[-1] + (item,),)
+
+    return reduce(__partitioner__, items, tuple())
+
 def parse_csv_line(
         line: str, delimiter: str = ",",
         quoting: Optional[str] = '"') -> Tuple[str, ...]:
diff --git a/gn3/settings.py b/gn3/settings.py
index 0a6d03b..0ac6698 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -52,3 +52,5 @@ CORS_HEADERS = [
 
 GNSHARE = os.environ.get("GNSHARE", "/gnshare/gn/")
 TEXTDIR = f"{GNSHARE}/web/ProbeSetFreeze_DataMatrix"
+
+ROUND_TO = 10
author	Muriithi Frederick Muriuki	2021-11-20 18:06:32 +0300
committer	GitHub	2021-11-20 18:06:32 +0300
commit	92d5766f5514181cd6aa82fc0d0f225666e892cb (patch)
tree	74840e8e2118e24e1f49eb780ca0bbf24704e510 /gn3
parent	abc0d36f39c691652fee81bce808d625fc368e72 (diff)
parent	08c81b8892060353bb7fb15555875f03bbdcb46e (diff)
download	genenetwork3-92d5766f5514181cd6aa82fc0d0f225666e892cb.tar.gz