3 files changed, 95 insertions, 7 deletions
diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py
index 07dc16d..07a67be 100644
--- a/gn3/computations/partial_correlations.py
+++ b/gn3/computations/partial_correlations.py
@@ -5,11 +5,14 @@ It is an attempt to migrate over the partial correlations feature from
 GeneNetwork1.
 """
 
+import math
 from functools import reduce
-from typing import Any, Tuple, Sequence
+from typing import Any, Tuple, Union, Sequence
 from scipy.stats import pearsonr, spearmanr
 
-from gn3.settings import TEXTDIR
+import pandas
+
+from gn3.settings import TEXTDIR, ROUND_TO
 from gn3.data_helpers import parse_csv_line
 
 def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]):
@@ -258,10 +261,27 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914]
             else fetched_correlations[corr[0]][0:2])
         for idx, corr in enumerate(all_correlations))
 
+def build_data_frame(
+        xdata: Tuple[float, ...], ydata: Tuple[float, ...],
+        zdata: Union[
+            Tuple[float, ...],
+            Tuple[Tuple[float, ...], ...]]) -> pandas.DataFrame:
+    """
+    Build a pandas DataFrame object from xdata, ydata and zdata
+    """
+    x_y_df = pandas.DataFrame({"x": xdata, "y": ydata})
+    if isinstance(zdata[0], float):
+        return x_y_df.join(pandas.DataFrame({"z": zdata}))
+    interm_df = x_y_df.join(pandas.DataFrame(
+        {"z{}".format(i): val for i, val in enumerate(row)} for row in zdata))
+    if interm_df.shape[1] == 3:
+        return interm_df.rename(columns={"z0": "z"})
+    return interm_df
+
 def partial_correlation_matrix(
         xdata: Tuple[float, ...], ydata: Tuple[float, ...],
-        zdata: Tuple[float, ...], method: str = "pearsons",
-        omit_nones: bool = True) -> float:
+        zdata: Union[Tuple[float, ...], Tuple[Tuple[float, ...], ...]],
+        method: str = "pearson", omit_nones: bool = True) -> float:
     """
     Computes the partial correlation coefficient using the
     'variance-covariance matrix' method
@@ -275,8 +295,8 @@ def partial_correlation_matrix(
 
 def partial_correlation_recursive(
         xdata: Tuple[float, ...], ydata: Tuple[float, ...],
-        zdata: Tuple[float, ...], method: str = "pearsons",
-        omit_nones: bool = True) -> float:
+        zdata: Union[Tuple[float, ...], Tuple[Tuple[float, ...], ...]],
+        method: str = "pearson", omit_nones: bool = True) -> float:
     """
     Computes the partial correlation coefficient using the 'recursive formula'
     method
@@ -286,4 +306,55 @@ def partial_correlation_recursive(
     GeneNetwork1, specifically the `pcor.rec` function written in the R
     programming language.
     """
-    return 0
+    assert method in ("pearson", "spearman", "kendall")
+    data = (
+        build_data_frame(xdata, ydata, zdata).dropna(axis=0)
+        if omit_nones else
+        build_data_frame(xdata, ydata, zdata))
+
+    if data.shape[1] == 3: # z is a vector, not matrix
+        fields = {
+            "rxy": ("x", "y"),
+            "rxz": ("x", "z"),
+            "ryz": ("y", "z")}
+        tdata = {
+            corr_type: pandas.DataFrame(
+                {cols[0]: data[cols[0]],
+                 cols[1]: data[cols[1]]}).dropna(axis=0)
+            for corr_type, cols in fields.items()
+        }
+        corrs = {
+            corr_type: tdata[corr_type][cols[0]].corr(
+                tdata[corr_type][cols[1]], method=method)
+            for corr_type, cols in fields.items()
+        }
+        return round((
+            (corrs["rxy"] - corrs["rxz"] * corrs["ryz"]) /
+            (math.sqrt(1 - corrs["rxz"]**2) *
+             math.sqrt(1 - corrs["ryz"]**2))), ROUND_TO)
+
+    remaining_cols = [
+        colname for colname, series in data.items()
+        if colname not in ("x", "y", "z0")
+    ]
+
+    new_xdata = tuple(data["x"])
+    new_ydata = tuple(data["y"])
+    zc = tuple(
+        tuple(row_series[1])
+        for row_series in data[remaining_cols].iterrows())
+
+    rxy_zc = partial_correlation_recursive(
+        new_xdata, new_ydata, zc, method=method,
+        omit_nones=omit_nones)
+    rxz0_zc = partial_correlation_recursive(
+        new_xdata, tuple(data["z0"]), zc, method=method,
+        omit_nones=omit_nones)
+    ryz0_zc = partial_correlation_recursive(
+        new_ydata, tuple(data["z0"]), zc, method=method,
+        omit_nones=omit_nones)
+
+    return round(
+        ((rxy_zc - rxz0_zc * ryz0_zc) /(
+            math.sqrt(1 - rxz0_zc**2) * math.sqrt(1 - ryz0_zc**2))),
+        ROUND_TO)
diff --git a/gn3/data_helpers.py b/gn3/data_helpers.py
index d3f942b..b72fbc5 100644
--- a/gn3/data_helpers.py
+++ b/gn3/data_helpers.py
@@ -24,6 +24,21 @@ def partition_all(num: int, items: Sequence[Any]) -> Tuple[Tuple[Any, ...], ...]
         in reduce(
             __compute_start_stop__, iterations, tuple())])
 
+def partition_by(partition_fn, items):
+    """
+    Given a sequence `items`, return a tuple of tuples, each of which contain
+    the values in `items` partitioned such that the first item in each internal
+    tuple, when passed to `partition_function` returns True.
+
+    This is an approximation of Clojure's `partition-by` function.
+    """
+    def __partitioner__(accumulator, item):
+        if partition_fn(item):
+            return accumulator + ((item,),)
+        return accumulator[:-1] + (accumulator[-1] + (item,),)
+
+    return reduce(__partitioner__, items, tuple())
+
 def parse_csv_line(
         line: str, delimiter: str = ",",
         quoting: Optional[str] = '"') -> Tuple[str, ...]:
diff --git a/gn3/settings.py b/gn3/settings.py
index 57c63df..eaf8f23 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -53,3 +53,5 @@ CORS_HEADERS = [
 
 GNSHARE = os.environ.get("GNSHARE", "/gnshare/gn/")
 TEXTDIR = f"{GNSHARE}/web/ProbeSetFreeze_DataMatrix"
+
+ROUND_TO = 10