From 0357f5c5e6eeb146eb259337019c87079363a256 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 4 Nov 2021 12:38:27 +0300 Subject: Implement `build_data_frame` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/computations/partial_correlations.py: new function (`build_data_frame`) * tests/unit/computations/test_partial_correlations.py: Add tests for new function Add a new function to build a pandas DataFrame object from the provided values: - x: a vector of floats (represented with a tuple of floats) - y: a vector of floats (represented with a tuple of floats) - z: a vector OR matrix of floats (represented with a tuple of floats or a tuple of tuples of floats) --- gn3/computations/partial_correlations.py | 16 ++++++++++++++++ .../unit/computations/test_partial_correlations.py | 22 ++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 07dc16d..ffdf0c5 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -10,6 +10,8 @@ from typing import Any, Tuple, Sequence from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +import pandas + from gn3.data_helpers import parse_csv_line def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): @@ -258,6 +260,20 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] else fetched_correlations[corr[0]][0:2]) for idx, corr in enumerate(all_correlations)) +def build_data_frame( + xdata: Tuple[float, ...], ydata: Tuple[float, ...], + zdata: Union[ + Tuple[float, ...], + Tuple[Tuple[float, ...], ...]]) -> pandas.DataFrame: + """ + Build a pandas DataFrame object from xdata, ydata and zdata + """ + x_y_df = pandas.DataFrame({"x": xdata, "y": ydata}) + if isinstance(zdata[0], float): + return x_y_df.join(pandas.DataFrame({"z": zdata})) + return x_y_df.join(pandas.DataFrame( + {"z{}".format(i): val for i, val in enumerate(row)} for row in zdata)) + def partial_correlation_matrix( xdata: Tuple[float, ...], ydata: Tuple[float, ...], zdata: Tuple[float, ...], method: str = "pearsons", diff --git a/tests/unit/computations/test_partial_correlations.py b/tests/unit/computations/test_partial_correlations.py index c5c35d1..b22bc62 100644 --- a/tests/unit/computations/test_partial_correlations.py +++ b/tests/unit/computations/test_partial_correlations.py @@ -2,9 +2,12 @@ import csv from unittest import TestCase + +import pandas from gn3.computations.partial_correlations import ( fix_samples, control_samples, + build_data_frame, dictify_by_samples, tissue_correlation, find_identical_traits, @@ -297,6 +300,25 @@ class TestPartialCorrelations(TestCase): ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l")), (0, 4, 8, 10)) + def test_build_data_frame(self): + """ + Check that the function builds the correct data frame. + """ + for xdata, ydata, zdata, expected in ( + ((0.1, 1.1, 2.1), (2.1, 3.1, 4.1), (5.1, 6.1 ,7.1), + pandas.DataFrame({ + "x": (0.1, 1.1, 2.1), "y": (2.1, 3.1, 4.1), + "z": (5.1, 6.1 ,7.1)})), + ((0.1, 1.1, 2.1), (2.1, 3.1, 4.1), + ((5.1, 6.1 ,7.1), (5.2, 6.2, 7.2), (5.3, 6.3, 7.3)), + pandas.DataFrame({ + "x": (0.1, 1.1, 2.1), "y": (2.1, 3.1, 4.1), + "z0": (5.1, 5.2 ,5.3), "z1": (6.1, 6.2 ,6.3), + "z2": (7.1, 7.2 ,7.3)}))): + with self.subTest(xdata=xdata, ydata=ydata, zdata=zdata): + self.assertTrue( + build_data_frame(xdata, ydata, zdata).equals(expected)) + def test_partial_correlation_matrix(self): """ Test that `partial_correlation_matrix` computes the appropriate -- cgit v1.2.3