From 27cca4c118cba6a5f8e8b03d152070f83a44a9e5 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 05:47:45 +0300 Subject: Migrate `export_informative` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/partial_correlations.py: Implement a mostly, bug-compatible `export_informative` function as part of migrating the partial correlations feature over to GN3 from GN1 * tests/unit/test_partial_correlations.py: Implement tests to ensure the code work in a similar manner as that one in GN1. --- gn3/partial_correlations.py | 32 ++++++++++++ tests/unit/test_partial_correlations.py | 92 +++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 gn3/partial_correlations.py create mode 100644 tests/unit/test_partial_correlations.py diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py new file mode 100644 index 0000000..8c37886 --- /dev/null +++ b/gn3/partial_correlations.py @@ -0,0 +1,32 @@ +""" +This module deals with partial correlations. + +It is an attempt to migrate over the partial correlations feature from +GeneNetwork1. +""" + +from functools import reduce + +def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: + """ + Export informative strain + + This is a migration of the `exportInformative` function in + web/webqtl/base/webqtlTrait.py module in GeneNetwork1. + + There is a chance that the original implementation has a bug, especially + dealing with the `inc_var` value. It the `inc_var` value is meant to control + the inclusion of the `variance` value, then the current implementation, and + that one in GN1 have a bug. + """ + def __exporter__(acc, data_item): + if not inc_var or data_item["variance"] is not None: + return ( + acc[0] + (data_item["sample_name"],), + acc[1] + (data_item["value"],), + acc[2] + (data_item["variance"],)) + return acc + return reduce( + __exporter__, + filter(lambda td: td["value"] is not None, trait_data["data"].values()), + (tuple(), tuple(), tuple())) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py new file mode 100644 index 0000000..6eea078 --- /dev/null +++ b/tests/unit/test_partial_correlations.py @@ -0,0 +1,92 @@ +"""Module contains tests for gn3.partial_correlations""" + +from unittest import TestCase +from gn3.partial_correlations import export_informative + +class TestPartialCorrelations(TestCase): + """Class for testing partial correlations computation functions""" + + def test_export_informative(self): + """Test that the function exports appropriate data.""" + for trait_data, inc_var, expected in [ + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": None, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample4"), (9, 8, 6), + (None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, True, (tuple(), tuple(), tuple())], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": 0.657, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, 0.657, None, None))]]: + with self.subTest(trait_data=trait_data): + self.assertEqual( + export_informative(trait_data, inc_var), expected) -- cgit v1.2.3 From 157df453cdb84591cb44af9f1d2677cd0b2c0380 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 12:17:11 +0300 Subject: Move 'export_trait_data' to 'gn3.db.traits' module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/traits.py: Move function `export_trait_data` here * gn3/heatmaps.py: Remove function `export_trait_data` * tests/unit/db/test_traits.py: Move function `export_trait_data` tests here * tests/unit/test_heatmaps.py: Remove function `export_trait_data` here Function `export_trait_data` more closely corresponds to the traits and is used in more than just the `gn3.heatmaps` module. This commit moves the relevant code over to the `gn3.db.traits` module and also moves the tests to the corresponding tests modules. --- gn3/db/traits.py | 69 ++++++++++++++++++++++++++++++++++ gn3/heatmaps.py | 67 +-------------------------------- tests/unit/db/test_traits.py | 89 ++++++++++++++++++++++++++++++++++++++++++++ tests/unit/test_heatmaps.py | 87 ------------------------------------------- 4 files changed, 159 insertions(+), 153 deletions(-) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index f2673c8..1e29aff 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,12 +1,81 @@ """This class contains functions relating to trait data manipulation""" import os +from functools import reduce from typing import Any, Dict, Union, Sequence + from gn3.settings import TMPDIR from gn3.random import random_string from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset +def export_trait_data( + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `samplelist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + samplelist: (list) + A list of sample names + dtype: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, sample): + sample_data = [] + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) + if var_exists: + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, sample): + # pylint: disable=[R0911] + if sample in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][sample]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][sample]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][sample]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], sample) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, samplelist, tuple()) + def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" diff --git a/gn3/heatmaps.py b/gn3/heatmaps.py index adbfbc6..3b94e88 100644 --- a/gn3/heatmaps.py +++ b/gn3/heatmaps.py @@ -14,6 +14,7 @@ from plotly.subplots import make_subplots # type: ignore from gn3.settings import TMPDIR from gn3.random import random_string from gn3.computations.slink import slink +from gn3.db.traits import export_trait_data from gn3.computations.correlations2 import compute_correlation from gn3.db.genotypes import ( build_genotype_file, load_genotype_samples) @@ -26,72 +27,6 @@ from gn3.computations.qtlreaper import ( parse_reaper_main_results, organise_reaper_main_results) -def export_trait_data( - trait_data: dict, samplelist: Sequence[str], dtype: str = "val", - var_exists: bool = False, n_exists: bool = False): - """ - Export data according to `samplelist`. Mostly used in calculating - correlations. - - DESCRIPTION: - Migrated from - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 - - PARAMETERS - trait: (dict) - The dictionary of key-value pairs representing a trait - samplelist: (list) - A list of sample names - dtype: (str) - ... verify what this is ... - var_exists: (bool) - A flag indicating existence of variance - n_exists: (bool) - A flag indicating existence of ndata - """ - def __export_all_types(tdata, sample): - sample_data = [] - if tdata[sample]["value"]: - sample_data.append(tdata[sample]["value"]) - if var_exists: - if tdata[sample]["variance"]: - sample_data.append(tdata[sample]["variance"]) - else: - sample_data.append(None) - if n_exists: - if tdata[sample]["ndata"]: - sample_data.append(tdata[sample]["ndata"]) - else: - sample_data.append(None) - else: - if var_exists and n_exists: - sample_data += [None, None, None] - elif var_exists or n_exists: - sample_data += [None, None] - else: - sample_data.append(None) - - return tuple(sample_data) - - def __exporter(accumulator, sample): - # pylint: disable=[R0911] - if sample in trait_data["data"]: - if dtype == "val": - return accumulator + (trait_data["data"][sample]["value"], ) - if dtype == "var": - return accumulator + (trait_data["data"][sample]["variance"], ) - if dtype == "N": - return accumulator + (trait_data["data"][sample]["ndata"], ) - if dtype == "all": - return accumulator + __export_all_types(trait_data["data"], sample) - raise KeyError("Type `%s` is incorrect" % dtype) - if var_exists and n_exists: - return accumulator + (None, None, None) - if var_exists or n_exists: - return accumulator + (None, None) - return accumulator + (None,) - - return reduce(__exporter, samplelist, tuple()) def trait_display_name(trait: Dict): """ diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 8af8e82..0c4ef78 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -2,6 +2,7 @@ from unittest import mock, TestCase from gn3.db.traits import ( build_trait_name, + export_trait_data, set_haveinfo_field, update_sample_data, retrieve_trait_info, @@ -12,6 +13,38 @@ from gn3.db.traits import ( retrieve_publish_trait_info, retrieve_probeset_trait_info) +samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +trait_data = { + "mysqlid": 36688172, + "data": { + "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, + "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, + "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, + "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, + "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, + "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, + "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, + "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, + "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, + "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, + "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, + "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, + "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, + "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, + "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, + "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, + "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, + "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, + "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, + "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, + "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, + "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} + class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" @@ -226,3 +259,59 @@ class TestTraitsDBFunctions(TestCase): with self.subTest(trait_info=trait_info, expected=expected): self.assertEqual( set_confidential_field(trait_type, trait_info), expected) + + def test_export_trait_data_dtype(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument + """ + for dtype, expected in [ + ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", (None, None, None, None, None, None)], + ["N", (None, None, None, None, None, None)], + ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: + with self.subTest(dtype=dtype): + self.assertEqual( + export_trait_data(trait_data, samplelist, dtype=dtype), + expected) + + def test_export_trait_data_dtype_all_flags(self): + """ + Test `export_trait_data` with different values for the `dtype` keyword + argument and the different flags set up + """ + for dtype, vflag, nflag, expected in [ + ["val", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", False, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["val", True, True, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["var", False, False, (None, None, None, None, None, None)], + ["var", False, True, (None, None, None, None, None, None)], + ["var", True, False, (None, None, None, None, None, None)], + ["var", True, True, (None, None, None, None, None, None)], + ["N", False, False, (None, None, None, None, None, None)], + ["N", False, True, (None, None, None, None, None, None)], + ["N", True, False, (None, None, None, None, None, None)], + ["N", True, True, (None, None, None, None, None, None)], + ["all", False, False, + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], + ["all", False, True, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, False, + (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, + 8.30401, None, 7.80944, None)], + ["all", True, True, + (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, + 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] + ]: + with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): + self.assertEqual( + export_trait_data( + trait_data, samplelist, dtype=dtype, var_exists=vflag, + n_exists=nflag), + expected) diff --git a/tests/unit/test_heatmaps.py b/tests/unit/test_heatmaps.py index 7b66688..03fd4a6 100644 --- a/tests/unit/test_heatmaps.py +++ b/tests/unit/test_heatmaps.py @@ -4,43 +4,12 @@ from gn3.heatmaps import ( cluster_traits, get_loci_names, get_lrs_from_chr, - export_trait_data, compute_traits_order, retrieve_samples_and_values, process_traits_data_for_heatmap) from tests.unit.sample_test_data import organised_trait_1, organised_trait_2 samplelist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] -trait_data = { - "mysqlid": 36688172, - "data": { - "B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None, "ndata": None}, - "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None, "ndata": None}, - "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None, "ndata": None}, - "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None, "ndata": None}, - "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None, "ndata": None}, - "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}, - "BXD21": {"sample_name": "BXD21", "value": 8.93809, "variance": None, "ndata": None}, - "BXD24": {"sample_name": "BXD24", "value": 7.99415, "variance": None, "ndata": None}, - "BXD27": {"sample_name": "BXD27", "value": 8.12177, "variance": None, "ndata": None}, - "BXD28": {"sample_name": "BXD28", "value": 7.67688, "variance": None, "ndata": None}, - "BXD32": {"sample_name": "BXD32", "value": 7.79062, "variance": None, "ndata": None}, - "BXD39": {"sample_name": "BXD39", "value": 8.27641, "variance": None, "ndata": None}, - "BXD40": {"sample_name": "BXD40", "value": 8.18012, "variance": None, "ndata": None}, - "BXD42": {"sample_name": "BXD42", "value": 7.82433, "variance": None, "ndata": None}, - "BXD6": {"sample_name": "BXD6", "value": 8.09718, "variance": None, "ndata": None}, - "BXH14": {"sample_name": "BXH14", "value": 7.97475, "variance": None, "ndata": None}, - "BXH19": {"sample_name": "BXH19", "value": 7.67223, "variance": None, "ndata": None}, - "BXH2": {"sample_name": "BXH2", "value": 7.93622, "variance": None, "ndata": None}, - "BXH22": {"sample_name": "BXH22", "value": 7.43692, "variance": None, "ndata": None}, - "BXH4": {"sample_name": "BXH4", "value": 7.96336, "variance": None, "ndata": None}, - "BXH6": {"sample_name": "BXH6", "value": 7.75132, "variance": None, "ndata": None}, - "BXH7": {"sample_name": "BXH7", "value": 8.12927, "variance": None, "ndata": None}, - "BXH8": {"sample_name": "BXH8", "value": 6.77338, "variance": None, "ndata": None}, - "BXH9": {"sample_name": "BXH9", "value": 8.03836, "variance": None, "ndata": None}, - "C3H/HeJ": {"sample_name": "C3H/HeJ", "value": 7.42795, "variance": None, "ndata": None}, - "C57BL/6J": {"sample_name": "C57BL/6J", "value": 7.50606, "variance": None, "ndata": None}, - "DBA/2J": {"sample_name": "DBA/2J", "value": 7.72588, "variance": None, "ndata": None}}} slinked = ( (((0, 2, 0.16381088984330505), @@ -55,62 +24,6 @@ slinked = ( class TestHeatmap(TestCase): """Class for testing heatmap computation functions""" - def test_export_trait_data_dtype(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument - """ - for dtype, expected in [ - ["val", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", (None, None, None, None, None, None)], - ["N", (None, None, None, None, None, None)], - ["all", (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)]]: - with self.subTest(dtype=dtype): - self.assertEqual( - export_trait_data(trait_data, samplelist, dtype=dtype), - expected) - - def test_export_trait_data_dtype_all_flags(self): - """ - Test `export_trait_data` with different values for the `dtype` keyword - argument and the different flags set up - """ - for dtype, vflag, nflag, expected in [ - ["val", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", False, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["val", True, True, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["var", False, False, (None, None, None, None, None, None)], - ["var", False, True, (None, None, None, None, None, None)], - ["var", True, False, (None, None, None, None, None, None)], - ["var", True, True, (None, None, None, None, None, None)], - ["N", False, False, (None, None, None, None, None, None)], - ["N", False, True, (None, None, None, None, None, None)], - ["N", True, False, (None, None, None, None, None, None)], - ["N", True, True, (None, None, None, None, None, None)], - ["all", False, False, - (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944)], - ["all", False, True, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, False, - (7.51879, None, 7.77141, None, 8.39265, None, 8.17443, None, - 8.30401, None, 7.80944, None)], - ["all", True, True, - (7.51879, None, None, 7.77141, None, None, 8.39265, None, None, - 8.17443, None, None, 8.30401, None, None, 7.80944, None, None)] - ]: - with self.subTest(dtype=dtype, vflag=vflag, nflag=nflag): - self.assertEqual( - export_trait_data( - trait_data, samplelist, dtype=dtype, var_exists=vflag, - n_exists=nflag), - expected) - def test_cluster_traits(self): """ Test that the clustering is working as expected. -- cgit v1.2.3 From 94ca79045baf978d6aab964c7c70b84911c1124f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 12:27:32 +0300 Subject: Move `export_informative` function to `gn3.db.traits` module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/traits.py: Move `export_informative` function here * gn3/partial_correlations.py: Remove `export_informative` function * tests/unit/db/test_traits.py: Move `export_informative` function tests here * tests/unit/test_partial_correlations.py: Remove `export_informative` function tests The `export_informative` function relates more to the traits than to the partial correlations, and could find use in more than just the partial correlations stuff. This commit moves the function to the more traits-specific `gn3.db.traits` module. --- gn3/db/traits.py | 24 +++++++++ gn3/partial_correlations.py | 24 --------- tests/unit/db/test_traits.py | 86 ++++++++++++++++++++++++++++++++ tests/unit/test_partial_correlations.py | 87 +-------------------------------- 4 files changed, 111 insertions(+), 110 deletions(-) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1e29aff..1c6aaa7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -743,3 +743,27 @@ def generate_traits_filename(base_path: str = TMPDIR): """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) + +def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: + """ + Export informative strain + + This is a migration of the `exportInformative` function in + web/webqtl/base/webqtlTrait.py module in GeneNetwork1. + + There is a chance that the original implementation has a bug, especially + dealing with the `inc_var` value. It the `inc_var` value is meant to control + the inclusion of the `variance` value, then the current implementation, and + that one in GN1 have a bug. + """ + def __exporter__(acc, data_item): + if not inc_var or data_item["variance"] is not None: + return ( + acc[0] + (data_item["sample_name"],), + acc[1] + (data_item["value"],), + acc[2] + (data_item["variance"],)) + return acc + return reduce( + __exporter__, + filter(lambda td: td["value"] is not None, trait_data["data"].values()), + (tuple(), tuple(), tuple())) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index 8c37886..df390ed 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -6,27 +6,3 @@ GeneNetwork1. """ from functools import reduce - -def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: - """ - Export informative strain - - This is a migration of the `exportInformative` function in - web/webqtl/base/webqtlTrait.py module in GeneNetwork1. - - There is a chance that the original implementation has a bug, especially - dealing with the `inc_var` value. It the `inc_var` value is meant to control - the inclusion of the `variance` value, then the current implementation, and - that one in GN1 have a bug. - """ - def __exporter__(acc, data_item): - if not inc_var or data_item["variance"] is not None: - return ( - acc[0] + (data_item["sample_name"],), - acc[1] + (data_item["value"],), - acc[2] + (data_item["variance"],)) - return acc - return reduce( - __exporter__, - filter(lambda td: td["value"] is not None, trait_data["data"].values()), - (tuple(), tuple(), tuple())) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 0c4ef78..67f0c6f 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -3,6 +3,7 @@ from unittest import mock, TestCase from gn3.db.traits import ( build_trait_name, export_trait_data, + export_informative, set_haveinfo_field, update_sample_data, retrieve_trait_info, @@ -315,3 +316,88 @@ class TestTraitsDBFunctions(TestCase): trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) + + def test_export_informative(self): + """Test that the function exports appropriate data.""" + for trait_data, inc_var, expected in [ + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": None, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample4"), (9, 8, 6), + (None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, True, (tuple(), tuple(), tuple())], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": 0.657, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, 0.657, None, None))]]: + with self.subTest(trait_data=trait_data): + self.assertEqual( + export_informative(trait_data, inc_var), expected) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index 6eea078..f204d4f 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -1,92 +1,7 @@ """Module contains tests for gn3.partial_correlations""" from unittest import TestCase -from gn3.partial_correlations import export_informative + class TestPartialCorrelations(TestCase): """Class for testing partial correlations computation functions""" - - def test_export_informative(self): - """Test that the function exports appropriate data.""" - for trait_data, inc_var, expected in [ - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), - (None, None, None, None))], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": None, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample4"), (9, 8, 6), - (None, None, None))], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, True, (tuple(), tuple(), tuple())], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": 0.657, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), - (None, 0.657, None, None))]]: - with self.subTest(trait_data=trait_data): - self.assertEqual( - export_informative(trait_data, inc_var), expected) -- cgit v1.2.3 From 1544776b072d7240773cf14d423078841e4c1a07 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 14:14:04 +0300 Subject: Implement `control_samples` function as is in GN1 Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/partial_correlations.py: Implement `control_samples` function * tests/unit/test_partial_correlations.py: add tests for `control_samples` function Implement the function `control_samples` and make it mostly bug-compatible with the `web/webqtl/correlation/correlationFunction.controlStrain` function in GN1. This implementation in GN3 does not do any calls to the database. It will rely on other functions to provide the data from the database to it. --- gn3/partial_correlations.py | 38 ++++++++++++++++ tests/unit/test_partial_correlations.py | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index df390ed..99521c6 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -5,4 +5,42 @@ It is an attempt to migrate over the partial correlations feature from GeneNetwork1. """ +from typing import Sequence from functools import reduce + +def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): + """ + Fetches data for the control traits. + + This migrates `web/webqtl/correlation/correlationFunction.controlStrain` in + GN1, with a few modifications to the arguments passed in. + + PARAMETERS: + controls: A map of sample names to trait data. Equivalent to the `cvals` + value in the corresponding source function in GN1. + sampleslist: A list of samples. Equivalent to `strainlst` in the + corresponding source function in GN1 + """ + def __process_control__(trait_data): + def __process_sample__(acc, sample): + if sample in trait_data["data"].keys(): + sample_item = trait_data["data"][sample] + val = sample_item["value"] + if val is not None: + return ( + acc[0] + (sample,), + acc[1] + (val,), + acc[2] + (sample_item["variance"],)) + return acc + return reduce( + __process_sample__, sampleslist, (tuple(), tuple(), tuple())) + + return reduce( + lambda acc, item: ( + acc[0] + (item[0],), + acc[1] + (item[1],), + acc[2] + (item[2],), + acc[3] + (len(item[0]),), + ), + [__process_control__(trait_data) for trait_data in controls], + (tuple(), tuple(), tuple(), tuple())) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index f204d4f..0083ef7 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -1,7 +1,87 @@ """Module contains tests for gn3.partial_correlations""" from unittest import TestCase +from gn3.partial_correlations import control_samples +sampleslist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] +control_traits = ( + { + "mysqlid": 36688172, + "data": { + "B6cC3-1": { + "sample_name": "B6cC3-1", "value": 7.51879, "variance": None, + "ndata": None}, + "BXD1": { + "sample_name": "BXD1", "value": 7.77141, "variance": None, + "ndata": None}, + "BXD12": { + "sample_name": "BXD12", "value": 8.39265, "variance": None, + "ndata": None}, + "BXD16": { + "sample_name": "BXD16", "value": 8.17443, "variance": None, + "ndata": None}, + "BXD19": { + "sample_name": "BXD19", "value": 8.30401, "variance": None, + "ndata": None}, + "BXD2": { + "sample_name": "BXD2", "value": 7.80944, "variance": None, + "ndata": None}}}, + { + "mysqlid": 36688172, + "data": { + "B6cC3-21": { + "sample_name": "B6cC3-1", "value": 7.51879, "variance": None, + "ndata": None}, + "BXD21": { + "sample_name": "BXD1", "value": 7.77141, "variance": None, + "ndata": None}, + "BXD12": { + "sample_name": "BXD12", "value": 8.39265, "variance": None, + "ndata": None}, + "BXD16": { + "sample_name": "BXD16", "value": 8.17443, "variance": None, + "ndata": None}, + "BXD19": { + "sample_name": "BXD19", "value": 8.30401, "variance": None, + "ndata": None}, + "BXD2": { + "sample_name": "BXD2", "value": 7.80944, "variance": None, + "ndata": None}}}, + { + "mysqlid": 36688172, + "data": { + "B6cC3-1": { + "sample_name": "B6cC3-1", "value": 7.51879, "variance": None, + "ndata": None}, + "BXD1": { + "sample_name": "BXD1", "value": 7.77141, "variance": None, + "ndata": None}, + "BXD12": { + "sample_name": "BXD12", "value": None, "variance": None, + "ndata": None}, + "BXD16": { + "sample_name": "BXD16", "value": None, "variance": None, + "ndata": None}, + "BXD19": { + "sample_name": "BXD19", "value": None, "variance": None, + "ndata": None}, + "BXD2": { + "sample_name": "BXD2", "value": 7.80944, "variance": None, + "ndata": None}}}) class TestPartialCorrelations(TestCase): """Class for testing partial correlations computation functions""" + + def test_control_samples(self): + """Test that the control_samples works as expected.""" + self.assertEqual( + control_samples(control_traits, sampleslist), + ((("B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"), + ("BXD12", "BXD16", "BXD19", "BXD2"), + ("B6cC3-1", "BXD1", "BXD2")), + ((7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (8.39265, 8.17443, 8.30401, 7.80944), + (7.51879, 7.77141, 7.80944)), + ((None, None, None, None, None, None), (None, None, None, None), + (None, None, None)), + (6, 4, 3))) -- cgit v1.2.3 From c5355c5db72fdec9e7e360ceec19d5d50d15ce00 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 14:31:51 +0300 Subject: Disable pylint issue * Disable minor pylint issue. --- tests/unit/db/test_traits.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 67f0c6f..4aa9389 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -319,6 +319,7 @@ class TestTraitsDBFunctions(TestCase): def test_export_informative(self): """Test that the function exports appropriate data.""" + # pylint: disable=W0621 for trait_data, inc_var, expected in [ [{"data": { "sample1": { -- cgit v1.2.3 From 3304fa682924b8f6bff5126ecf2fb58f4201b968 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 19 Oct 2021 09:16:38 +0300 Subject: Implement `dictify_by_samples` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/partial_correlations.py: implement `dictify_by_samples` function * tests/unit/test_partial_correlations.py: implement tests for `dictify_by_samples` function Implement the `dictify_by_samples` function as a partial migration of the `web.webqtl.correlation.correlationFunction.fixStrains` function from GN1. --- gn3/partial_correlations.py | 16 +++++++++++++ tests/unit/test_partial_correlations.py | 42 ++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index 99521c6..4db4807 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -44,3 +44,19 @@ def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): ), [__process_control__(trait_data) for trait_data in controls], (tuple(), tuple(), tuple(), tuple())) + +def dictify_by_samples(samples_vals_vars: Sequence[Sequence]) -> dict: + """ + Build a sequence of dictionaries from a sequence of separate sequences of + samples, values and variances. + + This is a partial migration of + `web.webqtl.correlation.correlationFunction.fixStrains` function in GN1. + This implementation extracts code that will find common use, and that will + find use in more than one place. + """ + return tuple( + { + sample: {"sample_name": sample, "value": val, "variance": var} + for sample, val, var in zip(*trait_line) + } for trait_line in zip(*(samples_vals_vars[0:3]))) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index 0083ef7..6302f74 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -1,7 +1,7 @@ """Module contains tests for gn3.partial_correlations""" from unittest import TestCase -from gn3.partial_correlations import control_samples +from gn3.partial_correlations import control_samples, dictify_by_samples sampleslist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] control_traits = ( @@ -85,3 +85,43 @@ class TestPartialCorrelations(TestCase): ((None, None, None, None, None, None), (None, None, None, None), (None, None, None)), (6, 4, 3))) + + def test_dictify_by_samples(self): + """ + Given: + a sequence of sequences with sample names, values and variances, as + in the output of `gn3.partial_correlations.control_samples` or + the output of `gn3.db.traits.export_informative` + When: + the sequence is passed as an argument into the + `gn3.partial_correlations.dictify_by_sample` + Then: + return a sequence of dicts with keys being the values of the sample + names, and each of who's values being sub-dicts with the keys + 'sample_name', 'value' and 'variance' whose values correspond to the + values passed in. + """ + self.assertEqual( + dictify_by_samples( + ((("B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"), + ("BXD12", "BXD16", "BXD19", "BXD2"), + ("B6cC3-1", "BXD1", "BXD2")), + ((7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944), + (8.39265, 8.17443, 8.30401, 7.80944), + (7.51879, 7.77141, 7.80944)), + ((None, None, None, None, None, None), (None, None, None, None), + (None, None, None)), + (6, 4, 3))), + ({"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, + {"BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, + {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}})) -- cgit v1.2.3 From efb9896464f969de4fe8fcaee21a19ac1d881fa2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 19 Oct 2021 10:31:24 +0300 Subject: Implement remaining `fix_samples` functionality Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/partial_correlations.py: implement `fix_samples` function * tests/unit/test_partial_correlations.py: implement tests for `fix_samples` function Implement the remaining partial migration for the `web.webqtl.correlation.correlationFunction.fixStrain` function in GN1. --- gn3/partial_correlations.py | 30 +++++++++++++++++-- tests/unit/test_partial_correlations.py | 52 ++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index 4db4807..c556d10 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -5,8 +5,8 @@ It is an attempt to migrate over the partial correlations feature from GeneNetwork1. """ -from typing import Sequence from functools import reduce +from typing import Any, Sequence def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -45,7 +45,7 @@ def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): [__process_control__(trait_data) for trait_data in controls], (tuple(), tuple(), tuple(), tuple())) -def dictify_by_samples(samples_vals_vars: Sequence[Sequence]) -> dict: +def dictify_by_samples(samples_vals_vars: Sequence[Sequence]) -> Sequence[dict]: """ Build a sequence of dictionaries from a sequence of separate sequences of samples, values and variances. @@ -60,3 +60,29 @@ def dictify_by_samples(samples_vals_vars: Sequence[Sequence]) -> dict: sample: {"sample_name": sample, "value": val, "variance": var} for sample, val, var in zip(*trait_line) } for trait_line in zip(*(samples_vals_vars[0:3]))) + +def fix_samples(primary_trait: dict, control_traits: Sequence[dict]) -> Sequence[Sequence[Any]]: + """ + Corrects sample_names, values and variance such that they all contain only + those samples that are common to the reference trait and all control traits. + + This is a partial migration of the + `web.webqtl.correlation.correlationFunction.fixStrain` function in GN1. + """ + primary_samples = tuple( + present[0] for present in + ((sample, all(sample in control.keys() for control in control_traits)) + for sample in primary_trait.keys()) + if present[1]) + control_vals_vars: tuple = reduce( + lambda acc, x: (acc[0] + (x[0],), acc[1] + (x[1],)), + ((item["value"], item["variance"]) + for sublist in [tuple(control.values()) for control in control_traits] + for item in sublist), + (tuple(), tuple())) + return ( + primary_samples, + tuple(primary_trait[sample]["value"] for sample in primary_samples), + control_vals_vars[0], + tuple(primary_trait[sample]["variance"] for sample in primary_samples), + control_vals_vars[1]) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index 6302f74..7631a71 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -1,7 +1,10 @@ """Module contains tests for gn3.partial_correlations""" from unittest import TestCase -from gn3.partial_correlations import control_samples, dictify_by_samples +from gn3.partial_correlations import ( + fix_samples, + control_samples, + dictify_by_samples) sampleslist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] control_traits = ( @@ -69,6 +72,21 @@ control_traits = ( "sample_name": "BXD2", "value": 7.80944, "variance": None, "ndata": None}}}) +dictified_control_samples = ( + {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, + "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, + {"BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, + "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, + "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, + {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}) + class TestPartialCorrelations(TestCase): """Class for testing partial correlations computation functions""" @@ -112,16 +130,22 @@ class TestPartialCorrelations(TestCase): ((None, None, None, None, None, None), (None, None, None, None), (None, None, None)), (6, 4, 3))), - ({"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, - "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, - "BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, - "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, - "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, - "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, - {"BXD12": {"sample_name": "BXD12", "value": 8.39265, "variance": None}, - "BXD16": {"sample_name": "BXD16", "value": 8.17443, "variance": None}, - "BXD19": {"sample_name": "BXD19", "value": 8.30401, "variance": None}, - "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}}, - {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, "variance": None}, - "BXD1": {"sample_name": "BXD1", "value": 7.77141, "variance": None}, - "BXD2": {"sample_name": "BXD2", "value": 7.80944, "variance": None}})) + dictified_control_samples) + + def test_fix_samples(self): + """Test that fix_samples fixes the values""" + self.assertEqual( + fix_samples( + {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, + "variance": None}, + "BXD1": {"sample_name": "BXD1", "value": 7.77141, + "variance": None}, + "BXD2": {"sample_name": "BXD2", "value": 7.80944, + "variance": None}}, + dictified_control_samples), + (("BXD2",), (7.80944,), + (7.51879, 7.77141, 8.39265, 8.17443, 8.30401, 7.80944, 8.39265, + 8.17443, 8.30401, 7.80944, 7.51879, 7.77141, 7.80944), + (None,), + (None, None, None, None, None, None, None, None, None, None, None, + None, None))) -- cgit v1.2.3 From 6818670686de86c86b6c1aa372135ab6c22af156 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 21 Oct 2021 07:11:41 +0300 Subject: Document tests better Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Document the issues better to help with understanding what each test checks for. --- tests/unit/test_partial_correlations.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index 7631a71..c591c8f 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -106,6 +106,8 @@ class TestPartialCorrelations(TestCase): def test_dictify_by_samples(self): """ + Test that `dictify_by_samples` generates the appropriate dict + Given: a sequence of sequences with sample names, values and variances, as in the output of `gn3.partial_correlations.control_samples` or @@ -133,7 +135,34 @@ class TestPartialCorrelations(TestCase): dictified_control_samples) def test_fix_samples(self): - """Test that fix_samples fixes the values""" + """ + Test that `fix_samples` returns only the common samples + + Given: + - A primary trait + - A sequence of control samples + When: + - The two arguments are passed to `fix_samples` + Then: + - Only the names of the samples present in the primary trait that + are also present in ALL the control traits are present in the + return value + - Only the values of the samples present in the primary trait that + are also present in ALL the control traits are present in the + return value + - ALL the values for ALL the control traits are present in the + return value + - Only the variances of the samples present in the primary trait + that are also present in ALL the control traits are present in the + return value + - ALL the variances for ALL the control traits are present in the + return value + - The return value is a tuple of the above items, in the following + order: + ((sample_names, ...), (primary_trait_values, ...), + (control_traits_values, ...), (primary_trait_variances, ...) + (control_traits_variances, ...)) + """ self.assertEqual( fix_samples( {"B6cC3-1": {"sample_name": "B6cC3-1", "value": 7.51879, -- cgit v1.2.3 From cad4649d19001f62ef592dedf09f3ac53744962a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 21 Oct 2021 09:00:16 +0300 Subject: Implement `find_identical_traits` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/partial_correlations.py: implement function `find_identical_traits` * tests/unit/test_partial_correlations.py: implement tests for function `find_identical_traits` Migrate `web.webqtl.correlation.correlationFunction.findIdenticalTraits` function in GN1 to here, adding in tests to ensure the migration works in a bug-compatible version with the original. --- gn3/partial_correlations.py | 38 ++++++++++++++++++++++++++++++++- tests/unit/test_partial_correlations.py | 33 +++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index c556d10..1fb0ccc 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -6,7 +6,7 @@ GeneNetwork1. """ from functools import reduce -from typing import Any, Sequence +from typing import Any, Tuple, Sequence def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -86,3 +86,39 @@ def fix_samples(primary_trait: dict, control_traits: Sequence[dict]) -> Sequence control_vals_vars[0], tuple(primary_trait[sample]["variance"] for sample in primary_samples), control_vals_vars[1]) + +def find_identical_traits( + primary_name: str, primary_value: float, control_names: Tuple[str, ...], + control_values: Tuple[float, ...]) -> Tuple[str, ...]: + """ + Find traits that have the same value when the values are considered to + 3 decimal places. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.findIdenticalTraits` function in + GN1. + """ + def __merge_identicals__( + acc: Tuple[str, ...], + ident: Tuple[str, Tuple[str, ...]]) -> Tuple[str, ...]: + return acc + ident[1] + + def __dictify_controls__(acc, control_item): + ckey = "{:.3f}".format(control_item[0]) + return {**acc, ckey: acc.get(ckey, tuple()) + (control_item[1],)} + + return (reduce(## for identical control traits + __merge_identicals__, + (item for item in reduce(# type: ignore[var-annotated] + __dictify_controls__, zip(control_values, control_names), + {}).items() if len(item[1]) > 1), + tuple()) + or + reduce(## If no identical control traits, try primary and controls + __merge_identicals__, + (item for item in reduce(# type: ignore[var-annotated] + __dictify_controls__, + zip((primary_value,) + control_values, + (primary_name,) + control_names), {}).items() + if len(item[1]) > 1), + tuple())) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index c591c8f..60e54c1 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -4,7 +4,8 @@ from unittest import TestCase from gn3.partial_correlations import ( fix_samples, control_samples, - dictify_by_samples) + dictify_by_samples, + find_identical_traits) sampleslist = ["B6cC3-1", "BXD1", "BXD12", "BXD16", "BXD19", "BXD2"] control_traits = ( @@ -178,3 +179,33 @@ class TestPartialCorrelations(TestCase): (None,), (None, None, None, None, None, None, None, None, None, None, None, None, None))) + + def test_find_identical_traits(self): + """ + Test `gn3.partial_correlations.find_identical_traits`. + + Given: + - the name of a primary trait + - the value of a primary trait + - a sequence of names of control traits + - a sequence of values of control traits + When: + - the arguments above are passed to the `find_identical_traits` + function + Then: + - Return ALL trait names that have the same value when up to three + decimal places are considered + """ + for primn, primv, contn, contv, expected in ( + ("pt", 12.98395, ("ct0", "ct1", "ct2"), + (0.1234, 2.3456, 3.4567), tuple()), + ("pt", 12.98395, ("ct0", "ct1", "ct2"), + (12.98354, 2.3456, 3.4567), ("pt", "ct0")), + ("pt", 12.98395, ("ct0", "ct1", "ct2", "ct3"), + (0.1234, 2.3456, 0.1233, 4.5678), ("ct0", "ct2")) + ): + with self.subTest( + primary_name=primn, primary_value=primv, + control_names=contn, control_values=contv): + self.assertEqual( + find_identical_traits(primn, primv, contn, contv), expected) -- cgit v1.2.3 From 41936d0a486ef54bf4fc049c2b4d85dca43ab761 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 21 Oct 2021 09:36:36 +0300 Subject: Implement `translate_to_mouse_gene_id` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.correlation/CorrelationPage.translateToMouseGeneID` function in GN1 to GN3. This is a function that retrieves data from the database, and therefore uses a system outside of our code, therefore, the function does not have a corresponding unit test. This kind of function will probably need to be tested at the integration or system tests level, where we test that our code interacts correcly with any and all external systems that it should. --- gn3/db/species.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/gn3/db/species.py b/gn3/db/species.py index 0deae4e..1e5015f 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -30,3 +30,34 @@ def get_chromosome(name: str, is_species: bool, conn: Any) -> Optional[Tuple]: with conn.cursor() as cursor: cursor.execute(_sql) return cursor.fetchall() + +def translate_to_mouse_gene_id(species: str, geneid: int, conn: Any) -> int: + """ + Translate rat or human geneid to mouse geneid + + This is a migration of the + `web.webqtl.correlation/CorrelationPage.translateToMouseGeneID` function in + GN1 + """ + assert species in ("rat", "mouse", "human"), "Invalid species" + if geneid is None: + return 0 + + if species == "mouse": + return geneid + + with conn.cursor as cursor: + if species == "rat": + cursor.execute( + "SELECT mouse FROM GeneIDXRef WHERE rat = %s", geneid) + rat_geneid = cursor.fetchone() + if rat_geneid: + return rat_geneid[0] + + cursor.execute( + "SELECT mouse FROM GeneIDXRef WHERE human = %s", geneid) + human_geneid = cursor.fetchone() + if human_geneid: + return human_geneid[0] + + return 0 # default if all else fails -- cgit v1.2.3 From df8185078a52c89cc5a75ff9be413a236da29a6e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 09:31:58 +0300 Subject: Implement `get_filename` for correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `get_filename` for the correlations, to be used to determine whether to do fast or normal correlations. This is a migration of the `web.webqtl.correlation.CorrelationPage.getFileName` function in GN1 --- gn3/db/correlations.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 gn3/db/correlations.py diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py new file mode 100644 index 0000000..fa8e7ca --- /dev/null +++ b/gn3/db/correlations.py @@ -0,0 +1,26 @@ +""" +This module will hold functions that are used in the (partial) correlations +feature to access the database to retrieve data needed for computations. +""" + +from typing import Any +def get_filename(target_db_name: str, conn: Any) -> str: + """ + Retrieve the name of the reference database file with which correlations are + computed. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getFileName` function in + GeneNetwork1. + """ + with conn.cursor() as cursor: + cursor.execute( + "SELECT Id, FullName from ProbeSetFreeze WHERE Name-%s", + target_db_name) + result = cursor.fetchone() + if result: + return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( + tid=result[0], + fname=result[1].replace(' ', '_').replace('/', '_')) + + return "" -- cgit v1.2.3 From 0814eea6b57e45d4337424e63c164d204d03b64d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 12:38:24 +0300 Subject: Implement `fetch_literature_correlations` and depedencies Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate: * `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` * `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` from GeneNetwork1. The first function creates and populates a temporary table with the literature correlations data. The second function uses the data in the newly created temporary table to link the trait with the correlation value. --- gn3/db/correlations.py | 113 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index fa8e7ca..67cfef9 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -4,6 +4,10 @@ feature to access the database to retrieve data needed for computations. """ from typing import Any + +from gn3.random import random_string +from gn3.db.species import translate_to_mouse_gene_id + def get_filename(target_db_name: str, conn: Any) -> str: """ Retrieve the name of the reference database file with which correlations are @@ -24,3 +28,112 @@ def get_filename(target_db_name: str, conn: Any) -> str: fname=result[1].replace(' ', '_').replace('/', '_')) return "" + +def build_temporary_literature_table( + species: str, gene_id: int, return_number: int, conn: Any) -> str: + """ + Build and populate a temporary table to hold the literature correlation data + to be used in computations. + + "This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` function in + GeneNetwork1. + """ + def __translated_species_id(row, cursor): + if species == "mouse": + return row[1] + query = { + "rat": "SELECT rat FROM GeneIDXRef WHERE mouse=%s", + "human": "SELECT human FROM GeneIDXRef WHERE mouse=%d"} + if species in query.keys(): + cursor.execute(query[species], row[1]) + record = cursor.fetchone() + if record: + return record[0] + return None + return None + + temp_table_name = f"TOPLITERATURE{random_string(8)}" + with conn.cursor as cursor: + mouse_geneid = translate_to_mouse_gene_id(species, gene_id, conn) + data_query = ( + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1 = %(mouse_gene_id)s " + "UNION ALL " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2 = %(mouse_gene_id)s " + "AND GeneId1 != %(mouse_gene_id)s") + cursor.execute( + (f"CREATE TEMPORARY TABLE {temp_table_name} (" + "GeneId1 int(12) unsigned, " + "GeneId2 int(12) unsigned PRIMARY KEY, " + "value double)")) + cursor.execute(data_query, mouse_gene_id=mouse_geneid) + literature_data = [ + {"GeneId1": row[0], "GeneId2": row[1], "value": row[2]} + for row in cursor.fetchall() + if __translated_species_id(row, cursor)] + + cursor.execute( + (f"INSERT INTO {temp_table_name} " + "VALUES (%(GeneId1)s, %(GeneId2)s, %(value)s)"), + literature_data[0:(2 * return_number)]) + + return temp_table_name + +def fetch_geno_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `Geno*` tables. + """ + return ( + f"SELECT Geno.Name, {temp_table}.value " + "FROM Geno, GenoXRef, GenoFreeze " + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + f"AND {temp_table}.value IS NOT NULL " + "AND GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoFreeze.Name = %(db_name)s " + "AND Geno.Id=GenoXRef.GenoId " + "ORDER BY Geno.Id") + +def fetch_probeset_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `ProbeSet*` tables. + """ + return ( + f"SELECT ProbeSet.Name, {temp_table}.value " + "FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze " + "LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + "AND {temp_table}.value IS NOT NULL " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %(db_name)s " + "AND ProbeSet.Id=ProbeSetXRef.ProbeSetId " + "ORDER BY ProbeSet.Id") + +def fetch_literature_correlations( + species: str, gene_id: int, dataset: dict, return_number: int, + conn: Any) -> dict: + """ + Gather the literature correlation data and pair it with trait id string(s). + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_literature_table( + species, gene_id, return_number, conn) + query_fns = { + "Geno": fetch_geno_literature_correlations, + # "Temp": fetch_temp_literature_correlations, + # "Publish": fetch_publish_literature_correlations, + "ProbeSet": fetch_probeset_literature_correlations} + with conn.cursor as cursor: + cursor.execute( + query_fns[dataset["dataset_type"]](temp_table), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} -- cgit v1.2.3 From 783f302c5d4729eb0b5fb6ba79180b7cd97764a5 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 19:12:24 +0300 Subject: Implement `partition_all` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/data_helpers.py: new function (partition_all) * tests/unit/test_data_helpers.py: tests for function `gn3.data_helpers.partition_all` As part of migrating some functions that access the database, this commit extracts generic processes that can be accomplished on data, and implements the `partition_all` function, that is equivalent to Clojure's `partition-all` function. --- gn3/data_helpers.py | 25 +++++++++++++++++++++++++ tests/unit/test_data_helpers.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 gn3/data_helpers.py create mode 100644 tests/unit/test_data_helpers.py diff --git a/gn3/data_helpers.py b/gn3/data_helpers.py new file mode 100644 index 0000000..f0d971e --- /dev/null +++ b/gn3/data_helpers.py @@ -0,0 +1,25 @@ +""" +This module will hold generic functions that can operate on a wide-array of +data structures. +""" + +from math import ceil +from functools import reduce +from typing import Any, Tuple, Sequence + +def partition_all(num: int, items: Sequence[Any]) -> Tuple[Tuple[Any, ...], ...]: + """ + Given a sequence `items`, return a new sequence of the same type as `items` + with the data partitioned into sections of `n` items per partition. + + This is an approximation of clojure's `partition-all` function. + """ + def __compute_start_stop__(acc, iteration): + start = iteration * num + return acc + ((start, start + num),) + + iterations = range(ceil(len(items) / num)) + return tuple([# type: ignore[misc] + tuple(items[start:stop]) for start, stop # type: ignore[has-type] + in reduce( + __compute_start_stop__, iterations, tuple())]) diff --git a/tests/unit/test_data_helpers.py b/tests/unit/test_data_helpers.py new file mode 100644 index 0000000..1eec3cc --- /dev/null +++ b/tests/unit/test_data_helpers.py @@ -0,0 +1,37 @@ +""" +Test functions in gn3.data_helpers +""" + +from unittest import TestCase + +from gn3.data_helpers import partition_all + +class TestDataHelpers(TestCase): + """ + Test functions in gn3.data_helpers + """ + + def test_partition_all(self): + """ + Test that `gn3.data_helpers.partition_all` partitions sequences as expected. + + Given: + - `num`: The number of items per partition + - `items`: A sequence of items + When: + - The arguments above are passed to the `gn3.data_helpers.partition_all` + Then: + - Return a new sequence with partitions, each of which has `num` + items in the same order as those in `items`, save for the last + partition which might have fewer items than `num`. + """ + for count, items, expected in ( + (1, [0, 1, 2, 3], ((0,), (1,), (2,), (3,))), + (3, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), + ((0, 1, 2), (3, 4, 5), (6, 7, 8), (9, ))), + (4, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9))), + (13, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ((0, 1, 2, 3, 4, 5, 6, 7, 8, 9), ))): + with self.subTest(n=count, items=items): + self.assertEqual(partition_all(count, items), expected) -- cgit v1.2.3 From c13afb3af166d2b01e4f9fd9b09bb231f0a63cb1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 19:19:54 +0300 Subject: Start implementation of `fetch_tissue_correlations` and dependencies * compare_tissue_correlation_absolute_values: New function. Complete. Used for sorting of tissue correlation values * fetch_symbol_value_pair_dict: New function. Complete. Maps gene symbols to tissue expression data * fetch_gene_symbol_tissue_value_dict: New function. Complete. Wrapper for `gn3.db.correlations.fetch_symbol_value_pair_dict` function * fetch_tissue_probeset_xref_info: New function. Complete. Retrieves the Probeset XRef information for tissues from the database. * correlations_of_all_tissue_traits: Stub. Dependencies not completed yet. * build_temporary_tissue_correlations_table: Stub. Dependencies not completed yet. * fetch_tissue_correlations: New function. Incomplete. This function calls (a) stub(s) function(s) which is/are under development still. --- gn3/db/correlations.py | 183 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 2 deletions(-) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 67cfef9..87ab082 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -3,9 +3,11 @@ This module will hold functions that are used in the (partial) correlations feature to access the database to retrieve data needed for computations. """ -from typing import Any +from functools import reduce +from typing import Any, Dict, Tuple from gn3.random import random_string +from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id def get_filename(target_db_name: str, conn: Any) -> str: @@ -136,4 +138,181 @@ def fetch_literature_correlations( db_name=dataset["dataset_name"]) results = cursor.fetchall() cursor.execute("DROP TEMPORARY TABLE %s", temp_table) - return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} + return dict(results) + +def compare_tissue_correlation_absolute_values(val1, val2): + """ + Comparison function for use when sorting tissue correlation values. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + try: + if abs(val1) < abs(val2): + return 1 + if abs(val1) == abs(val2): + return 0 + return -1 + except TypeError: + return 0 + +def fetch_symbol_value_pair_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, + conn: Any) -> Dict[str, Tuple[float, ...]]: + """ + Map each gene symbols to the corresponding tissue expression data. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getSymbolValuePairDict` function + in GeneNetwork1. + """ + data_ids = { + symbol: data_id_dict.get(symbol) for symbol in symbol_list + if data_id_dict.get(symbol) is not None + } + query = "SELECT Id, value FROM TissueProbeSetData WHERE Id IN %(data_ids)s" + with conn.cursor() as cursor: + cursor.execute( + query, + data_ids=tuple(data_ids.values())) + value_results = cursor.fetchall() + return { + key: tuple(row[1] for row in value_results if row[0] == key) + for key in data_ids.keys() + } + + return {} + +def fetch_gene_symbol_tissue_value_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, conn: Any, + limit_num: int = 1000) -> dict:#getGeneSymbolTissueValueDict + """ + Wrapper function for `gn3.db.correlations.fetch_symbol_value_pair_dict`. + + This is a migrations of the + `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDict` in + GeneNetwork1. + """ + count = len(symbol_list) + if count != 0 and count <= limit_num: + return fetch_symbol_value_pair_dict(symbol_list, data_id_dict, conn) + + if count > limit_num: + return { + key: value for dct in [ + fetch_symbol_value_pair_dict(sl, data_id_dict, conn) + for sl in partition_all(limit_num, symbol_list)] + for key, value in dct.items() + } + + return {} + +def fetch_tissue_probeset_xref_info( + gene_name_list: Tuple[str, ...], probeset_freeze_id: int, + conn: Any) -> Tuple[tuple, dict, dict, dict, dict, dict, dict]: + """ + Retrieve the ProbeSet XRef information for tissues. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getTissueProbeSetXRefInfo` + function in GeneNetwork1.""" + with conn.cursor() as cursor: + if len(gene_name_list) == 0: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol != '' " + " AND Symbol IS NOT NULL " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute(query, probeset_freeze_id=probeset_freeze_id) + else: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol in %(symbols)s " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute( + query, probeset_freeze_id=probeset_freeze_id, + symbols=tuple(gene_name_list)) + + results = cursor.fetchall() + + return reduce( + lambda acc, item: ( + acc[0] + (item[0],), + {**acc[1], item[0].lower(): item[1]}, + {**acc[1], item[0].lower(): item[2]}, + {**acc[1], item[0].lower(): item[3]}, + {**acc[1], item[0].lower(): item[4]}, + {**acc[1], item[0].lower(): item[5]}, + {**acc[1], item[0].lower(): item[6]}), + results or tuple(), + (tuple(), {}, {}, {}, {}, {}, {})) + +def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` + function in GeneNetwork1. + """ + raise Exception("Unimplemented!!!") + return ({}, {}) + +def build_temporary_tissue_correlations_table( + trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> str: + """ + Build a temporary table to hold the tissue correlations data. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + raise Exception("Unimplemented!!!") + return "" + +def fetch_tissue_correlations( + dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> dict: + """ + Pair tissue correlations data with a trait id string. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchTissueCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_tissue_correlations_table( + trait_symbol, probeset_freeze_id, method, return_number, conn) + with conn.cursor() as cursor: + cursor.execute( + ( + f"SELECT ProbeSet.Name, {temp_table}.Correlation, " + f"{temp_table}.PValue " + "FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) " + "LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + "WHERE ProbeSetFreeze.Name = %(db_name) " + "AND ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId " + "AND ProbeSet.Id = ProbeSetXRef.ProbeSetId " + "AND ProbeSet.Symbol IS NOT NULL " + "AND %s.Correlation IS NOT NULL"), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return { + trait_name: (tiss_corr, tiss_p_val) + for trait_name, tiss_corr, tiss_p_val in results} -- cgit v1.2.3