From 157df453cdb84591cb44af9f1d2677cd0b2c0380 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 12:17:11 +0300 Subject: Move 'export_trait_data' to 'gn3.db.traits' module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/traits.py: Move function `export_trait_data` here * gn3/heatmaps.py: Remove function `export_trait_data` * tests/unit/db/test_traits.py: Move function `export_trait_data` tests here * tests/unit/test_heatmaps.py: Remove function `export_trait_data` here Function `export_trait_data` more closely corresponds to the traits and is used in more than just the `gn3.heatmaps` module. This commit moves the relevant code over to the `gn3.db.traits` module and also moves the tests to the corresponding tests modules. --- gn3/db/traits.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index f2673c8..1e29aff 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,12 +1,81 @@ """This class contains functions relating to trait data manipulation""" import os +from functools import reduce from typing import Any, Dict, Union, Sequence + from gn3.settings import TMPDIR from gn3.random import random_string from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset +def export_trait_data( + trait_data: dict, samplelist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): + """ + Export data according to `samplelist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + samplelist: (list) + A list of sample names + dtype: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, sample): + sample_data = [] + if tdata[sample]["value"]: + sample_data.append(tdata[sample]["value"]) + if var_exists: + if tdata[sample]["variance"]: + sample_data.append(tdata[sample]["variance"]) + else: + sample_data.append(None) + if n_exists: + if tdata[sample]["ndata"]: + sample_data.append(tdata[sample]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, sample): + # pylint: disable=[R0911] + if sample in trait_data["data"]: + if dtype == "val": + return accumulator + (trait_data["data"][sample]["value"], ) + if dtype == "var": + return accumulator + (trait_data["data"][sample]["variance"], ) + if dtype == "N": + return accumulator + (trait_data["data"][sample]["ndata"], ) + if dtype == "all": + return accumulator + __export_all_types(trait_data["data"], sample) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter, samplelist, tuple()) + def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" -- cgit v1.2.3 From 94ca79045baf978d6aab964c7c70b84911c1124f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 18 Oct 2021 12:27:32 +0300 Subject: Move `export_informative` function to `gn3.db.traits` module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/traits.py: Move `export_informative` function here * gn3/partial_correlations.py: Remove `export_informative` function * tests/unit/db/test_traits.py: Move `export_informative` function tests here * tests/unit/test_partial_correlations.py: Remove `export_informative` function tests The `export_informative` function relates more to the traits than to the partial correlations, and could find use in more than just the partial correlations stuff. This commit moves the function to the more traits-specific `gn3.db.traits` module. --- gn3/db/traits.py | 24 +++++++++ gn3/partial_correlations.py | 24 --------- tests/unit/db/test_traits.py | 86 ++++++++++++++++++++++++++++++++ tests/unit/test_partial_correlations.py | 87 +-------------------------------- 4 files changed, 111 insertions(+), 110 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1e29aff..1c6aaa7 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -743,3 +743,27 @@ def generate_traits_filename(base_path: str = TMPDIR): """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) + +def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: + """ + Export informative strain + + This is a migration of the `exportInformative` function in + web/webqtl/base/webqtlTrait.py module in GeneNetwork1. + + There is a chance that the original implementation has a bug, especially + dealing with the `inc_var` value. It the `inc_var` value is meant to control + the inclusion of the `variance` value, then the current implementation, and + that one in GN1 have a bug. + """ + def __exporter__(acc, data_item): + if not inc_var or data_item["variance"] is not None: + return ( + acc[0] + (data_item["sample_name"],), + acc[1] + (data_item["value"],), + acc[2] + (data_item["variance"],)) + return acc + return reduce( + __exporter__, + filter(lambda td: td["value"] is not None, trait_data["data"].values()), + (tuple(), tuple(), tuple())) diff --git a/gn3/partial_correlations.py b/gn3/partial_correlations.py index 8c37886..df390ed 100644 --- a/gn3/partial_correlations.py +++ b/gn3/partial_correlations.py @@ -6,27 +6,3 @@ GeneNetwork1. """ from functools import reduce - -def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: - """ - Export informative strain - - This is a migration of the `exportInformative` function in - web/webqtl/base/webqtlTrait.py module in GeneNetwork1. - - There is a chance that the original implementation has a bug, especially - dealing with the `inc_var` value. It the `inc_var` value is meant to control - the inclusion of the `variance` value, then the current implementation, and - that one in GN1 have a bug. - """ - def __exporter__(acc, data_item): - if not inc_var or data_item["variance"] is not None: - return ( - acc[0] + (data_item["sample_name"],), - acc[1] + (data_item["value"],), - acc[2] + (data_item["variance"],)) - return acc - return reduce( - __exporter__, - filter(lambda td: td["value"] is not None, trait_data["data"].values()), - (tuple(), tuple(), tuple())) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 0c4ef78..67f0c6f 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -3,6 +3,7 @@ from unittest import mock, TestCase from gn3.db.traits import ( build_trait_name, export_trait_data, + export_informative, set_haveinfo_field, update_sample_data, retrieve_trait_info, @@ -315,3 +316,88 @@ class TestTraitsDBFunctions(TestCase): trait_data, samplelist, dtype=dtype, var_exists=vflag, n_exists=nflag), expected) + + def test_export_informative(self): + """Test that the function exports appropriate data.""" + for trait_data, inc_var, expected in [ + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": None, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample4"), (9, 8, 6), + (None, None, None))], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": None, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, True, (tuple(), tuple(), tuple())], + [{"data": { + "sample1": { + "sample_name": "sample1", "value": 9, "variance": None, + "ndata": 13 + }, + "sample2": { + "sample_name": "sample2", "value": 8, "variance": 0.657, + "ndata": 13 + }, + "sample3": { + "sample_name": "sample3", "value": 7, "variance": None, + "ndata": 13 + }, + "sample4": { + "sample_name": "sample4", "value": 6, "variance": None, + "ndata": 13 + }, + }}, 0, ( + ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), + (None, 0.657, None, None))]]: + with self.subTest(trait_data=trait_data): + self.assertEqual( + export_informative(trait_data, inc_var), expected) diff --git a/tests/unit/test_partial_correlations.py b/tests/unit/test_partial_correlations.py index 6eea078..f204d4f 100644 --- a/tests/unit/test_partial_correlations.py +++ b/tests/unit/test_partial_correlations.py @@ -1,92 +1,7 @@ """Module contains tests for gn3.partial_correlations""" from unittest import TestCase -from gn3.partial_correlations import export_informative + class TestPartialCorrelations(TestCase): """Class for testing partial correlations computation functions""" - - def test_export_informative(self): - """Test that the function exports appropriate data.""" - for trait_data, inc_var, expected in [ - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), - (None, None, None, None))], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": None, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample4"), (9, 8, 6), - (None, None, None))], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": None, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, True, (tuple(), tuple(), tuple())], - [{"data": { - "sample1": { - "sample_name": "sample1", "value": 9, "variance": None, - "ndata": 13 - }, - "sample2": { - "sample_name": "sample2", "value": 8, "variance": 0.657, - "ndata": 13 - }, - "sample3": { - "sample_name": "sample3", "value": 7, "variance": None, - "ndata": 13 - }, - "sample4": { - "sample_name": "sample4", "value": 6, "variance": None, - "ndata": 13 - }, - }}, 0, ( - ("sample1", "sample2", "sample3", "sample4"), (9, 8, 7, 6), - (None, 0.657, None, None))]]: - with self.subTest(trait_data=trait_data): - self.assertEqual( - export_informative(trait_data, inc_var), expected) -- cgit v1.2.3 From 41936d0a486ef54bf4fc049c2b4d85dca43ab761 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 21 Oct 2021 09:36:36 +0300 Subject: Implement `translate_to_mouse_gene_id` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.correlation/CorrelationPage.translateToMouseGeneID` function in GN1 to GN3. This is a function that retrieves data from the database, and therefore uses a system outside of our code, therefore, the function does not have a corresponding unit test. This kind of function will probably need to be tested at the integration or system tests level, where we test that our code interacts correcly with any and all external systems that it should. --- gn3/db/species.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/species.py b/gn3/db/species.py index 0deae4e..1e5015f 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -30,3 +30,34 @@ def get_chromosome(name: str, is_species: bool, conn: Any) -> Optional[Tuple]: with conn.cursor() as cursor: cursor.execute(_sql) return cursor.fetchall() + +def translate_to_mouse_gene_id(species: str, geneid: int, conn: Any) -> int: + """ + Translate rat or human geneid to mouse geneid + + This is a migration of the + `web.webqtl.correlation/CorrelationPage.translateToMouseGeneID` function in + GN1 + """ + assert species in ("rat", "mouse", "human"), "Invalid species" + if geneid is None: + return 0 + + if species == "mouse": + return geneid + + with conn.cursor as cursor: + if species == "rat": + cursor.execute( + "SELECT mouse FROM GeneIDXRef WHERE rat = %s", geneid) + rat_geneid = cursor.fetchone() + if rat_geneid: + return rat_geneid[0] + + cursor.execute( + "SELECT mouse FROM GeneIDXRef WHERE human = %s", geneid) + human_geneid = cursor.fetchone() + if human_geneid: + return human_geneid[0] + + return 0 # default if all else fails -- cgit v1.2.3 From df8185078a52c89cc5a75ff9be413a236da29a6e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 09:31:58 +0300 Subject: Implement `get_filename` for correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `get_filename` for the correlations, to be used to determine whether to do fast or normal correlations. This is a migration of the `web.webqtl.correlation.CorrelationPage.getFileName` function in GN1 --- gn3/db/correlations.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 gn3/db/correlations.py (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py new file mode 100644 index 0000000..fa8e7ca --- /dev/null +++ b/gn3/db/correlations.py @@ -0,0 +1,26 @@ +""" +This module will hold functions that are used in the (partial) correlations +feature to access the database to retrieve data needed for computations. +""" + +from typing import Any +def get_filename(target_db_name: str, conn: Any) -> str: + """ + Retrieve the name of the reference database file with which correlations are + computed. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getFileName` function in + GeneNetwork1. + """ + with conn.cursor() as cursor: + cursor.execute( + "SELECT Id, FullName from ProbeSetFreeze WHERE Name-%s", + target_db_name) + result = cursor.fetchone() + if result: + return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( + tid=result[0], + fname=result[1].replace(' ', '_').replace('/', '_')) + + return "" -- cgit v1.2.3 From 0814eea6b57e45d4337424e63c164d204d03b64d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 12:38:24 +0300 Subject: Implement `fetch_literature_correlations` and depedencies Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate: * `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` * `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` from GeneNetwork1. The first function creates and populates a temporary table with the literature correlations data. The second function uses the data in the newly created temporary table to link the trait with the correlation value. --- gn3/db/correlations.py | 113 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index fa8e7ca..67cfef9 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -4,6 +4,10 @@ feature to access the database to retrieve data needed for computations. """ from typing import Any + +from gn3.random import random_string +from gn3.db.species import translate_to_mouse_gene_id + def get_filename(target_db_name: str, conn: Any) -> str: """ Retrieve the name of the reference database file with which correlations are @@ -24,3 +28,112 @@ def get_filename(target_db_name: str, conn: Any) -> str: fname=result[1].replace(' ', '_').replace('/', '_')) return "" + +def build_temporary_literature_table( + species: str, gene_id: int, return_number: int, conn: Any) -> str: + """ + Build and populate a temporary table to hold the literature correlation data + to be used in computations. + + "This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` function in + GeneNetwork1. + """ + def __translated_species_id(row, cursor): + if species == "mouse": + return row[1] + query = { + "rat": "SELECT rat FROM GeneIDXRef WHERE mouse=%s", + "human": "SELECT human FROM GeneIDXRef WHERE mouse=%d"} + if species in query.keys(): + cursor.execute(query[species], row[1]) + record = cursor.fetchone() + if record: + return record[0] + return None + return None + + temp_table_name = f"TOPLITERATURE{random_string(8)}" + with conn.cursor as cursor: + mouse_geneid = translate_to_mouse_gene_id(species, gene_id, conn) + data_query = ( + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1 = %(mouse_gene_id)s " + "UNION ALL " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2 = %(mouse_gene_id)s " + "AND GeneId1 != %(mouse_gene_id)s") + cursor.execute( + (f"CREATE TEMPORARY TABLE {temp_table_name} (" + "GeneId1 int(12) unsigned, " + "GeneId2 int(12) unsigned PRIMARY KEY, " + "value double)")) + cursor.execute(data_query, mouse_gene_id=mouse_geneid) + literature_data = [ + {"GeneId1": row[0], "GeneId2": row[1], "value": row[2]} + for row in cursor.fetchall() + if __translated_species_id(row, cursor)] + + cursor.execute( + (f"INSERT INTO {temp_table_name} " + "VALUES (%(GeneId1)s, %(GeneId2)s, %(value)s)"), + literature_data[0:(2 * return_number)]) + + return temp_table_name + +def fetch_geno_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `Geno*` tables. + """ + return ( + f"SELECT Geno.Name, {temp_table}.value " + "FROM Geno, GenoXRef, GenoFreeze " + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + f"AND {temp_table}.value IS NOT NULL " + "AND GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoFreeze.Name = %(db_name)s " + "AND Geno.Id=GenoXRef.GenoId " + "ORDER BY Geno.Id") + +def fetch_probeset_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `ProbeSet*` tables. + """ + return ( + f"SELECT ProbeSet.Name, {temp_table}.value " + "FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze " + "LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + "AND {temp_table}.value IS NOT NULL " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %(db_name)s " + "AND ProbeSet.Id=ProbeSetXRef.ProbeSetId " + "ORDER BY ProbeSet.Id") + +def fetch_literature_correlations( + species: str, gene_id: int, dataset: dict, return_number: int, + conn: Any) -> dict: + """ + Gather the literature correlation data and pair it with trait id string(s). + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_literature_table( + species, gene_id, return_number, conn) + query_fns = { + "Geno": fetch_geno_literature_correlations, + # "Temp": fetch_temp_literature_correlations, + # "Publish": fetch_publish_literature_correlations, + "ProbeSet": fetch_probeset_literature_correlations} + with conn.cursor as cursor: + cursor.execute( + query_fns[dataset["dataset_type"]](temp_table), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} -- cgit v1.2.3 From c13afb3af166d2b01e4f9fd9b09bb231f0a63cb1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 19:19:54 +0300 Subject: Start implementation of `fetch_tissue_correlations` and dependencies * compare_tissue_correlation_absolute_values: New function. Complete. Used for sorting of tissue correlation values * fetch_symbol_value_pair_dict: New function. Complete. Maps gene symbols to tissue expression data * fetch_gene_symbol_tissue_value_dict: New function. Complete. Wrapper for `gn3.db.correlations.fetch_symbol_value_pair_dict` function * fetch_tissue_probeset_xref_info: New function. Complete. Retrieves the Probeset XRef information for tissues from the database. * correlations_of_all_tissue_traits: Stub. Dependencies not completed yet. * build_temporary_tissue_correlations_table: Stub. Dependencies not completed yet. * fetch_tissue_correlations: New function. Incomplete. This function calls (a) stub(s) function(s) which is/are under development still. --- gn3/db/correlations.py | 183 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 2 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 67cfef9..87ab082 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -3,9 +3,11 @@ This module will hold functions that are used in the (partial) correlations feature to access the database to retrieve data needed for computations. """ -from typing import Any +from functools import reduce +from typing import Any, Dict, Tuple from gn3.random import random_string +from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id def get_filename(target_db_name: str, conn: Any) -> str: @@ -136,4 +138,181 @@ def fetch_literature_correlations( db_name=dataset["dataset_name"]) results = cursor.fetchall() cursor.execute("DROP TEMPORARY TABLE %s", temp_table) - return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} + return dict(results) + +def compare_tissue_correlation_absolute_values(val1, val2): + """ + Comparison function for use when sorting tissue correlation values. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + try: + if abs(val1) < abs(val2): + return 1 + if abs(val1) == abs(val2): + return 0 + return -1 + except TypeError: + return 0 + +def fetch_symbol_value_pair_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, + conn: Any) -> Dict[str, Tuple[float, ...]]: + """ + Map each gene symbols to the corresponding tissue expression data. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getSymbolValuePairDict` function + in GeneNetwork1. + """ + data_ids = { + symbol: data_id_dict.get(symbol) for symbol in symbol_list + if data_id_dict.get(symbol) is not None + } + query = "SELECT Id, value FROM TissueProbeSetData WHERE Id IN %(data_ids)s" + with conn.cursor() as cursor: + cursor.execute( + query, + data_ids=tuple(data_ids.values())) + value_results = cursor.fetchall() + return { + key: tuple(row[1] for row in value_results if row[0] == key) + for key in data_ids.keys() + } + + return {} + +def fetch_gene_symbol_tissue_value_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, conn: Any, + limit_num: int = 1000) -> dict:#getGeneSymbolTissueValueDict + """ + Wrapper function for `gn3.db.correlations.fetch_symbol_value_pair_dict`. + + This is a migrations of the + `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDict` in + GeneNetwork1. + """ + count = len(symbol_list) + if count != 0 and count <= limit_num: + return fetch_symbol_value_pair_dict(symbol_list, data_id_dict, conn) + + if count > limit_num: + return { + key: value for dct in [ + fetch_symbol_value_pair_dict(sl, data_id_dict, conn) + for sl in partition_all(limit_num, symbol_list)] + for key, value in dct.items() + } + + return {} + +def fetch_tissue_probeset_xref_info( + gene_name_list: Tuple[str, ...], probeset_freeze_id: int, + conn: Any) -> Tuple[tuple, dict, dict, dict, dict, dict, dict]: + """ + Retrieve the ProbeSet XRef information for tissues. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getTissueProbeSetXRefInfo` + function in GeneNetwork1.""" + with conn.cursor() as cursor: + if len(gene_name_list) == 0: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol != '' " + " AND Symbol IS NOT NULL " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute(query, probeset_freeze_id=probeset_freeze_id) + else: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol in %(symbols)s " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute( + query, probeset_freeze_id=probeset_freeze_id, + symbols=tuple(gene_name_list)) + + results = cursor.fetchall() + + return reduce( + lambda acc, item: ( + acc[0] + (item[0],), + {**acc[1], item[0].lower(): item[1]}, + {**acc[1], item[0].lower(): item[2]}, + {**acc[1], item[0].lower(): item[3]}, + {**acc[1], item[0].lower(): item[4]}, + {**acc[1], item[0].lower(): item[5]}, + {**acc[1], item[0].lower(): item[6]}), + results or tuple(), + (tuple(), {}, {}, {}, {}, {}, {})) + +def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` + function in GeneNetwork1. + """ + raise Exception("Unimplemented!!!") + return ({}, {}) + +def build_temporary_tissue_correlations_table( + trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> str: + """ + Build a temporary table to hold the tissue correlations data. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + raise Exception("Unimplemented!!!") + return "" + +def fetch_tissue_correlations( + dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> dict: + """ + Pair tissue correlations data with a trait id string. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchTissueCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_tissue_correlations_table( + trait_symbol, probeset_freeze_id, method, return_number, conn) + with conn.cursor() as cursor: + cursor.execute( + ( + f"SELECT ProbeSet.Name, {temp_table}.Correlation, " + f"{temp_table}.PValue " + "FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) " + "LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + "WHERE ProbeSetFreeze.Name = %(db_name) " + "AND ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId " + "AND ProbeSet.Id = ProbeSetXRef.ProbeSetId " + "AND ProbeSet.Symbol IS NOT NULL " + "AND %s.Correlation IS NOT NULL"), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return { + trait_name: (tiss_corr, tiss_p_val) + for trait_name, tiss_corr, tiss_p_val in results} -- cgit v1.2.3 From 42dee16ec8a7d7620367dd31481999bfca9313db Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 08:59:30 +0300 Subject: Implement `fetch_gene_symbol_tissue_value_dict_for_trait` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `fetch_gene_symbol_tissue_value_dict_for_trait` function which is a migration of the `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDictForTrait` function in GeneNetwork1. --- gn3/db/correlations.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 87ab082..cae8080 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -266,7 +266,22 @@ def fetch_tissue_probeset_xref_info( (tuple(), {}, {}, {}, {}, {}, {})) def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: +def fetch_gene_symbol_tissue_value_dict_for_trait( + gene_name_list: Tuple[str, ...], probeset_freeze_id: int, + conn: Any) -> dict: + """ + Fetches a map of the gene symbols to the tissue values. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDictForTrait` + function in GeneNetwork1. """ + xref_info = fetch_tissue_probeset_xref_info( + gene_name_list, probeset_freeze_id, conn) + if xref_info[0]: + return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) + return {} + This is a migration of the `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` function in GeneNetwork1. -- cgit v1.2.3 From d6e392c2488421ae04b4ffd5de26be40ed86a9b3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 09:17:52 +0300 Subject: Complete `correlations_of_all_tissue_traits` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Complete the implementation of the `correlations_of_all_tissue_traits` function by providing a call to a non-implemented function. --- gn3/db/correlations.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index cae8080..f43b8a5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -265,7 +265,6 @@ def fetch_tissue_probeset_xref_info( results or tuple(), (tuple(), {}, {}, {}, {}, {}, {})) -def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: def fetch_gene_symbol_tissue_value_dict_for_trait( gene_name_list: Tuple[str, ...], probeset_freeze_id: int, conn: Any) -> dict: @@ -282,12 +281,25 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} +def correlations_of_all_tissue_traits( + trait_symbol: str, probeset_freeze_id: int, + method: str, conn: Any) -> Tuple[dict, dict]: + """ + Computes and returns the correlation of all tissue traits. + This is a migration of the - `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` + `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - raise Exception("Unimplemented!!!") - return ({}, {}) + primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + (trait_symbol,), probeset_freeze_id, conn) + primary_trait_value = primary_trait_symbol_value_dict.vlaues()[0] + symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + tuple(), probeset_freeze_id, conn) + if method == "1": + return batch_computed_tissue_correlation( + primaryTraitValue,SymbolValueDict,method='spearman') + return batch_computed_tissue_correlation(primaryTraitValue,SymbolValueDict) def build_temporary_tissue_correlations_table( trait_symbol: str, probeset_freeze_id: int, method: str, @@ -298,6 +310,8 @@ def build_temporary_tissue_correlations_table( This is a migration of the `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" + symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( + trait_symbol, probeset_freeze_id, method, conn) raise Exception("Unimplemented!!!") return "" -- cgit v1.2.3 From 5079e5077adafdbfd0b7e7c0ef12431e9aed443d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 09:23:48 +0300 Subject: Stub out `batch_computed_tissue_correlation` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Stub out `batch_computed_tissue_correlation` function to be used in implementing the function down the line. --- gn3/db/correlations.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f43b8a5..54d3079 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,6 +281,14 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} +def batch_computed_tissue_correlation( + trait_value: str, symbol_value_dict: dict, + method: str = "pearson") -> Tuple[dict, dict]: + """ + `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" + raise Exception("Not implemented!") + return ({}, {}) + def correlations_of_all_tissue_traits( trait_symbol: str, probeset_freeze_id: int, method: str, conn: Any) -> Tuple[dict, dict]: -- cgit v1.2.3 From 84aaf880f32f5293e5e4f1c74a3f284e3c95df2f Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 27 Oct 2021 10:24:28 +0300 Subject: Remove if clauses: replace with dict Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Remove the if clauses to simplify the code flow: use a dictionary of queries and select the appropriate query from the dictionary instead. --- gn3/db/species.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/species.py b/gn3/db/species.py index 1e5015f..abcbf64 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -47,17 +47,13 @@ def translate_to_mouse_gene_id(species: str, geneid: int, conn: Any) -> int: return geneid with conn.cursor as cursor: - if species == "rat": - cursor.execute( - "SELECT mouse FROM GeneIDXRef WHERE rat = %s", geneid) - rat_geneid = cursor.fetchone() - if rat_geneid: - return rat_geneid[0] - - cursor.execute( - "SELECT mouse FROM GeneIDXRef WHERE human = %s", geneid) - human_geneid = cursor.fetchone() - if human_geneid: - return human_geneid[0] + query = { + "rat": "SELECT mouse FROM GeneIDXRef WHERE rat = %s" + "human": "SELECT mouse FROM GeneIDXRef WHERE human = %s" + } + cursor.execute(query[species], geneid) + translated_gene_id = cursor.fetchone() + if translated_gene_id: + return translated_gene_id[0] return 0 # default if all else fails -- cgit v1.2.3 From 28b0ced4ec13451c5c7323ed5135d126f296836a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 04:55:30 +0300 Subject: Move the function to computations module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * The function `batch_computed_tissue_correlation` is a pure computations function with no expressions accessing the database, as far as I can tell, therefore, this commit moves the function over to the gn3.computations.partial_correlations module that holds the pure computation functions. --- gn3/computations/partial_correlations.py | 8 ++++++++ gn3/db/correlations.py | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 1fb0ccc..b3de31c 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -122,3 +122,11 @@ def find_identical_traits( (primary_name,) + control_names), {}).items() if len(item[1]) > 1), tuple())) + +def batch_computed_tissue_correlation( + trait_value: str, symbol_value_dict: dict, + method: str = "pearson") -> Tuple[dict, dict]: + """ + `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" + raise Exception("Not implemented!") + return ({}, {}) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 54d3079..f43b8a5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,14 +281,6 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} -def batch_computed_tissue_correlation( - trait_value: str, symbol_value_dict: dict, - method: str = "pearson") -> Tuple[dict, dict]: - """ - `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" - raise Exception("Not implemented!") - return ({}, {}) - def correlations_of_all_tissue_traits( trait_symbol: str, probeset_freeze_id: int, method: str, conn: Any) -> Tuple[dict, dict]: -- cgit v1.2.3 From a85db849660a63b09e5c40f7753d861f47eaaaeb Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 06:37:24 +0300 Subject: Add missing comma Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/db/species.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/species.py b/gn3/db/species.py index abcbf64..702a9a8 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -48,7 +48,7 @@ def translate_to_mouse_gene_id(species: str, geneid: int, conn: Any) -> int: with conn.cursor as cursor: query = { - "rat": "SELECT mouse FROM GeneIDXRef WHERE rat = %s" + "rat": "SELECT mouse FROM GeneIDXRef WHERE rat = %s", "human": "SELECT mouse FROM GeneIDXRef WHERE human = %s" } cursor.execute(query[species], geneid) -- cgit v1.2.3 From 5a9db2162a0a694a76a256996bb296ff06c75126 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 06:59:57 +0300 Subject: Move `correlations_of_all_tissue_traits` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/computations/partial_correlations.py: new function (`correlations_of_all_tissue_traits`). * gn3/db/correlations.py: delete function (`correlations_of_all_tissue_traits`). Move the function to `gn3.computations.partial_correlations` module and comment out the db-access code. Rework it to receive, as arguments, the data it previously fetched from the database, and add comments on future rework to get the function working again. --- gn3/computations/partial_correlations.py | 27 +++++++++++++++++++++++++++ gn3/db/correlations.py | 20 -------------------- 2 files changed, 27 insertions(+), 20 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index e73edfd..4ba2ba4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -157,3 +157,30 @@ def batch_computed_tissue_correlation( `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" raise Exception("Not implemented!") return ({}, {}) + +def correlations_of_all_tissue_traits( + primary_trait_symbol_value_dict: dict, symbol_value_dict: dict, + method: str) -> Tuple[dict, dict]: + """ + Computes and returns the correlation of all tissue traits. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` + function in GeneNetwork1. + """ + # The section below existed in the original function, but with the migration + # and the proposed rework (in the near future), the values from the database + # should be passed into this function, rather than have the function fetch + # the data for itself. + # --------------------------------------------------- + # primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + # (trait_symbol,), probeset_freeze_id, conn) + # primary_trait_values = primary_trait_symbol_value_dict.vlaues()[0] + # symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + # tuple(), probeset_freeze_id, conn) + # --------------------------------------------------- + # We might end up actually getting rid of this function all together as the + # rework is done. + primary_trait_values = primary_trait_symbol_value_dict.values()[0] + return batch_computed_tissue_correlation( + primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f43b8a5..39ed499 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,26 +281,6 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} -def correlations_of_all_tissue_traits( - trait_symbol: str, probeset_freeze_id: int, - method: str, conn: Any) -> Tuple[dict, dict]: - """ - Computes and returns the correlation of all tissue traits. - - This is a migration of the - `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` - function in GeneNetwork1. - """ - primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - (trait_symbol,), probeset_freeze_id, conn) - primary_trait_value = primary_trait_symbol_value_dict.vlaues()[0] - symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - tuple(), probeset_freeze_id, conn) - if method == "1": - return batch_computed_tissue_correlation( - primaryTraitValue,SymbolValueDict,method='spearman') - return batch_computed_tissue_correlation(primaryTraitValue,SymbolValueDict) - def build_temporary_tissue_correlations_table( trait_symbol: str, probeset_freeze_id: int, method: str, return_number: int, conn: Any) -> str: -- cgit v1.2.3 From 773c0896ccbed12170be2b5aed4554ab86d923b5 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:00:27 +0300 Subject: Complete `build_temporary_tissue_correlations_table` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/computations/partial_correlations.py: Remove comments after updating usage of the function at call point * gn3/db/correlations.py: Complete the implementation of the `build_temporary_tissue_correlations_table` function --- gn3/computations/partial_correlations.py | 13 ------------ gn3/db/correlations.py | 36 +++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index d095185..5777a0b 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -173,19 +173,6 @@ def correlations_of_all_tissue_traits( `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - # The section below existed in the original function, but with the migration - # and the proposed rework (in the near future), the values from the database - # should be passed into this function, rather than have the function fetch - # the data for itself. - # --------------------------------------------------- - # primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - # (trait_symbol,), probeset_freeze_id, conn) - # primary_trait_values = primary_trait_symbol_value_dict.vlaues()[0] - # symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - # tuple(), probeset_freeze_id, conn) - # --------------------------------------------------- - # We might end up actually getting rid of this function all together as the - # rework is done. primary_trait_values = primary_trait_symbol_value_dict.values()[0] return batch_computed_tissue_correlation( primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 39ed499..28f050a 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -290,10 +290,40 @@ def build_temporary_tissue_correlations_table( This is a migration of the `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" + # We should probably pass the `correlations_of_all_tissue_traits` function + # as an argument to this function and get rid of the two lines immediately + # following this comment. + from gn3.computations.partial_correlations import correlations_of_all_tissue_traits symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( - trait_symbol, probeset_freeze_id, method, conn) - raise Exception("Unimplemented!!!") - return "" + fetch_gene_symbol_tissue_value_dict_for_trait( + (trait_symbol,), probeset_freeze_id, conn), + fetch_gene_symbol_tissue_value_dict_for_trait( + tuple(), probeset_freeze_id, conn), + method) + + symbol_corr_list = sorted( + symbol_corr_dict.items(), + key=compare_tissue_correlation_absolute_values) + + temp_table_name = f"TOPTISSUE{random_string(8)}" + create_query = ( + "CREATE TEMPORARY TABLE {temp_table_name}" + "(Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)") + insert_query = ( + f"INSERT INTO {temp_table_name}(Symbol, Correlation, PValue) " + " VALUES (%(symbol)s, %(correlation)s, %(pvalue)s)") + + with conn.cursor() as cursor: + cursor.execute(create_query) + cursor.execute( + insert_query, + tuple({ + "symbol": symbol, + "correlation": corr, + "pvalue": symbol_p_value_dict[symbol] + } for symbol, corr in symbol_corr_list[0: 2 * return_number])) + + return temp_table_name def fetch_tissue_correlations( dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, -- cgit v1.2.3 From 307a83b897b9ece7c9dd1af49bdedc9e1320eb61 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:25:13 +0300 Subject: Rework sorting: remove `compare_tissue_correlation_absolute_values` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/correlations.py: Remove the `compare_tissue_correlation_absolute_values` function which is no longer needed. --- gn3/db/correlations.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 28f050a..d7954e5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -140,22 +140,6 @@ def fetch_literature_correlations( cursor.execute("DROP TEMPORARY TABLE %s", temp_table) return dict(results) -def compare_tissue_correlation_absolute_values(val1, val2): - """ - Comparison function for use when sorting tissue correlation values. - - This is a partial migration of the - `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in - GeneNetwork1.""" - try: - if abs(val1) < abs(val2): - return 1 - if abs(val1) == abs(val2): - return 0 - return -1 - except TypeError: - return 0 - def fetch_symbol_value_pair_dict( symbol_list: Tuple[str, ...], data_id_dict: dict, conn: Any) -> Dict[str, Tuple[float, ...]]: @@ -302,8 +286,7 @@ def build_temporary_tissue_correlations_table( method) symbol_corr_list = sorted( - symbol_corr_dict.items(), - key=compare_tissue_correlation_absolute_values) + symbol_corr_dict.items(), key=lambda key_val: key_val[1]) temp_table_name = f"TOPTISSUE{random_string(8)}" create_query = ( -- cgit v1.2.3 From 9ceb958273b8d86d220fa0d2f040fcb4a8233586 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:28:19 +0300 Subject: Fix linting and typing errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/computations/partial_correlations.py | 2 +- gn3/db/correlations.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 5777a0b..fce6ad2 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -173,6 +173,6 @@ def correlations_of_all_tissue_traits( `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - primary_trait_values = primary_trait_symbol_value_dict.values()[0] + primary_trait_values = tuple(primary_trait_symbol_value_dict.values())[0] return batch_computed_tissue_correlation( primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index d7954e5..d94759a 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -10,6 +10,8 @@ from gn3.random import random_string from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id +from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + def get_filename(target_db_name: str, conn: Any) -> str: """ Retrieve the name of the reference database file with which correlations are @@ -275,9 +277,8 @@ def build_temporary_tissue_correlations_table( `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" # We should probably pass the `correlations_of_all_tissue_traits` function - # as an argument to this function and get rid of the two lines immediately + # as an argument to this function and get rid of the one call immediately # following this comment. - from gn3.computations.partial_correlations import correlations_of_all_tissue_traits symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( fetch_gene_symbol_tissue_value_dict_for_trait( (trait_symbol,), probeset_freeze_id, conn), @@ -308,7 +309,7 @@ def build_temporary_tissue_correlations_table( return temp_table_name -def fetch_tissue_correlations( +def fetch_tissue_correlations(# pylint: disable=R0913 dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, return_number: int, conn: Any) -> dict: """ -- cgit v1.2.3 From 4a6be7e1b6514f3c7db8c672970b27e27ecde305 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 1 Nov 2021 06:01:58 +0300 Subject: Add some condition checking functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Add the `check_for_literature_info` and `check_symbol_for_tissue_correlation` functions to check for the presence of specific data. --- gn3/db/correlations.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index d94759a..06b3310 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -339,3 +339,43 @@ def fetch_tissue_correlations(# pylint: disable=R0913 return { trait_name: (tiss_corr, tiss_p_val) for trait_name, tiss_corr, tiss_p_val in results} + +def check_for_literature_info(conn: Any, geneid: int) -> bool: + """ + Checks the database to find out whether the trait with `geneid` has any + associated literature. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.checkForLitInfo` function in + GeneNetwork1. + """ + query = "SELECT 1 FROM LCorrRamin3 WHERE GeneId1=%s LIMIT 1" + with conn.cursor() as cursor: + cursor.execute(query, geneid) + result = cursor.fetchone() + if result: + return True + + return False + +def check_symbol_for_tissue_correlation( + conn: Any, tissue_probeset_freeze_id: int, symbol: str = "") -> bool: + """ + Checks whether a symbol has any associated tissue correlations. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.checkSymbolForTissueCorr` function + in GeneNetwork1. + """ + query = ( + "SELECT 1 FROM TissueProbeSetXRef " + "WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + "AND Symbol=%(symbol)s LIMIT 1") + with conn.cursor() as cursor: + cursor.execute( + query, probeset_freeze_id=tissue_probeset_freeze_id, symbol=symbol) + result = cursor.fetchone() + if result: + return True + + return False -- cgit v1.2.3 From 854416ba850b7793aa9aa95528b89bc69df26888 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 09:06:52 +0300 Subject: Make the DB connection argument the first Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * To make the code more composable down the line, make the database connection argument the first argument for functions that access the database, since they will always require the connection. --- gn3/db/correlations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 06b3310..f327dc3 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -34,7 +34,7 @@ def get_filename(target_db_name: str, conn: Any) -> str: return "" def build_temporary_literature_table( - species: str, gene_id: int, return_number: int, conn: Any) -> str: + conn: Any, species: str, gene_id: int, return_number: int) -> str: """ Build and populate a temporary table to hold the literature correlation data to be used in computations. @@ -128,7 +128,7 @@ def fetch_literature_correlations( GeneNetwork1. """ temp_table = build_temporary_literature_table( - species, gene_id, return_number, conn) + conn, species, gene_id, return_number) query_fns = { "Geno": fetch_geno_literature_correlations, # "Temp": fetch_temp_literature_correlations, @@ -268,8 +268,8 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return {} def build_temporary_tissue_correlations_table( - trait_symbol: str, probeset_freeze_id: int, method: str, - return_number: int, conn: Any) -> str: + conn: Any, trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int) -> str: """ Build a temporary table to hold the tissue correlations data. @@ -320,7 +320,7 @@ def fetch_tissue_correlations(# pylint: disable=R0913 GeneNetwork1. """ temp_table = build_temporary_tissue_correlations_table( - trait_symbol, probeset_freeze_id, method, return_number, conn) + conn, trait_symbol, probeset_freeze_id, method, return_number) with conn.cursor() as cursor: cursor.execute( ( -- cgit v1.2.3 From 55d698b1fb07afe74bf1dd570f9f495aefea1086 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 12:03:21 +0300 Subject: Migrate `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function from GN1 to GN3. --- gn3/db/correlations.py | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f327dc3..ff570b4 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -379,3 +379,150 @@ def check_symbol_for_tissue_correlation( return True return False + +def fetch_sample_ids( + conn: Any, sample_names: Tuple[str, ...], species_name: str) -> Tuple[ + int, ...]: + """ + Given a sequence of sample names, and a species name, return the sample ids + that correspond to both. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ + query = ( + "SELECT Strain.Id FROM Strain, Species " + "WHERE Strain.Name IN %(samples_names)s " + "AND Strain.SpeciesId=Species.Id " + "AND Species.name=%(species_name)s") + with conn.cursor() as cursor: + cursor.execute( + query, samples_names=tuple(samples), + species_name=species) + return cursor.fetchall() + +def fetch_all_database_data( + conn: Any, species: str, gene_id: int, gene_symbol: str, + samples: Tuple[str, ...], db_type: str, db_name: str, method: str, + returnNumber: int, tissueProbeSetFreezeId: int) -> Tuple[Any, Any]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ + def __build_query_sgo_lit__(temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.value " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + + " ".join(joins) + + f" WHERE ProbeSet.GeneId IS NOT NULL " + + f"AND {temp_table}.value IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 2) + + def __build_query_tissue_corr__(temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + + f"{temp_table}.PValue, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + + " ".join(joins) + + f" WHERE ProbeSet.Symbol IS NOT NULL " + + f"AND {temp_table}.Correlation IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.%sId " + f"ORDER BY {db_type}.Id"), + 3) + + def __build_query__(sample_ids, temp_table): + sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) + if db_type == "Publish": + joins = tuple( + ("LEFT JOIN PublishData AS T{item} " + "ON T{item}.Id = PublishXRef.DataId " + "AND T{item}.StrainId = %(T{item}_sample_id)s") + for item in sample_ids) + return ( + ("SELECT PublishXRef.Id, " + + sample_id_columns + + "FROM (PublishXRef, PublishFreeze) " + + " ".join(joins) + + " WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishFreeze.Name = %(db_name)s"), + 1) + if temp_table is not None: + joins = tuple( + ("LEFT JOIN {db_type}Data AS T{item} " + "ON T{item}.Id = {db_type}XRef.DataId " + "AND T{item}.StrainId=%(T{item}_sample_id)s") + for item in sample_ids) + if method.lower() == "sgo literature correlation": + return __build_query_sgo_lit__( + sample_ids, temp_table, sample_id_columns) + if method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return __build_query_tissue_corr__( + sample_ids, temp_table, sample_id_columns) + joins = tuple( + (f"LEFT JOIN {db_type}Data AS T{item} " + f"ON T{item}.Id = {db_type}XRef.DataId " + f"AND T{item}.StrainId = %(T{item}_sample_id)s") + for item in sample_ids) + return ( + ( + f"SELECT {db_type}.Name, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + " ".join(joins) + + f" WHERE {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 1) + + def __fetch_data__(sample_ids, temp_table): + query, data_start_pos = __build_query__(sample_ids, temp_table) + with conn.cursor() as cursor: + cursor.execute( + query, db_name=db_name, + **{f"T{item}_sample_id": item for item in sample_ids}) + return cursor.fetchall() + + sample_ids = tuple( + # look into graduating this to an argument and removing the `samples` + # and `species` argument: function currying and compositions might help + # with this + f"{sample_id}" for sample_id in + fetch_sample_ids(conn, samples, species)) + + temp_table = None + if gene_id and db_type == "probeset": + if method.lower() == "sgo literature correlation": + temp_table = build_temporary_literature_table( + conn, species, gene_id, return_number) + if method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + temp_table = build_temporary_tissue_correlations_table( + conn, trait_symbol, probeset_freeze_id, method, return_number) + + trait_database = tuple( + item for sublist in + (__fetch_data__(ssample_ids, temp_table) + for ssample_ids in partition_all(25, sample_ids)) + for item in sublist) + + if temp_table: + with conn.cursor() as cursor: + cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") + + return trait_database, data_start_pos -- cgit v1.2.3 From 575da0baf4468d27782c73b19995b3adb934ba70 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 13:56:03 +0300 Subject: Add test to query builders Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Add some tests for the query builders to ensure that the queries are built up correctly. --- gn3/db/correlations.py | 78 +++++++++++++++++---------------- tests/unit/db/test_correlation.py | 90 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 36 deletions(-) create mode 100644 tests/unit/db/test_correlation.py (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index ff570b4..7daff87 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -402,6 +402,43 @@ def fetch_sample_ids( species_name=species) return cursor.fetchall() +def build_query_sgo_lit_corr( + db_type: str, temp_table: str, sample_id_columns: str, + joins: Tuple[str, ...]) -> str: + """ + Build query for `SGO Literature Correlation` data, when querying the given + `temp_table` temporary table. + """ + return ( + (f"SELECT {db_type}.Name, {temp_table}.value, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + + " ".join(joins) + + f" WHERE ProbeSet.GeneId IS NOT NULL " + + f"AND {temp_table}.value IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 2) + +def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + + f"{temp_table}.PValue, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + + " ".join(joins) + + f" WHERE ProbeSet.Symbol IS NOT NULL " + + f"AND {temp_table}.Correlation IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + f"ORDER BY {db_type}.Id"), + 3) + def fetch_all_database_data( conn: Any, species: str, gene_id: int, gene_symbol: str, samples: Tuple[str, ...], db_type: str, db_name: str, method: str, @@ -411,37 +448,6 @@ def fetch_all_database_data( `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ - def __build_query_sgo_lit__(temp_table, sample_id_columns, joins): - return ( - (f"SELECT {db_type}.Name, {temp_table}.value " + - sample_id_columns + - f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + - f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + - " ".join(joins) + - f" WHERE ProbeSet.GeneId IS NOT NULL " + - f"AND {temp_table}.value IS NOT NULL " + - f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + - f"AND {db_type}Freeze.Name = %(db_name)s " + - f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + - f"ORDER BY {db_type}.Id"), - 2) - - def __build_query_tissue_corr__(temp_table, sample_id_columns, joins): - return ( - (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + - f"{temp_table}.PValue, " + - sample_id_columns + - f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + - f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + - " ".join(joins) + - f" WHERE ProbeSet.Symbol IS NOT NULL " + - f"AND {temp_table}.Correlation IS NOT NULL " + - f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + - f"AND {db_type}Freeze.Name = %(db_name)s " + - f"AND {db_type}.Id = {db_type}XRef.%sId " - f"ORDER BY {db_type}.Id"), - 3) - def __build_query__(sample_ids, temp_table): sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) if db_type == "Publish": @@ -460,17 +466,17 @@ def fetch_all_database_data( 1) if temp_table is not None: joins = tuple( - ("LEFT JOIN {db_type}Data AS T{item} " - "ON T{item}.Id = {db_type}XRef.DataId " - "AND T{item}.StrainId=%(T{item}_sample_id)s") + (f"LEFT JOIN {db_type}Data AS T{item} " + f"ON T{item}.Id = {db_type}XRef.DataId " + f"AND T{item}.StrainId=%(T{item}_sample_id)s") for item in sample_ids) if method.lower() == "sgo literature correlation": - return __build_query_sgo_lit__( + return build_query_sgo_lit_corr( sample_ids, temp_table, sample_id_columns) if method.lower() in ( "tissue correlation, pearson's r", "tissue correlation, spearman's rho"): - return __build_query_tissue_corr__( + return build_query_tissue_corr( sample_ids, temp_table, sample_id_columns) joins = tuple( (f"LEFT JOIN {db_type}Data AS T{item} " diff --git a/tests/unit/db/test_correlation.py b/tests/unit/db/test_correlation.py new file mode 100644 index 0000000..866d28d --- /dev/null +++ b/tests/unit/db/test_correlation.py @@ -0,0 +1,90 @@ +""" +Tests for the gn3.db.correlations module +""" + +from unittest import TestCase + +from gn3.db.correlations import ( + build_query_sgo_lit_corr, + build_query_tissue_corr) + +class TestCorrelation(TestCase): + """Test cases for correlation data fetching functions""" + maxDiff = None + + def test_build_query_sgo_lit_corr(self): + self.assertEqual( + build_query_sgo_lit_corr( + "Probeset", + "temp_table_xy45i7wd", + "T1.value, T2.value, T3.value", + (("LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s"))), + (("SELECT Probeset.Name, temp_table_xy45i7wd.value, " + "T1.value, T2.value, T3.value " + "FROM (Probeset, ProbesetXRef, ProbesetFreeze) " + "LEFT JOIN temp_table_xy45i7wd ON temp_table_xy45i7wd.GeneId2=ProbeSet.GeneId " + "LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s " + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s " + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s " + "WHERE ProbeSet.GeneId IS NOT NULL " + "AND temp_table_xy45i7wd.value IS NOT NULL " + "AND ProbesetXRef.ProbesetFreezeId = ProbesetFreeze.Id " + "AND ProbesetFreeze.Name = %(db_name)s " + "AND Probeset.Id = ProbesetXRef.ProbesetId " + "ORDER BY Probeset.Id"), + 2)) + + def test_build_query_tissue_corr(self): + self.assertEqual( + build_query_tissue_corr( + "Probeset", + "temp_table_xy45i7wd", + "T1.value, T2.value, T3.value", + (("LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s"))), + (("SELECT Probeset.Name, temp_table_xy45i7wd.Correlation, " + "temp_table_xy45i7wd.PValue, " + "T1.value, T2.value, T3.value " + "FROM (Probeset, ProbesetXRef, ProbesetFreeze) " + "LEFT JOIN temp_table_xy45i7wd ON temp_table_xy45i7wd.Symbol=ProbeSet.Symbol " + "LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s " + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s " + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s " + "WHERE ProbeSet.Symbol IS NOT NULL " + "AND temp_table_xy45i7wd.Correlation IS NOT NULL " + "AND ProbesetXRef.ProbesetFreezeId = ProbesetFreeze.Id " + "AND ProbesetFreeze.Name = %(db_name)s " + "AND Probeset.Id = ProbesetXRef.ProbesetId " + "ORDER BY Probeset.Id"), + 3)) -- cgit v1.2.3 From e1fb18b9d4a3b4ab9783f58d78ff384141567a42 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 23 Nov 2021 11:49:32 +0300 Subject: Update documentation for functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Document functions for posterity. --- gn3/db/correlations.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 7daff87..5c3e7b8 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -408,6 +408,10 @@ def build_query_sgo_lit_corr( """ Build query for `SGO Literature Correlation` data, when querying the given `temp_table` temporary table. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. """ return ( (f"SELECT {db_type}.Name, {temp_table}.value, " + @@ -424,6 +428,14 @@ def build_query_sgo_lit_corr( 2) def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): + """ + Build query for `Tissue Correlation` data, when querying the given + `temp_table` temporary table. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ return ( (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + f"{temp_table}.PValue, " + -- cgit v1.2.3 From df4ed9183f3efd89d54bba1a144c48475f4b8169 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 23 Nov 2021 12:34:44 +0300 Subject: Fix a myriad of linting errors * Fix linting errors like: - Unused variables - Undeclared variable errors (mostly caused by typos, and wrong names) - Missing documentation strings for functions etc. --- gn3/computations/partial_correlations.py | 4 +++- gn3/db/correlations.py | 24 ++++++++++++------------ tests/unit/db/test_correlation.py | 6 ++++++ 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 4bd26a2..f43c4d4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -200,11 +200,13 @@ def good_dataset_samples_indexes( samples_from_file.index(good) for good in set(samples).intersection(set(samples_from_file)))) -def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] +def partial_correlations_fast(# pylint: disable=[R0913, R0914] samples, primary_vals, control_vals, database_filename, fetched_correlations, method: str, correlation_type: str) -> Tuple[ float, Tuple[float, ...]]: """ + Computes partial correlation coefficients using data from a CSV file. + This is a partial migration of the `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` function in GeneNetwork1. diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 5c3e7b8..a1daa3c 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -398,8 +398,8 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - query, samples_names=tuple(samples), - species_name=species) + query, samples_names=tuple(sample_names), + species_name=species_name) return cursor.fetchall() def build_query_sgo_lit_corr( @@ -419,7 +419,7 @@ def build_query_sgo_lit_corr( f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + " ".join(joins) + - f" WHERE ProbeSet.GeneId IS NOT NULL " + + " WHERE ProbeSet.GeneId IS NOT NULL " + f"AND {temp_table}.value IS NOT NULL " + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + f"AND {db_type}Freeze.Name = %(db_name)s " + @@ -443,7 +443,7 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + " ".join(joins) + - f" WHERE ProbeSet.Symbol IS NOT NULL " + + " WHERE ProbeSet.Symbol IS NOT NULL " + f"AND {temp_table}.Correlation IS NOT NULL " + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + f"AND {db_type}Freeze.Name = %(db_name)s " + @@ -451,17 +451,17 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): f"ORDER BY {db_type}.Id"), 3) -def fetch_all_database_data( - conn: Any, species: str, gene_id: int, gene_symbol: str, +def fetch_all_database_data(# pylint: disable=[R0913, R0914] + conn: Any, species: str, gene_id: int, trait_symbol: str, samples: Tuple[str, ...], db_type: str, db_name: str, method: str, - returnNumber: int, tissueProbeSetFreezeId: int) -> Tuple[Any, Any]: + return_number: int, probeset_freeze_id: int) -> Tuple[Any, Any]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ def __build_query__(sample_ids, temp_table): - sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) + sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in sample_ids) if db_type == "Publish": joins = tuple( ("LEFT JOIN PublishData AS T{item} " @@ -484,12 +484,12 @@ def fetch_all_database_data( for item in sample_ids) if method.lower() == "sgo literature correlation": return build_query_sgo_lit_corr( - sample_ids, temp_table, sample_id_columns) + sample_ids, temp_table, sample_id_columns, joins) if method.lower() in ( "tissue correlation, pearson's r", "tissue correlation, spearman's rho"): return build_query_tissue_corr( - sample_ids, temp_table, sample_id_columns) + sample_ids, temp_table, sample_id_columns, joins) joins = tuple( (f"LEFT JOIN {db_type}Data AS T{item} " f"ON T{item}.Id = {db_type}XRef.DataId " @@ -513,7 +513,7 @@ def fetch_all_database_data( cursor.execute( query, db_name=db_name, **{f"T{item}_sample_id": item for item in sample_ids}) - return cursor.fetchall() + return (cursor.fetchall(), data_start_pos) sample_ids = tuple( # look into graduating this to an argument and removing the `samples` @@ -543,4 +543,4 @@ def fetch_all_database_data( with conn.cursor() as cursor: cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") - return trait_database, data_start_pos + return (tuple(item[0] for item in trait_database), trait_database[0][1]) diff --git a/tests/unit/db/test_correlation.py b/tests/unit/db/test_correlation.py index 866d28d..3f940b2 100644 --- a/tests/unit/db/test_correlation.py +++ b/tests/unit/db/test_correlation.py @@ -13,6 +13,9 @@ class TestCorrelation(TestCase): maxDiff = None def test_build_query_sgo_lit_corr(self): + """ + Test that the literature correlation query is built correctly. + """ self.assertEqual( build_query_sgo_lit_corr( "Probeset", @@ -51,6 +54,9 @@ class TestCorrelation(TestCase): 2)) def test_build_query_tissue_corr(self): + """ + Test that the tissue correlation query is built correctly. + """ self.assertEqual( build_query_tissue_corr( "Probeset", -- cgit v1.2.3 From a6d61f58dc07ba307698c90befab28bcaf691966 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 23 Nov 2021 17:32:41 +0300 Subject: db: traits: Remove "\n\n" when generating csv file In excel, "\n\n" is replaced with ",,,," during upload. --- gn3/db/traits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1c6aaa7..56258e2 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -103,7 +103,7 @@ def get_trait_csv_sample_data(conn: Any, ",".join([str(val) if val else "x" for val in (strain_id, strain_name, value, error, count)])) - return f"# Publish Data Id: {publishdata_id}\n\n" + "\n".join(csv_data) + return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) def update_sample_data(conn: Any, -- cgit v1.2.3 From 6675388cfee2fb85b78f9310dd72f2c95c8ea41b Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 24 Nov 2021 11:32:49 +0300 Subject: db: traits: Remove trailing ".0" in int values --- gn3/db/traits.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 56258e2..ebb7e3c 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -79,6 +79,11 @@ def export_trait_data( def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" + + def __float_strip(n): + if str(n)[-2:] == ".0": + return str(int(n)) + return str(n) sql = ("SELECT DISTINCT Strain.Id, PublishData.Id, Strain.Name, " "PublishData.value, " "PublishSE.error, NStrain.count FROM " @@ -100,7 +105,7 @@ def get_trait_csv_sample_data(conn: Any, (strain_id, publishdata_id, strain_name, value, error, count) = record csv_data.append( - ",".join([str(val) if val else "x" + ",".join([__float_strip(val) if val else "x" for val in (strain_id, strain_name, value, error, count)])) return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) -- cgit v1.2.3 From a1516993c7f6dc608f75ba42cb27b983e0c5c330 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Thu, 25 Nov 2021 20:52:14 +0300 Subject: db: traits: Support additions and deletions from csv file --- gn3/db/traits.py | 238 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 193 insertions(+), 45 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ebb7e3c..75de4f4 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,5 +1,6 @@ """This class contains functions relating to trait data manipulation""" import os +import MySQLdb from functools import reduce from typing import Any, Dict, Union, Sequence @@ -76,16 +77,15 @@ def export_trait_data( return reduce(__exporter, samplelist, tuple()) + def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" - def __float_strip(n): if str(n)[-2:] == ".0": return str(int(n)) return str(n) - sql = ("SELECT DISTINCT Strain.Id, PublishData.Id, Strain.Name, " - "PublishData.value, " + sql = ("SELECT DISTINCT Strain.Name, PublishData.value, " "PublishSE.error, NStrain.count FROM " "(PublishData, Strain, PublishXRef, PublishFreeze) " "LEFT JOIN PublishSE ON " @@ -97,65 +97,188 @@ def get_trait_csv_sample_data(conn: Any, "PublishData.Id = PublishXRef.DataId AND " "PublishXRef.Id = %s AND PublishXRef.PhenotypeId = %s " "AND PublishData.StrainId = Strain.Id Order BY Strain.Name") - csv_data = ["Strain Id,Strain Name,Value,SE,Count"] - publishdata_id = "" + csv_data = ["Strain Name,Value,SE,Count"] with conn.cursor() as cursor: cursor.execute(sql, (trait_name, phenotype_id,)) for record in cursor.fetchall(): - (strain_id, publishdata_id, - strain_name, value, error, count) = record + (strain_name, value, error, count) = record csv_data.append( ",".join([__float_strip(val) if val else "x" - for val in (strain_id, strain_name, - value, error, count)])) - return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) + for val in (strain_name, value, error, count)])) + return "\n".join(csv_data) def update_sample_data(conn: Any, + trait_name: str, strain_name: str, - strain_id: int, - publish_data_id: int, + phenotype_id: int, value: Union[int, float, str], error: Union[int, float, str], count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" - # pylint: disable=[R0913, R0914, C0103] - STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" - PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " - "WHERE StrainId = %s AND Id = %s") - PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " - "WHERE StrainId = %s AND DataId = %s") - N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " - "WHERE StrainId = %s AND DataId = %s") - - updated_strains: int = 0 + strain_id, data_id = "", "" + + with conn.cursor() as cursor: + cursor.execute( + ("SELECT Strain.Id, PublishData.Id FROM " + "(PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id AND " + "PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = " + "PublishFreeze.InbredSetId AND " + "PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND " + "PublishXRef.PhenotypeId = %s " + "AND PublishData.StrainId = Strain.Id " + "AND Strain.Name = \"%s\"") % (trait_name, + phenotype_id, + str(strain_name))) + strain_id, data_id = cursor.fetchone() updated_published_data: int = 0 updated_se_data: int = 0 updated_n_strains: int = 0 with conn.cursor() as cursor: - # Update the Strains table - cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id)) - updated_strains = cursor.rowcount # Update the PublishData table - cursor.execute(PUBLISH_DATA_SQL, + cursor.execute(("UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s"), (None if value == "x" else value, - strain_id, publish_data_id)) + strain_id, data_id)) updated_published_data = cursor.rowcount + # Update the PublishSE table - cursor.execute(PUBLISH_SE_SQL, + cursor.execute(("UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s"), (None if error == "x" else error, - strain_id, publish_data_id)) + strain_id, data_id)) updated_se_data = cursor.rowcount + # Update the NStrain table - cursor.execute(N_STRAIN_SQL, + cursor.execute(("UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s"), (None if count == "x" else count, - strain_id, publish_data_id)) + strain_id, data_id)) updated_n_strains = cursor.rowcount - return (updated_strains, updated_published_data, + return (updated_published_data, updated_se_data, updated_n_strains) + +def delete_sample_data(conn: Any, + trait_name: str, + strain_name: str, + phenotype_id: int): + """Given the right parameters, delete sample-data from the relevant + table.""" + strain_id, data_id = "", "" + + deleted_published_data: int = 0 + deleted_se_data: int = 0 + deleted_n_strains: int = 0 + + with conn.cursor() as cursor: + # Delete the PublishData table + try: + cursor.execute( + ("SELECT Strain.Id, PublishData.Id FROM " + "(PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id AND " + "PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = " + "PublishFreeze.InbredSetId AND " + "PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND " + "PublishXRef.PhenotypeId = %s " + "AND PublishData.StrainId = Strain.Id " + "AND Strain.Name = \"%s\"") % (trait_name, + phenotype_id, + str(strain_name))) + strain_id, data_id = cursor.fetchone() + + cursor.execute(("DELETE FROM PublishData " + "WHERE StrainId = %s AND Id = %s") + % (strain_id, data_id)) + deleted_published_data = cursor.rowcount + + # Delete the PublishSE table + cursor.execute(("DELETE FROM PublishSE " + "WHERE StrainId = %s AND DataId = %s") % + (strain_id, data_id)) + deleted_se_data = cursor.rowcount + + # Delete the NStrain table + cursor.execute(("DELETE FROM NStrain " + "WHERE StrainId = %s AND DataId = %s" % + (strain_id, data_id))) + deleted_n_strains = cursor.rowcount + except Exception as e: + conn.rollback() + raise MySQLdb.Error + conn.commit() + cursor.close() + cursor.close() + + return (deleted_published_data, + deleted_se_data, deleted_n_strains) + + +def insert_sample_data(conn: Any, + trait_name: str, + strain_name: str, + phenotype_id: int, + value: Union[int, float, str], + error: Union[int, float, str], + count: Union[int, str]): + """Given the right parameters, insert sample-data to the relevant table. + + """ + + inserted_published_data, inserted_se_data, inserted_n_strains = 0, 0, 0 + with conn.cursor() as cursor: + try: + cursor.execute("SELECT DataId FROM PublishXRef WHERE Id = %s AND " + "PhenotypeId = %s", (trait_name, phenotype_id)) + data_id = cursor.fetchone() + + cursor.execute("SELECT Id FROM Strain WHERE Name = %s", + (strain_name,)) + strain_id = cursor.fetchone() + + # Insert the PublishData table + cursor.execute(("INSERT INTO PublishData (Id, StrainId, value)" + "VALUES (%s, %s, %s)"), + (data_id, strain_id, value)) + inserted_published_data = cursor.rowcount + + # Insert into the PublishSE table if error is specified + if error and error != "x": + cursor.execute(("INSERT INTO PublishSE (StrainId, DataId, " + " error) VALUES (%s, %s, %s)") % + (strain_id, data_id, error)) + inserted_se_data = cursor.rowcount + + # Insert into the NStrain table + if count and count != "x": + cursor.execute(("INSERT INTO NStrain " + "(StrainId, DataId, error) " + "VALUES (%s, %s, %s)") % + (strain_id, data_id, count)) + inserted_n_strains = cursor.rowcount + except Exception as e: + conn.rollback() + raise MySQLdb.Error + return (inserted_published_data, + inserted_se_data, inserted_n_strains) + + def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Publish` traits. @@ -195,11 +318,12 @@ def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_id"] }) return dict(zip([k.lower() for k in keys], cursor.fetchone())) + def set_confidential_field(trait_type, trait_info): """Post processing function for 'Publish' trait types. @@ -212,6 +336,7 @@ def set_confidential_field(trait_type, trait_info): and not trait_info.get("pubmed_id", None)) else 0} return trait_info + def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. @@ -239,11 +364,12 @@ def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) return dict(zip(keys, cursor.fetchone())) + def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Geno` traits. @@ -263,11 +389,12 @@ def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) return dict(zip(keys, cursor.fetchone())) + def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Temp` traits. @@ -280,11 +407,12 @@ def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name"] }) return dict(zip(keys, cursor.fetchone())) + def set_haveinfo_field(trait_info): """ Common postprocessing function for all trait types. @@ -292,6 +420,7 @@ def set_haveinfo_field(trait_info): Sets the value for the 'haveinfo' field.""" return {**trait_info, "haveinfo": 1 if trait_info else 0} + def set_homologene_id_field_probeset(trait_info, conn): """ Postprocessing function for 'ProbeSet' traits. @@ -307,7 +436,7 @@ def set_homologene_id_field_probeset(trait_info, conn): cursor.execute( query, { - k:v for k, v in trait_info.items() + k: v for k, v in trait_info.items() if k in ["geneid", "group"] }) res = cursor.fetchone() @@ -315,12 +444,13 @@ def set_homologene_id_field_probeset(trait_info, conn): return {**trait_info, "homologeneid": res[0]} return {**trait_info, "homologeneid": None} + def set_homologene_id_field(trait_type, trait_info, conn): """ Common postprocessing function for all trait types. Sets the value for the 'homologene' key.""" - set_to_null = lambda ti: {**ti, "homologeneid": None} + def set_to_null(ti): return {**ti, "homologeneid": None} functions_table = { "Temp": set_to_null, "Geno": set_to_null, @@ -329,6 +459,7 @@ def set_homologene_id_field(trait_type, trait_info, conn): } return functions_table[trait_type](trait_info) + def load_publish_qtl_info(trait_info, conn): """ Load extra QTL information for `Publish` traits @@ -349,6 +480,7 @@ def load_publish_qtl_info(trait_info, conn): return dict(zip(["locus", "lrs", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "additive": ""} + def load_probeset_qtl_info(trait_info, conn): """ Load extra QTL information for `ProbeSet` traits @@ -371,6 +503,7 @@ def load_probeset_qtl_info(trait_info, conn): ["locus", "lrs", "pvalue", "mean", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} + def load_qtl_info(qtl, trait_type, trait_info, conn): """ Load extra QTL information for traits @@ -399,6 +532,7 @@ def load_qtl_info(qtl, trait_type, trait_info, conn): return qtl_info_functions[trait_type](trait_info, conn) + def build_trait_name(trait_fullname): """ Initialises the trait's name, and other values from the search data provided @@ -425,6 +559,7 @@ def build_trait_name(trait_fullname): "cellid": name_parts[2] if len(name_parts) == 3 else "" } + def retrieve_probeset_sequence(trait, conn): """ Retrieve a 'ProbeSet' trait's sequence information @@ -446,6 +581,7 @@ def retrieve_probeset_sequence(trait, conn): seq = cursor.fetchone() return {**trait, "sequence": seq[0] if seq else ""} + def retrieve_trait_info( threshold: int, trait_full_name: str, conn: Any, qtl=None): @@ -501,6 +637,7 @@ def retrieve_trait_info( } return trait_info + def retrieve_temp_trait_data(trait_info: dict, conn: Any): """ Retrieve trait data for `Temp` traits. @@ -520,9 +657,10 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): {"trait_name": trait_info["trait_name"]}) return [dict(zip( ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_species_id(group, conn: Any): """ Retrieve a species id given the Group value @@ -534,6 +672,7 @@ def retrieve_species_id(group, conn: Any): return cursor.fetchone()[0] return None + def retrieve_geno_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Geno` traits. @@ -559,9 +698,10 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): trait_info["db"]["group"], conn)}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_publish_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Publish` traits. @@ -590,9 +730,10 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Probe Data` types. @@ -623,9 +764,10 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `ProbeSet` traits. @@ -652,9 +794,10 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def with_samplelist_data_setup(samplelist: Sequence[str]): """ Build function that computes the trait data from provided list of samples. @@ -681,6 +824,7 @@ def with_samplelist_data_setup(samplelist: Sequence[str]): return None return setup_fn + def without_samplelist_data_setup(): """ Build function that computes the trait data. @@ -701,6 +845,7 @@ def without_samplelist_data_setup(): return None return setup_fn + def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -740,15 +885,17 @@ def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tupl "data": dict(map( lambda x: ( x["sample_name"], - {k:v for k, v in x.items() if x != "sample_name"}), + {k: v for k, v in x.items() if x != "sample_name"}), data))} return {} + def generate_traits_filename(base_path: str = TMPDIR): """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) + def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: """ Export informative strain @@ -770,5 +917,6 @@ def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: return acc return reduce( __exporter__, - filter(lambda td: td["value"] is not None, trait_data["data"].values()), + filter(lambda td: td["value"] is not None, + trait_data["data"].values()), (tuple(), tuple(), tuple())) -- cgit v1.2.3 From b8067b3a5e4c6891d0a0a99e23e03ac12d68d649 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 23 Nov 2021 17:32:41 +0300 Subject: db: traits: Remove "\n\n" when generating csv file In excel, "\n\n" is replaced with ",,,," during upload. --- gn3/db/traits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 1c6aaa7..56258e2 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -103,7 +103,7 @@ def get_trait_csv_sample_data(conn: Any, ",".join([str(val) if val else "x" for val in (strain_id, strain_name, value, error, count)])) - return f"# Publish Data Id: {publishdata_id}\n\n" + "\n".join(csv_data) + return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) def update_sample_data(conn: Any, -- cgit v1.2.3 From 8210007122c26daffcfbbb159ff846b928dfb18d Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 24 Nov 2021 11:32:49 +0300 Subject: db: traits: Remove trailing ".0" in int values --- gn3/db/traits.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 56258e2..ebb7e3c 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -79,6 +79,11 @@ def export_trait_data( def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" + + def __float_strip(n): + if str(n)[-2:] == ".0": + return str(int(n)) + return str(n) sql = ("SELECT DISTINCT Strain.Id, PublishData.Id, Strain.Name, " "PublishData.value, " "PublishSE.error, NStrain.count FROM " @@ -100,7 +105,7 @@ def get_trait_csv_sample_data(conn: Any, (strain_id, publishdata_id, strain_name, value, error, count) = record csv_data.append( - ",".join([str(val) if val else "x" + ",".join([__float_strip(val) if val else "x" for val in (strain_id, strain_name, value, error, count)])) return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) -- cgit v1.2.3 From 6a3ee25e241bd4984c6959e2ccc1e569b53d6486 Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Thu, 25 Nov 2021 20:52:14 +0300 Subject: db: traits: Support additions and deletions from csv file --- gn3/db/traits.py | 238 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 193 insertions(+), 45 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ebb7e3c..75de4f4 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,5 +1,6 @@ """This class contains functions relating to trait data manipulation""" import os +import MySQLdb from functools import reduce from typing import Any, Dict, Union, Sequence @@ -76,16 +77,15 @@ def export_trait_data( return reduce(__exporter, samplelist, tuple()) + def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" - def __float_strip(n): if str(n)[-2:] == ".0": return str(int(n)) return str(n) - sql = ("SELECT DISTINCT Strain.Id, PublishData.Id, Strain.Name, " - "PublishData.value, " + sql = ("SELECT DISTINCT Strain.Name, PublishData.value, " "PublishSE.error, NStrain.count FROM " "(PublishData, Strain, PublishXRef, PublishFreeze) " "LEFT JOIN PublishSE ON " @@ -97,65 +97,188 @@ def get_trait_csv_sample_data(conn: Any, "PublishData.Id = PublishXRef.DataId AND " "PublishXRef.Id = %s AND PublishXRef.PhenotypeId = %s " "AND PublishData.StrainId = Strain.Id Order BY Strain.Name") - csv_data = ["Strain Id,Strain Name,Value,SE,Count"] - publishdata_id = "" + csv_data = ["Strain Name,Value,SE,Count"] with conn.cursor() as cursor: cursor.execute(sql, (trait_name, phenotype_id,)) for record in cursor.fetchall(): - (strain_id, publishdata_id, - strain_name, value, error, count) = record + (strain_name, value, error, count) = record csv_data.append( ",".join([__float_strip(val) if val else "x" - for val in (strain_id, strain_name, - value, error, count)])) - return f"# Publish Data Id: {publishdata_id}\n" + "\n".join(csv_data) + for val in (strain_name, value, error, count)])) + return "\n".join(csv_data) def update_sample_data(conn: Any, + trait_name: str, strain_name: str, - strain_id: int, - publish_data_id: int, + phenotype_id: int, value: Union[int, float, str], error: Union[int, float, str], count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" - # pylint: disable=[R0913, R0914, C0103] - STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" - PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " - "WHERE StrainId = %s AND Id = %s") - PUBLISH_SE_SQL: str = ("UPDATE PublishSE SET error = %s " - "WHERE StrainId = %s AND DataId = %s") - N_STRAIN_SQL: str = ("UPDATE NStrain SET count = %s " - "WHERE StrainId = %s AND DataId = %s") - - updated_strains: int = 0 + strain_id, data_id = "", "" + + with conn.cursor() as cursor: + cursor.execute( + ("SELECT Strain.Id, PublishData.Id FROM " + "(PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id AND " + "PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = " + "PublishFreeze.InbredSetId AND " + "PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND " + "PublishXRef.PhenotypeId = %s " + "AND PublishData.StrainId = Strain.Id " + "AND Strain.Name = \"%s\"") % (trait_name, + phenotype_id, + str(strain_name))) + strain_id, data_id = cursor.fetchone() updated_published_data: int = 0 updated_se_data: int = 0 updated_n_strains: int = 0 with conn.cursor() as cursor: - # Update the Strains table - cursor.execute(STRAIN_ID_SQL, (strain_name, strain_id)) - updated_strains = cursor.rowcount # Update the PublishData table - cursor.execute(PUBLISH_DATA_SQL, + cursor.execute(("UPDATE PublishData SET value = %s " + "WHERE StrainId = %s AND Id = %s"), (None if value == "x" else value, - strain_id, publish_data_id)) + strain_id, data_id)) updated_published_data = cursor.rowcount + # Update the PublishSE table - cursor.execute(PUBLISH_SE_SQL, + cursor.execute(("UPDATE PublishSE SET error = %s " + "WHERE StrainId = %s AND DataId = %s"), (None if error == "x" else error, - strain_id, publish_data_id)) + strain_id, data_id)) updated_se_data = cursor.rowcount + # Update the NStrain table - cursor.execute(N_STRAIN_SQL, + cursor.execute(("UPDATE NStrain SET count = %s " + "WHERE StrainId = %s AND DataId = %s"), (None if count == "x" else count, - strain_id, publish_data_id)) + strain_id, data_id)) updated_n_strains = cursor.rowcount - return (updated_strains, updated_published_data, + return (updated_published_data, updated_se_data, updated_n_strains) + +def delete_sample_data(conn: Any, + trait_name: str, + strain_name: str, + phenotype_id: int): + """Given the right parameters, delete sample-data from the relevant + table.""" + strain_id, data_id = "", "" + + deleted_published_data: int = 0 + deleted_se_data: int = 0 + deleted_n_strains: int = 0 + + with conn.cursor() as cursor: + # Delete the PublishData table + try: + cursor.execute( + ("SELECT Strain.Id, PublishData.Id FROM " + "(PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id AND " + "PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = " + "PublishFreeze.InbredSetId AND " + "PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND " + "PublishXRef.PhenotypeId = %s " + "AND PublishData.StrainId = Strain.Id " + "AND Strain.Name = \"%s\"") % (trait_name, + phenotype_id, + str(strain_name))) + strain_id, data_id = cursor.fetchone() + + cursor.execute(("DELETE FROM PublishData " + "WHERE StrainId = %s AND Id = %s") + % (strain_id, data_id)) + deleted_published_data = cursor.rowcount + + # Delete the PublishSE table + cursor.execute(("DELETE FROM PublishSE " + "WHERE StrainId = %s AND DataId = %s") % + (strain_id, data_id)) + deleted_se_data = cursor.rowcount + + # Delete the NStrain table + cursor.execute(("DELETE FROM NStrain " + "WHERE StrainId = %s AND DataId = %s" % + (strain_id, data_id))) + deleted_n_strains = cursor.rowcount + except Exception as e: + conn.rollback() + raise MySQLdb.Error + conn.commit() + cursor.close() + cursor.close() + + return (deleted_published_data, + deleted_se_data, deleted_n_strains) + + +def insert_sample_data(conn: Any, + trait_name: str, + strain_name: str, + phenotype_id: int, + value: Union[int, float, str], + error: Union[int, float, str], + count: Union[int, str]): + """Given the right parameters, insert sample-data to the relevant table. + + """ + + inserted_published_data, inserted_se_data, inserted_n_strains = 0, 0, 0 + with conn.cursor() as cursor: + try: + cursor.execute("SELECT DataId FROM PublishXRef WHERE Id = %s AND " + "PhenotypeId = %s", (trait_name, phenotype_id)) + data_id = cursor.fetchone() + + cursor.execute("SELECT Id FROM Strain WHERE Name = %s", + (strain_name,)) + strain_id = cursor.fetchone() + + # Insert the PublishData table + cursor.execute(("INSERT INTO PublishData (Id, StrainId, value)" + "VALUES (%s, %s, %s)"), + (data_id, strain_id, value)) + inserted_published_data = cursor.rowcount + + # Insert into the PublishSE table if error is specified + if error and error != "x": + cursor.execute(("INSERT INTO PublishSE (StrainId, DataId, " + " error) VALUES (%s, %s, %s)") % + (strain_id, data_id, error)) + inserted_se_data = cursor.rowcount + + # Insert into the NStrain table + if count and count != "x": + cursor.execute(("INSERT INTO NStrain " + "(StrainId, DataId, error) " + "VALUES (%s, %s, %s)") % + (strain_id, data_id, count)) + inserted_n_strains = cursor.rowcount + except Exception as e: + conn.rollback() + raise MySQLdb.Error + return (inserted_published_data, + inserted_se_data, inserted_n_strains) + + def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Publish` traits. @@ -195,11 +318,12 @@ def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_id"] }) return dict(zip([k.lower() for k in keys], cursor.fetchone())) + def set_confidential_field(trait_type, trait_info): """Post processing function for 'Publish' trait types. @@ -212,6 +336,7 @@ def set_confidential_field(trait_type, trait_info): and not trait_info.get("pubmed_id", None)) else 0} return trait_info + def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. @@ -239,11 +364,12 @@ def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) return dict(zip(keys, cursor.fetchone())) + def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Geno` traits. @@ -263,11 +389,12 @@ def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) return dict(zip(keys, cursor.fetchone())) + def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Temp` traits. @@ -280,11 +407,12 @@ def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): cursor.execute( query, { - k:v for k, v in trait_data_source.items() + k: v for k, v in trait_data_source.items() if k in ["trait_name"] }) return dict(zip(keys, cursor.fetchone())) + def set_haveinfo_field(trait_info): """ Common postprocessing function for all trait types. @@ -292,6 +420,7 @@ def set_haveinfo_field(trait_info): Sets the value for the 'haveinfo' field.""" return {**trait_info, "haveinfo": 1 if trait_info else 0} + def set_homologene_id_field_probeset(trait_info, conn): """ Postprocessing function for 'ProbeSet' traits. @@ -307,7 +436,7 @@ def set_homologene_id_field_probeset(trait_info, conn): cursor.execute( query, { - k:v for k, v in trait_info.items() + k: v for k, v in trait_info.items() if k in ["geneid", "group"] }) res = cursor.fetchone() @@ -315,12 +444,13 @@ def set_homologene_id_field_probeset(trait_info, conn): return {**trait_info, "homologeneid": res[0]} return {**trait_info, "homologeneid": None} + def set_homologene_id_field(trait_type, trait_info, conn): """ Common postprocessing function for all trait types. Sets the value for the 'homologene' key.""" - set_to_null = lambda ti: {**ti, "homologeneid": None} + def set_to_null(ti): return {**ti, "homologeneid": None} functions_table = { "Temp": set_to_null, "Geno": set_to_null, @@ -329,6 +459,7 @@ def set_homologene_id_field(trait_type, trait_info, conn): } return functions_table[trait_type](trait_info) + def load_publish_qtl_info(trait_info, conn): """ Load extra QTL information for `Publish` traits @@ -349,6 +480,7 @@ def load_publish_qtl_info(trait_info, conn): return dict(zip(["locus", "lrs", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "additive": ""} + def load_probeset_qtl_info(trait_info, conn): """ Load extra QTL information for `ProbeSet` traits @@ -371,6 +503,7 @@ def load_probeset_qtl_info(trait_info, conn): ["locus", "lrs", "pvalue", "mean", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} + def load_qtl_info(qtl, trait_type, trait_info, conn): """ Load extra QTL information for traits @@ -399,6 +532,7 @@ def load_qtl_info(qtl, trait_type, trait_info, conn): return qtl_info_functions[trait_type](trait_info, conn) + def build_trait_name(trait_fullname): """ Initialises the trait's name, and other values from the search data provided @@ -425,6 +559,7 @@ def build_trait_name(trait_fullname): "cellid": name_parts[2] if len(name_parts) == 3 else "" } + def retrieve_probeset_sequence(trait, conn): """ Retrieve a 'ProbeSet' trait's sequence information @@ -446,6 +581,7 @@ def retrieve_probeset_sequence(trait, conn): seq = cursor.fetchone() return {**trait, "sequence": seq[0] if seq else ""} + def retrieve_trait_info( threshold: int, trait_full_name: str, conn: Any, qtl=None): @@ -501,6 +637,7 @@ def retrieve_trait_info( } return trait_info + def retrieve_temp_trait_data(trait_info: dict, conn: Any): """ Retrieve trait data for `Temp` traits. @@ -520,9 +657,10 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): {"trait_name": trait_info["trait_name"]}) return [dict(zip( ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_species_id(group, conn: Any): """ Retrieve a species id given the Group value @@ -534,6 +672,7 @@ def retrieve_species_id(group, conn: Any): return cursor.fetchone()[0] return None + def retrieve_geno_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Geno` traits. @@ -559,9 +698,10 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): trait_info["db"]["group"], conn)}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_publish_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Publish` traits. @@ -590,9 +730,10 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `Probe Data` types. @@ -623,9 +764,10 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): "dataset_id": trait_info["db"]["dataset_id"]}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): """ Retrieve trait data for `ProbeSet` traits. @@ -652,9 +794,10 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): "dataset_name": trait_info["db"]["dataset_name"]}) return [dict(zip( ["sample_name", "value", "se_error", "id"], row)) - for row in cursor.fetchall()] + for row in cursor.fetchall()] return [] + def with_samplelist_data_setup(samplelist: Sequence[str]): """ Build function that computes the trait data from provided list of samples. @@ -681,6 +824,7 @@ def with_samplelist_data_setup(samplelist: Sequence[str]): return None return setup_fn + def without_samplelist_data_setup(): """ Build function that computes the trait data. @@ -701,6 +845,7 @@ def without_samplelist_data_setup(): return None return setup_fn + def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tuple()): """ Retrieve trait data @@ -740,15 +885,17 @@ def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tupl "data": dict(map( lambda x: ( x["sample_name"], - {k:v for k, v in x.items() if x != "sample_name"}), + {k: v for k, v in x.items() if x != "sample_name"}), data))} return {} + def generate_traits_filename(base_path: str = TMPDIR): """Generate a unique filename for use with generated traits files.""" return "{}/traits_test_file_{}.txt".format( os.path.abspath(base_path), random_string(10)) + def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: """ Export informative strain @@ -770,5 +917,6 @@ def export_informative(trait_data: dict, inc_var: bool = False) -> tuple: return acc return reduce( __exporter__, - filter(lambda td: td["value"] is not None, trait_data["data"].values()), + filter(lambda td: td["value"] is not None, + trait_data["data"].values()), (tuple(), tuple(), tuple())) -- cgit v1.2.3 From 40f264876f2309fcccfcd3d04a2999bdf3fa5d98 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 24 Nov 2021 05:46:43 +0300 Subject: Retrieve the species name given the group Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.dbFunction.webqtlDatabaseFunction.retrieveSpecies` in GeneNetwork1 to `gn3.db.species.species_name` in GeneNetwork3 to enable the retrieval of the species name, given the group name (formerly RISet). --- gn3/db/species.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'gn3/db') diff --git a/gn3/db/species.py b/gn3/db/species.py index 702a9a8..20170ba 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -57,3 +57,20 @@ def translate_to_mouse_gene_id(species: str, geneid: int, conn: Any) -> int: return translated_gene_id[0] return 0 # default if all else fails + +def species_name(conn: Any, group: str) -> str: + """ + Retrieve the name of the species, given the group (RISet). + + This is a migration of the + `web.webqtl.dbFunction.webqtlDatabaseFunction.retrieveSpecies` function in + GeneNetwork1. + """ + with conn.cursor() as cursor: + cursor.execute( + ("SELECT Species.Name FROM Species, InbredSet " + "WHERE InbredSet.Name = %(group_name)s " + "AND InbredSet.SpeciesId = Species.Id"), + group_name=group_name) + return cursor.fetchone()[0] + return None -- cgit v1.2.3 From f88d8e4446c8c9d6693197a452669e0e9c8ea812 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:44:53 +0300 Subject: Fix query parametrisation Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Pass parameters to the query the way the MySQL driver expects. --- gn3/db/correlations.py | 16 ++++++++++------ gn3/db/species.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index a1daa3c..c838597 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -23,8 +23,8 @@ def get_filename(target_db_name: str, conn: Any) -> str: """ with conn.cursor() as cursor: cursor.execute( - "SELECT Id, FullName from ProbeSetFreeze WHERE Name-%s", - target_db_name) + "SELECT Id, FullName from ProbeSetFreeze WHERE Name=%s", + (target_db_name,)) result = cursor.fetchone() if result: return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( @@ -398,9 +398,12 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - query, samples_names=tuple(sample_names), - species_name=species_name) return cursor.fetchall() + query, + { + "samples_names": tuple(sample_names), + "species_name": species_name + }) def build_query_sgo_lit_corr( db_type: str, temp_table: str, sample_id_columns: str, @@ -511,8 +514,9 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] query, data_start_pos = __build_query__(sample_ids, temp_table) with conn.cursor() as cursor: cursor.execute( - query, db_name=db_name, - **{f"T{item}_sample_id": item for item in sample_ids}) + query, + {"db_name": db_name, + **{f"T{item}_sample_id": item for item in sample_ids}}) return (cursor.fetchall(), data_start_pos) sample_ids = tuple( diff --git a/gn3/db/species.py b/gn3/db/species.py index 20170ba..5b8e096 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -71,6 +71,6 @@ def species_name(conn: Any, group: str) -> str: ("SELECT Species.Name FROM Species, InbredSet " "WHERE InbredSet.Name = %(group_name)s " "AND InbredSet.SpeciesId = Species.Id"), - group_name=group_name) + {"group_name": group}) return cursor.fetchone()[0] return None -- cgit v1.2.3 From d29ef85d44fff9414932d64850f65cb268b1cddd Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:48:26 +0300 Subject: Update typing notations on functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/db/correlations.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index c838597..898de75 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -4,7 +4,7 @@ feature to access the database to retrieve data needed for computations. """ from functools import reduce -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Union from gn3.random import random_string from gn3.data_helpers import partition_all @@ -12,7 +12,8 @@ from gn3.db.species import translate_to_mouse_gene_id from gn3.computations.partial_correlations import correlations_of_all_tissue_traits -def get_filename(target_db_name: str, conn: Any) -> str: +def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ + str, bool]: """ Retrieve the name of the reference database file with which correlations are computed. @@ -456,8 +457,9 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): def fetch_all_database_data(# pylint: disable=[R0913, R0914] conn: Any, species: str, gene_id: int, trait_symbol: str, - samples: Tuple[str, ...], db_type: str, db_name: str, method: str, - return_number: int, probeset_freeze_id: int) -> Tuple[Any, Any]: + samples: Tuple[str, ...], dataset: dict, method: str, + return_number: int, probeset_freeze_id: int) -> Tuple[ + Tuple[float], int]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in -- cgit v1.2.3 From ee200d60bd3065c4c9e69bcbb756715211b711d2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:50:33 +0300 Subject: Return only values Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Return the values from the database, not the tuples. --- gn3/db/correlations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 898de75..05c0809 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -399,12 +399,12 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - return cursor.fetchall() query, { "samples_names": tuple(sample_names), "species_name": species_name }) + return tuple(row[0] for row in cursor.fetchall()) def build_query_sgo_lit_corr( db_type: str, temp_table: str, sample_id_columns: str, -- cgit v1.2.3 From 0f9247fffc7127bdb3c35492a37706a2db01a26c Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:52:37 +0300 Subject: Update return type Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Return the complete filename when found, or the boolean value False, when it is not found. --- gn3/db/correlations.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 05c0809..ab4dc2c 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -2,7 +2,7 @@ This module will hold functions that are used in the (partial) correlations feature to access the database to retrieve data needed for computations. """ - +import os from functools import reduce from typing import Any, Dict, Tuple, Union @@ -28,11 +28,13 @@ def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ (target_db_name,)) result = cursor.fetchone() if result: - return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( + filename = "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( tid=result[0], fname=result[1].replace(' ', '_').replace('/', '_')) + return ((filename in os.listdir(text_file_dir)) + and f"{text_files_dir}/{filename}") - return "" + return False def build_temporary_literature_table( conn: Any, species: str, gene_id: int, return_number: int) -> str: -- cgit v1.2.3 From 109b233f698a2ce41bb5634f12e966ad02798819 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:54:17 +0300 Subject: Fix bugs in data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Process the db_name and db_type values. * Return data correctly --- gn3/db/correlations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gn3/db') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index ab4dc2c..401fd91 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -467,6 +467,8 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ + db_type = dataset["dataset_type"] + db_name = dataset["dataset_name"] def __build_query__(sample_ids, temp_table): sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in sample_ids) if db_type == "Publish": @@ -551,4 +553,4 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] with conn.cursor() as cursor: cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") - return (tuple(item[0] for item in trait_database), trait_database[0][1]) + return (trait_database[0], trait_database[1]) -- cgit v1.2.3 From 6b147173d514093ec4e461f5843170c968290e5e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 29 Nov 2021 11:52:26 +0300 Subject: Provide entry-point function for the partial correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Provide the entry-point function to the partial correlation feature. This is the function that ochestrates the fetching of the data, and processing it for output by the API endpoint (to be implemented). --- gn3/computations/partial_correlations.py | 357 +++++++++++++++++++++++++++++-- gn3/db/correlations.py | 11 +- 2 files changed, 350 insertions(+), 18 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index f43c4d4..869bee4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -6,15 +6,20 @@ GeneNetwork1. """ import math -from functools import reduce +from functools import reduce, partial from typing import Any, Tuple, Union, Sequence -from scipy.stats import pearsonr, spearmanr import pandas import pingouin +from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line +from gn3.db.traits import export_informative +from gn3.db.traits import retrieve_trait_info, retrieve_trait_data +from gn3.db.species import species_name, translate_to_mouse_gene_id +from gn3.db.correlations import get_filename, fetch_all_database_data def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -112,7 +117,7 @@ def find_identical_traits( return acc + ident[1] def __dictify_controls__(acc, control_item): - ckey = "{:.3f}".format(control_item[0]) + ckey = tuple("{:.3f}".format(item) for item in control_item[0]) return {**acc, ckey: acc.get(ckey, tuple()) + (control_item[1],)} return (reduce(## for identical control traits @@ -212,7 +217,7 @@ def partial_correlations_fast(# pylint: disable=[R0913, R0914] function in GeneNetwork1. """ assert method in ("spearman", "pearson") - with open(f"{TEXTDIR}/{database_filename}", "r") as dataset_file: + with open(database_filename, "r") as dataset_file: dataset = tuple(dataset_file.readlines()) good_dataset_samples = good_dataset_samples_indexes( @@ -286,32 +291,37 @@ def compute_partial( """ # replace the R code with `pingouin.partial_corr` def __compute_trait_info__(target): + targ_vals = target[0] + targ_name = target[1] primary = [ - prim for targ, prim in zip(target, primary_vals) + prim for targ, prim in zip(targ_vals, primary_vals) if targ is not None] + datafrm = build_data_frame( primary, - [targ for targ in target if targ is not None], - [cont for i, cont in enumerate(control_vals) - if target[i] is not None]) + tuple(targ for targ in targ_vals if targ is not None), + tuple(cont for i, cont in enumerate(control_vals) + if target[i] is not None)) covariates = "z" if datafrm.shape[1] == 3 else [ col for col in datafrm.columns if col not in ("x", "y")] ppc = pingouin.partial_corr( - data=datafrm, x="x", y="y", covar=covariates, method=method) - pc_coeff = ppc["r"] + data=datafrm, x="x", y="y", covar=covariates, method=( + "pearson" if "pearson" in method.lower() else "spearman")) + pc_coeff = ppc["r"][0] zero_order_corr = pingouin.corr( - datafrm["x"], datafrm["y"], method=method) + datafrm["x"], datafrm["y"], method=( + "pearson" if "pearson" in method.lower() else "spearman")) if math.isnan(pc_coeff): return ( - target[1], len(primary), pc_coeff, 1, zero_order_corr["r"], - zero_order_corr["p-val"]) + targ_name, len(primary), pc_coeff, 1, zero_order_corr["r"][0], + zero_order_corr["p-val"][0]) return ( - target[1], len(primary), pc_coeff, - (ppc["p-val"] if not math.isnan(ppc["p-val"]) else ( + targ_name, len(primary), pc_coeff, + (ppc["p-val"][0] if not math.isnan(ppc["p-val"][0]) else ( 0 if (abs(pc_coeff - 1) < 0.0000001) else 1)), - zero_order_corr["r"], zero_order_corr["p-val"]) + zero_order_corr["r"][0], zero_order_corr["p-val"][0]) return tuple( __compute_trait_info__(target) @@ -360,3 +370,318 @@ def partial_correlations_normal(# pylint: disable=R0913 for idx, item in enumerate(all_correlations))) return len(trait_database), all_correlations + +def partial_corrs( + conn, samples , primary_vals, control_vals, return_number, species, input_trait_geneid, + input_trait_symbol, tissue_probeset_freeze_id, method, dataset, database_filename): + """ + Compute the partial correlations, selecting the fast or normal method + depending on the existence of the database text file. + + This is a partial migration of the + `web.webqtl.correlation.PartialCorrDBPage.__init__` function in + GeneNetwork1. + """ + if database_filename: + return partial_correlations_fast( + samples, primary_vals, control_vals, database_filename, + ( + fetch_literature_correlations( + species, input_trait_geneid, dataset, return_number, conn) + if "literature" in method.lower() else + fetch_tissue_correlations( + dataset, input_trait_symbol, tissue_probeset_freeze_id, + method, return_number, conn)), + method, + ("literature" if method.lower() == "sgo literature correlation" + else ("tissue" if "tissue" in method.lower() else "genetic"))) + + trait_database, data_start_pos = fetch_all_database_data( + conn, species, input_trait_geneid, input_trait_symbol, samples, dataset, + method, return_number, tissue_probeset_freeze_id) + return partial_correlations_normal( + primary_vals, control_vals, input_trait_geneid, trait_database, + data_start_pos, dataset, method) + +def literature_correlation_by_list( + conn: Any, input_trait_mouse_geneid: int, species: str, + trait_list: Tuple[dict]) -> Tuple[dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` + function in GeneNetwork1. + """ + if any((lambda t: ( + bool(t.get("tissue_corr")) and + bool(t.get("tissue_p_value"))))(trait) + for trait in trait_list): + temp_table_name = f"LITERATURE{random_string(8)}" + q1 = ( + f"CREATE TEMPORARY TABLE {temporary_table_name} " + "(GeneId1 INT(12) UNSIGNED, GeneId2 INT(12) UNSIGNED PRIMARY KEY, " + "value DOUBLE)") + q2 = ( + f"INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1=%(geneid)s") + q3 = ( + "INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2=%s AND GeneId1 != %(geneid)s") + + def __set_mouse_geneid__(trait): + if trait.get("geneid"): + return { + **trait, + "mouse_geneid": translate_to_mouse_gene_id(trait.get("geneid")) + } + return {**trait, "mouse_geneid": 0} + + def __retrieve_lcorr__(cursor, geneids): + cursor.execute( + f"SELECT GeneId2, value FROM {temporary_table_name} " + "WHERE GeneId2 IN %(geneids)s", + geneids = geneids) + return {geneid: value for geneid, value in cursor.fetchall()} + + with conn.cursor() as cursor: + cursor.execute(q1) + cursor.execute(q2) + cursor.execute(q3) + + traits = tuple(__set_mouse_geneid__(trait) for trait in trait_list) + lcorrs = __retrieve_lcorr__( + cursor, ( + trait["mouse_geneid"] for trait in traits + if (trait["mouse_geneid"] != 0 and + trait["mouse_geneid"].find(";") < 0))) + return tuple( + {**trait, "l_corr": lcorrs.get(trait["mouse_geneid"], None)} + for trait in traits) + + return trait_list + return trait_list + +def tissue_correlation_by_list( + conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int, + method: str, trait_list: Tuple[dict]) -> Tuple[dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` + function in GeneNetwork1. + """ + def __add_tissue_corr__(trait, primary_trait_value, trait_value): + result = pingouin.corr( + primary_trait_values, target_trait_values, + method=("spearman" if "spearman" in method.lower() else "pearson")) + return { + **trait, + "tissue_corr": result["r"], + "tissue_p_value": result["p-val"] + } + + if any((lambda t: bool(t.get("l_corr")))(trait) for trait in trait_list): + prim_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + (primary_trait_symbol,), tissue_probeset_freeze_id, conn) + if primary_trait_symbol.lower() in prim_trait_symbol_value_dict: + primary_trait_value = prim_trait_symbol_value_dict[prim_trait_symbol.lower()] + gene_symbol_list = tuple( + trait for trait in trait_list if "symbol" in trait.keys()) + symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + gene_symbol_list, tissue_probeset_freeze_id, conn) + return tuple( + __add_tissue_corr__( + trait, primary_trait_value, + symbol_value_dict[trait["symbol"].lower()]) + for trait in trait_list + if ("symbol" in trait and + bool(trait["symbol"]) and + trait["symbol"].lower() in symbol_value_dict)) + return tuple({ + **trait, + "tissue_corr": None, + "tissue_p_value": None + } for trait in trait_list) + return trait_list + +def partial_correlations_entry( + conn: Any, primary_trait_name: str, + control_trait_names: Tuple[str, ...], method: str, + criteria: int, group: str, target_db_name: str) -> dict: + """ + This is the 'ochestration' function for the partial-correlation feature. + + This function will dispatch the functions doing data fetches from the + database (and various other places) and feed that data to the functions + doing the conversions and computations. It will then return the results of + all of that work. + + This function is doing way too much. Look into splitting out the + functionality into smaller functions that do fewer things. + """ + threshold = 0 + corr_min_informative = 4 + + primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) + primary_trait_data = retrieve_trait_data(primary_trait, conn) + primary_samples, primary_values, primary_variances = export_informative( + primary_trait_data) + + cntrl_traits = tuple( + retrieve_trait_info(threshold, trait_full_name, conn) + for trait_full_name in control_trait_names) + cntrl_traits_data = tuple( + retrieve_trait_data(cntrl_trait, conn) + for cntrl_trait in cntrl_traits) + species = species_name(conn, group) + + (cntrl_samples, + cntrl_values, + cntrl_variances, + cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + + common_primary_control_samples = primary_samples + fixed_primary_vals = primary_values + fixed_control_vals = cntrl_values + if not all(cnt_smp == primary_samples for cnt_smp in cntrl_samples): + (common_primary_control_samples, + fixed_primary_vals, + fixed_control_vals, + primary_variances, + cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + + if len(common_primary_control_samples) < corr_min_informative: + return { + "status": "error", + "message": ( + f"Fewer than {corr_min_informative} samples data entered for " + f"{group} dataset. No calculation of correlation has been " + "attempted."), + "error_type": "Inadequate Samples"} + + identical_traits_names = find_identical_traits( + primary_trait_name, primary_values, control_trait_names, cntrl_values) + if len(identical_traits_names) > 0: + return { + "status": "error", + "message": ( + f"{identical_traits_names[0]} and {identical_traits_names[1]} " + "have the same values for the {len(fixed_primary_vals)} " + "samples that will be used to compute the partial correlation " + "(common for all primary and control traits). In such cases, " + "partial correlation cannot be computed. Please re-select your " + "traits."), + "error_type": "Identical Traits"} + + input_trait_geneid = primary_trait.get("geneid") + input_trait_symbol = primary_trait.get("symbol") + input_trait_mouse_geneid = translate_to_mouse_gene_id( + species, input_trait_geneid, conn) + + tissue_probeset_freeze_id = 1 + db_type = primary_trait["db"]["dataset_type"] + db_name = primary_trait["db"]["dataset_name"] + + if db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return { + "status": "error", + "message": ( + "Wrong correlation type: It is not possible to compute the " + f"{method} between your trait and data in the {target_db_name} " + "database. Please try again after selecting another type of " + "correlation."), + "error_type": "Correlation Type"} + + if (method.lower() == "sgo literature correlation" and ( + input_trait_geneid is None or + check_for_literature_info(conn, input_trait_mouse_geneid))): + return { + "status": "error", + "message": ( + "No Literature Information: This gene does not have any " + "associated Literature Information."), + "error_type": "Literature Correlation"} + + if (method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and input_trait_symbol is None): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + if (method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + database_filename = get_filename(conn, target_db_name, TEXTDIR) + total_traits, all_correlations = partial_corrs( + conn, common_primary_control_samples, fixed_primary_vals, + fixed_control_vals, len(fixed_primary_vals), species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, primary_trait["db"], database_filename) + + + def __make_sorter__(method): + def __sort_6__(x): + return x[6] + + def __sort_3__(x): + return x[3] + + if "literature" in method.lower(): + return __sort_6__ + + if "tissue" in method.lower(): + return __sort_6__ + + return __sort_3__ + + sorted_correlations = sorted( + all_correlations, key=__make_sorter__(method)) + + add_lit_corr_and_tiss_corr = compose( + partial( + literature_correlation_by_list, conn, input_trait_mouse_geneid, + species), + partial( + tissue_correlation_by_list, conn, input_trait_symbol, + tissue_probeset_freeze_id, method)) + + trait_list = add_lit_corr_and_tiss_corr(tuple( + { + **retrieve_trait_info( + threshold, + f"{primary_trait['db']['dataset_name']}::{item[0]}", + conn), + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} + if len(item) == 8 else {}), + **({"l_corr": item[6]} + if len(item) == 7 else {}) + } + for item in + sorted_correlations[:min(criteria, len(all_correlations))])) + + return trait_list diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 401fd91..2a38bae 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -10,8 +10,6 @@ from gn3.random import random_string from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id -from gn3.computations.partial_correlations import correlations_of_all_tissue_traits - def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ str, bool]: """ @@ -282,6 +280,15 @@ def build_temporary_tissue_correlations_table( # We should probably pass the `correlations_of_all_tissue_traits` function # as an argument to this function and get rid of the one call immediately # following this comment. + from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + # This import above is necessary within the function to avoid + # circular-imports. + # + # + # This import above is indicative of convoluted code, with the computation + # being interwoven with the data retrieval. This needs to be changed, such + # that the function being imported here is no longer necessary, or have the + # imported function passed to this function as an argument. symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( fetch_gene_symbol_tissue_value_dict_for_trait( (trait_symbol,), probeset_freeze_id, conn), -- cgit v1.2.3 From 99953f6e4a540da41d0517203eb63da4e19405cd Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 29 Nov 2021 14:01:44 +0300 Subject: Fix linting errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/computations/partial_correlations.py | 131 +++++++++++++++++-------------- gn3/db/correlations.py | 5 +- gn3/db/traits.py | 47 ++++++----- 3 files changed, 100 insertions(+), 83 deletions(-) (limited to 'gn3/db') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 869bee4..231b0a7 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -14,12 +14,20 @@ import pingouin from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +from gn3.random import random_string from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line from gn3.db.traits import export_informative from gn3.db.traits import retrieve_trait_info, retrieve_trait_data from gn3.db.species import species_name, translate_to_mouse_gene_id -from gn3.db.correlations import get_filename, fetch_all_database_data +from gn3.db.correlations import ( + get_filename, + fetch_all_database_data, + check_for_literature_info, + fetch_tissue_correlations, + fetch_literature_correlations, + check_symbol_for_tissue_correlation, + fetch_gene_symbol_tissue_value_dict_for_trait) def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -311,7 +319,7 @@ def compute_partial( zero_order_corr = pingouin.corr( datafrm["x"], datafrm["y"], method=( - "pearson" if "pearson" in method.lower() else "spearman")) + "pearson" if "pearson" in method.lower() else "spearman")) if math.isnan(pc_coeff): return ( @@ -371,9 +379,10 @@ def partial_correlations_normal(# pylint: disable=R0913 return len(trait_database), all_correlations -def partial_corrs( - conn, samples , primary_vals, control_vals, return_number, species, input_trait_geneid, - input_trait_symbol, tissue_probeset_freeze_id, method, dataset, database_filename): +def partial_corrs(# pylint: disable=[R0913] + conn, samples, primary_vals, control_vals, return_number, species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, dataset, database_filename): """ Compute the partial correlations, selecting the fast or normal method depending on the existence of the database text file. @@ -404,8 +413,7 @@ def partial_corrs( data_start_pos, dataset, method) def literature_correlation_by_list( - conn: Any, input_trait_mouse_geneid: int, species: str, - trait_list: Tuple[dict]) -> Tuple[dict]: + conn: Any, species: str, trait_list: Tuple[dict]) -> Tuple[dict]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` @@ -415,16 +423,16 @@ def literature_correlation_by_list( bool(t.get("tissue_corr")) and bool(t.get("tissue_p_value"))))(trait) for trait in trait_list): - temp_table_name = f"LITERATURE{random_string(8)}" - q1 = ( + temporary_table_name = f"LITERATURE{random_string(8)}" + query1 = ( f"CREATE TEMPORARY TABLE {temporary_table_name} " "(GeneId1 INT(12) UNSIGNED, GeneId2 INT(12) UNSIGNED PRIMARY KEY, " "value DOUBLE)") - q2 = ( + query2 = ( f"INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " "WHERE GeneId1=%(geneid)s") - q3 = ( + query3 = ( "INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " "WHERE GeneId2=%s AND GeneId1 != %(geneid)s") @@ -433,7 +441,8 @@ def literature_correlation_by_list( if trait.get("geneid"): return { **trait, - "mouse_geneid": translate_to_mouse_gene_id(trait.get("geneid")) + "mouse_geneid": translate_to_mouse_gene_id( + species, trait.get("geneid"), conn) } return {**trait, "mouse_geneid": 0} @@ -441,13 +450,13 @@ def literature_correlation_by_list( cursor.execute( f"SELECT GeneId2, value FROM {temporary_table_name} " "WHERE GeneId2 IN %(geneids)s", - geneids = geneids) - return {geneid: value for geneid, value in cursor.fetchall()} + geneids=geneids) + return dict(cursor.fetchall()) with conn.cursor() as cursor: - cursor.execute(q1) - cursor.execute(q2) - cursor.execute(q3) + cursor.execute(query1) + cursor.execute(query2) + cursor.execute(query3) traits = tuple(__set_mouse_geneid__(trait) for trait in trait_list) lcorrs = __retrieve_lcorr__( @@ -470,9 +479,9 @@ def tissue_correlation_by_list( `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` function in GeneNetwork1. """ - def __add_tissue_corr__(trait, primary_trait_value, trait_value): + def __add_tissue_corr__(trait, primary_trait_values, trait_values): result = pingouin.corr( - primary_trait_values, target_trait_values, + primary_trait_values, trait_values, method=("spearman" if "spearman" in method.lower() else "pearson")) return { **trait, @@ -484,7 +493,8 @@ def tissue_correlation_by_list( prim_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( (primary_trait_symbol,), tissue_probeset_freeze_id, conn) if primary_trait_symbol.lower() in prim_trait_symbol_value_dict: - primary_trait_value = prim_trait_symbol_value_dict[prim_trait_symbol.lower()] + primary_trait_value = prim_trait_symbol_value_dict[ + primary_trait_symbol.lower()] gene_symbol_list = tuple( trait for trait in trait_list if "symbol" in trait.keys()) symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( @@ -504,7 +514,7 @@ def tissue_correlation_by_list( } for trait in trait_list) return trait_list -def partial_correlations_entry( +def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] conn: Any, primary_trait_name: str, control_trait_names: Tuple[str, ...], method: str, criteria: int, group: str, target_db_name: str) -> dict: @@ -524,7 +534,7 @@ def partial_correlations_entry( primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) primary_trait_data = retrieve_trait_data(primary_trait, conn) - primary_samples, primary_values, primary_variances = export_informative( + primary_samples, primary_values, _primary_variances = export_informative( primary_trait_data) cntrl_traits = tuple( @@ -537,8 +547,8 @@ def partial_correlations_entry( (cntrl_samples, cntrl_values, - cntrl_variances, - cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + _cntrl_variances, + _cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) common_primary_control_samples = primary_samples fixed_primary_vals = primary_values @@ -547,8 +557,8 @@ def partial_correlations_entry( (common_primary_control_samples, fixed_primary_vals, fixed_control_vals, - primary_variances, - cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + _primary_variances, + _cntrl_variances) = fix_samples(primary_trait, cntrl_traits) if len(common_primary_control_samples) < corr_min_informative: return { @@ -580,7 +590,6 @@ def partial_correlations_entry( tissue_probeset_freeze_id = 1 db_type = primary_trait["db"]["dataset_type"] - db_name = primary_trait["db"]["dataset_name"] if db_type == "ProbeSet" and method.lower() in ( "sgo literature correlation", @@ -605,10 +614,11 @@ def partial_correlations_entry( "associated Literature Information."), "error_type": "Literature Correlation"} - if (method.lower() in ( - "tissue correlation, pearson's r", - "tissue correlation, spearman's rho") - and input_trait_symbol is None): + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and input_trait_symbol is None): return { "status": "error", "message": ( @@ -616,11 +626,12 @@ def partial_correlations_entry( "any associated Tissue Correlation Information."), "error_type": "Tissue Correlation"} - if (method.lower() in ( - "tissue correlation, pearson's r", - "tissue correlation, spearman's rho") - and check_symbol_for_tissue_correlation( - conn, tissue_probeset_freeze_id, input_trait_symbol)): + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): return { "status": "error", "message": ( @@ -629,7 +640,7 @@ def partial_correlations_entry( "error_type": "Tissue Correlation"} database_filename = get_filename(conn, target_db_name, TEXTDIR) - total_traits, all_correlations = partial_corrs( + _total_traits, all_correlations = partial_corrs( conn, common_primary_control_samples, fixed_primary_vals, fixed_control_vals, len(fixed_primary_vals), species, input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, @@ -637,11 +648,11 @@ def partial_correlations_entry( def __make_sorter__(method): - def __sort_6__(x): - return x[6] + def __sort_6__(row): + return row[6] - def __sort_3__(x): - return x[3] + def __sort_3__(row): + return row[3] if "literature" in method.lower(): return __sort_6__ @@ -655,33 +666,31 @@ def partial_correlations_entry( all_correlations, key=__make_sorter__(method)) add_lit_corr_and_tiss_corr = compose( - partial( - literature_correlation_by_list, conn, input_trait_mouse_geneid, - species), + partial(literature_correlation_by_list, conn, species), partial( tissue_correlation_by_list, conn, input_trait_symbol, tissue_probeset_freeze_id, method)) trait_list = add_lit_corr_and_tiss_corr(tuple( - { - **retrieve_trait_info( - threshold, - f"{primary_trait['db']['dataset_name']}::{item[0]}", - conn), - "noverlap": item[1], - "partial_corr": item[2], - "partial_corr_p_value": item[3], - "corr": item[4], - "corr_p_value": item[5], - "rank_order": (1 if "spearman" in method.lower() else 0), - **({ - "tissue_corr": item[6], - "tissue_p_value": item[7]} + { + **retrieve_trait_info( + threshold, + f"{primary_trait['db']['dataset_name']}::{item[0]}", + conn), + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} if len(item) == 8 else {}), - **({"l_corr": item[6]} + **({"l_corr": item[6]} if len(item) == 7 else {}) - } + } for item in - sorted_correlations[:min(criteria, len(all_correlations))])) + sorted_correlations[:min(criteria, len(all_correlations))])) return trait_list diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 2a38bae..3d12019 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -29,7 +29,7 @@ def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ filename = "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( tid=result[0], fname=result[1].replace(' ', '_').replace('/', '_')) - return ((filename in os.listdir(text_file_dir)) + return ((filename in os.listdir(text_files_dir)) and f"{text_files_dir}/{filename}") return False @@ -280,7 +280,8 @@ def build_temporary_tissue_correlations_table( # We should probably pass the `correlations_of_all_tissue_traits` function # as an argument to this function and get rid of the one call immediately # following this comment. - from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + from gn3.computations.partial_correlations import (#pylint: disable=[C0415, R0401] + correlations_of_all_tissue_traits) # This import above is necessary within the function to avoid # circular-imports. # diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 75de4f4..d4a96f0 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,9 +1,10 @@ """This class contains functions relating to trait data manipulation""" import os -import MySQLdb from functools import reduce from typing import Any, Dict, Union, Sequence +import MySQLdb + from gn3.settings import TMPDIR from gn3.random import random_string from gn3.function_helpers import compose @@ -81,10 +82,10 @@ def export_trait_data( def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" - def __float_strip(n): - if str(n)[-2:] == ".0": - return str(int(n)) - return str(n) + def __float_strip(num_str): + if str(num_str)[-2:] == ".0": + return str(int(num_str)) + return str(num_str) sql = ("SELECT DISTINCT Strain.Name, PublishData.value, " "PublishSE.error, NStrain.count FROM " "(PublishData, Strain, PublishXRef, PublishFreeze) " @@ -108,7 +109,7 @@ def get_trait_csv_sample_data(conn: Any, return "\n".join(csv_data) -def update_sample_data(conn: Any, +def update_sample_data(conn: Any, #pylint: disable=[R0913] trait_name: str, strain_name: str, phenotype_id: int, @@ -219,7 +220,7 @@ def delete_sample_data(conn: Any, "WHERE StrainId = %s AND DataId = %s" % (strain_id, data_id))) deleted_n_strains = cursor.rowcount - except Exception as e: + except Exception as e: #pylint: disable=[C0103, W0612] conn.rollback() raise MySQLdb.Error conn.commit() @@ -230,7 +231,7 @@ def delete_sample_data(conn: Any, deleted_se_data, deleted_n_strains) -def insert_sample_data(conn: Any, +def insert_sample_data(conn: Any, #pylint: disable=[R0913] trait_name: str, strain_name: str, phenotype_id: int, @@ -272,7 +273,7 @@ def insert_sample_data(conn: Any, "VALUES (%s, %s, %s)") % (strain_id, data_id, count)) inserted_n_strains = cursor.rowcount - except Exception as e: + except Exception as e: #pylint: disable=[C0103, W0612] conn.rollback() raise MySQLdb.Error return (inserted_published_data, @@ -450,7 +451,7 @@ def set_homologene_id_field(trait_type, trait_info, conn): Common postprocessing function for all trait types. Sets the value for the 'homologene' key.""" - def set_to_null(ti): return {**ti, "homologeneid": None} + def set_to_null(ti): return {**ti, "homologeneid": None} # pylint: disable=[C0103, C0321] functions_table = { "Temp": set_to_null, "Geno": set_to_null, @@ -656,8 +657,9 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + ["sample_name", "value", "se_error", "nstrain", "id"], + row)) + for row in cursor.fetchall()] return [] @@ -696,8 +698,10 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "dataset_name": trait_info["db"]["dataset_name"], "species_id": retrieve_species_id( trait_info["db"]["group"], conn)}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], + row)) for row in cursor.fetchall()] return [] @@ -728,8 +732,9 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): query, {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "nstrain", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -762,8 +767,9 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): {"cellid": trait_info["cellid"], "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -792,8 +798,9 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): query, {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -- cgit v1.2.3