From df8185078a52c89cc5a75ff9be413a236da29a6e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 09:31:58 +0300 Subject: Implement `get_filename` for correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `get_filename` for the correlations, to be used to determine whether to do fast or normal correlations. This is a migration of the `web.webqtl.correlation.CorrelationPage.getFileName` function in GN1 --- gn3/db/correlations.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 gn3/db/correlations.py (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py new file mode 100644 index 0000000..fa8e7ca --- /dev/null +++ b/gn3/db/correlations.py @@ -0,0 +1,26 @@ +""" +This module will hold functions that are used in the (partial) correlations +feature to access the database to retrieve data needed for computations. +""" + +from typing import Any +def get_filename(target_db_name: str, conn: Any) -> str: + """ + Retrieve the name of the reference database file with which correlations are + computed. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getFileName` function in + GeneNetwork1. + """ + with conn.cursor() as cursor: + cursor.execute( + "SELECT Id, FullName from ProbeSetFreeze WHERE Name-%s", + target_db_name) + result = cursor.fetchone() + if result: + return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( + tid=result[0], + fname=result[1].replace(' ', '_').replace('/', '_')) + + return "" -- cgit v1.2.3 From 0814eea6b57e45d4337424e63c164d204d03b64d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 12:38:24 +0300 Subject: Implement `fetch_literature_correlations` and depedencies Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate: * `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` * `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` from GeneNetwork1. The first function creates and populates a temporary table with the literature correlations data. The second function uses the data in the newly created temporary table to link the trait with the correlation value. --- gn3/db/correlations.py | 113 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index fa8e7ca..67cfef9 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -4,6 +4,10 @@ feature to access the database to retrieve data needed for computations. """ from typing import Any + +from gn3.random import random_string +from gn3.db.species import translate_to_mouse_gene_id + def get_filename(target_db_name: str, conn: Any) -> str: """ Retrieve the name of the reference database file with which correlations are @@ -24,3 +28,112 @@ def get_filename(target_db_name: str, conn: Any) -> str: fname=result[1].replace(' ', '_').replace('/', '_')) return "" + +def build_temporary_literature_table( + species: str, gene_id: int, return_number: int, conn: Any) -> str: + """ + Build and populate a temporary table to hold the literature correlation data + to be used in computations. + + "This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempLiteratureTable` function in + GeneNetwork1. + """ + def __translated_species_id(row, cursor): + if species == "mouse": + return row[1] + query = { + "rat": "SELECT rat FROM GeneIDXRef WHERE mouse=%s", + "human": "SELECT human FROM GeneIDXRef WHERE mouse=%d"} + if species in query.keys(): + cursor.execute(query[species], row[1]) + record = cursor.fetchone() + if record: + return record[0] + return None + return None + + temp_table_name = f"TOPLITERATURE{random_string(8)}" + with conn.cursor as cursor: + mouse_geneid = translate_to_mouse_gene_id(species, gene_id, conn) + data_query = ( + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1 = %(mouse_gene_id)s " + "UNION ALL " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2 = %(mouse_gene_id)s " + "AND GeneId1 != %(mouse_gene_id)s") + cursor.execute( + (f"CREATE TEMPORARY TABLE {temp_table_name} (" + "GeneId1 int(12) unsigned, " + "GeneId2 int(12) unsigned PRIMARY KEY, " + "value double)")) + cursor.execute(data_query, mouse_gene_id=mouse_geneid) + literature_data = [ + {"GeneId1": row[0], "GeneId2": row[1], "value": row[2]} + for row in cursor.fetchall() + if __translated_species_id(row, cursor)] + + cursor.execute( + (f"INSERT INTO {temp_table_name} " + "VALUES (%(GeneId1)s, %(GeneId2)s, %(value)s)"), + literature_data[0:(2 * return_number)]) + + return temp_table_name + +def fetch_geno_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `Geno*` tables. + """ + return ( + f"SELECT Geno.Name, {temp_table}.value " + "FROM Geno, GenoXRef, GenoFreeze " + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + f"AND {temp_table}.value IS NOT NULL " + "AND GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoFreeze.Name = %(db_name)s " + "AND Geno.Id=GenoXRef.GenoId " + "ORDER BY Geno.Id") + +def fetch_probeset_literature_correlations(temp_table: str) -> str: + """ + Helper function for `fetch_literature_correlations` below, to build query + for `ProbeSet*` tables. + """ + return ( + f"SELECT ProbeSet.Name, {temp_table}.value " + "FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze " + "LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + "WHERE ProbeSet.GeneId IS NOT NULL " + "AND {temp_table}.value IS NOT NULL " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %(db_name)s " + "AND ProbeSet.Id=ProbeSetXRef.ProbeSetId " + "ORDER BY ProbeSet.Id") + +def fetch_literature_correlations( + species: str, gene_id: int, dataset: dict, return_number: int, + conn: Any) -> dict: + """ + Gather the literature correlation data and pair it with trait id string(s). + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchLitCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_literature_table( + species, gene_id, return_number, conn) + query_fns = { + "Geno": fetch_geno_literature_correlations, + # "Temp": fetch_temp_literature_correlations, + # "Publish": fetch_publish_literature_correlations, + "ProbeSet": fetch_probeset_literature_correlations} + with conn.cursor as cursor: + cursor.execute( + query_fns[dataset["dataset_type"]](temp_table), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} -- cgit v1.2.3 From c13afb3af166d2b01e4f9fd9b09bb231f0a63cb1 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 25 Oct 2021 19:19:54 +0300 Subject: Start implementation of `fetch_tissue_correlations` and dependencies * compare_tissue_correlation_absolute_values: New function. Complete. Used for sorting of tissue correlation values * fetch_symbol_value_pair_dict: New function. Complete. Maps gene symbols to tissue expression data * fetch_gene_symbol_tissue_value_dict: New function. Complete. Wrapper for `gn3.db.correlations.fetch_symbol_value_pair_dict` function * fetch_tissue_probeset_xref_info: New function. Complete. Retrieves the Probeset XRef information for tissues from the database. * correlations_of_all_tissue_traits: Stub. Dependencies not completed yet. * build_temporary_tissue_correlations_table: Stub. Dependencies not completed yet. * fetch_tissue_correlations: New function. Incomplete. This function calls (a) stub(s) function(s) which is/are under development still. --- gn3/db/correlations.py | 183 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 2 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 67cfef9..87ab082 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -3,9 +3,11 @@ This module will hold functions that are used in the (partial) correlations feature to access the database to retrieve data needed for computations. """ -from typing import Any +from functools import reduce +from typing import Any, Dict, Tuple from gn3.random import random_string +from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id def get_filename(target_db_name: str, conn: Any) -> str: @@ -136,4 +138,181 @@ def fetch_literature_correlations( db_name=dataset["dataset_name"]) results = cursor.fetchall() cursor.execute("DROP TEMPORARY TABLE %s", temp_table) - return dict(results) # {trait_name: lit_corr for trait_name, lit_corr in results} + return dict(results) + +def compare_tissue_correlation_absolute_values(val1, val2): + """ + Comparison function for use when sorting tissue correlation values. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + try: + if abs(val1) < abs(val2): + return 1 + if abs(val1) == abs(val2): + return 0 + return -1 + except TypeError: + return 0 + +def fetch_symbol_value_pair_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, + conn: Any) -> Dict[str, Tuple[float, ...]]: + """ + Map each gene symbols to the corresponding tissue expression data. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getSymbolValuePairDict` function + in GeneNetwork1. + """ + data_ids = { + symbol: data_id_dict.get(symbol) for symbol in symbol_list + if data_id_dict.get(symbol) is not None + } + query = "SELECT Id, value FROM TissueProbeSetData WHERE Id IN %(data_ids)s" + with conn.cursor() as cursor: + cursor.execute( + query, + data_ids=tuple(data_ids.values())) + value_results = cursor.fetchall() + return { + key: tuple(row[1] for row in value_results if row[0] == key) + for key in data_ids.keys() + } + + return {} + +def fetch_gene_symbol_tissue_value_dict( + symbol_list: Tuple[str, ...], data_id_dict: dict, conn: Any, + limit_num: int = 1000) -> dict:#getGeneSymbolTissueValueDict + """ + Wrapper function for `gn3.db.correlations.fetch_symbol_value_pair_dict`. + + This is a migrations of the + `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDict` in + GeneNetwork1. + """ + count = len(symbol_list) + if count != 0 and count <= limit_num: + return fetch_symbol_value_pair_dict(symbol_list, data_id_dict, conn) + + if count > limit_num: + return { + key: value for dct in [ + fetch_symbol_value_pair_dict(sl, data_id_dict, conn) + for sl in partition_all(limit_num, symbol_list)] + for key, value in dct.items() + } + + return {} + +def fetch_tissue_probeset_xref_info( + gene_name_list: Tuple[str, ...], probeset_freeze_id: int, + conn: Any) -> Tuple[tuple, dict, dict, dict, dict, dict, dict]: + """ + Retrieve the ProbeSet XRef information for tissues. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getTissueProbeSetXRefInfo` + function in GeneNetwork1.""" + with conn.cursor() as cursor: + if len(gene_name_list) == 0: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol != '' " + " AND Symbol IS NOT NULL " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute(query, probeset_freeze_id=probeset_freeze_id) + else: + query = ( + "SELECT t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, " + "t.description, t.Probe_Target_Description " + "FROM " + "(" + " SELECT Symbol, max(Mean) AS maxmean " + " FROM TissueProbeSetXRef " + " WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + " AND Symbol in %(symbols)s " + " GROUP BY Symbol" + ") AS x " + "INNER JOIN TissueProbeSetXRef AS t ON t.Symbol = x.Symbol " + "AND t.Mean = x.maxmean") + cursor.execute( + query, probeset_freeze_id=probeset_freeze_id, + symbols=tuple(gene_name_list)) + + results = cursor.fetchall() + + return reduce( + lambda acc, item: ( + acc[0] + (item[0],), + {**acc[1], item[0].lower(): item[1]}, + {**acc[1], item[0].lower(): item[2]}, + {**acc[1], item[0].lower(): item[3]}, + {**acc[1], item[0].lower(): item[4]}, + {**acc[1], item[0].lower(): item[5]}, + {**acc[1], item[0].lower(): item[6]}), + results or tuple(), + (tuple(), {}, {}, {}, {}, {}, {})) + +def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` + function in GeneNetwork1. + """ + raise Exception("Unimplemented!!!") + return ({}, {}) + +def build_temporary_tissue_correlations_table( + trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> str: + """ + Build a temporary table to hold the tissue correlations data. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in + GeneNetwork1.""" + raise Exception("Unimplemented!!!") + return "" + +def fetch_tissue_correlations( + dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int, conn: Any) -> dict: + """ + Pair tissue correlations data with a trait id string. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchTissueCorrelations` function in + GeneNetwork1. + """ + temp_table = build_temporary_tissue_correlations_table( + trait_symbol, probeset_freeze_id, method, return_number, conn) + with conn.cursor() as cursor: + cursor.execute( + ( + f"SELECT ProbeSet.Name, {temp_table}.Correlation, " + f"{temp_table}.PValue " + "FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze) " + "LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + "WHERE ProbeSetFreeze.Name = %(db_name) " + "AND ProbeSetFreeze.Id=ProbeSetXRef.ProbeSetFreezeId " + "AND ProbeSet.Id = ProbeSetXRef.ProbeSetId " + "AND ProbeSet.Symbol IS NOT NULL " + "AND %s.Correlation IS NOT NULL"), + db_name=dataset["dataset_name"]) + results = cursor.fetchall() + cursor.execute("DROP TEMPORARY TABLE %s", temp_table) + return { + trait_name: (tiss_corr, tiss_p_val) + for trait_name, tiss_corr, tiss_p_val in results} -- cgit v1.2.3 From 42dee16ec8a7d7620367dd31481999bfca9313db Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 08:59:30 +0300 Subject: Implement `fetch_gene_symbol_tissue_value_dict_for_trait` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Implement `fetch_gene_symbol_tissue_value_dict_for_trait` function which is a migration of the `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDictForTrait` function in GeneNetwork1. --- gn3/db/correlations.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 87ab082..cae8080 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -266,7 +266,22 @@ def fetch_tissue_probeset_xref_info( (tuple(), {}, {}, {}, {}, {}, {})) def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: +def fetch_gene_symbol_tissue_value_dict_for_trait( + gene_name_list: Tuple[str, ...], probeset_freeze_id: int, + conn: Any) -> dict: + """ + Fetches a map of the gene symbols to the tissue values. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.getGeneSymbolTissueValueDictForTrait` + function in GeneNetwork1. """ + xref_info = fetch_tissue_probeset_xref_info( + gene_name_list, probeset_freeze_id, conn) + if xref_info[0]: + return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) + return {} + This is a migration of the `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` function in GeneNetwork1. -- cgit v1.2.3 From d6e392c2488421ae04b4ffd5de26be40ed86a9b3 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 09:17:52 +0300 Subject: Complete `correlations_of_all_tissue_traits` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Complete the implementation of the `correlations_of_all_tissue_traits` function by providing a call to a non-implemented function. --- gn3/db/correlations.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index cae8080..f43b8a5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -265,7 +265,6 @@ def fetch_tissue_probeset_xref_info( results or tuple(), (tuple(), {}, {}, {}, {}, {}, {})) -def correlations_of_all_tissue_traits() -> Tuple[dict, dict]: def fetch_gene_symbol_tissue_value_dict_for_trait( gene_name_list: Tuple[str, ...], probeset_freeze_id: int, conn: Any) -> dict: @@ -282,12 +281,25 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} +def correlations_of_all_tissue_traits( + trait_symbol: str, probeset_freeze_id: int, + method: str, conn: Any) -> Tuple[dict, dict]: + """ + Computes and returns the correlation of all tissue traits. + This is a migration of the - `web.webqtl.correlation.CorrelationPage.calculateCorrOfAllTissueTrait` + `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - raise Exception("Unimplemented!!!") - return ({}, {}) + primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + (trait_symbol,), probeset_freeze_id, conn) + primary_trait_value = primary_trait_symbol_value_dict.vlaues()[0] + symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + tuple(), probeset_freeze_id, conn) + if method == "1": + return batch_computed_tissue_correlation( + primaryTraitValue,SymbolValueDict,method='spearman') + return batch_computed_tissue_correlation(primaryTraitValue,SymbolValueDict) def build_temporary_tissue_correlations_table( trait_symbol: str, probeset_freeze_id: int, method: str, @@ -298,6 +310,8 @@ def build_temporary_tissue_correlations_table( This is a migration of the `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" + symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( + trait_symbol, probeset_freeze_id, method, conn) raise Exception("Unimplemented!!!") return "" -- cgit v1.2.3 From 5079e5077adafdbfd0b7e7c0ef12431e9aed443d Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 26 Oct 2021 09:23:48 +0300 Subject: Stub out `batch_computed_tissue_correlation` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Stub out `batch_computed_tissue_correlation` function to be used in implementing the function down the line. --- gn3/db/correlations.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f43b8a5..54d3079 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,6 +281,14 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} +def batch_computed_tissue_correlation( + trait_value: str, symbol_value_dict: dict, + method: str = "pearson") -> Tuple[dict, dict]: + """ + `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" + raise Exception("Not implemented!") + return ({}, {}) + def correlations_of_all_tissue_traits( trait_symbol: str, probeset_freeze_id: int, method: str, conn: Any) -> Tuple[dict, dict]: -- cgit v1.2.3 From 28b0ced4ec13451c5c7323ed5135d126f296836a Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 04:55:30 +0300 Subject: Move the function to computations module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * The function `batch_computed_tissue_correlation` is a pure computations function with no expressions accessing the database, as far as I can tell, therefore, this commit moves the function over to the gn3.computations.partial_correlations module that holds the pure computation functions. --- gn3/computations/partial_correlations.py | 8 ++++++++ gn3/db/correlations.py | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 1fb0ccc..b3de31c 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -122,3 +122,11 @@ def find_identical_traits( (primary_name,) + control_names), {}).items() if len(item[1]) > 1), tuple())) + +def batch_computed_tissue_correlation( + trait_value: str, symbol_value_dict: dict, + method: str = "pearson") -> Tuple[dict, dict]: + """ + `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" + raise Exception("Not implemented!") + return ({}, {}) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 54d3079..f43b8a5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,14 +281,6 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} -def batch_computed_tissue_correlation( - trait_value: str, symbol_value_dict: dict, - method: str = "pearson") -> Tuple[dict, dict]: - """ - `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" - raise Exception("Not implemented!") - return ({}, {}) - def correlations_of_all_tissue_traits( trait_symbol: str, probeset_freeze_id: int, method: str, conn: Any) -> Tuple[dict, dict]: -- cgit v1.2.3 From 5a9db2162a0a694a76a256996bb296ff06c75126 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 06:59:57 +0300 Subject: Move `correlations_of_all_tissue_traits` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/computations/partial_correlations.py: new function (`correlations_of_all_tissue_traits`). * gn3/db/correlations.py: delete function (`correlations_of_all_tissue_traits`). Move the function to `gn3.computations.partial_correlations` module and comment out the db-access code. Rework it to receive, as arguments, the data it previously fetched from the database, and add comments on future rework to get the function working again. --- gn3/computations/partial_correlations.py | 27 +++++++++++++++++++++++++++ gn3/db/correlations.py | 20 -------------------- 2 files changed, 27 insertions(+), 20 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index e73edfd..4ba2ba4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -157,3 +157,30 @@ def batch_computed_tissue_correlation( `web.webqtl.correlation.correlationFunction.batchCalTissueCorr`""" raise Exception("Not implemented!") return ({}, {}) + +def correlations_of_all_tissue_traits( + primary_trait_symbol_value_dict: dict, symbol_value_dict: dict, + method: str) -> Tuple[dict, dict]: + """ + Computes and returns the correlation of all tissue traits. + + This is a migration of the + `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` + function in GeneNetwork1. + """ + # The section below existed in the original function, but with the migration + # and the proposed rework (in the near future), the values from the database + # should be passed into this function, rather than have the function fetch + # the data for itself. + # --------------------------------------------------- + # primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + # (trait_symbol,), probeset_freeze_id, conn) + # primary_trait_values = primary_trait_symbol_value_dict.vlaues()[0] + # symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + # tuple(), probeset_freeze_id, conn) + # --------------------------------------------------- + # We might end up actually getting rid of this function all together as the + # rework is done. + primary_trait_values = primary_trait_symbol_value_dict.values()[0] + return batch_computed_tissue_correlation( + primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f43b8a5..39ed499 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -281,26 +281,6 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return fetch_gene_symbol_tissue_value_dict(xref_info[0], xref_info[2], conn) return {} -def correlations_of_all_tissue_traits( - trait_symbol: str, probeset_freeze_id: int, - method: str, conn: Any) -> Tuple[dict, dict]: - """ - Computes and returns the correlation of all tissue traits. - - This is a migration of the - `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` - function in GeneNetwork1. - """ - primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - (trait_symbol,), probeset_freeze_id, conn) - primary_trait_value = primary_trait_symbol_value_dict.vlaues()[0] - symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - tuple(), probeset_freeze_id, conn) - if method == "1": - return batch_computed_tissue_correlation( - primaryTraitValue,SymbolValueDict,method='spearman') - return batch_computed_tissue_correlation(primaryTraitValue,SymbolValueDict) - def build_temporary_tissue_correlations_table( trait_symbol: str, probeset_freeze_id: int, method: str, return_number: int, conn: Any) -> str: -- cgit v1.2.3 From 773c0896ccbed12170be2b5aed4554ab86d923b5 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:00:27 +0300 Subject: Complete `build_temporary_tissue_correlations_table` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/computations/partial_correlations.py: Remove comments after updating usage of the function at call point * gn3/db/correlations.py: Complete the implementation of the `build_temporary_tissue_correlations_table` function --- gn3/computations/partial_correlations.py | 13 ------------ gn3/db/correlations.py | 36 +++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index d095185..5777a0b 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -173,19 +173,6 @@ def correlations_of_all_tissue_traits( `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - # The section below existed in the original function, but with the migration - # and the proposed rework (in the near future), the values from the database - # should be passed into this function, rather than have the function fetch - # the data for itself. - # --------------------------------------------------- - # primary_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - # (trait_symbol,), probeset_freeze_id, conn) - # primary_trait_values = primary_trait_symbol_value_dict.vlaues()[0] - # symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( - # tuple(), probeset_freeze_id, conn) - # --------------------------------------------------- - # We might end up actually getting rid of this function all together as the - # rework is done. primary_trait_values = primary_trait_symbol_value_dict.values()[0] return batch_computed_tissue_correlation( primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 39ed499..28f050a 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -290,10 +290,40 @@ def build_temporary_tissue_correlations_table( This is a migration of the `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" + # We should probably pass the `correlations_of_all_tissue_traits` function + # as an argument to this function and get rid of the two lines immediately + # following this comment. + from gn3.computations.partial_correlations import correlations_of_all_tissue_traits symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( - trait_symbol, probeset_freeze_id, method, conn) - raise Exception("Unimplemented!!!") - return "" + fetch_gene_symbol_tissue_value_dict_for_trait( + (trait_symbol,), probeset_freeze_id, conn), + fetch_gene_symbol_tissue_value_dict_for_trait( + tuple(), probeset_freeze_id, conn), + method) + + symbol_corr_list = sorted( + symbol_corr_dict.items(), + key=compare_tissue_correlation_absolute_values) + + temp_table_name = f"TOPTISSUE{random_string(8)}" + create_query = ( + "CREATE TEMPORARY TABLE {temp_table_name}" + "(Symbol varchar(100) PRIMARY KEY, Correlation float, PValue float)") + insert_query = ( + f"INSERT INTO {temp_table_name}(Symbol, Correlation, PValue) " + " VALUES (%(symbol)s, %(correlation)s, %(pvalue)s)") + + with conn.cursor() as cursor: + cursor.execute(create_query) + cursor.execute( + insert_query, + tuple({ + "symbol": symbol, + "correlation": corr, + "pvalue": symbol_p_value_dict[symbol] + } for symbol, corr in symbol_corr_list[0: 2 * return_number])) + + return temp_table_name def fetch_tissue_correlations( dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, -- cgit v1.2.3 From 307a83b897b9ece7c9dd1af49bdedc9e1320eb61 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:25:13 +0300 Subject: Rework sorting: remove `compare_tissue_correlation_absolute_values` Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * gn3/db/correlations.py: Remove the `compare_tissue_correlation_absolute_values` function which is no longer needed. --- gn3/db/correlations.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 28f050a..d7954e5 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -140,22 +140,6 @@ def fetch_literature_correlations( cursor.execute("DROP TEMPORARY TABLE %s", temp_table) return dict(results) -def compare_tissue_correlation_absolute_values(val1, val2): - """ - Comparison function for use when sorting tissue correlation values. - - This is a partial migration of the - `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in - GeneNetwork1.""" - try: - if abs(val1) < abs(val2): - return 1 - if abs(val1) == abs(val2): - return 0 - return -1 - except TypeError: - return 0 - def fetch_symbol_value_pair_dict( symbol_list: Tuple[str, ...], data_id_dict: dict, conn: Any) -> Dict[str, Tuple[float, ...]]: @@ -302,8 +286,7 @@ def build_temporary_tissue_correlations_table( method) symbol_corr_list = sorted( - symbol_corr_dict.items(), - key=compare_tissue_correlation_absolute_values) + symbol_corr_dict.items(), key=lambda key_val: key_val[1]) temp_table_name = f"TOPTISSUE{random_string(8)}" create_query = ( -- cgit v1.2.3 From 9ceb958273b8d86d220fa0d2f040fcb4a8233586 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 29 Oct 2021 08:28:19 +0300 Subject: Fix linting and typing errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/computations/partial_correlations.py | 2 +- gn3/db/correlations.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 5777a0b..fce6ad2 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -173,6 +173,6 @@ def correlations_of_all_tissue_traits( `web.webqtl.correlation.correlationFunction.calculateCorrOfAllTissueTrait` function in GeneNetwork1. """ - primary_trait_values = primary_trait_symbol_value_dict.values()[0] + primary_trait_values = tuple(primary_trait_symbol_value_dict.values())[0] return batch_computed_tissue_correlation( primary_trait_values, symbol_value_dict, method) diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index d7954e5..d94759a 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -10,6 +10,8 @@ from gn3.random import random_string from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id +from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + def get_filename(target_db_name: str, conn: Any) -> str: """ Retrieve the name of the reference database file with which correlations are @@ -275,9 +277,8 @@ def build_temporary_tissue_correlations_table( `web.webqtl.correlation.CorrelationPage.getTempTissueCorrTable` function in GeneNetwork1.""" # We should probably pass the `correlations_of_all_tissue_traits` function - # as an argument to this function and get rid of the two lines immediately + # as an argument to this function and get rid of the one call immediately # following this comment. - from gn3.computations.partial_correlations import correlations_of_all_tissue_traits symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( fetch_gene_symbol_tissue_value_dict_for_trait( (trait_symbol,), probeset_freeze_id, conn), @@ -308,7 +309,7 @@ def build_temporary_tissue_correlations_table( return temp_table_name -def fetch_tissue_correlations( +def fetch_tissue_correlations(# pylint: disable=R0913 dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, return_number: int, conn: Any) -> dict: """ -- cgit v1.2.3 From 4a6be7e1b6514f3c7db8c672970b27e27ecde305 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 1 Nov 2021 06:01:58 +0300 Subject: Add some condition checking functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Add the `check_for_literature_info` and `check_symbol_for_tissue_correlation` functions to check for the presence of specific data. --- gn3/db/correlations.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index d94759a..06b3310 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -339,3 +339,43 @@ def fetch_tissue_correlations(# pylint: disable=R0913 return { trait_name: (tiss_corr, tiss_p_val) for trait_name, tiss_corr, tiss_p_val in results} + +def check_for_literature_info(conn: Any, geneid: int) -> bool: + """ + Checks the database to find out whether the trait with `geneid` has any + associated literature. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.checkForLitInfo` function in + GeneNetwork1. + """ + query = "SELECT 1 FROM LCorrRamin3 WHERE GeneId1=%s LIMIT 1" + with conn.cursor() as cursor: + cursor.execute(query, geneid) + result = cursor.fetchone() + if result: + return True + + return False + +def check_symbol_for_tissue_correlation( + conn: Any, tissue_probeset_freeze_id: int, symbol: str = "") -> bool: + """ + Checks whether a symbol has any associated tissue correlations. + + This is a migration of the + `web.webqtl.correlation.CorrelationPage.checkSymbolForTissueCorr` function + in GeneNetwork1. + """ + query = ( + "SELECT 1 FROM TissueProbeSetXRef " + "WHERE TissueProbeSetFreezeId=%(probeset_freeze_id)s " + "AND Symbol=%(symbol)s LIMIT 1") + with conn.cursor() as cursor: + cursor.execute( + query, probeset_freeze_id=tissue_probeset_freeze_id, symbol=symbol) + result = cursor.fetchone() + if result: + return True + + return False -- cgit v1.2.3 From 854416ba850b7793aa9aa95528b89bc69df26888 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 09:06:52 +0300 Subject: Make the DB connection argument the first Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * To make the code more composable down the line, make the database connection argument the first argument for functions that access the database, since they will always require the connection. --- gn3/db/correlations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 06b3310..f327dc3 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -34,7 +34,7 @@ def get_filename(target_db_name: str, conn: Any) -> str: return "" def build_temporary_literature_table( - species: str, gene_id: int, return_number: int, conn: Any) -> str: + conn: Any, species: str, gene_id: int, return_number: int) -> str: """ Build and populate a temporary table to hold the literature correlation data to be used in computations. @@ -128,7 +128,7 @@ def fetch_literature_correlations( GeneNetwork1. """ temp_table = build_temporary_literature_table( - species, gene_id, return_number, conn) + conn, species, gene_id, return_number) query_fns = { "Geno": fetch_geno_literature_correlations, # "Temp": fetch_temp_literature_correlations, @@ -268,8 +268,8 @@ def fetch_gene_symbol_tissue_value_dict_for_trait( return {} def build_temporary_tissue_correlations_table( - trait_symbol: str, probeset_freeze_id: int, method: str, - return_number: int, conn: Any) -> str: + conn: Any, trait_symbol: str, probeset_freeze_id: int, method: str, + return_number: int) -> str: """ Build a temporary table to hold the tissue correlations data. @@ -320,7 +320,7 @@ def fetch_tissue_correlations(# pylint: disable=R0913 GeneNetwork1. """ temp_table = build_temporary_tissue_correlations_table( - trait_symbol, probeset_freeze_id, method, return_number, conn) + conn, trait_symbol, probeset_freeze_id, method, return_number) with conn.cursor() as cursor: cursor.execute( ( -- cgit v1.2.3 From 55d698b1fb07afe74bf1dd570f9f495aefea1086 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 12:03:21 +0300 Subject: Migrate `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function from GN1 to GN3. --- gn3/db/correlations.py | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index f327dc3..ff570b4 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -379,3 +379,150 @@ def check_symbol_for_tissue_correlation( return True return False + +def fetch_sample_ids( + conn: Any, sample_names: Tuple[str, ...], species_name: str) -> Tuple[ + int, ...]: + """ + Given a sequence of sample names, and a species name, return the sample ids + that correspond to both. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ + query = ( + "SELECT Strain.Id FROM Strain, Species " + "WHERE Strain.Name IN %(samples_names)s " + "AND Strain.SpeciesId=Species.Id " + "AND Species.name=%(species_name)s") + with conn.cursor() as cursor: + cursor.execute( + query, samples_names=tuple(samples), + species_name=species) + return cursor.fetchall() + +def fetch_all_database_data( + conn: Any, species: str, gene_id: int, gene_symbol: str, + samples: Tuple[str, ...], db_type: str, db_name: str, method: str, + returnNumber: int, tissueProbeSetFreezeId: int) -> Tuple[Any, Any]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ + def __build_query_sgo_lit__(temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.value " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + + " ".join(joins) + + f" WHERE ProbeSet.GeneId IS NOT NULL " + + f"AND {temp_table}.value IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 2) + + def __build_query_tissue_corr__(temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + + f"{temp_table}.PValue, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + + " ".join(joins) + + f" WHERE ProbeSet.Symbol IS NOT NULL " + + f"AND {temp_table}.Correlation IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.%sId " + f"ORDER BY {db_type}.Id"), + 3) + + def __build_query__(sample_ids, temp_table): + sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) + if db_type == "Publish": + joins = tuple( + ("LEFT JOIN PublishData AS T{item} " + "ON T{item}.Id = PublishXRef.DataId " + "AND T{item}.StrainId = %(T{item}_sample_id)s") + for item in sample_ids) + return ( + ("SELECT PublishXRef.Id, " + + sample_id_columns + + "FROM (PublishXRef, PublishFreeze) " + + " ".join(joins) + + " WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishFreeze.Name = %(db_name)s"), + 1) + if temp_table is not None: + joins = tuple( + ("LEFT JOIN {db_type}Data AS T{item} " + "ON T{item}.Id = {db_type}XRef.DataId " + "AND T{item}.StrainId=%(T{item}_sample_id)s") + for item in sample_ids) + if method.lower() == "sgo literature correlation": + return __build_query_sgo_lit__( + sample_ids, temp_table, sample_id_columns) + if method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return __build_query_tissue_corr__( + sample_ids, temp_table, sample_id_columns) + joins = tuple( + (f"LEFT JOIN {db_type}Data AS T{item} " + f"ON T{item}.Id = {db_type}XRef.DataId " + f"AND T{item}.StrainId = %(T{item}_sample_id)s") + for item in sample_ids) + return ( + ( + f"SELECT {db_type}.Name, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + " ".join(joins) + + f" WHERE {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 1) + + def __fetch_data__(sample_ids, temp_table): + query, data_start_pos = __build_query__(sample_ids, temp_table) + with conn.cursor() as cursor: + cursor.execute( + query, db_name=db_name, + **{f"T{item}_sample_id": item for item in sample_ids}) + return cursor.fetchall() + + sample_ids = tuple( + # look into graduating this to an argument and removing the `samples` + # and `species` argument: function currying and compositions might help + # with this + f"{sample_id}" for sample_id in + fetch_sample_ids(conn, samples, species)) + + temp_table = None + if gene_id and db_type == "probeset": + if method.lower() == "sgo literature correlation": + temp_table = build_temporary_literature_table( + conn, species, gene_id, return_number) + if method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + temp_table = build_temporary_tissue_correlations_table( + conn, trait_symbol, probeset_freeze_id, method, return_number) + + trait_database = tuple( + item for sublist in + (__fetch_data__(ssample_ids, temp_table) + for ssample_ids in partition_all(25, sample_ids)) + for item in sublist) + + if temp_table: + with conn.cursor() as cursor: + cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") + + return trait_database, data_start_pos -- cgit v1.2.3 From 575da0baf4468d27782c73b19995b3adb934ba70 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 22 Nov 2021 13:56:03 +0300 Subject: Add test to query builders Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Add some tests for the query builders to ensure that the queries are built up correctly. --- gn3/db/correlations.py | 78 +++++++++++++++++---------------- tests/unit/db/test_correlation.py | 90 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 36 deletions(-) create mode 100644 tests/unit/db/test_correlation.py (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index ff570b4..7daff87 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -402,6 +402,43 @@ def fetch_sample_ids( species_name=species) return cursor.fetchall() +def build_query_sgo_lit_corr( + db_type: str, temp_table: str, sample_id_columns: str, + joins: Tuple[str, ...]) -> str: + """ + Build query for `SGO Literature Correlation` data, when querying the given + `temp_table` temporary table. + """ + return ( + (f"SELECT {db_type}.Name, {temp_table}.value, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + + " ".join(joins) + + f" WHERE ProbeSet.GeneId IS NOT NULL " + + f"AND {temp_table}.value IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + + f"ORDER BY {db_type}.Id"), + 2) + +def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): + return ( + (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + + f"{temp_table}.PValue, " + + sample_id_columns + + f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + + " ".join(joins) + + f" WHERE ProbeSet.Symbol IS NOT NULL " + + f"AND {temp_table}.Correlation IS NOT NULL " + + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + + f"AND {db_type}Freeze.Name = %(db_name)s " + + f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + f"ORDER BY {db_type}.Id"), + 3) + def fetch_all_database_data( conn: Any, species: str, gene_id: int, gene_symbol: str, samples: Tuple[str, ...], db_type: str, db_name: str, method: str, @@ -411,37 +448,6 @@ def fetch_all_database_data( `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ - def __build_query_sgo_lit__(temp_table, sample_id_columns, joins): - return ( - (f"SELECT {db_type}.Name, {temp_table}.value " + - sample_id_columns + - f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + - f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + - " ".join(joins) + - f" WHERE ProbeSet.GeneId IS NOT NULL " + - f"AND {temp_table}.value IS NOT NULL " + - f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + - f"AND {db_type}Freeze.Name = %(db_name)s " + - f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " + - f"ORDER BY {db_type}.Id"), - 2) - - def __build_query_tissue_corr__(temp_table, sample_id_columns, joins): - return ( - (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + - f"{temp_table}.PValue, " + - sample_id_columns + - f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + - f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + - " ".join(joins) + - f" WHERE ProbeSet.Symbol IS NOT NULL " + - f"AND {temp_table}.Correlation IS NOT NULL " + - f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + - f"AND {db_type}Freeze.Name = %(db_name)s " + - f"AND {db_type}.Id = {db_type}XRef.%sId " - f"ORDER BY {db_type}.Id"), - 3) - def __build_query__(sample_ids, temp_table): sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) if db_type == "Publish": @@ -460,17 +466,17 @@ def fetch_all_database_data( 1) if temp_table is not None: joins = tuple( - ("LEFT JOIN {db_type}Data AS T{item} " - "ON T{item}.Id = {db_type}XRef.DataId " - "AND T{item}.StrainId=%(T{item}_sample_id)s") + (f"LEFT JOIN {db_type}Data AS T{item} " + f"ON T{item}.Id = {db_type}XRef.DataId " + f"AND T{item}.StrainId=%(T{item}_sample_id)s") for item in sample_ids) if method.lower() == "sgo literature correlation": - return __build_query_sgo_lit__( + return build_query_sgo_lit_corr( sample_ids, temp_table, sample_id_columns) if method.lower() in ( "tissue correlation, pearson's r", "tissue correlation, spearman's rho"): - return __build_query_tissue_corr__( + return build_query_tissue_corr( sample_ids, temp_table, sample_id_columns) joins = tuple( (f"LEFT JOIN {db_type}Data AS T{item} " diff --git a/tests/unit/db/test_correlation.py b/tests/unit/db/test_correlation.py new file mode 100644 index 0000000..866d28d --- /dev/null +++ b/tests/unit/db/test_correlation.py @@ -0,0 +1,90 @@ +""" +Tests for the gn3.db.correlations module +""" + +from unittest import TestCase + +from gn3.db.correlations import ( + build_query_sgo_lit_corr, + build_query_tissue_corr) + +class TestCorrelation(TestCase): + """Test cases for correlation data fetching functions""" + maxDiff = None + + def test_build_query_sgo_lit_corr(self): + self.assertEqual( + build_query_sgo_lit_corr( + "Probeset", + "temp_table_xy45i7wd", + "T1.value, T2.value, T3.value", + (("LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s"))), + (("SELECT Probeset.Name, temp_table_xy45i7wd.value, " + "T1.value, T2.value, T3.value " + "FROM (Probeset, ProbesetXRef, ProbesetFreeze) " + "LEFT JOIN temp_table_xy45i7wd ON temp_table_xy45i7wd.GeneId2=ProbeSet.GeneId " + "LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s " + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s " + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s " + "WHERE ProbeSet.GeneId IS NOT NULL " + "AND temp_table_xy45i7wd.value IS NOT NULL " + "AND ProbesetXRef.ProbesetFreezeId = ProbesetFreeze.Id " + "AND ProbesetFreeze.Name = %(db_name)s " + "AND Probeset.Id = ProbesetXRef.ProbesetId " + "ORDER BY Probeset.Id"), + 2)) + + def test_build_query_tissue_corr(self): + self.assertEqual( + build_query_tissue_corr( + "Probeset", + "temp_table_xy45i7wd", + "T1.value, T2.value, T3.value", + (("LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s"), + ( + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s"))), + (("SELECT Probeset.Name, temp_table_xy45i7wd.Correlation, " + "temp_table_xy45i7wd.PValue, " + "T1.value, T2.value, T3.value " + "FROM (Probeset, ProbesetXRef, ProbesetFreeze) " + "LEFT JOIN temp_table_xy45i7wd ON temp_table_xy45i7wd.Symbol=ProbeSet.Symbol " + "LEFT JOIN ProbesetData AS T1 " + "ON T1.Id = ProbesetXRef.DataId " + "AND T1.StrainId=%(T1_sample_id)s " + "LEFT JOIN ProbesetData AS T2 " + "ON T2.Id = ProbesetXRef.DataId " + "AND T2.StrainId=%(T2_sample_id)s " + "LEFT JOIN ProbesetData AS T3 " + "ON T3.Id = ProbesetXRef.DataId " + "AND T3.StrainId=%(T3_sample_id)s " + "WHERE ProbeSet.Symbol IS NOT NULL " + "AND temp_table_xy45i7wd.Correlation IS NOT NULL " + "AND ProbesetXRef.ProbesetFreezeId = ProbesetFreeze.Id " + "AND ProbesetFreeze.Name = %(db_name)s " + "AND Probeset.Id = ProbesetXRef.ProbesetId " + "ORDER BY Probeset.Id"), + 3)) -- cgit v1.2.3 From e1fb18b9d4a3b4ab9783f58d78ff384141567a42 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 23 Nov 2021 11:49:32 +0300 Subject: Update documentation for functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Document functions for posterity. --- gn3/db/correlations.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 7daff87..5c3e7b8 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -408,6 +408,10 @@ def build_query_sgo_lit_corr( """ Build query for `SGO Literature Correlation` data, when querying the given `temp_table` temporary table. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. """ return ( (f"SELECT {db_type}.Name, {temp_table}.value, " + @@ -424,6 +428,14 @@ def build_query_sgo_lit_corr( 2) def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): + """ + Build query for `Tissue Correlation` data, when querying the given + `temp_table` temporary table. + + This is a partial migration of the + `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in + GeneNetwork1. + """ return ( (f"SELECT {db_type}.Name, {temp_table}.Correlation, " + f"{temp_table}.PValue, " + -- cgit v1.2.3 From df4ed9183f3efd89d54bba1a144c48475f4b8169 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 23 Nov 2021 12:34:44 +0300 Subject: Fix a myriad of linting errors * Fix linting errors like: - Unused variables - Undeclared variable errors (mostly caused by typos, and wrong names) - Missing documentation strings for functions etc. --- gn3/computations/partial_correlations.py | 4 +++- gn3/db/correlations.py | 24 ++++++++++++------------ tests/unit/db/test_correlation.py | 6 ++++++ 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 4bd26a2..f43c4d4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -200,11 +200,13 @@ def good_dataset_samples_indexes( samples_from_file.index(good) for good in set(samples).intersection(set(samples_from_file)))) -def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] +def partial_correlations_fast(# pylint: disable=[R0913, R0914] samples, primary_vals, control_vals, database_filename, fetched_correlations, method: str, correlation_type: str) -> Tuple[ float, Tuple[float, ...]]: """ + Computes partial correlation coefficients using data from a CSV file. + This is a partial migration of the `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` function in GeneNetwork1. diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 5c3e7b8..a1daa3c 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -398,8 +398,8 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - query, samples_names=tuple(samples), - species_name=species) + query, samples_names=tuple(sample_names), + species_name=species_name) return cursor.fetchall() def build_query_sgo_lit_corr( @@ -419,7 +419,7 @@ def build_query_sgo_lit_corr( f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " + " ".join(joins) + - f" WHERE ProbeSet.GeneId IS NOT NULL " + + " WHERE ProbeSet.GeneId IS NOT NULL " + f"AND {temp_table}.value IS NOT NULL " + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + f"AND {db_type}Freeze.Name = %(db_name)s " + @@ -443,7 +443,7 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " + f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " + " ".join(joins) + - f" WHERE ProbeSet.Symbol IS NOT NULL " + + " WHERE ProbeSet.Symbol IS NOT NULL " + f"AND {temp_table}.Correlation IS NOT NULL " + f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " + f"AND {db_type}Freeze.Name = %(db_name)s " + @@ -451,17 +451,17 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): f"ORDER BY {db_type}.Id"), 3) -def fetch_all_database_data( - conn: Any, species: str, gene_id: int, gene_symbol: str, +def fetch_all_database_data(# pylint: disable=[R0913, R0914] + conn: Any, species: str, gene_id: int, trait_symbol: str, samples: Tuple[str, ...], db_type: str, db_name: str, method: str, - returnNumber: int, tissueProbeSetFreezeId: int) -> Tuple[Any, Any]: + return_number: int, probeset_freeze_id: int) -> Tuple[Any, Any]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ def __build_query__(sample_ids, temp_table): - sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids) + sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in sample_ids) if db_type == "Publish": joins = tuple( ("LEFT JOIN PublishData AS T{item} " @@ -484,12 +484,12 @@ def fetch_all_database_data( for item in sample_ids) if method.lower() == "sgo literature correlation": return build_query_sgo_lit_corr( - sample_ids, temp_table, sample_id_columns) + sample_ids, temp_table, sample_id_columns, joins) if method.lower() in ( "tissue correlation, pearson's r", "tissue correlation, spearman's rho"): return build_query_tissue_corr( - sample_ids, temp_table, sample_id_columns) + sample_ids, temp_table, sample_id_columns, joins) joins = tuple( (f"LEFT JOIN {db_type}Data AS T{item} " f"ON T{item}.Id = {db_type}XRef.DataId " @@ -513,7 +513,7 @@ def fetch_all_database_data( cursor.execute( query, db_name=db_name, **{f"T{item}_sample_id": item for item in sample_ids}) - return cursor.fetchall() + return (cursor.fetchall(), data_start_pos) sample_ids = tuple( # look into graduating this to an argument and removing the `samples` @@ -543,4 +543,4 @@ def fetch_all_database_data( with conn.cursor() as cursor: cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") - return trait_database, data_start_pos + return (tuple(item[0] for item in trait_database), trait_database[0][1]) diff --git a/tests/unit/db/test_correlation.py b/tests/unit/db/test_correlation.py index 866d28d..3f940b2 100644 --- a/tests/unit/db/test_correlation.py +++ b/tests/unit/db/test_correlation.py @@ -13,6 +13,9 @@ class TestCorrelation(TestCase): maxDiff = None def test_build_query_sgo_lit_corr(self): + """ + Test that the literature correlation query is built correctly. + """ self.assertEqual( build_query_sgo_lit_corr( "Probeset", @@ -51,6 +54,9 @@ class TestCorrelation(TestCase): 2)) def test_build_query_tissue_corr(self): + """ + Test that the tissue correlation query is built correctly. + """ self.assertEqual( build_query_tissue_corr( "Probeset", -- cgit v1.2.3 From f88d8e4446c8c9d6693197a452669e0e9c8ea812 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:44:53 +0300 Subject: Fix query parametrisation Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Pass parameters to the query the way the MySQL driver expects. --- gn3/db/correlations.py | 16 ++++++++++------ gn3/db/species.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index a1daa3c..c838597 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -23,8 +23,8 @@ def get_filename(target_db_name: str, conn: Any) -> str: """ with conn.cursor() as cursor: cursor.execute( - "SELECT Id, FullName from ProbeSetFreeze WHERE Name-%s", - target_db_name) + "SELECT Id, FullName from ProbeSetFreeze WHERE Name=%s", + (target_db_name,)) result = cursor.fetchone() if result: return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( @@ -398,9 +398,12 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - query, samples_names=tuple(sample_names), - species_name=species_name) return cursor.fetchall() + query, + { + "samples_names": tuple(sample_names), + "species_name": species_name + }) def build_query_sgo_lit_corr( db_type: str, temp_table: str, sample_id_columns: str, @@ -511,8 +514,9 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] query, data_start_pos = __build_query__(sample_ids, temp_table) with conn.cursor() as cursor: cursor.execute( - query, db_name=db_name, - **{f"T{item}_sample_id": item for item in sample_ids}) + query, + {"db_name": db_name, + **{f"T{item}_sample_id": item for item in sample_ids}}) return (cursor.fetchall(), data_start_pos) sample_ids = tuple( diff --git a/gn3/db/species.py b/gn3/db/species.py index 20170ba..5b8e096 100644 --- a/gn3/db/species.py +++ b/gn3/db/species.py @@ -71,6 +71,6 @@ def species_name(conn: Any, group: str) -> str: ("SELECT Species.Name FROM Species, InbredSet " "WHERE InbredSet.Name = %(group_name)s " "AND InbredSet.SpeciesId = Species.Id"), - group_name=group_name) + {"group_name": group}) return cursor.fetchone()[0] return None -- cgit v1.2.3 From d29ef85d44fff9414932d64850f65cb268b1cddd Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:48:26 +0300 Subject: Update typing notations on functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/db/correlations.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index c838597..898de75 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -4,7 +4,7 @@ feature to access the database to retrieve data needed for computations. """ from functools import reduce -from typing import Any, Dict, Tuple +from typing import Any, Dict, Tuple, Union from gn3.random import random_string from gn3.data_helpers import partition_all @@ -12,7 +12,8 @@ from gn3.db.species import translate_to_mouse_gene_id from gn3.computations.partial_correlations import correlations_of_all_tissue_traits -def get_filename(target_db_name: str, conn: Any) -> str: +def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ + str, bool]: """ Retrieve the name of the reference database file with which correlations are computed. @@ -456,8 +457,9 @@ def build_query_tissue_corr(db_type, temp_table, sample_id_columns, joins): def fetch_all_database_data(# pylint: disable=[R0913, R0914] conn: Any, species: str, gene_id: int, trait_symbol: str, - samples: Tuple[str, ...], db_type: str, db_name: str, method: str, - return_number: int, probeset_freeze_id: int) -> Tuple[Any, Any]: + samples: Tuple[str, ...], dataset: dict, method: str, + return_number: int, probeset_freeze_id: int) -> Tuple[ + Tuple[float], int]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in -- cgit v1.2.3 From ee200d60bd3065c4c9e69bcbb756715211b711d2 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:50:33 +0300 Subject: Return only values Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Return the values from the database, not the tuples. --- gn3/db/correlations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 898de75..05c0809 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -399,12 +399,12 @@ def fetch_sample_ids( "AND Species.name=%(species_name)s") with conn.cursor() as cursor: cursor.execute( - return cursor.fetchall() query, { "samples_names": tuple(sample_names), "species_name": species_name }) + return tuple(row[0] for row in cursor.fetchall()) def build_query_sgo_lit_corr( db_type: str, temp_table: str, sample_id_columns: str, -- cgit v1.2.3 From 0f9247fffc7127bdb3c35492a37706a2db01a26c Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:52:37 +0300 Subject: Update return type Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Return the complete filename when found, or the boolean value False, when it is not found. --- gn3/db/correlations.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 05c0809..ab4dc2c 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -2,7 +2,7 @@ This module will hold functions that are used in the (partial) correlations feature to access the database to retrieve data needed for computations. """ - +import os from functools import reduce from typing import Any, Dict, Tuple, Union @@ -28,11 +28,13 @@ def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ (target_db_name,)) result = cursor.fetchone() if result: - return "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( + filename = "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( tid=result[0], fname=result[1].replace(' ', '_').replace('/', '_')) + return ((filename in os.listdir(text_file_dir)) + and f"{text_files_dir}/{filename}") - return "" + return False def build_temporary_literature_table( conn: Any, species: str, gene_id: int, return_number: int) -> str: -- cgit v1.2.3 From 109b233f698a2ce41bb5634f12e966ad02798819 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 26 Nov 2021 11:54:17 +0300 Subject: Fix bugs in data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Process the db_name and db_type values. * Return data correctly --- gn3/db/correlations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index ab4dc2c..401fd91 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -467,6 +467,8 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in GeneNetwork1. """ + db_type = dataset["dataset_type"] + db_name = dataset["dataset_name"] def __build_query__(sample_ids, temp_table): sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in sample_ids) if db_type == "Publish": @@ -551,4 +553,4 @@ def fetch_all_database_data(# pylint: disable=[R0913, R0914] with conn.cursor() as cursor: cursor.execute(f"DROP TEMPORARY TABLE {temp_table}") - return (tuple(item[0] for item in trait_database), trait_database[0][1]) + return (trait_database[0], trait_database[1]) -- cgit v1.2.3 From 6b147173d514093ec4e461f5843170c968290e5e Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 29 Nov 2021 11:52:26 +0300 Subject: Provide entry-point function for the partial correlations Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Provide the entry-point function to the partial correlation feature. This is the function that ochestrates the fetching of the data, and processing it for output by the API endpoint (to be implemented). --- gn3/computations/partial_correlations.py | 357 +++++++++++++++++++++++++++++-- gn3/db/correlations.py | 11 +- 2 files changed, 350 insertions(+), 18 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index f43c4d4..869bee4 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -6,15 +6,20 @@ GeneNetwork1. """ import math -from functools import reduce +from functools import reduce, partial from typing import Any, Tuple, Union, Sequence -from scipy.stats import pearsonr, spearmanr import pandas import pingouin +from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line +from gn3.db.traits import export_informative +from gn3.db.traits import retrieve_trait_info, retrieve_trait_data +from gn3.db.species import species_name, translate_to_mouse_gene_id +from gn3.db.correlations import get_filename, fetch_all_database_data def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -112,7 +117,7 @@ def find_identical_traits( return acc + ident[1] def __dictify_controls__(acc, control_item): - ckey = "{:.3f}".format(control_item[0]) + ckey = tuple("{:.3f}".format(item) for item in control_item[0]) return {**acc, ckey: acc.get(ckey, tuple()) + (control_item[1],)} return (reduce(## for identical control traits @@ -212,7 +217,7 @@ def partial_correlations_fast(# pylint: disable=[R0913, R0914] function in GeneNetwork1. """ assert method in ("spearman", "pearson") - with open(f"{TEXTDIR}/{database_filename}", "r") as dataset_file: + with open(database_filename, "r") as dataset_file: dataset = tuple(dataset_file.readlines()) good_dataset_samples = good_dataset_samples_indexes( @@ -286,32 +291,37 @@ def compute_partial( """ # replace the R code with `pingouin.partial_corr` def __compute_trait_info__(target): + targ_vals = target[0] + targ_name = target[1] primary = [ - prim for targ, prim in zip(target, primary_vals) + prim for targ, prim in zip(targ_vals, primary_vals) if targ is not None] + datafrm = build_data_frame( primary, - [targ for targ in target if targ is not None], - [cont for i, cont in enumerate(control_vals) - if target[i] is not None]) + tuple(targ for targ in targ_vals if targ is not None), + tuple(cont for i, cont in enumerate(control_vals) + if target[i] is not None)) covariates = "z" if datafrm.shape[1] == 3 else [ col for col in datafrm.columns if col not in ("x", "y")] ppc = pingouin.partial_corr( - data=datafrm, x="x", y="y", covar=covariates, method=method) - pc_coeff = ppc["r"] + data=datafrm, x="x", y="y", covar=covariates, method=( + "pearson" if "pearson" in method.lower() else "spearman")) + pc_coeff = ppc["r"][0] zero_order_corr = pingouin.corr( - datafrm["x"], datafrm["y"], method=method) + datafrm["x"], datafrm["y"], method=( + "pearson" if "pearson" in method.lower() else "spearman")) if math.isnan(pc_coeff): return ( - target[1], len(primary), pc_coeff, 1, zero_order_corr["r"], - zero_order_corr["p-val"]) + targ_name, len(primary), pc_coeff, 1, zero_order_corr["r"][0], + zero_order_corr["p-val"][0]) return ( - target[1], len(primary), pc_coeff, - (ppc["p-val"] if not math.isnan(ppc["p-val"]) else ( + targ_name, len(primary), pc_coeff, + (ppc["p-val"][0] if not math.isnan(ppc["p-val"][0]) else ( 0 if (abs(pc_coeff - 1) < 0.0000001) else 1)), - zero_order_corr["r"], zero_order_corr["p-val"]) + zero_order_corr["r"][0], zero_order_corr["p-val"][0]) return tuple( __compute_trait_info__(target) @@ -360,3 +370,318 @@ def partial_correlations_normal(# pylint: disable=R0913 for idx, item in enumerate(all_correlations))) return len(trait_database), all_correlations + +def partial_corrs( + conn, samples , primary_vals, control_vals, return_number, species, input_trait_geneid, + input_trait_symbol, tissue_probeset_freeze_id, method, dataset, database_filename): + """ + Compute the partial correlations, selecting the fast or normal method + depending on the existence of the database text file. + + This is a partial migration of the + `web.webqtl.correlation.PartialCorrDBPage.__init__` function in + GeneNetwork1. + """ + if database_filename: + return partial_correlations_fast( + samples, primary_vals, control_vals, database_filename, + ( + fetch_literature_correlations( + species, input_trait_geneid, dataset, return_number, conn) + if "literature" in method.lower() else + fetch_tissue_correlations( + dataset, input_trait_symbol, tissue_probeset_freeze_id, + method, return_number, conn)), + method, + ("literature" if method.lower() == "sgo literature correlation" + else ("tissue" if "tissue" in method.lower() else "genetic"))) + + trait_database, data_start_pos = fetch_all_database_data( + conn, species, input_trait_geneid, input_trait_symbol, samples, dataset, + method, return_number, tissue_probeset_freeze_id) + return partial_correlations_normal( + primary_vals, control_vals, input_trait_geneid, trait_database, + data_start_pos, dataset, method) + +def literature_correlation_by_list( + conn: Any, input_trait_mouse_geneid: int, species: str, + trait_list: Tuple[dict]) -> Tuple[dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` + function in GeneNetwork1. + """ + if any((lambda t: ( + bool(t.get("tissue_corr")) and + bool(t.get("tissue_p_value"))))(trait) + for trait in trait_list): + temp_table_name = f"LITERATURE{random_string(8)}" + q1 = ( + f"CREATE TEMPORARY TABLE {temporary_table_name} " + "(GeneId1 INT(12) UNSIGNED, GeneId2 INT(12) UNSIGNED PRIMARY KEY, " + "value DOUBLE)") + q2 = ( + f"INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1=%(geneid)s") + q3 = ( + "INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2=%s AND GeneId1 != %(geneid)s") + + def __set_mouse_geneid__(trait): + if trait.get("geneid"): + return { + **trait, + "mouse_geneid": translate_to_mouse_gene_id(trait.get("geneid")) + } + return {**trait, "mouse_geneid": 0} + + def __retrieve_lcorr__(cursor, geneids): + cursor.execute( + f"SELECT GeneId2, value FROM {temporary_table_name} " + "WHERE GeneId2 IN %(geneids)s", + geneids = geneids) + return {geneid: value for geneid, value in cursor.fetchall()} + + with conn.cursor() as cursor: + cursor.execute(q1) + cursor.execute(q2) + cursor.execute(q3) + + traits = tuple(__set_mouse_geneid__(trait) for trait in trait_list) + lcorrs = __retrieve_lcorr__( + cursor, ( + trait["mouse_geneid"] for trait in traits + if (trait["mouse_geneid"] != 0 and + trait["mouse_geneid"].find(";") < 0))) + return tuple( + {**trait, "l_corr": lcorrs.get(trait["mouse_geneid"], None)} + for trait in traits) + + return trait_list + return trait_list + +def tissue_correlation_by_list( + conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int, + method: str, trait_list: Tuple[dict]) -> Tuple[dict]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` + function in GeneNetwork1. + """ + def __add_tissue_corr__(trait, primary_trait_value, trait_value): + result = pingouin.corr( + primary_trait_values, target_trait_values, + method=("spearman" if "spearman" in method.lower() else "pearson")) + return { + **trait, + "tissue_corr": result["r"], + "tissue_p_value": result["p-val"] + } + + if any((lambda t: bool(t.get("l_corr")))(trait) for trait in trait_list): + prim_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + (primary_trait_symbol,), tissue_probeset_freeze_id, conn) + if primary_trait_symbol.lower() in prim_trait_symbol_value_dict: + primary_trait_value = prim_trait_symbol_value_dict[prim_trait_symbol.lower()] + gene_symbol_list = tuple( + trait for trait in trait_list if "symbol" in trait.keys()) + symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + gene_symbol_list, tissue_probeset_freeze_id, conn) + return tuple( + __add_tissue_corr__( + trait, primary_trait_value, + symbol_value_dict[trait["symbol"].lower()]) + for trait in trait_list + if ("symbol" in trait and + bool(trait["symbol"]) and + trait["symbol"].lower() in symbol_value_dict)) + return tuple({ + **trait, + "tissue_corr": None, + "tissue_p_value": None + } for trait in trait_list) + return trait_list + +def partial_correlations_entry( + conn: Any, primary_trait_name: str, + control_trait_names: Tuple[str, ...], method: str, + criteria: int, group: str, target_db_name: str) -> dict: + """ + This is the 'ochestration' function for the partial-correlation feature. + + This function will dispatch the functions doing data fetches from the + database (and various other places) and feed that data to the functions + doing the conversions and computations. It will then return the results of + all of that work. + + This function is doing way too much. Look into splitting out the + functionality into smaller functions that do fewer things. + """ + threshold = 0 + corr_min_informative = 4 + + primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) + primary_trait_data = retrieve_trait_data(primary_trait, conn) + primary_samples, primary_values, primary_variances = export_informative( + primary_trait_data) + + cntrl_traits = tuple( + retrieve_trait_info(threshold, trait_full_name, conn) + for trait_full_name in control_trait_names) + cntrl_traits_data = tuple( + retrieve_trait_data(cntrl_trait, conn) + for cntrl_trait in cntrl_traits) + species = species_name(conn, group) + + (cntrl_samples, + cntrl_values, + cntrl_variances, + cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + + common_primary_control_samples = primary_samples + fixed_primary_vals = primary_values + fixed_control_vals = cntrl_values + if not all(cnt_smp == primary_samples for cnt_smp in cntrl_samples): + (common_primary_control_samples, + fixed_primary_vals, + fixed_control_vals, + primary_variances, + cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + + if len(common_primary_control_samples) < corr_min_informative: + return { + "status": "error", + "message": ( + f"Fewer than {corr_min_informative} samples data entered for " + f"{group} dataset. No calculation of correlation has been " + "attempted."), + "error_type": "Inadequate Samples"} + + identical_traits_names = find_identical_traits( + primary_trait_name, primary_values, control_trait_names, cntrl_values) + if len(identical_traits_names) > 0: + return { + "status": "error", + "message": ( + f"{identical_traits_names[0]} and {identical_traits_names[1]} " + "have the same values for the {len(fixed_primary_vals)} " + "samples that will be used to compute the partial correlation " + "(common for all primary and control traits). In such cases, " + "partial correlation cannot be computed. Please re-select your " + "traits."), + "error_type": "Identical Traits"} + + input_trait_geneid = primary_trait.get("geneid") + input_trait_symbol = primary_trait.get("symbol") + input_trait_mouse_geneid = translate_to_mouse_gene_id( + species, input_trait_geneid, conn) + + tissue_probeset_freeze_id = 1 + db_type = primary_trait["db"]["dataset_type"] + db_name = primary_trait["db"]["dataset_name"] + + if db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return { + "status": "error", + "message": ( + "Wrong correlation type: It is not possible to compute the " + f"{method} between your trait and data in the {target_db_name} " + "database. Please try again after selecting another type of " + "correlation."), + "error_type": "Correlation Type"} + + if (method.lower() == "sgo literature correlation" and ( + input_trait_geneid is None or + check_for_literature_info(conn, input_trait_mouse_geneid))): + return { + "status": "error", + "message": ( + "No Literature Information: This gene does not have any " + "associated Literature Information."), + "error_type": "Literature Correlation"} + + if (method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and input_trait_symbol is None): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + if (method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + database_filename = get_filename(conn, target_db_name, TEXTDIR) + total_traits, all_correlations = partial_corrs( + conn, common_primary_control_samples, fixed_primary_vals, + fixed_control_vals, len(fixed_primary_vals), species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, primary_trait["db"], database_filename) + + + def __make_sorter__(method): + def __sort_6__(x): + return x[6] + + def __sort_3__(x): + return x[3] + + if "literature" in method.lower(): + return __sort_6__ + + if "tissue" in method.lower(): + return __sort_6__ + + return __sort_3__ + + sorted_correlations = sorted( + all_correlations, key=__make_sorter__(method)) + + add_lit_corr_and_tiss_corr = compose( + partial( + literature_correlation_by_list, conn, input_trait_mouse_geneid, + species), + partial( + tissue_correlation_by_list, conn, input_trait_symbol, + tissue_probeset_freeze_id, method)) + + trait_list = add_lit_corr_and_tiss_corr(tuple( + { + **retrieve_trait_info( + threshold, + f"{primary_trait['db']['dataset_name']}::{item[0]}", + conn), + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} + if len(item) == 8 else {}), + **({"l_corr": item[6]} + if len(item) == 7 else {}) + } + for item in + sorted_correlations[:min(criteria, len(all_correlations))])) + + return trait_list diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 401fd91..2a38bae 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -10,8 +10,6 @@ from gn3.random import random_string from gn3.data_helpers import partition_all from gn3.db.species import translate_to_mouse_gene_id -from gn3.computations.partial_correlations import correlations_of_all_tissue_traits - def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ str, bool]: """ @@ -282,6 +280,15 @@ def build_temporary_tissue_correlations_table( # We should probably pass the `correlations_of_all_tissue_traits` function # as an argument to this function and get rid of the one call immediately # following this comment. + from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + # This import above is necessary within the function to avoid + # circular-imports. + # + # + # This import above is indicative of convoluted code, with the computation + # being interwoven with the data retrieval. This needs to be changed, such + # that the function being imported here is no longer necessary, or have the + # imported function passed to this function as an argument. symbol_corr_dict, symbol_p_value_dict = correlations_of_all_tissue_traits( fetch_gene_symbol_tissue_value_dict_for_trait( (trait_symbol,), probeset_freeze_id, conn), -- cgit v1.2.3 From 99953f6e4a540da41d0517203eb63da4e19405cd Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 29 Nov 2021 14:01:44 +0300 Subject: Fix linting errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi --- gn3/computations/partial_correlations.py | 131 +++++++++++++++++-------------- gn3/db/correlations.py | 5 +- gn3/db/traits.py | 47 ++++++----- 3 files changed, 100 insertions(+), 83 deletions(-) (limited to 'gn3/db/correlations.py') diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 869bee4..231b0a7 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -14,12 +14,20 @@ import pingouin from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +from gn3.random import random_string from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line from gn3.db.traits import export_informative from gn3.db.traits import retrieve_trait_info, retrieve_trait_data from gn3.db.species import species_name, translate_to_mouse_gene_id -from gn3.db.correlations import get_filename, fetch_all_database_data +from gn3.db.correlations import ( + get_filename, + fetch_all_database_data, + check_for_literature_info, + fetch_tissue_correlations, + fetch_literature_correlations, + check_symbol_for_tissue_correlation, + fetch_gene_symbol_tissue_value_dict_for_trait) def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -311,7 +319,7 @@ def compute_partial( zero_order_corr = pingouin.corr( datafrm["x"], datafrm["y"], method=( - "pearson" if "pearson" in method.lower() else "spearman")) + "pearson" if "pearson" in method.lower() else "spearman")) if math.isnan(pc_coeff): return ( @@ -371,9 +379,10 @@ def partial_correlations_normal(# pylint: disable=R0913 return len(trait_database), all_correlations -def partial_corrs( - conn, samples , primary_vals, control_vals, return_number, species, input_trait_geneid, - input_trait_symbol, tissue_probeset_freeze_id, method, dataset, database_filename): +def partial_corrs(# pylint: disable=[R0913] + conn, samples, primary_vals, control_vals, return_number, species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, dataset, database_filename): """ Compute the partial correlations, selecting the fast or normal method depending on the existence of the database text file. @@ -404,8 +413,7 @@ def partial_corrs( data_start_pos, dataset, method) def literature_correlation_by_list( - conn: Any, input_trait_mouse_geneid: int, species: str, - trait_list: Tuple[dict]) -> Tuple[dict]: + conn: Any, species: str, trait_list: Tuple[dict]) -> Tuple[dict]: """ This is a migration of the `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` @@ -415,16 +423,16 @@ def literature_correlation_by_list( bool(t.get("tissue_corr")) and bool(t.get("tissue_p_value"))))(trait) for trait in trait_list): - temp_table_name = f"LITERATURE{random_string(8)}" - q1 = ( + temporary_table_name = f"LITERATURE{random_string(8)}" + query1 = ( f"CREATE TEMPORARY TABLE {temporary_table_name} " "(GeneId1 INT(12) UNSIGNED, GeneId2 INT(12) UNSIGNED PRIMARY KEY, " "value DOUBLE)") - q2 = ( + query2 = ( f"INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " "WHERE GeneId1=%(geneid)s") - q3 = ( + query3 = ( "INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " "WHERE GeneId2=%s AND GeneId1 != %(geneid)s") @@ -433,7 +441,8 @@ def literature_correlation_by_list( if trait.get("geneid"): return { **trait, - "mouse_geneid": translate_to_mouse_gene_id(trait.get("geneid")) + "mouse_geneid": translate_to_mouse_gene_id( + species, trait.get("geneid"), conn) } return {**trait, "mouse_geneid": 0} @@ -441,13 +450,13 @@ def literature_correlation_by_list( cursor.execute( f"SELECT GeneId2, value FROM {temporary_table_name} " "WHERE GeneId2 IN %(geneids)s", - geneids = geneids) - return {geneid: value for geneid, value in cursor.fetchall()} + geneids=geneids) + return dict(cursor.fetchall()) with conn.cursor() as cursor: - cursor.execute(q1) - cursor.execute(q2) - cursor.execute(q3) + cursor.execute(query1) + cursor.execute(query2) + cursor.execute(query3) traits = tuple(__set_mouse_geneid__(trait) for trait in trait_list) lcorrs = __retrieve_lcorr__( @@ -470,9 +479,9 @@ def tissue_correlation_by_list( `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` function in GeneNetwork1. """ - def __add_tissue_corr__(trait, primary_trait_value, trait_value): + def __add_tissue_corr__(trait, primary_trait_values, trait_values): result = pingouin.corr( - primary_trait_values, target_trait_values, + primary_trait_values, trait_values, method=("spearman" if "spearman" in method.lower() else "pearson")) return { **trait, @@ -484,7 +493,8 @@ def tissue_correlation_by_list( prim_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( (primary_trait_symbol,), tissue_probeset_freeze_id, conn) if primary_trait_symbol.lower() in prim_trait_symbol_value_dict: - primary_trait_value = prim_trait_symbol_value_dict[prim_trait_symbol.lower()] + primary_trait_value = prim_trait_symbol_value_dict[ + primary_trait_symbol.lower()] gene_symbol_list = tuple( trait for trait in trait_list if "symbol" in trait.keys()) symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( @@ -504,7 +514,7 @@ def tissue_correlation_by_list( } for trait in trait_list) return trait_list -def partial_correlations_entry( +def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] conn: Any, primary_trait_name: str, control_trait_names: Tuple[str, ...], method: str, criteria: int, group: str, target_db_name: str) -> dict: @@ -524,7 +534,7 @@ def partial_correlations_entry( primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) primary_trait_data = retrieve_trait_data(primary_trait, conn) - primary_samples, primary_values, primary_variances = export_informative( + primary_samples, primary_values, _primary_variances = export_informative( primary_trait_data) cntrl_traits = tuple( @@ -537,8 +547,8 @@ def partial_correlations_entry( (cntrl_samples, cntrl_values, - cntrl_variances, - cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + _cntrl_variances, + _cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) common_primary_control_samples = primary_samples fixed_primary_vals = primary_values @@ -547,8 +557,8 @@ def partial_correlations_entry( (common_primary_control_samples, fixed_primary_vals, fixed_control_vals, - primary_variances, - cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + _primary_variances, + _cntrl_variances) = fix_samples(primary_trait, cntrl_traits) if len(common_primary_control_samples) < corr_min_informative: return { @@ -580,7 +590,6 @@ def partial_correlations_entry( tissue_probeset_freeze_id = 1 db_type = primary_trait["db"]["dataset_type"] - db_name = primary_trait["db"]["dataset_name"] if db_type == "ProbeSet" and method.lower() in ( "sgo literature correlation", @@ -605,10 +614,11 @@ def partial_correlations_entry( "associated Literature Information."), "error_type": "Literature Correlation"} - if (method.lower() in ( - "tissue correlation, pearson's r", - "tissue correlation, spearman's rho") - and input_trait_symbol is None): + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and input_trait_symbol is None): return { "status": "error", "message": ( @@ -616,11 +626,12 @@ def partial_correlations_entry( "any associated Tissue Correlation Information."), "error_type": "Tissue Correlation"} - if (method.lower() in ( - "tissue correlation, pearson's r", - "tissue correlation, spearman's rho") - and check_symbol_for_tissue_correlation( - conn, tissue_probeset_freeze_id, input_trait_symbol)): + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): return { "status": "error", "message": ( @@ -629,7 +640,7 @@ def partial_correlations_entry( "error_type": "Tissue Correlation"} database_filename = get_filename(conn, target_db_name, TEXTDIR) - total_traits, all_correlations = partial_corrs( + _total_traits, all_correlations = partial_corrs( conn, common_primary_control_samples, fixed_primary_vals, fixed_control_vals, len(fixed_primary_vals), species, input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, @@ -637,11 +648,11 @@ def partial_correlations_entry( def __make_sorter__(method): - def __sort_6__(x): - return x[6] + def __sort_6__(row): + return row[6] - def __sort_3__(x): - return x[3] + def __sort_3__(row): + return row[3] if "literature" in method.lower(): return __sort_6__ @@ -655,33 +666,31 @@ def partial_correlations_entry( all_correlations, key=__make_sorter__(method)) add_lit_corr_and_tiss_corr = compose( - partial( - literature_correlation_by_list, conn, input_trait_mouse_geneid, - species), + partial(literature_correlation_by_list, conn, species), partial( tissue_correlation_by_list, conn, input_trait_symbol, tissue_probeset_freeze_id, method)) trait_list = add_lit_corr_and_tiss_corr(tuple( - { - **retrieve_trait_info( - threshold, - f"{primary_trait['db']['dataset_name']}::{item[0]}", - conn), - "noverlap": item[1], - "partial_corr": item[2], - "partial_corr_p_value": item[3], - "corr": item[4], - "corr_p_value": item[5], - "rank_order": (1 if "spearman" in method.lower() else 0), - **({ - "tissue_corr": item[6], - "tissue_p_value": item[7]} + { + **retrieve_trait_info( + threshold, + f"{primary_trait['db']['dataset_name']}::{item[0]}", + conn), + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} if len(item) == 8 else {}), - **({"l_corr": item[6]} + **({"l_corr": item[6]} if len(item) == 7 else {}) - } + } for item in - sorted_correlations[:min(criteria, len(all_correlations))])) + sorted_correlations[:min(criteria, len(all_correlations))])) return trait_list diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index 2a38bae..3d12019 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -29,7 +29,7 @@ def get_filename(conn: Any, target_db_name: str, text_files_dir: str) -> Union[ filename = "ProbeSetFreezeId_{tid}_FullName_{fname}.txt".format( tid=result[0], fname=result[1].replace(' ', '_').replace('/', '_')) - return ((filename in os.listdir(text_file_dir)) + return ((filename in os.listdir(text_files_dir)) and f"{text_files_dir}/{filename}") return False @@ -280,7 +280,8 @@ def build_temporary_tissue_correlations_table( # We should probably pass the `correlations_of_all_tissue_traits` function # as an argument to this function and get rid of the one call immediately # following this comment. - from gn3.computations.partial_correlations import correlations_of_all_tissue_traits + from gn3.computations.partial_correlations import (#pylint: disable=[C0415, R0401] + correlations_of_all_tissue_traits) # This import above is necessary within the function to avoid # circular-imports. # diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 75de4f4..d4a96f0 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,9 +1,10 @@ """This class contains functions relating to trait data manipulation""" import os -import MySQLdb from functools import reduce from typing import Any, Dict, Union, Sequence +import MySQLdb + from gn3.settings import TMPDIR from gn3.random import random_string from gn3.function_helpers import compose @@ -81,10 +82,10 @@ def export_trait_data( def get_trait_csv_sample_data(conn: Any, trait_name: int, phenotype_id: int): """Fetch a trait and return it as a csv string""" - def __float_strip(n): - if str(n)[-2:] == ".0": - return str(int(n)) - return str(n) + def __float_strip(num_str): + if str(num_str)[-2:] == ".0": + return str(int(num_str)) + return str(num_str) sql = ("SELECT DISTINCT Strain.Name, PublishData.value, " "PublishSE.error, NStrain.count FROM " "(PublishData, Strain, PublishXRef, PublishFreeze) " @@ -108,7 +109,7 @@ def get_trait_csv_sample_data(conn: Any, return "\n".join(csv_data) -def update_sample_data(conn: Any, +def update_sample_data(conn: Any, #pylint: disable=[R0913] trait_name: str, strain_name: str, phenotype_id: int, @@ -219,7 +220,7 @@ def delete_sample_data(conn: Any, "WHERE StrainId = %s AND DataId = %s" % (strain_id, data_id))) deleted_n_strains = cursor.rowcount - except Exception as e: + except Exception as e: #pylint: disable=[C0103, W0612] conn.rollback() raise MySQLdb.Error conn.commit() @@ -230,7 +231,7 @@ def delete_sample_data(conn: Any, deleted_se_data, deleted_n_strains) -def insert_sample_data(conn: Any, +def insert_sample_data(conn: Any, #pylint: disable=[R0913] trait_name: str, strain_name: str, phenotype_id: int, @@ -272,7 +273,7 @@ def insert_sample_data(conn: Any, "VALUES (%s, %s, %s)") % (strain_id, data_id, count)) inserted_n_strains = cursor.rowcount - except Exception as e: + except Exception as e: #pylint: disable=[C0103, W0612] conn.rollback() raise MySQLdb.Error return (inserted_published_data, @@ -450,7 +451,7 @@ def set_homologene_id_field(trait_type, trait_info, conn): Common postprocessing function for all trait types. Sets the value for the 'homologene' key.""" - def set_to_null(ti): return {**ti, "homologeneid": None} + def set_to_null(ti): return {**ti, "homologeneid": None} # pylint: disable=[C0103, C0321] functions_table = { "Temp": set_to_null, "Geno": set_to_null, @@ -656,8 +657,9 @@ def retrieve_temp_trait_data(trait_info: dict, conn: Any): query, {"trait_name": trait_info["trait_name"]}) return [dict(zip( - ["sample_name", "value", "se_error", "nstrain", "id"], row)) - for row in cursor.fetchall()] + ["sample_name", "value", "se_error", "nstrain", "id"], + row)) + for row in cursor.fetchall()] return [] @@ -696,8 +698,10 @@ def retrieve_geno_trait_data(trait_info: Dict, conn: Any): "dataset_name": trait_info["db"]["dataset_name"], "species_id": retrieve_species_id( trait_info["db"]["group"], conn)}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], + row)) for row in cursor.fetchall()] return [] @@ -728,8 +732,9 @@ def retrieve_publish_trait_data(trait_info: Dict, conn: Any): query, {"trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "nstrain", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "nstrain", "id"], row)) for row in cursor.fetchall()] return [] @@ -762,8 +767,9 @@ def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): {"cellid": trait_info["cellid"], "trait_name": trait_info["trait_name"], "dataset_id": trait_info["db"]["dataset_id"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] @@ -792,8 +798,9 @@ def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): query, {"trait_name": trait_info["trait_name"], "dataset_name": trait_info["db"]["dataset_name"]}) - return [dict(zip( - ["sample_name", "value", "se_error", "id"], row)) + return [ + dict(zip( + ["sample_name", "value", "se_error", "id"], row)) for row in cursor.fetchall()] return [] -- cgit v1.2.3