From 1812e3eb2d230bf8d6ac043d5ed85ad1d8027f5f Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 28 Jul 2021 09:42:13 +0300 Subject: Retrieve 'ProbeSet' trait name Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/traits.py: new function (retrieve_probeset_trait_name) * tests/unit/db/test_traits.py: test(s) for new function Add a function to retrieve the name of a 'ProbeSet' trait in a manner similar to genenetwork1's retrieval of the same, as implemented here https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-154 Unlike in genenetwork1, we do not mutate an object, instead, we return the values as retrieved from the database, and the caller will deal with the returned values as appropriate. --- gn3/db/traits.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 4860a07..37b111e 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -90,3 +90,21 @@ def insert_publication(pubmed_id: int, publication: Optional[Dict], ", ".join(['%s'] * len(publication)))) with conn.cursor() as cursor: cursor.execute(insert_query, tuple(publication.values())) + +def retrieve_probeset_trait_name(threshold, name, connection): + """ + Retrieve the name for a Probeset trait + + This is extracted from the `webqtlDataset.retrieveName` function, + specifically the section dealing with 'ProbeSet' type traits + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-154""" + query = ( + 'SELECT Id, Name, FullName, ShortName, DataScale ' + 'FROM ProbeSetFreeze ' + 'WHERE ' + 'public > %(threshold)s ' + 'AND ' + '(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)') + with connection.cursor() as cursor: + cursor.execute(query, {"threshold": threshold, "name": name}) + return cursor.fetchone() -- cgit v1.2.3 From 9b66f428f341bc047030126ba1e4cc405a34570c Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 28 Jul 2021 10:20:18 +0300 Subject: Make name retrieval more general Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/traits.py: make function more general * tests/unit/db/test_traits.py: parametrize the tests Make the name retrieval more general for the different types of traits by changing the column specification and table as appropriate. --- gn3/db/traits.py | 26 +++++++++++++++----------- tests/unit/db/test_traits.py | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 25 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 37b111e..fddb8be 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -91,20 +91,24 @@ def insert_publication(pubmed_id: int, publication: Optional[Dict], with conn.cursor() as cursor: cursor.execute(insert_query, tuple(publication.values())) -def retrieve_probeset_trait_name(threshold, name, connection): +def retrieve_type_trait_name(trait_type, threshold, name, connection): """ - Retrieve the name for a Probeset trait + Retrieve the name of a trait given the trait's name - This is extracted from the `webqtlDataset.retrieveName` function, - specifically the section dealing with 'ProbeSet' type traits - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-154""" + This is extracted from the `webqtlDataset.retrieveName` function as is + implemented at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 + """ + columns = "Id, Name, FullName, ShortName{}".format( + ", DataScale" if trait_type == "ProbeSet" else "") query = ( - 'SELECT Id, Name, FullName, ShortName, DataScale ' - 'FROM ProbeSetFreeze ' - 'WHERE ' - 'public > %(threshold)s ' - 'AND ' - '(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)') + "SELECT {columns} " + "FROM {trait_type}Freeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)").format( + columns=columns, trait_type=trait_type) with connection.cursor() as cursor: cursor.execute(query, {"threshold": threshold, "name": name}) return cursor.fetchone() diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 6d2ba4d..95c5b27 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,22 +1,34 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase -from gn3.db.traits import retrieve_probeset_trait_name +from gn3.db.traits import retrieve_type_trait_name class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" def test_retrieve_probeset_trait_name(self): """Test that the function is called correctly.""" - db_mock = mock.MagicMock() - with db_mock.cursor() as cursor: - cursor.fetchone.return_value = ( - "testName", "testNameFull", "testNameShort", "dataScale") - self.assertEqual( - retrieve_probeset_trait_name(9, "testName", db_mock), - ("testName", "testNameFull", "testNameShort", "dataScale")) - cursor.execute.assert_called_once_with( - "SELECT Id, Name, FullName, ShortName, DataScale " - "FROM ProbeSetFreeze " - "WHERE public > %(threshold)s AND " - "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)", - {"threshold": 9, "name": "testName"}) + for trait_type, thresh, trait_name, columns in [ + ["ProbeSet", 9, "testName", + "Id, Name, FullName, ShortName, DataScale"], + ["Geno", 3, "genoTraitName", "Id, Name, FullName, ShortName"], + ["Publish", 6, "publishTraitName", + "Id, Name, FullName, ShortName"], + ["Temp", 4, "tempTraitName", "Id, Name, FullName, ShortName"]]: + db_mock = mock.MagicMock() + with self.subTest(trait_type=trait_type): + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = ( + "testName", "testNameFull", "testNameShort", + "dataScale") + self.assertEqual( + retrieve_type_trait_name( + trait_type, thresh, trait_name, db_mock), + ("testName", "testNameFull", "testNameShort", + "dataScale")) + cursor.execute.assert_called_once_with( + "SELECT {cols} " + "FROM {ttype}Freeze " + "WHERE public > %(threshold)s AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)".format( + cols=columns, ttype=trait_type), + {"threshold": thresh, "name": trait_name}) -- cgit v1.2.3 From cdf5887506a0b035f5a51f9538815ff77cb71cdc Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 28 Jul 2021 12:32:43 +0300 Subject: Retrieve trait information Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/traits.py: add functions to retrieve traits information * tests/unit/db/test_traits.py: add tests for new function Add functions to retrieve traits information as is done in genenetwork1 https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 At this point, the data retrieval functions are probably incomplete, as there is more of the `retrieveInfo` function in GN1 that has not been considered as of this commit. --- gn3/db/traits.py | 133 ++++++++++++++++++++++++++++++++++++++++++- tests/unit/db/test_traits.py | 92 ++++++++++++++++++++++++++++-- 2 files changed, 218 insertions(+), 7 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index fddb8be..3c62df8 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -91,7 +91,7 @@ def insert_publication(pubmed_id: int, publication: Optional[Dict], with conn.cursor() as cursor: cursor.execute(insert_query, tuple(publication.values())) -def retrieve_type_trait_name(trait_type, threshold, name, connection): +def retrieve_trait_dataset_name(trait_type, threshold, name, connection): """ Retrieve the name of a trait given the trait's name @@ -112,3 +112,134 @@ def retrieve_type_trait_name(trait_type, threshold, name, connection): with connection.cursor() as cursor: cursor.execute(query, {"threshold": threshold, "name": name}) return cursor.fetchone() + +PUBLISH_TRAIT_INFO_QUERY = ( + "SELECT " + "PublishXRef.Id, Publication.PubMed_ID, " + "Phenotype.Pre_publication_description, " + "Phenotype.Post_publication_description, " + "Phenotype.Original_description, " + "Phenotype.Pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation, " + "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, " + "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), " + "Publication.Title, Publication.Abstract, Publication.Journal, " + "Publication.Volume, Publication.Pages, Publication.Month, " + "Publication.Year, PublishXRef.Sequence, Phenotype.Units, " + "PublishXRef.comments " + "FROM " + "PublishXRef, Publication, Phenotype, PublishFreeze " + "WHERE " + "PublishXRef.Id = %(trait_name)s AND " + "Phenotype.Id = PublishXRef.PhenotypeId AND " + "Publication.Id = PublishXRef.PublicationId AND " + "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " + "PublishFreeze.Id =%(trait_dataset_id)s") + +def retrieve_publish_trait_info(trait_data_source, conn): + """Retrieve trait information for type `Publish` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421""" + with conn.cursor() as cursor: + cursor.execute( + PUBLISH_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_id"] + }) + return cursor.fetchone() + +PROBESET_TRAIT_INFO_QUERY = ( + "SELECT " + "ProbeSet.name, ProbeSet.symbol, ProbeSet.description, " + "ProbeSet.probe_target_description, ProbeSet.chr, ProbeSet.mb, " + "ProbeSet.alias, ProbeSet.geneid, ProbeSet.genbankid, ProbeSet.unigeneid, " + "ProbeSet.omim, ProbeSet.refseq_transcriptid, ProbeSet.blatseq, " + "ProbeSet.targetseq, ProbeSet.chipid, ProbeSet.comments, " + "ProbeSet.strand_probe, ProbeSet.strand_gene, " + "ProbeSet.probe_set_target_region, ProbeSet.proteinid, " + "ProbeSet.probe_set_specificity, ProbeSet.probe_set_blat_score, " + "ProbeSet.probe_set_blat_mb_start, ProbeSet.probe_set_blat_mb_end, " + "ProbeSet.probe_set_strand, ProbeSet.probe_set_note_by_rw, " + "ProbeSet.flag " + "FROM " + "ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " + "ProbeSet.Name = %(trait_name)s") + +def retrieve_probeset_trait_info(trait_data_source, conn): + """Retrieve trait information for type `ProbeSet` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435""" + with conn.cursor() as cursor: + cursor.execute( + PROBESET_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return cursor.fetchone() + +GENO_TRAIT_INFO_QUERY = ( + "SELECT " + "Geno.name, Geno.chr, Geno.mb, Geno.source2, Geno.sequence " + "FROM " + "Geno, GenoFreeze, GenoXRef " + "WHERE " + "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " + "GenoFreeze.Name = %(trait_dataset_name)s AND Geno.Name = %(trait_name)s") + +def retrieve_geno_trait_info(trait_data_source, conn): + """Retrieve trait information for type `Geno` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449""" + with conn.cursor() as cursor: + cursor.execute( + GENO_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name", "trait_dataset_name"] + }) + return cursor.fetchone() + +TEMP_TRAIT_INFO_QUERY = ( + "SELECT name, description FROM Temp " + "WHERE Name = %(trait_name)s") + +def retrieve_temp_trait_info(trait_data_source, conn): + """Retrieve trait information for type `Temp` traits. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452""" + with conn.cursor() as cursor: + cursor.execute( + TEMP_TRAIT_INFO_QUERY, + { + k:v for k, v in trait_data_source.items() + if k in ["trait_name"] + }) + return cursor.fetchone() + +def retrieve_trait_info( + trait_type, trait_name, trait_dataset_id, trait_dataset_name, conn): + """Retrieves the trait information. + + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 + + This function, or the dependent functions, might be incomplete as they are + currently.""" + trait_info_function_table = { + "Publish": retrieve_publish_trait_info, + "ProbeSet": retrieve_probeset_trait_info, + "Geno": retrieve_geno_trait_info, + "Temp": retrieve_temp_trait_info + } + return trait_info_function_table[trait_type]( + { + "trait_name": trait_name, + "trait_dataset_id": trait_dataset_id, + "trait_dataset_name":trait_dataset_name + }, + conn) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 95c5b27..e3c5c28 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,13 +1,24 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase -from gn3.db.traits import retrieve_type_trait_name +from gn3.db.traits import ( + GENO_TRAIT_INFO_QUERY, + TEMP_TRAIT_INFO_QUERY, + PUBLISH_TRAIT_INFO_QUERY, + PROBESET_TRAIT_INFO_QUERY) +from gn3.db.traits import ( + retrieve_trait_info, + retrieve_geno_trait_info, + retrieve_temp_trait_info, + retrieve_trait_dataset_name, + retrieve_publish_trait_info, + retrieve_probeset_trait_info) class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" - def test_retrieve_probeset_trait_name(self): + def test_retrieve_trait_dataset_name(self): """Test that the function is called correctly.""" - for trait_type, thresh, trait_name, columns in [ + for trait_type, thresh, trait_dataset_name, columns in [ ["ProbeSet", 9, "testName", "Id, Name, FullName, ShortName, DataScale"], ["Geno", 3, "genoTraitName", "Id, Name, FullName, ShortName"], @@ -21,8 +32,8 @@ class TestTraitsDBFunctions(TestCase): "testName", "testNameFull", "testNameShort", "dataScale") self.assertEqual( - retrieve_type_trait_name( - trait_type, thresh, trait_name, db_mock), + retrieve_trait_dataset_name( + trait_type, thresh, trait_dataset_name, db_mock), ("testName", "testNameFull", "testNameShort", "dataScale")) cursor.execute.assert_called_once_with( @@ -31,4 +42,73 @@ class TestTraitsDBFunctions(TestCase): "WHERE public > %(threshold)s AND " "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)".format( cols=columns, ttype=trait_type), - {"threshold": thresh, "name": trait_name}) + {"threshold": thresh, "name": trait_dataset_name}) + + def test_retrieve_publish_trait_info(self): + """Test retrieval of type `Publish` traits.""" + db_mock = mock.MagicMock() + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = tuple() + trait_source = { + "trait_name": "PublishTraitName", "trait_dataset_id": 1} + self.assertEqual( + retrieve_publish_trait_info( + trait_source, + db_mock), + tuple()) + cursor.execute.assert_called_once_with( + PUBLISH_TRAIT_INFO_QUERY, trait_source) + + def test_retrieve_probeset_trait_info(self): + """Test retrieval of type `Probeset` traits.""" + db_mock = mock.MagicMock() + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = tuple() + trait_source = { + "trait_name": "ProbeSetTraitName", + "trait_dataset_name": "ProbeSetDatasetTraitName"} + self.assertEqual( + retrieve_probeset_trait_info(trait_source, db_mock), tuple()) + cursor.execute.assert_called_once_with( + PROBESET_TRAIT_INFO_QUERY, trait_source) + + def test_retrieve_geno_trait_info(self): + """Test retrieval of type `Geno` traits.""" + db_mock = mock.MagicMock() + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = tuple() + trait_source = { + "trait_name": "GenoTraitName", + "trait_dataset_name": "GenoDatasetTraitName"} + self.assertEqual( + retrieve_geno_trait_info(trait_source, db_mock), tuple()) + cursor.execute.assert_called_once_with( + GENO_TRAIT_INFO_QUERY, trait_source) + + def test_retrieve_temp_trait_info(self): + """Test retrieval of type `Temp` traits.""" + db_mock = mock.MagicMock() + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = tuple() + trait_source = {"trait_name": "TempTraitName"} + self.assertEqual( + retrieve_temp_trait_info(trait_source, db_mock), tuple()) + cursor.execute.assert_called_once_with( + TEMP_TRAIT_INFO_QUERY, trait_source) + + def test_retrieve_trait_info(self): + """Test that information on traits is retrieved as appropriate.""" + for trait_type, trait_name, trait_dataset_id, trait_dataset_name, in [ + ["Publish", "PublishTraitName", 1, "PublishDatasetTraitName"], + ["ProbeSet", "ProbeSetTraitName", 2, "ProbeSetDatasetTraitName"], + ["Geno", "GenoTraitName", 3, "GenoDatasetTraitName"], + ["Temp", "TempTraitName", 4, "TempDatasetTraitName"]]: + db_mock = mock.MagicMock() + with self.subTest(trait_type=trait_type): + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = tuple() + self.assertEqual( + retrieve_trait_info( + trait_type, trait_name, trait_dataset_id, + trait_dataset_name, db_mock), + tuple()) -- cgit v1.2.3 From bbc2d3d57a66f1c6815a3fdd243c6461115510a5 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 29 Jul 2021 12:28:21 +0300 Subject: Add type annotations to the function Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add some type annotations to the functions to reduce the chances of bugs creeping into the code. --- gn3/db/traits.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 3c62df8..f18e16a 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -91,7 +91,8 @@ def insert_publication(pubmed_id: int, publication: Optional[Dict], with conn.cursor() as cursor: cursor.execute(insert_query, tuple(publication.values())) -def retrieve_trait_dataset_name(trait_type, threshold, name, connection): +def retrieve_trait_dataset_name( + trait_type: str, threshold: int, name: str, connection: Any): """ Retrieve the name of a trait given the trait's name @@ -136,7 +137,7 @@ PUBLISH_TRAIT_INFO_QUERY = ( "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " "PublishFreeze.Id =%(trait_dataset_id)s") -def retrieve_publish_trait_info(trait_data_source, conn): +def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Publish` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421""" @@ -170,7 +171,7 @@ PROBESET_TRAIT_INFO_QUERY = ( "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " "ProbeSet.Name = %(trait_name)s") -def retrieve_probeset_trait_info(trait_data_source, conn): +def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435""" @@ -192,7 +193,7 @@ GENO_TRAIT_INFO_QUERY = ( "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " "GenoFreeze.Name = %(trait_dataset_name)s AND Geno.Name = %(trait_name)s") -def retrieve_geno_trait_info(trait_data_source, conn): +def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Geno` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449""" @@ -209,7 +210,7 @@ TEMP_TRAIT_INFO_QUERY = ( "SELECT name, description FROM Temp " "WHERE Name = %(trait_name)s") -def retrieve_temp_trait_info(trait_data_source, conn): +def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Temp` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452""" @@ -223,7 +224,8 @@ def retrieve_temp_trait_info(trait_data_source, conn): return cursor.fetchone() def retrieve_trait_info( - trait_type, trait_name, trait_dataset_id, trait_dataset_name, conn): + trait_type: str, trait_name: str, trait_dataset_id: int, + trait_dataset_name: str, conn: Any): """Retrieves the trait information. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 -- cgit v1.2.3 From 77312535e643e4c8fecd7c20b3381996808dea11 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 29 Jul 2021 14:09:49 +0300 Subject: Add partial type annotations for slink module Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add some type annotations for the `nearest` function. * Leave some comments regarding the issues experienced when trying to add some typing annotations to the function to help with future endeavours of the same. --- gn3/computations/slink.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'gn3') diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py index 4aac6b3..23d3d88 100644 --- a/gn3/computations/slink.py +++ b/gn3/computations/slink.py @@ -7,6 +7,10 @@ slink: TODO: Describe what the function does... """ import logging +from typing import List, Tuple, Union, Sequence + +NumType = Union[int, float] +SeqOfNums = Sequence[NumType] class LengthError(BaseException): """Raised whenever child lists/tuples are not the same length as the parent @@ -73,7 +77,10 @@ raise an exception.""" def __flatten_list_of_lists(parent): return [item for child in parent for item in child] -def nearest(lists, i, j): +# i and j are Union[SeqOfNums, NumType], but that leads to errors where the +# values of i or j are indexed, since the NumType type is not indexable. +# I don't know how to type this so that it does not fail on running `mypy .` +def nearest(lists: Sequence[SeqOfNums], i, j) -> NumType: """ Computes shortest distance between member(s) in `i` and member(s) in `j`. @@ -126,6 +133,10 @@ def nearest(lists, i, j): raise ValueError("member values (i or j) should be lists/tuples of integers or integers") +# `lists` here could be Sequence[SeqOfNums], but that leads to errors I do not +# understand down the line +# Might have to re-implement the function especially since the errors are thrown +# where `listindexcopy` is involved def slink(lists): """ DESCRIPTION: -- cgit v1.2.3 From c4f362d9a9b83f4fc6fadde0989663dd34fb0b07 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 30 Jul 2021 08:29:38 +0300 Subject: Return dict from query functions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/traits.py: return dicts rather than tuples/list * tests/unit/db/test_traits.py: Update tests Return dicts with the key-value pairs set up so as to ease with the data manipulation down the pipeline. This is also useful to help with the retrieval of all other extra information that was left out in the first iteration. This commit also updates the tests by ensuring they expect dicts rather than tuples. --- gn3/db/traits.py | 141 +++++++++++++++++++++++-------------------- tests/unit/db/test_traits.py | 83 +++++++++++++++++++------ 2 files changed, 140 insertions(+), 84 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ae1939a..9742fa2 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -100,119 +100,128 @@ def retrieve_trait_dataset_name( cursor.execute(query, {"threshold": threshold, "name": name}) return cursor.fetchone() -PUBLISH_TRAIT_INFO_QUERY = ( - "SELECT " - "PublishXRef.Id, Publication.PubMed_ID, " - "Phenotype.Pre_publication_description, " - "Phenotype.Post_publication_description, " - "Phenotype.Original_description, " - "Phenotype.Pre_publication_abbreviation, " - "Phenotype.Post_publication_abbreviation, " - "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, " - "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), " - "Publication.Title, Publication.Abstract, Publication.Journal, " - "Publication.Volume, Publication.Pages, Publication.Month, " - "Publication.Year, PublishXRef.Sequence, Phenotype.Units, " - "PublishXRef.comments " - "FROM " - "PublishXRef, Publication, Phenotype, PublishFreeze " - "WHERE " - "PublishXRef.Id = %(trait_name)s AND " - "Phenotype.Id = PublishXRef.PhenotypeId AND " - "Publication.Id = PublishXRef.PublicationId AND " - "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " - "PublishFreeze.Id =%(trait_dataset_id)s") - def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Publish` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L399-L421""" + keys = ( + "Id", "PubMed_ID", "Pre_publication_description", + "Post_publication_description", "Original_description", + "Pre_publication_abbreviation", "Post_publication_abbreviation", + "Lab_code", "Submitter", "Owner", "Authorized_Users", "Authors", + "Title", "Abstract", "Journal", "Volume", "Pages", "Month", "Year", + "Sequence", "Units", "comments") + columns = ( + "PublishXRef.Id, Publication.PubMed_ID, " + "Phenotype.Pre_publication_description, " + "Phenotype.Post_publication_description, " + "Phenotype.Original_description, " + "Phenotype.Pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation, " + "Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner, " + "Phenotype.Authorized_Users, CAST(Publication.Authors AS BINARY), " + "Publication.Title, Publication.Abstract, Publication.Journal, " + "Publication.Volume, Publication.Pages, Publication.Month, " + "Publication.Year, PublishXRef.Sequence, Phenotype.Units, " + "PublishXRef.comments") + query = ( + "SELECT " + "{columns} " + "FROM " + "PublishXRef, Publication, Phenotype, PublishFreeze " + "WHERE " + "PublishXRef.Id = %(trait_name)s AND " + "Phenotype.Id = PublishXRef.PhenotypeId AND " + "Publication.Id = PublishXRef.PublicationId AND " + "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " + "PublishFreeze.Id =%(trait_dataset_id)s").format( + columns = columns) with conn.cursor() as cursor: cursor.execute( - PUBLISH_TRAIT_INFO_QUERY, + query, { k:v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_id"] }) - return cursor.fetchone() - -PROBESET_TRAIT_INFO_QUERY = ( - "SELECT " - "ProbeSet.name, ProbeSet.symbol, ProbeSet.description, " - "ProbeSet.probe_target_description, ProbeSet.chr, ProbeSet.mb, " - "ProbeSet.alias, ProbeSet.geneid, ProbeSet.genbankid, ProbeSet.unigeneid, " - "ProbeSet.omim, ProbeSet.refseq_transcriptid, ProbeSet.blatseq, " - "ProbeSet.targetseq, ProbeSet.chipid, ProbeSet.comments, " - "ProbeSet.strand_probe, ProbeSet.strand_gene, " - "ProbeSet.probe_set_target_region, ProbeSet.proteinid, " - "ProbeSet.probe_set_specificity, ProbeSet.probe_set_blat_score, " - "ProbeSet.probe_set_blat_mb_start, ProbeSet.probe_set_blat_mb_end, " - "ProbeSet.probe_set_strand, ProbeSet.probe_set_note_by_rw, " - "ProbeSet.flag " - "FROM " - "ProbeSet, ProbeSetFreeze, ProbeSetXRef " - "WHERE " - "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND " - "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " - "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " - "ProbeSet.Name = %(trait_name)s") + return dict(zip((k.lower() for k in keys), cursor.fetchone())) def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L424-L435""" + keys = ( + "name", "symbol", "description", "probe_target_description", "chr", + "mb", "alias", "geneid", "genbankid", "unigeneid", "omim", + "refseq_transcriptid", "blatseq", "targetseq", "chipid", "comments", + "strand_probe", "strand_gene", "probe_set_target_region", "proteinid", + "probe_set_specificity", "probe_set_blat_score", + "probe_set_blat_mb_start", "probe_set_blat_mb_end", "probe_set_strand", + "probe_set_note_by_rw", "flag") + query = ( + "SELECT " + "{columns} " + "FROM " + "ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " + "ProbeSet.Name = %(trait_name)s").format( + columns = ", ".join(["ProbeSet.{}".format(x) for x in keys])) with conn.cursor() as cursor: cursor.execute( - PROBESET_TRAIT_INFO_QUERY, + query, { k:v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) - return cursor.fetchone() - -GENO_TRAIT_INFO_QUERY = ( - "SELECT " - "Geno.name, Geno.chr, Geno.mb, Geno.source2, Geno.sequence " - "FROM " - "Geno, GenoFreeze, GenoXRef " - "WHERE " - "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " - "GenoFreeze.Name = %(trait_dataset_name)s AND Geno.Name = %(trait_name)s") + return dict(zip(keys, cursor.fetchone())) def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Geno` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L438-L449""" + keys = ("name", "chr", "mb", "source2", "sequence") + query = ( + "SELECT " + "{columns} " + "FROM " + "Geno, GenoFreeze, GenoXRef " + "WHERE " + "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " + "GenoFreeze.Name = %(trait_dataset_name)s AND " + "Geno.Name = %(trait_name)s").format( + columns = ", ".join(["Geno.{}".format(x) for x in keys])) with conn.cursor() as cursor: cursor.execute( - GENO_TRAIT_INFO_QUERY, + query, { k:v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_name"] }) - return cursor.fetchone() - -TEMP_TRAIT_INFO_QUERY = ( - "SELECT name, description FROM Temp " - "WHERE Name = %(trait_name)s") + return dict(zip(keys, cursor.fetchone())) def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Temp` traits. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L450-452""" + keys = ("name", "description") + query = ( + "SELECT {columns} FROM Temp " + "WHERE Name = %(trait_name)s").format(columns = ", ".join(keys)) with conn.cursor() as cursor: cursor.execute( - TEMP_TRAIT_INFO_QUERY, + query, { k:v for k, v in trait_data_source.items() if k in ["trait_name"] }) - return cursor.fetchone() + return dict(zip(keys, cursor.fetchone())) def retrieve_trait_info( trait_type: str, trait_name: str, trait_dataset_id: int, - trait_dataset_name: str, conn: Any): + trait_dataset_name: str, conn: Any, QTL = None): """Retrieves the trait information. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index c8f28b5..393983d 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,10 +1,5 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase -from gn3.db.traits import ( - GENO_TRAIT_INFO_QUERY, - TEMP_TRAIT_INFO_QUERY, - PUBLISH_TRAIT_INFO_QUERY, - PROBESET_TRAIT_INFO_QUERY) from gn3.db.traits import ( retrieve_trait_info, retrieve_geno_trait_info, @@ -14,7 +9,6 @@ from gn3.db.traits import ( retrieve_probeset_trait_info, update_sample_data) - class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" @@ -54,12 +48,32 @@ class TestTraitsDBFunctions(TestCase): trait_source = { "trait_name": "PublishTraitName", "trait_dataset_id": 1} self.assertEqual( - retrieve_publish_trait_info( - trait_source, - db_mock), - tuple()) + retrieve_publish_trait_info(trait_source, db_mock), {}) cursor.execute.assert_called_once_with( - PUBLISH_TRAIT_INFO_QUERY, trait_source) + ("SELECT " + "PublishXRef.Id, Publication.PubMed_ID," + " Phenotype.Pre_publication_description," + " Phenotype.Post_publication_description," + " Phenotype.Original_description," + " Phenotype.Pre_publication_abbreviation," + " Phenotype.Post_publication_abbreviation," + " Phenotype.Lab_code, Phenotype.Submitter, Phenotype.Owner," + " Phenotype.Authorized_Users," + " CAST(Publication.Authors AS BINARY)," + " Publication.Title, Publication.Abstract," + " Publication.Journal," + " Publication.Volume, Publication.Pages, Publication.Month," + " Publication.Year, PublishXRef.Sequence, Phenotype.Units," + " PublishXRef.comments" + " FROM" + " PublishXRef, Publication, Phenotype, PublishFreeze" + " WHERE" + " PublishXRef.Id = %(trait_name)s " + " AND Phenotype.Id = PublishXRef.PhenotypeId" + " AND Publication.Id = PublishXRef.PublicationId" + " AND PublishXRef.InbredSetId = PublishFreeze.InbredSetId" + " AND PublishFreeze.Id =%(trait_dataset_id)s"), + trait_source) def test_retrieve_probeset_trait_info(self): """Test retrieval of type `Probeset` traits.""" @@ -70,9 +84,31 @@ class TestTraitsDBFunctions(TestCase): "trait_name": "ProbeSetTraitName", "trait_dataset_name": "ProbeSetDatasetTraitName"} self.assertEqual( - retrieve_probeset_trait_info(trait_source, db_mock), tuple()) + retrieve_probeset_trait_info(trait_source, db_mock), {}) cursor.execute.assert_called_once_with( - PROBESET_TRAIT_INFO_QUERY, trait_source) + ( + "SELECT " + "ProbeSet.name, ProbeSet.symbol, ProbeSet.description, " + "ProbeSet.probe_target_description, ProbeSet.chr, " + "ProbeSet.mb, ProbeSet.alias, ProbeSet.geneid, " + "ProbeSet.genbankid, ProbeSet.unigeneid, ProbeSet.omim, " + "ProbeSet.refseq_transcriptid, ProbeSet.blatseq, " + "ProbeSet.targetseq, ProbeSet.chipid, ProbeSet.comments, " + "ProbeSet.strand_probe, ProbeSet.strand_gene, " + "ProbeSet.probe_set_target_region, ProbeSet.proteinid, " + "ProbeSet.probe_set_specificity, " + "ProbeSet.probe_set_blat_score, " + "ProbeSet.probe_set_blat_mb_start, " + "ProbeSet.probe_set_blat_mb_end, " + "ProbeSet.probe_set_strand, ProbeSet.probe_set_note_by_rw, " + "ProbeSet.flag " + "FROM " + "ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id " + "AND ProbeSetFreeze.Name = %(trait_dataset_name)s " + "AND ProbeSet.Name = %(trait_name)s"), trait_source) def test_retrieve_geno_trait_info(self): """Test retrieval of type `Geno` traits.""" @@ -83,9 +119,19 @@ class TestTraitsDBFunctions(TestCase): "trait_name": "GenoTraitName", "trait_dataset_name": "GenoDatasetTraitName"} self.assertEqual( - retrieve_geno_trait_info(trait_source, db_mock), tuple()) + retrieve_geno_trait_info(trait_source, db_mock), {}) cursor.execute.assert_called_once_with( - GENO_TRAIT_INFO_QUERY, trait_source) + ( + "SELECT " + "Geno.name, Geno.chr, Geno.mb, Geno.source2, Geno.sequence " + "FROM " + "Geno, GenoFreeze, GenoXRef " + "WHERE " + "GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoXRef.GenoId = Geno.Id " + "AND GenoFreeze.Name = %(trait_dataset_name)s " + "AND Geno.Name = %(trait_name)s"), + trait_source) def test_retrieve_temp_trait_info(self): """Test retrieval of type `Temp` traits.""" @@ -94,9 +140,10 @@ class TestTraitsDBFunctions(TestCase): cursor.fetchone.return_value = tuple() trait_source = {"trait_name": "TempTraitName"} self.assertEqual( - retrieve_temp_trait_info(trait_source, db_mock), tuple()) + retrieve_temp_trait_info(trait_source, db_mock), {}) cursor.execute.assert_called_once_with( - TEMP_TRAIT_INFO_QUERY, trait_source) + "SELECT name, description FROM Temp WHERE Name = %(trait_name)s", + trait_source) def test_retrieve_trait_info(self): """Test that information on traits is retrieved as appropriate.""" @@ -113,7 +160,7 @@ class TestTraitsDBFunctions(TestCase): retrieve_trait_info( trait_type, trait_name, trait_dataset_id, trait_dataset_name, db_mock), - tuple()) + {}) def test_update_sample_data(self): """Test that the SQL queries when calling update_sample_data are called with -- cgit v1.2.3 From 3e5ce62f9c46ff63d3ba3d83140ed698a934a7c3 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 30 Jul 2021 10:09:56 +0300 Subject: Add module for common utilities Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/function_helpers.py: new file Provides a new module to hold common programming utilities that are generic enough that they will find use across the entire application. The first utility function provided in this commit is the `compose` function, whose purpose, as indicated by its name, is to take a number of functions and compose them into a single function, which when called, will return the same result that would have been got had the user called the functions in a chain from right to left. --- gn3/function_helpers.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 gn3/function_helpers.py (limited to 'gn3') diff --git a/gn3/function_helpers.py b/gn3/function_helpers.py new file mode 100644 index 0000000..397b2da --- /dev/null +++ b/gn3/function_helpers.py @@ -0,0 +1,36 @@ +""" +This module will contain helper functions that should assist in maintaining a +mostly functional way of programming. + +It will also contain miscellaneous functions that can be used globally, and thus +do not fit well in any other module. + +FUNCTIONS: +compose: This function is used to compose multiple functions into a single + function. It passes the results of calling one function to the other until + all the functions to be composed are called. +""" +from functools import reduce + +def compose(*functions): + """Compose multiple functions into a single function. + + The utility in this function is not specific to this module, and as such, + this function can, and probably should, be moved to a more global module. + + DESCRIPTION: + Given `cfn = compose(f_1, f_2, ... f_(n-1), f_n )`, calling + `cfn(arg_1, arg_2, ..., arg_m)` should call `f_n` with the arguments passed + to `cfn` and the results of that should be passed as arguments to `f_(n-1)` + and so on until `f_1` is called with the results of the cumulative calls and + that is the result of the entire chain of calls. + + PARAMETERS: + functions: a variable argument list of function. + """ + def composed_function(*args, **kwargs): + return reduce( + lambda res, fn: fn(res), + reversed(functions[:-1]), + functions[-1](*args, **kwargs)) + return composed_function -- cgit v1.2.3 From 238450af8aa3395b3ae5a636fada67206a863d85 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Fri, 30 Jul 2021 10:33:40 +0300 Subject: Rework db functions to enable postprocessing Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Rework the database functions to return a dict of key-value pairs, which eases the postprocessing of the trait information. The postprocessing is mainly to try an maintain data compatibility with the code that is at the following locations: https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py This was mainly a proof-of-concept, and the functions do not have testing added for them: there is therefore need to add testing for the new functions, and probably even rework them if they are found to be complicated. --- gn3/db/traits.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 8 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 9742fa2..d8d2b62 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,5 +1,6 @@ """This class contains functions relating to trait data manipulation""" from typing import Any, Dict, Union +from gn3.function_helpers import compose def get_trait_csv_sample_data(conn: Any, @@ -135,8 +136,7 @@ def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): "Phenotype.Id = PublishXRef.PhenotypeId AND " "Publication.Id = PublishXRef.PublicationId AND " "PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND " - "PublishFreeze.Id =%(trait_dataset_id)s").format( - columns = columns) + "PublishFreeze.Id =%(trait_dataset_id)s").format(columns=columns) with conn.cursor() as cursor: cursor.execute( query, @@ -144,7 +144,17 @@ def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): k:v for k, v in trait_data_source.items() if k in ["trait_name", "trait_dataset_id"] }) - return dict(zip((k.lower() for k in keys), cursor.fetchone())) + return dict(zip([k.lower() for k in keys], cursor.fetchone())) + +def set_confidential_field(trait_info): + """Post processing function for 'Publish' trait types. + + It sets the value for the 'confidential' key.""" + return { + **trait_info, + "confidential": 1 if ( + trait_info.get("pre_publication_description", None) + and not trait_info.get("pubmed_id", None)) else 0} def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. @@ -168,7 +178,7 @@ def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " "ProbeSetFreeze.Name = %(trait_dataset_name)s AND " "ProbeSet.Name = %(trait_name)s").format( - columns = ", ".join(["ProbeSet.{}".format(x) for x in keys])) + columns=", ".join(["ProbeSet.{}".format(x) for x in keys])) with conn.cursor() as cursor: cursor.execute( query, @@ -192,7 +202,7 @@ def retrieve_geno_trait_info(trait_data_source: Dict[str, Any], conn: Any): "GenoXRef.GenoFreezeId = GenoFreeze.Id AND GenoXRef.GenoId = Geno.Id AND " "GenoFreeze.Name = %(trait_dataset_name)s AND " "Geno.Name = %(trait_name)s").format( - columns = ", ".join(["Geno.{}".format(x) for x in keys])) + columns=", ".join(["Geno.{}".format(x) for x in keys])) with conn.cursor() as cursor: cursor.execute( query, @@ -209,7 +219,7 @@ def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): keys = ("name", "description") query = ( "SELECT {columns} FROM Temp " - "WHERE Name = %(trait_name)s").format(columns = ", ".join(keys)) + "WHERE Name = %(trait_name)s").format(columns=", ".join(keys)) with conn.cursor() as cursor: cursor.execute( query, @@ -219,9 +229,53 @@ def retrieve_temp_trait_info(trait_data_source: Dict[str, Any], conn: Any): }) return dict(zip(keys, cursor.fetchone())) +def set_haveinfo_field(trait_info): + """ + Common postprocessing function for all trait types. + + Sets the value for the 'haveinfo' field.""" + return {**trait_info, "haveinfo": 1 if trait_info else 0} + +def set_homologene_id_field_probeset(trait_info, conn): + """ + Postprocessing function for 'ProbeSet' traits. + + Sets the value for the 'homologene' key. + """ + query = ( + "SELECT HomologeneId FROM Homologene, Species, InbredSet" + " WHERE Homologene.GeneId = %(geneid)s AND InbredSet.Name = %(riset)s" + " AND InbredSet.SpeciesId = Species.Id AND" + " Species.TaxonomyId = Homologene.TaxonomyId") + with conn.cursor() as cursor: + cursor.execute( + query, + { + k:v for k, v in trait_info.items() + if k in ["geneid", "riset"] + }) + res = cursor.fetchone() + if res: + return {**trait_info, "homologeneid": res[0]} + return {**trait_info, "homologeneid": None} + +def set_homologene_id_field(trait_info, conn): + """ + Common postprocessing function for all trait types. + + Sets the value for the 'homologene' key.""" + set_to_null = lambda ti: {**ti, "homologeneid": None} + functions_table = { + "Temp": set_to_null, + "Geno": set_to_null, + "Publish": set_to_null, + "ProbeSet": lambda ti: set_homologene_id_field_probeset(ti, conn) + } + return functions_table[trait_info["type"]](trait_info) + def retrieve_trait_info( trait_type: str, trait_name: str, trait_dataset_id: int, - trait_dataset_name: str, conn: Any, QTL = None): + trait_dataset_name: str, conn: Any, QTL=None): """Retrieves the trait information. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 @@ -234,7 +288,24 @@ def retrieve_trait_info( "Geno": retrieve_geno_trait_info, "Temp": retrieve_temp_trait_info } - return trait_info_function_table[trait_type]( + + common_post_processing_fn = compose( + lambda ti: set_homologene_id_field(ti, conn), + lambda ti: {"type": trait_type, **ti}, + set_haveinfo_field) + + trait_post_processing_functions_table = { + "Publish": compose(set_confidential_field, common_post_processing_fn), + "ProbeSet": compose(common_post_processing_fn), + "Geno": common_post_processing_fn, + "Temp": common_post_processing_fn + } + + retrieve_info = compose( + trait_post_processing_functions_table[trait_type], + trait_info_function_table[trait_type]) + + return retrieve_info( { "trait_name": trait_name, "trait_dataset_id": trait_dataset_id, -- cgit v1.2.3 From 630488edfd75c428dc18e09d9336c6f16531130d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 4 Aug 2021 09:39:04 +0300 Subject: Avoid string interpolation: use prepared statement Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Following Arun's comment at https://github.com/genenetwork/genenetwork3/pull/31#issuecomment-890915813 this commit eliminates string interpolation, and adds a map of tables for the various types of traits dataset names --- gn3/db/traits.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index d8d2b62..902eb8b 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -77,7 +77,6 @@ def update_sample_data(conn: Any, return (updated_strains, updated_published_data, updated_se_data, updated_n_strains) - def retrieve_trait_dataset_name( trait_type: str, threshold: int, name: str, connection: Any): """ @@ -87,18 +86,29 @@ def retrieve_trait_dataset_name( implemented at https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 """ + table_map = { + "ProbeSet": "ProbeSetFreeze", + "Publish": "PublishFreeze", + "Geno": "GenoFreeze", + "Temp": "TempFreeze"} columns = "Id, Name, FullName, ShortName{}".format( ", DataScale" if trait_type == "ProbeSet" else "") query = ( - "SELECT {columns} " - "FROM {trait_type}Freeze " + "SELECT %(columns)s " + "FROM %(table)s " "WHERE " "public > %(threshold)s " "AND " - "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)").format( - columns=columns, trait_type=trait_type) + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") with connection.cursor() as cursor: - cursor.execute(query, {"threshold": threshold, "name": name}) + cursor.execute( + query, + { + "table": table_map[trait_type], + "columns": columns, + "threshold": threshold, + "name": name + }) return cursor.fetchone() -- cgit v1.2.3 From 53a8b6aa977bc6c051625a812009184f78da597d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 4 Aug 2021 10:05:33 +0300 Subject: Add tests for post-processing functions Issues: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add missing tests for some post-processing functions --- gn3/db/traits.py | 13 ++++++++----- tests/unit/db/test_traits.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 7 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 902eb8b..ce6298f 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -160,11 +160,14 @@ def set_confidential_field(trait_info): """Post processing function for 'Publish' trait types. It sets the value for the 'confidential' key.""" - return { - **trait_info, - "confidential": 1 if ( - trait_info.get("pre_publication_description", None) - and not trait_info.get("pubmed_id", None)) else 0} + if trait_info["type"] == "Publish": + return { + **trait_info, + "confidential": 1 if ( + trait_info.get("pre_publication_description", None) + and not trait_info.get("pubmed_id", None)) else 0} + else: + return trait_info def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 3840dd1..7e8b29c 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,13 +1,16 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase from gn3.db.traits import ( + set_haveinfo_field, + update_sample_data, retrieve_trait_info, + set_confidential_field, + set_homologene_id_field, retrieve_geno_trait_info, retrieve_temp_trait_info, retrieve_trait_dataset_name, retrieve_publish_trait_info, - retrieve_probeset_trait_info, - update_sample_data) + retrieve_probeset_trait_info) class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" @@ -198,3 +201,35 @@ class TestTraitsDBFunctions(TestCase): mock.call(PUBLISH_SE_SQL, (2.3, 10, 8967049)), mock.call(N_STRAIN_SQL, (2, 10, 8967049))] ) + + def test_set_haveinfo_field(self): + for trait_info, expected in [ + [{}, {"haveinfo": 0}], + [{"k1": "v1"}, {"k1": "v1", "haveinfo": 1}]]: + with self.subTest(trait_info=trait_info, expected=expected): + self.assertEqual(set_haveinfo_field(trait_info), expected) + + def test_set_homologene_id_field(self): + for trait_info, expected in [ + [{"type": "Publish"}, + {"type": "Publish", "homologeneid": None}], + [{"type": "ProbeSet"}, + {"type": "ProbeSet", "homologeneid": None}], + [{"type": "Geno"}, {"type": "Geno", "homologeneid": None}], + [{"type": "Temp"}, {"type": "Temp", "homologeneid": None}]]: + db_mock = mock.MagicMock() + with self.subTest(trait_info=trait_info, expected=expected): + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = () + self.assertEqual( + set_homologene_id_field(trait_info, db_mock), expected) + + def test_set_confidential_field(self): + for trait_info, expected in [ + [{"type": "Publish"}, {"type": "Publish", "confidential": 0}], + [{"type": "ProbeSet"}, {"type": "ProbeSet"}], + [{"type": "Geno"}, {"type": "Geno"}], + [{"type": "Temp"}, {"type": "Temp"}]]: + with self.subTest(trait_info=trait_info, expected=expected): + self.assertEqual( + set_confidential_field(trait_info), expected) -- cgit v1.2.3 From 0d7ebb87e2995207f23bc8b8e05e64aaab50b48d Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 4 Aug 2021 11:27:24 +0300 Subject: Retrieve the RISet and RISet ID values Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Retrieve the RISet and RISet ID values from the database. --- gn3/db/traits.py | 57 +++++++++++++++++++++++ tests/unit/db/test_traits.py | 105 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ce6298f..ea35d7e 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -286,6 +286,62 @@ def set_homologene_id_field(trait_info, conn): } return functions_table[trait_info["type"]](trait_info) +def set_geno_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Geno trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, GenoFreeze " + "WHERE GenoFreeze.InbredSetId = InbredSet.Id " + "AND GenoFreeze.Name = %(name)s") + with conn.cursor() as cursor: + return cursor.execute(query, {"name": name}) + +def set_publish_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Publish trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, PublishFreeze " + "WHERE PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.Name = %(name)s") + with conn.cursor() as cursor: + return cursor.execute(query, {"name": name}) + +def set_probeset_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various ProbeSet trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, ProbeSetFreeze, ProbeFreeze " + "WHERE ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " + "AND ProbeSetFreeze.Name = %(name)s") + with conn.cursor() as cursor: + return cursor.execute(query, {"name": name}) + +def set_riset_fields(trait_info, conn): + """ + Retrieve the RISet, and RISetID values for various trait types. + """ + riset_functions_map = { + "Temp": lambda ti, con: (None, None), + "Geno": set_geno_riset_fields, + "Publish": set_publish_riset_fields, + "ProbeSet": set_probeset_riset_fields + } + if not trait_info.get("haveinfo", None): + return trait_info + + riset, riid = riset_functions_map[trait_info["type"]]( + trait_info["name"], conn) + return { + **trait_info, "risetid": riid, + "riset": "BXD" if riset == "BXD300" else riset} + def retrieve_trait_info( trait_type: str, trait_name: str, trait_dataset_id: int, trait_dataset_name: str, conn: Any, QTL=None): @@ -303,6 +359,7 @@ def retrieve_trait_info( } common_post_processing_fn = compose( + lambda ti: set_riset_fields(ti, conn), lambda ti: set_homologene_id_field(ti, conn), lambda ti: {"type": trait_type, **ti}, set_haveinfo_field) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 7e8b29c..2445d26 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,13 +1,17 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase from gn3.db.traits import ( + set_riset_fields, set_haveinfo_field, update_sample_data, retrieve_trait_info, + set_geno_riset_fields, set_confidential_field, set_homologene_id_field, retrieve_geno_trait_info, retrieve_temp_trait_info, + set_publish_riset_fields, + set_probeset_riset_fields, retrieve_trait_dataset_name, retrieve_publish_trait_info, retrieve_probeset_trait_info) @@ -233,3 +237,104 @@ class TestTraitsDBFunctions(TestCase): with self.subTest(trait_info=trait_info, expected=expected): self.assertEqual( set_confidential_field(trait_info), expected) + + def test_set_geno_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'Geno' trait type. + """ + for trait_name, expected in [ + ["testGenoName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_geno_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, GenoFreeze" + " WHERE GenoFreeze.InbredSetId = InbredSet.Id" + " AND GenoFreeze.Name = %(name)s"), + {"name": trait_name}) + + + def test_set_publish_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'Publish' trait type. + """ + for trait_name, expected in [ + ["testPublishName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_publish_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, PublishFreeze" + " WHERE PublishFreeze.InbredSetId = InbredSet.Id" + " AND PublishFreeze.Name = %(name)s"), + {"name": trait_name}) + + + def test_set_probeset_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'ProbeSet' trait type. + """ + for trait_name, expected in [ + ["testProbeSetName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_probeset_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, ProbeSetFreeze, ProbeFreeze" + " WHERE ProbeFreeze.InbredSetId = InbredSet.Id" + " AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId" + " AND ProbeSetFreeze.Name = %(name)s"), + {"name": trait_name}) + + def test_set_riset_fields(self): + """ + Test that the riset fields are set up correctly for the different trait + types. + """ + for trait_info, expected in [ + [{}, {}], + [{"haveinfo": 0, "type": "Publish"}, + {"haveinfo": 0, "type": "Publish"}], + [{"haveinfo": 0, "type": "ProbeSet"}, + {"haveinfo": 0, "type": "ProbeSet"}], + [{"haveinfo": 0, "type": "Geno"}, + {"haveinfo": 0, "type": "Geno"}], + [{"haveinfo": 0, "type": "Temp"}, + {"haveinfo": 0, "type": "Temp"}], + [{"haveinfo": 1, "type": "Publish", "name": "test"}, + {"haveinfo": 1, "type": "Publish", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "ProbeSet", "name": "test"}, + {"haveinfo": 1, "type": "ProbeSet", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "Geno", "name": "test"}, + {"haveinfo": 1, "type": "Geno", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "Temp", "name": "test"}, + {"haveinfo": 1, "type": "Temp", "name": "test", "riset": None, + "risetid": None}] + ]: + db_mock = mock.MagicMock() + with self.subTest(trait_info=trait_info, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = ("riset_name", 0) + self.assertEqual( + set_riset_fields(trait_info, db_mock), expected) -- cgit v1.2.3 From f712da630c1a3642cb44b62c4b2b857373cd78d7 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Wed, 4 Aug 2021 11:30:44 +0300 Subject: Fix issues caught by pylint * gn3/computations/slink.py: remove unused imports * gn3/db/traits.py: remove unnecessary `else` clauses * tests/unit/db/test_traits.py: add docstrings for functions --- gn3/computations/slink.py | 2 +- gn3/db/traits.py | 6 +++--- tests/unit/db/test_traits.py | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/slink.py b/gn3/computations/slink.py index 23d3d88..5953e6b 100644 --- a/gn3/computations/slink.py +++ b/gn3/computations/slink.py @@ -7,7 +7,7 @@ slink: TODO: Describe what the function does... """ import logging -from typing import List, Tuple, Union, Sequence +from typing import Union, Sequence NumType = Union[int, float] SeqOfNums = Sequence[NumType] diff --git a/gn3/db/traits.py b/gn3/db/traits.py index ea35d7e..29c91a6 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -166,8 +166,7 @@ def set_confidential_field(trait_info): "confidential": 1 if ( trait_info.get("pre_publication_description", None) and not trait_info.get("pubmed_id", None)) else 0} - else: - return trait_info + return trait_info def retrieve_probeset_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `ProbeSet` traits. @@ -344,13 +343,14 @@ def set_riset_fields(trait_info, conn): def retrieve_trait_info( trait_type: str, trait_name: str, trait_dataset_id: int, - trait_dataset_name: str, conn: Any, QTL=None): + trait_dataset_name: str, conn: Any, qtl=None): """Retrieves the trait information. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 This function, or the dependent functions, might be incomplete as they are currently.""" + # pylint: disable=[R0913] trait_info_function_table = { "Publish": retrieve_publish_trait_info, "ProbeSet": retrieve_probeset_trait_info, diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 2445d26..1c481a2 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -207,6 +207,7 @@ class TestTraitsDBFunctions(TestCase): ) def test_set_haveinfo_field(self): + """Test that the `haveinfo` field is set up correctly""" for trait_info, expected in [ [{}, {"haveinfo": 0}], [{"k1": "v1"}, {"k1": "v1", "haveinfo": 1}]]: @@ -214,6 +215,7 @@ class TestTraitsDBFunctions(TestCase): self.assertEqual(set_haveinfo_field(trait_info), expected) def test_set_homologene_id_field(self): + """Test that the `homologene_id` field is set up correctly""" for trait_info, expected in [ [{"type": "Publish"}, {"type": "Publish", "homologeneid": None}], @@ -229,6 +231,7 @@ class TestTraitsDBFunctions(TestCase): set_homologene_id_field(trait_info, db_mock), expected) def test_set_confidential_field(self): + """Test that the `confidential` field is set up correctly""" for trait_info, expected in [ [{"type": "Publish"}, {"type": "Publish", "confidential": 0}], [{"type": "ProbeSet"}, {"type": "ProbeSet"}], -- cgit v1.2.3 From 76ba5296c66e131301a9fdb692c3b2623f3331ed Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 5 Aug 2021 08:40:49 +0300 Subject: Build up trait_name items from full name Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * The full name of the traits from search contains multiple parts to it, and as such, we use it to retrieve the appropriate data and set it up in the final trait_info dictionary that is produced. --- gn3/db/traits.py | 16 +++++++++++++-- tests/unit/db/test_traits.py | 46 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 10 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 29c91a6..9f89510 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -341,8 +341,18 @@ def set_riset_fields(trait_info, conn): **trait_info, "risetid": riid, "riset": "BXD" if riset == "BXD300" else riset} +def build_trait_name(trait_fullname): + name_parts = trait_fullname.split("::") + assert len(name_parts) >= 2, "Name format error" + return { + "trait_db": name_parts[0], + "trait_fullname": trait_fullname, + "trait_name": name_parts[1], + "cellid": name_parts[2] if len(name_parts) == 3 else "" + } + def retrieve_trait_info( - trait_type: str, trait_name: str, trait_dataset_id: int, + trait_type: str, trait_full_name: str, trait_dataset_id: int, trait_dataset_name: str, conn: Any, qtl=None): """Retrieves the trait information. @@ -351,6 +361,7 @@ def retrieve_trait_info( This function, or the dependent functions, might be incomplete as they are currently.""" # pylint: disable=[R0913] + trait = build_trait_name(trait_full_name) trait_info_function_table = { "Publish": retrieve_publish_trait_info, "ProbeSet": retrieve_probeset_trait_info, @@ -362,6 +373,7 @@ def retrieve_trait_info( lambda ti: set_riset_fields(ti, conn), lambda ti: set_homologene_id_field(ti, conn), lambda ti: {"type": trait_type, **ti}, + lambda ti: {**ti, **trait}, set_haveinfo_field) trait_post_processing_functions_table = { @@ -377,7 +389,7 @@ def retrieve_trait_info( return retrieve_info( { - "trait_name": trait_name, + "trait_name": trait["trait_name"], "trait_dataset_id": trait_dataset_id, "trait_dataset_name":trait_dataset_name }, diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 1c481a2..39d7a31 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -1,6 +1,7 @@ """Tests for gn3/db/traits.py""" from unittest import mock, TestCase from gn3.db.traits import ( + build_trait_name, set_riset_fields, set_haveinfo_field, update_sample_data, @@ -155,18 +156,47 @@ class TestTraitsDBFunctions(TestCase): "SELECT name, description FROM Temp WHERE Name = %(trait_name)s", trait_source) + def test_build_trait_name_with_good_fullnames(self): + for fullname, expected in [ + ["testdb::testname", + {"trait_db": "testdb", "trait_name": "testname", "cellid": "", + "trait_fullname": "testdb::testname"}], + ["testdb::testname::testcell", + {"trait_db": "testdb", "trait_name": "testname", + "cellid": "testcell", + "trait_fullname": "testdb::testname::testcell"}]]: + with self.subTest(fullname=fullname): + self.assertEqual(build_trait_name(fullname), expected) + + def test_build_trait_name_with_bad_fullnames(self): + for fullname in ["", "test", "test:test"]: + with self.subTest(fullname=fullname): + with self.assertRaises(AssertionError, msg="Name format error"): + build_trait_name(fullname) + def test_retrieve_trait_info(self): """Test that information on traits is retrieved as appropriate.""" for trait_type, trait_name, trait_dataset_id, trait_dataset_name, expected in [ - ["Publish", "PublishTraitName", 1, "PublishDatasetTraitName", + ["Publish", "pubDb::PublishTraitName::pubCell", 1, + "PublishDatasetTraitName", {"haveinfo": 0, "homologeneid": None, "type": "Publish", - "confidential": 0}], - ["ProbeSet", "ProbeSetTraitName", 2, "ProbeSetDatasetTraitName", - {"haveinfo": 0, "homologeneid": None, "type": "ProbeSet"}], - ["Geno", "GenoTraitName", 3, "GenoDatasetTraitName", - {"haveinfo": 0, "homologeneid": None, "type": "Geno"}], - ["Temp", "TempTraitName", 4, "TempDatasetTraitName", - {"haveinfo": 0, "homologeneid": None, "type": "Temp"}]]: + "confidential": 0, "trait_db": "pubDb", + "trait_name": "PublishTraitName", "cellid": "pubCell", + "trait_fullname": "pubDb::PublishTraitName::pubCell"}], + ["ProbeSet", "prbDb::ProbeSetTraitName::prbCell", 2, + "ProbeSetDatasetTraitName", + {"haveinfo": 0, "homologeneid": None, "type": "ProbeSet", + "trait_fullname": "prbDb::ProbeSetTraitName::prbCell", + "trait_db": "prbDb", "trait_name": "ProbeSetTraitName", + "cellid": "prbCell"}], + ["Geno", "genDb::GenoTraitName", 3, "GenoDatasetTraitName", + {"haveinfo": 0, "homologeneid": None, "type": "Geno", + "trait_fullname": "genDb::GenoTraitName", "trait_db": "genDb", + "trait_name": "GenoTraitName", "cellid": ""}], + ["Temp", "tmpDb::TempTraitName", 4, "TempDatasetTraitName", + {"haveinfo": 0, "homologeneid": None, "type": "Temp", + "trait_fullname": "tmpDb::TempTraitName", "trait_db": "tmpDb", + "trait_name": "TempTraitName", "cellid": ""}]]: db_mock = mock.MagicMock() with self.subTest(trait_type=trait_type): with db_mock.cursor() as cursor: -- cgit v1.2.3 From f1876f8b9939a9b863dc88aab8d3fed3c16ad4e1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 5 Aug 2021 13:08:57 +0300 Subject: Reorganise the database code Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Reorganise the code to separate the datasets from the traits, and to more closely conform to the same flow as that in GN1 --- gn3/db/datasets.py | 251 +++++++++++++++++++++++++++++++++++++++++ gn3/db/traits.py | 171 ++++++++++++---------------- tests/unit/db/test_datasets.py | 133 ++++++++++++++++++++++ tests/unit/db/test_traits.py | 196 +++++--------------------------- 4 files changed, 485 insertions(+), 266 deletions(-) create mode 100644 gn3/db/datasets.py create mode 100644 tests/unit/db/test_datasets.py (limited to 'gn3') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py new file mode 100644 index 0000000..3ad50f6 --- /dev/null +++ b/gn3/db/datasets.py @@ -0,0 +1,251 @@ +from typing import Any, Dict, Union + +def retrieve_probeset_trait_dataset_name( + threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName, DataScale " + "FROM ProbeSetFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname", "dataset_datascale"], + cursor.fetchone)) + +def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM PublishFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM GenoFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_temp_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM TempFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_dataset_name( + trait_type: str, threshold: int, trait_name: str, dataset_name: str, + conn: Any): + """ + Retrieve the name of a trait given the trait's name + + This is extracted from the `webqtlDataset.retrieveName` function as is + implemented at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 + """ + fn_map = { + "ProbeSet": retrieve_probeset_trait_dataset_name, + "Publish": retrieve_publish_trait_dataset_name, + "Geno": retrieve_geno_trait_dataset_name, + "Temp": retrieve_temp_trait_dataset_name} + if trait_type == "Temp": + return retrieve_temp_trait_dataset_name(threshold, trait_name, conn) + return fn_map[trait_type](threshold, dataset_name, conn) + + +def retrieve_geno_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Geno trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, GenoFreeze " + "WHERE GenoFreeze.InbredSetId = InbredSet.Id " + "AND GenoFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_publish_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Publish trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, PublishFreeze " + "WHERE PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_probeset_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various ProbeSet trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, ProbeSetFreeze, ProbeFreeze " + "WHERE ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " + "AND ProbeSetFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_temp_riset_fields(name, conn): + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, Temp " + "WHERE Temp.InbredSetId = InbredSet.Id " + "AND Temp.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): + """ + Retrieve the RISet, and RISetID values for various trait types. + """ + riset_fns_map = { + "Geno": retrieve_geno_riset_fields, + "Publish": retrieve_publish_riset_fields, + "ProbeSet": retrieve_probeset_riset_fields + } + + if trait_type == "Temp": + riset_info = retrieve_temp_riset_fields(trait_name, conn) + else: + riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn) + + return { + **dataset_info, + **riset_info, + "riset": ( + "BXD" if riset_info.get("riset") == "BXD300" + else riset_info.get("riset", "")) + } + +def retrieve_temp_trait_dataset(): + return { + "searchfield": ["name", "description"], + "disfield": ["name", "description"], + "type": "Temp", + "dataset_id": 1, + "fullname": "Temporary Storage", + "shortname": "Temp" + } + +def retrieve_geno_trait_dataset(): + return { + "searchfield": ["name","chr"], + "disfield": ["name","chr","mb", "source2", "sequence"], + "type": "Geno" + } + +def retrieve_publish_trait_dataset(): + return { + "searchfield": [ + "name", "post_publication_description", "abstract", "title", + "authors"], + "disfield": [ + "name","pubmed_id", "pre_publication_description", + "post_publication_description", "original_description", + "pre_publication_abbreviation", "post_publication_abbreviation", + "lab_code", "submitter", "owner", "authorized_users", + "authors","title","abstract", "journal","volume","pages","month", + "year","sequence", "units", "comments"], + "type": "Publish" + } + +def retrieve_probeset_trait_dataset(): + return { + "searchfield": [ + "name", "description", "probe_target_description", "symbol", + "alias", "genbankid", "unigeneid", "omim", "refseq_transcriptid", + "probe_set_specificity", "probe_set_blat_score"], + "disfield": [ + "name", "symbol", "description", "probe_target_description", "chr", + "mb", "alias", "geneid", "genbankid", "unigeneid", "omim", + "refseq_transcriptid", "blatseq", "targetseq", "chipid", "comments", + "strand_probe", "strand_gene", "probe_set_target_region", + "proteinid", "probe_set_specificity", "probe_set_blat_score", + "probe_set_blat_mb_start", "probe_set_blat_mb_end", + "probe_set_strand", "probe_set_note_by_rw", "flag"], + "type": "ProbeSet" + } + +def retrieve_trait_dataset(trait_type, trait, threshold, conn): + dataset_fns = { + "Temp": retrieve_temp_trait_dataset, + "Geno": retrieve_geno_trait_dataset, + "Publish": retrieve_publish_trait_dataset, + "ProbeSet": retrieve_probeset_trait_dataset + } + dataset_name_info = { + "dataset_id": None, + "dataset_name": trait["db"]["dataset_name"], + **retrieve_dataset_name( + trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], + conn) + } + riset = retrieve_riset_fields( + trait_type, trait["trait_name"], dataset_name_info, conn) + return { + "display_name": dataset_name_info["dataset_name"], + **dataset_name_info, + **dataset_fns[trait_type](), + **riset + } diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 9f89510..85cccfa 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,6 +1,7 @@ """This class contains functions relating to trait data manipulation""" from typing import Any, Dict, Union from gn3.function_helpers import compose +from gn3.db.datasets import retrieve_trait_dataset def get_trait_csv_sample_data(conn: Any, @@ -77,41 +78,6 @@ def update_sample_data(conn: Any, return (updated_strains, updated_published_data, updated_se_data, updated_n_strains) -def retrieve_trait_dataset_name( - trait_type: str, threshold: int, name: str, connection: Any): - """ - Retrieve the name of a trait given the trait's name - - This is extracted from the `webqtlDataset.retrieveName` function as is - implemented at - https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 - """ - table_map = { - "ProbeSet": "ProbeSetFreeze", - "Publish": "PublishFreeze", - "Geno": "GenoFreeze", - "Temp": "TempFreeze"} - columns = "Id, Name, FullName, ShortName{}".format( - ", DataScale" if trait_type == "ProbeSet" else "") - query = ( - "SELECT %(columns)s " - "FROM %(table)s " - "WHERE " - "public > %(threshold)s " - "AND " - "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") - with connection.cursor() as cursor: - cursor.execute( - query, - { - "table": table_map[trait_type], - "columns": columns, - "threshold": threshold, - "name": name - }) - return cursor.fetchone() - - def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): """Retrieve trait information for type `Publish` traits. @@ -156,11 +122,11 @@ def retrieve_publish_trait_info(trait_data_source: Dict[str, Any], conn: Any): }) return dict(zip([k.lower() for k in keys], cursor.fetchone())) -def set_confidential_field(trait_info): +def set_confidential_field(trait_type, trait_info): """Post processing function for 'Publish' trait types. It sets the value for the 'confidential' key.""" - if trait_info["type"] == "Publish": + if trait_type == "Publish": return { **trait_info, "confidential": 1 if ( @@ -271,7 +237,7 @@ def set_homologene_id_field_probeset(trait_info, conn): return {**trait_info, "homologeneid": res[0]} return {**trait_info, "homologeneid": None} -def set_homologene_id_field(trait_info, conn): +def set_homologene_id_field(trait_type, trait_info, conn): """ Common postprocessing function for all trait types. @@ -283,84 +249,83 @@ def set_homologene_id_field(trait_info, conn): "Publish": set_to_null, "ProbeSet": lambda ti: set_homologene_id_field_probeset(ti, conn) } - return functions_table[trait_info["type"]](trait_info) + return functions_table[trait_type](trait_info) -def set_geno_riset_fields(name, conn): - """ - Retrieve the RISet, and RISetID values for various Geno trait types. - """ +def load_publish_qtl_info(trait_info, conn): query = ( - "SELECT InbredSet.Name, InbredSet.Id " - "FROM InbredSet, GenoFreeze " - "WHERE GenoFreeze.InbredSetId = InbredSet.Id " - "AND GenoFreeze.Name = %(name)s") + "SELECT PublishXRef.Locus, PublishXRef.LRS, PublishXRef.additive " + "FROM PublishXRef, PublishFreeze " + "WHERE PublishXRef.Id = %(trait_name)s " + "AND PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishFreeze.Id = %(dataset_id)s") with conn.cursor() as cursor: - return cursor.execute(query, {"name": name}) + cursor.execute() + return dict(zip(["locus", "lrs", "additive"], cursor.fetchone())) + return {"locus": "", "lrs": "", "additive": ""} -def set_publish_riset_fields(name, conn): - """ - Retrieve the RISet, and RISetID values for various Publish trait types. - """ +def load_probeset_qtl_info(trait_info, conn): query = ( - "SELECT InbredSet.Name, InbredSet.Id " - "FROM InbredSet, PublishFreeze " - "WHERE PublishFreeze.InbredSetId = InbredSet.Id " - "AND PublishFreeze.Name = %(name)s") + "SELECT ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, " + "ProbeSetXRef.mean, ProbeSetXRef.additive " + "FROM ProbeSetXRef, ProbeSet " + "WHERE ProbeSetXRef.ProbeSetId = ProbeSet.Id " + " AND ProbeSet.Name = %(trait_name)s " + "AND ProbeSetXRef.ProbeSetFreezeId = %(dataset_id)s") with conn.cursor() as cursor: - return cursor.execute(query, {"name": name}) + cursor.execute() + return dict(zip( + ["locus", "lrs", "pvalue", "mean", "additive"], cursor.fetchone())) + return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} -def set_probeset_riset_fields(name, conn): - """ - Retrieve the RISet, and RISetID values for various ProbeSet trait types. - """ - query = ( - "SELECT InbredSet.Name, InbredSet.Id " - "FROM InbredSet, ProbeSetFreeze, ProbeFreeze " - "WHERE ProbeFreeze.InbredSetId = InbredSet.Id " - "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " - "AND ProbeSetFreeze.Name = %(name)s") - with conn.cursor() as cursor: - return cursor.execute(query, {"name": name}) - -def set_riset_fields(trait_info, conn): - """ - Retrieve the RISet, and RISetID values for various trait types. - """ - riset_functions_map = { - "Temp": lambda ti, con: (None, None), - "Geno": set_geno_riset_fields, - "Publish": set_publish_riset_fields, - "ProbeSet": set_probeset_riset_fields +def load_qtl_info(qtl, trait_type, trait_info, conn): + if not qtl: + return trait_info + qtl_info_functions = { + "Publish": load_publish_qtl_info, + "ProbeSet": load_probeset_qtl_info } - if not trait_info.get("haveinfo", None): + if trait_inf["name"] not in qtl_info_functions.keys(): return trait_info - riset, riid = riset_functions_map[trait_info["type"]]( - trait_info["name"], conn) - return { - **trait_info, "risetid": riid, - "riset": "BXD" if riset == "BXD300" else riset} + return qtl_info_functions[trait_type](trait_info, conn) def build_trait_name(trait_fullname): name_parts = trait_fullname.split("::") assert len(name_parts) >= 2, "Name format error" return { - "trait_db": name_parts[0], + "db": {"dataset_name": name_parts[0]}, "trait_fullname": trait_fullname, "trait_name": name_parts[1], "cellid": name_parts[2] if len(name_parts) == 3 else "" } +def retrieve_probeset_sequence(trait, conn): + query = ( + "SELECT ProbeSet.BlatSeq " + "FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef " + "WHERE ProbeSet.Id=ProbeSetXRef.ProbeSetId " + "AND ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId " + "AND ProbeSet.Name = %(trait_name)s " + "AND ProbeSetFreeze.Name = %(dataset_name)s") + with conn.cursor() as cursor: + cursor.execute( + query, + { + "trait_name": trait["trait_name"], + "dataset_name": trait["db"]["dataset_name"] + }) + seq = cursor.fetchone() + return {**trait, "sequence": seq[0] if seq else ""} + def retrieve_trait_info( - trait_type: str, trait_full_name: str, trait_dataset_id: int, - trait_dataset_name: str, conn: Any, qtl=None): + trait_type: str, threshold: int, trait_full_name: str, conn: Any, + qtl=None): """Retrieves the trait information. https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L397-L456 This function, or the dependent functions, might be incomplete as they are currently.""" - # pylint: disable=[R0913] trait = build_trait_name(trait_full_name) trait_info_function_table = { "Publish": retrieve_publish_trait_info, @@ -370,15 +335,19 @@ def retrieve_trait_info( } common_post_processing_fn = compose( - lambda ti: set_riset_fields(ti, conn), - lambda ti: set_homologene_id_field(ti, conn), - lambda ti: {"type": trait_type, **ti}, - lambda ti: {**ti, **trait}, + lambda ti: load_qtl_info(qtl, trait_type, ti, conn), + lambda ti: set_homologene_id_field(trait_type, ti, conn), + lambda ti: {"trait_type": trait_type, **ti}, + lambda ti: {**trait, **ti}, set_haveinfo_field) trait_post_processing_functions_table = { - "Publish": compose(set_confidential_field, common_post_processing_fn), - "ProbeSet": compose(common_post_processing_fn), + "Publish": compose( + lambda ti: set_confidential_field(trait_type, ti), + common_post_processing_fn), + "ProbeSet": compose( + lambda ti: retrieve_probeset_sequence(ti, conn), + common_post_processing_fn), "Geno": common_post_processing_fn, "Temp": common_post_processing_fn } @@ -387,10 +356,16 @@ def retrieve_trait_info( trait_post_processing_functions_table[trait_type], trait_info_function_table[trait_type]) - return retrieve_info( + trait_dataset = retrieve_trait_dataset(trait_type, trait, threshold, conn) + trait_info = retrieve_info( { "trait_name": trait["trait_name"], - "trait_dataset_id": trait_dataset_id, - "trait_dataset_name":trait_dataset_name + "trait_dataset_id": trait_dataset["dataset_id"], + "trait_dataset_name": trait_dataset["dataset_name"] }, conn) + return { + **trait_info, + "db": {**trait["db"], **trait_dataset}, + "riset": trait_dataset["riset"] + } diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py new file mode 100644 index 0000000..34fe7f0 --- /dev/null +++ b/tests/unit/db/test_datasets.py @@ -0,0 +1,133 @@ +from unittest import mock, TestCase + +class TestDatasetsDBFunctions(TestCase): + + def test_retrieve_trait_dataset_name(self): + """Test that the function is called correctly.""" + for trait_type, thresh, trait_dataset_name, columns, table in [ + ["ProbeSet", 9, "testName", + "Id, Name, FullName, ShortName, DataScale", "ProbeSetFreeze"], + ["Geno", 3, "genoTraitName", "Id, Name, FullName, ShortName", + "GenoFreeze"], + ["Publish", 6, "publishTraitName", + "Id, Name, FullName, ShortName", "PublishFreeze"], + ["Temp", 4, "tempTraitName", "Id, Name, FullName, ShortName", + "TempFreeze"]]: + db_mock = mock.MagicMock() + with self.subTest(trait_type=trait_type): + with db_mock.cursor() as cursor: + cursor.fetchone.return_value = ( + "testName", "testNameFull", "testNameShort", + "dataScale") + self.assertEqual( + retrieve_trait_dataset_name( + trait_type, thresh, trait_dataset_name, db_mock), + ("testName", "testNameFull", "testNameShort", + "dataScale")) + cursor.execute.assert_called_once_with( + "SELECT %(columns)s " + "FROM %(table)s " + "WHERE public > %(threshold)s AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)".format( + cols=columns, ttype=trait_type), + {"threshold": thresh, "name": trait_dataset_name, + "table": table, "columns": columns}) + + def test_set_probeset_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'ProbeSet' trait type. + """ + for trait_name, expected in [ + ["testProbeSetName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_probeset_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, ProbeSetFreeze, ProbeFreeze" + " WHERE ProbeFreeze.InbredSetId = InbredSet.Id" + " AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId" + " AND ProbeSetFreeze.Name = %(name)s"), + {"name": trait_name}) + + def test_set_riset_fields(self): + """ + Test that the riset fields are set up correctly for the different trait + types. + """ + for trait_info, expected in [ + [{}, {}], + [{"haveinfo": 0, "type": "Publish"}, + {"haveinfo": 0, "type": "Publish"}], + [{"haveinfo": 0, "type": "ProbeSet"}, + {"haveinfo": 0, "type": "ProbeSet"}], + [{"haveinfo": 0, "type": "Geno"}, + {"haveinfo": 0, "type": "Geno"}], + [{"haveinfo": 0, "type": "Temp"}, + {"haveinfo": 0, "type": "Temp"}], + [{"haveinfo": 1, "type": "Publish", "name": "test"}, + {"haveinfo": 1, "type": "Publish", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "ProbeSet", "name": "test"}, + {"haveinfo": 1, "type": "ProbeSet", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "Geno", "name": "test"}, + {"haveinfo": 1, "type": "Geno", "name": "test", + "riset": "riset_name", "risetid": 0}], + [{"haveinfo": 1, "type": "Temp", "name": "test"}, + {"haveinfo": 1, "type": "Temp", "name": "test", "riset": None, + "risetid": None}] + ]: + db_mock = mock.MagicMock() + with self.subTest(trait_info=trait_info, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = ("riset_name", 0) + self.assertEqual( + set_riset_fields(trait_info, db_mock), expected) + + def test_set_publish_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'Publish' trait type. + """ + for trait_name, expected in [ + ["testPublishName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_publish_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, PublishFreeze" + " WHERE PublishFreeze.InbredSetId = InbredSet.Id" + " AND PublishFreeze.Name = %(name)s"), + {"name": trait_name}) + + def test_set_geno_riset_fields(self): + """ + Test that the `riset` and `riset_id` fields are retrieved appropriately + for the 'Geno' trait type. + """ + for trait_name, expected in [ + ["testGenoName", ()]]: + db_mock = mock.MagicMock() + with self.subTest(trait_name=trait_name, expected=expected): + with db_mock.cursor() as cursor: + cursor.execute.return_value = () + self.assertEqual( + set_geno_riset_fields(trait_name, db_mock), expected) + cursor.execute.assert_called_once_with( + ( + "SELECT InbredSet.Name, InbredSet.Id" + " FROM InbredSet, GenoFreeze" + " WHERE GenoFreeze.InbredSetId = InbredSet.Id" + " AND GenoFreeze.Name = %(name)s"), + {"name": trait_name}) diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 39d7a31..7d161bf 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -2,55 +2,19 @@ from unittest import mock, TestCase from gn3.db.traits import ( build_trait_name, - set_riset_fields, set_haveinfo_field, update_sample_data, retrieve_trait_info, - set_geno_riset_fields, set_confidential_field, set_homologene_id_field, retrieve_geno_trait_info, retrieve_temp_trait_info, - set_publish_riset_fields, - set_probeset_riset_fields, - retrieve_trait_dataset_name, retrieve_publish_trait_info, retrieve_probeset_trait_info) class TestTraitsDBFunctions(TestCase): "Test cases for traits functions" - def test_retrieve_trait_dataset_name(self): - """Test that the function is called correctly.""" - for trait_type, thresh, trait_dataset_name, columns, table in [ - ["ProbeSet", 9, "testName", - "Id, Name, FullName, ShortName, DataScale", "ProbeSetFreeze"], - ["Geno", 3, "genoTraitName", "Id, Name, FullName, ShortName", - "GenoFreeze"], - ["Publish", 6, "publishTraitName", - "Id, Name, FullName, ShortName", "PublishFreeze"], - ["Temp", 4, "tempTraitName", "Id, Name, FullName, ShortName", - "TempFreeze"]]: - db_mock = mock.MagicMock() - with self.subTest(trait_type=trait_type): - with db_mock.cursor() as cursor: - cursor.fetchone.return_value = ( - "testName", "testNameFull", "testNameShort", - "dataScale") - self.assertEqual( - retrieve_trait_dataset_name( - trait_type, thresh, trait_dataset_name, db_mock), - ("testName", "testNameFull", "testNameShort", - "dataScale")) - cursor.execute.assert_called_once_with( - "SELECT %(columns)s " - "FROM %(table)s " - "WHERE public > %(threshold)s AND " - "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)".format( - cols=columns, ttype=trait_type), - {"threshold": thresh, "name": trait_dataset_name, - "table": table, "columns": columns}) - def test_retrieve_publish_trait_info(self): """Test retrieval of type `Publish` traits.""" db_mock = mock.MagicMock() @@ -159,10 +123,10 @@ class TestTraitsDBFunctions(TestCase): def test_build_trait_name_with_good_fullnames(self): for fullname, expected in [ ["testdb::testname", - {"trait_db": "testdb", "trait_name": "testname", "cellid": "", - "trait_fullname": "testdb::testname"}], + {"db": {"dataset_name": "testdb"}, "trait_name": "testname", + "cellid": "", "trait_fullname": "testdb::testname"}], ["testdb::testname::testcell", - {"trait_db": "testdb", "trait_name": "testname", + {"db": {"dataset_name": "testdb"}, "trait_name": "testname", "cellid": "testcell", "trait_fullname": "testdb::testname::testcell"}]]: with self.subTest(fullname=fullname): @@ -176,26 +140,26 @@ class TestTraitsDBFunctions(TestCase): def test_retrieve_trait_info(self): """Test that information on traits is retrieved as appropriate.""" - for trait_type, trait_name, trait_dataset_id, trait_dataset_name, expected in [ - ["Publish", "pubDb::PublishTraitName::pubCell", 1, - "PublishDatasetTraitName", + for trait_type, threshold, trait_fullname, expected in [ + ["Publish", 9, "pubDb::PublishTraitName::pubCell", {"haveinfo": 0, "homologeneid": None, "type": "Publish", - "confidential": 0, "trait_db": "pubDb", + "confidential": 0, "db": {"dataset_name": "pubDb"}, "trait_name": "PublishTraitName", "cellid": "pubCell", "trait_fullname": "pubDb::PublishTraitName::pubCell"}], - ["ProbeSet", "prbDb::ProbeSetTraitName::prbCell", 2, - "ProbeSetDatasetTraitName", + ["ProbeSet", 5, "prbDb::ProbeSetTraitName::prbCell", {"haveinfo": 0, "homologeneid": None, "type": "ProbeSet", "trait_fullname": "prbDb::ProbeSetTraitName::prbCell", - "trait_db": "prbDb", "trait_name": "ProbeSetTraitName", - "cellid": "prbCell"}], - ["Geno", "genDb::GenoTraitName", 3, "GenoDatasetTraitName", + "db": {"dataset_name": "prbDb"}, + "trait_name": "ProbeSetTraitName", "cellid": "prbCell"}], + ["Geno", 12, "genDb::GenoTraitName", {"haveinfo": 0, "homologeneid": None, "type": "Geno", - "trait_fullname": "genDb::GenoTraitName", "trait_db": "genDb", + "trait_fullname": "genDb::GenoTraitName", + "db": {"dataset_name": "genDb"}, "trait_name": "GenoTraitName", "cellid": ""}], - ["Temp", "tmpDb::TempTraitName", 4, "TempDatasetTraitName", + ["Temp", 6, "tmpDb::TempTraitName", {"haveinfo": 0, "homologeneid": None, "type": "Temp", - "trait_fullname": "tmpDb::TempTraitName", "trait_db": "tmpDb", + "trait_fullname": "tmpDb::TempTraitName", + "db": {"dataset_name": "tmpDb"}, "trait_name": "TempTraitName", "cellid": ""}]]: db_mock = mock.MagicMock() with self.subTest(trait_type=trait_type): @@ -203,8 +167,7 @@ class TestTraitsDBFunctions(TestCase): cursor.fetchone.return_value = tuple() self.assertEqual( retrieve_trait_info( - trait_type, trait_name, trait_dataset_id, - trait_dataset_name, db_mock), + trait_type, threshold, trait_fullname, db_mock), expected) def test_update_sample_data(self): @@ -246,128 +209,25 @@ class TestTraitsDBFunctions(TestCase): def test_set_homologene_id_field(self): """Test that the `homologene_id` field is set up correctly""" - for trait_info, expected in [ - [{"type": "Publish"}, - {"type": "Publish", "homologeneid": None}], - [{"type": "ProbeSet"}, - {"type": "ProbeSet", "homologeneid": None}], - [{"type": "Geno"}, {"type": "Geno", "homologeneid": None}], - [{"type": "Temp"}, {"type": "Temp", "homologeneid": None}]]: + for trait_type, trait_info, expected in [ + ["Publish", {}, {"homologeneid": None}], + ["ProbeSet", {}, {"homologeneid": None}], + ["Geno", {}, {"homologeneid": None}], + ["Temp", {}, {"homologeneid": None}]]: db_mock = mock.MagicMock() with self.subTest(trait_info=trait_info, expected=expected): with db_mock.cursor() as cursor: cursor.fetchone.return_value = () self.assertEqual( - set_homologene_id_field(trait_info, db_mock), expected) + set_homologene_id_field(trait_type, trait_info, db_mock), expected) def test_set_confidential_field(self): """Test that the `confidential` field is set up correctly""" - for trait_info, expected in [ - [{"type": "Publish"}, {"type": "Publish", "confidential": 0}], - [{"type": "ProbeSet"}, {"type": "ProbeSet"}], - [{"type": "Geno"}, {"type": "Geno"}], - [{"type": "Temp"}, {"type": "Temp"}]]: + for trait_type, trait_info, expected in [ + ["Publish", {}, {"confidential": 0}], + ["ProbeSet", {}, {}], + ["Geno", {}, {}], + ["Temp", {}, {}]]: with self.subTest(trait_info=trait_info, expected=expected): self.assertEqual( - set_confidential_field(trait_info), expected) - - def test_set_geno_riset_fields(self): - """ - Test that the `riset` and `riset_id` fields are retrieved appropriately - for the 'Geno' trait type. - """ - for trait_name, expected in [ - ["testGenoName", ()]]: - db_mock = mock.MagicMock() - with self.subTest(trait_name=trait_name, expected=expected): - with db_mock.cursor() as cursor: - cursor.execute.return_value = () - self.assertEqual( - set_geno_riset_fields(trait_name, db_mock), expected) - cursor.execute.assert_called_once_with( - ( - "SELECT InbredSet.Name, InbredSet.Id" - " FROM InbredSet, GenoFreeze" - " WHERE GenoFreeze.InbredSetId = InbredSet.Id" - " AND GenoFreeze.Name = %(name)s"), - {"name": trait_name}) - - - def test_set_publish_riset_fields(self): - """ - Test that the `riset` and `riset_id` fields are retrieved appropriately - for the 'Publish' trait type. - """ - for trait_name, expected in [ - ["testPublishName", ()]]: - db_mock = mock.MagicMock() - with self.subTest(trait_name=trait_name, expected=expected): - with db_mock.cursor() as cursor: - cursor.execute.return_value = () - self.assertEqual( - set_publish_riset_fields(trait_name, db_mock), expected) - cursor.execute.assert_called_once_with( - ( - "SELECT InbredSet.Name, InbredSet.Id" - " FROM InbredSet, PublishFreeze" - " WHERE PublishFreeze.InbredSetId = InbredSet.Id" - " AND PublishFreeze.Name = %(name)s"), - {"name": trait_name}) - - - def test_set_probeset_riset_fields(self): - """ - Test that the `riset` and `riset_id` fields are retrieved appropriately - for the 'ProbeSet' trait type. - """ - for trait_name, expected in [ - ["testProbeSetName", ()]]: - db_mock = mock.MagicMock() - with self.subTest(trait_name=trait_name, expected=expected): - with db_mock.cursor() as cursor: - cursor.execute.return_value = () - self.assertEqual( - set_probeset_riset_fields(trait_name, db_mock), expected) - cursor.execute.assert_called_once_with( - ( - "SELECT InbredSet.Name, InbredSet.Id" - " FROM InbredSet, ProbeSetFreeze, ProbeFreeze" - " WHERE ProbeFreeze.InbredSetId = InbredSet.Id" - " AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId" - " AND ProbeSetFreeze.Name = %(name)s"), - {"name": trait_name}) - - def test_set_riset_fields(self): - """ - Test that the riset fields are set up correctly for the different trait - types. - """ - for trait_info, expected in [ - [{}, {}], - [{"haveinfo": 0, "type": "Publish"}, - {"haveinfo": 0, "type": "Publish"}], - [{"haveinfo": 0, "type": "ProbeSet"}, - {"haveinfo": 0, "type": "ProbeSet"}], - [{"haveinfo": 0, "type": "Geno"}, - {"haveinfo": 0, "type": "Geno"}], - [{"haveinfo": 0, "type": "Temp"}, - {"haveinfo": 0, "type": "Temp"}], - [{"haveinfo": 1, "type": "Publish", "name": "test"}, - {"haveinfo": 1, "type": "Publish", "name": "test", - "riset": "riset_name", "risetid": 0}], - [{"haveinfo": 1, "type": "ProbeSet", "name": "test"}, - {"haveinfo": 1, "type": "ProbeSet", "name": "test", - "riset": "riset_name", "risetid": 0}], - [{"haveinfo": 1, "type": "Geno", "name": "test"}, - {"haveinfo": 1, "type": "Geno", "name": "test", - "riset": "riset_name", "risetid": 0}], - [{"haveinfo": 1, "type": "Temp", "name": "test"}, - {"haveinfo": 1, "type": "Temp", "name": "test", "riset": None, - "risetid": None}] - ]: - db_mock = mock.MagicMock() - with self.subTest(trait_info=trait_info, expected=expected): - with db_mock.cursor() as cursor: - cursor.execute.return_value = ("riset_name", 0) - self.assertEqual( - set_riset_fields(trait_info, db_mock), expected) + set_confidential_field(trait_type, trait_info), expected) -- cgit v1.2.3 From 86abf16313ac4579f33c357525115827a39451af Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Sun, 8 Aug 2021 12:13:45 +0300 Subject: Only load extra data if the traits have basic info Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Only load the extra trait data if the basic trait information is found. --- gn3/db/traits.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index f66ead3..6c31a4d 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -338,8 +338,7 @@ def retrieve_trait_info( lambda ti: load_qtl_info(qtl, trait_type, ti, conn), lambda ti: set_homologene_id_field(trait_type, ti, conn), lambda ti: {"trait_type": trait_type, **ti}, - lambda ti: {**trait, **ti}, - set_haveinfo_field) + lambda ti: {**trait, **ti}) trait_post_processing_functions_table = { "Publish": compose( @@ -353,8 +352,7 @@ def retrieve_trait_info( } retrieve_info = compose( - trait_post_processing_functions_table[trait_type], - trait_info_function_table[trait_type]) + set_haveinfo_field, trait_info_function_table[trait_type]) trait_dataset = retrieve_trait_dataset(trait_type, trait, threshold, conn) trait_info = retrieve_info( @@ -364,8 +362,10 @@ def retrieve_trait_info( "trait_dataset_name": trait_dataset["dataset_name"] }, conn) - return { - **trait_info, - "db": {**trait["db"], **trait_dataset}, - "riset": trait_dataset["riset"] - } + if trait_info["haveinfo"]: + return { + **trait_post_processing_functions_table[trait_type](trait_info), + "db": {**trait["db"], **trait_dataset}, + "riset": trait_dataset["riset"] + } + return trait_info -- cgit v1.2.3 From 667e67bae832ca5083f3319ada4fda67aca41f44 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 11:44:47 +0300 Subject: Fix linting errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add module, class and function docstrings * Deactivate some irrelevant pylint errors * Fix indentations and line-lengths --- gn3/db/datasets.py | 64 ++++++++++++++++++++++++++++++++++-------- gn3/db/traits.py | 29 +++++++++++++++++++ tests/unit/db/test_datasets.py | 5 +++- tests/unit/db/test_traits.py | 6 ++++ 4 files changed, 91 insertions(+), 13 deletions(-) (limited to 'gn3') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 3ad50f6..53d6811 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -1,7 +1,13 @@ -from typing import Any, Dict, Union +""" +This module contains functions relating to specific trait dataset manipulation +""" +from typing import Any def retrieve_probeset_trait_dataset_name( threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `ProbeSet` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName, DataScale " "FROM ProbeSetFreeze " @@ -21,7 +27,11 @@ def retrieve_probeset_trait_dataset_name( "dataset_shortname", "dataset_datascale"], cursor.fetchone)) -def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_publish_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Publish` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM PublishFreeze " @@ -41,7 +51,11 @@ def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: A "dataset_shortname"], cursor.fetchone)) -def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_geno_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Geno` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM GenoFreeze " @@ -61,7 +75,11 @@ def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any) "dataset_shortname"], cursor.fetchone)) -def retrieve_temp_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_temp_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Temp` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM TempFreeze " @@ -145,6 +163,9 @@ def retrieve_probeset_riset_fields(name, conn): return {} def retrieve_temp_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for `Temp` trait types. + """ query = ( "SELECT InbredSet.Name, InbredSet.Id " "FROM InbredSet, Temp " @@ -179,6 +200,10 @@ def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): } def retrieve_temp_trait_dataset(): + """ + Retrieve the dataset that relates to `Temp` traits + """ + # pylint: disable=[C0330] return { "searchfield": ["name", "description"], "disfield": ["name", "description"], @@ -189,28 +214,40 @@ def retrieve_temp_trait_dataset(): } def retrieve_geno_trait_dataset(): + """ + Retrieve the dataset that relates to `Geno` traits + """ + # pylint: disable=[C0330] return { - "searchfield": ["name","chr"], - "disfield": ["name","chr","mb", "source2", "sequence"], + "searchfield": ["name", "chr"], + "disfield": ["name", "chr", "mb", "source2", "sequence"], "type": "Geno" } def retrieve_publish_trait_dataset(): + """ + Retrieve the dataset that relates to `Publish` traits + """ + # pylint: disable=[C0330] return { "searchfield": [ "name", "post_publication_description", "abstract", "title", "authors"], "disfield": [ - "name","pubmed_id", "pre_publication_description", - "post_publication_description", "original_description", + "name", "pubmed_id", "pre_publication_description", + "post_publication_description", "original_description", "pre_publication_abbreviation", "post_publication_abbreviation", "lab_code", "submitter", "owner", "authorized_users", - "authors","title","abstract", "journal","volume","pages","month", - "year","sequence", "units", "comments"], + "authors", "title", "abstract", "journal", "volume", "pages", + "month", "year", "sequence", "units", "comments"], "type": "Publish" } def retrieve_probeset_trait_dataset(): + """ + Retrieve the dataset that relates to `ProbeSet` traits + """ + # pylint: disable=[C0330] return { "searchfield": [ "name", "description", "probe_target_description", "symbol", @@ -228,6 +265,9 @@ def retrieve_probeset_trait_dataset(): } def retrieve_trait_dataset(trait_type, trait, threshold, conn): + """ + Retrieve the dataset that relates to a specific trait. + """ dataset_fns = { "Temp": retrieve_temp_trait_dataset, "Geno": retrieve_geno_trait_dataset, @@ -238,8 +278,8 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): "dataset_id": None, "dataset_name": trait["db"]["dataset_name"], **retrieve_dataset_name( - trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], - conn) + trait_type, threshold, trait["trait_name"], + trait["db"]["dataset_name"], conn) } riset = retrieve_riset_fields( trait_type, trait["trait_name"], dataset_name_info, conn) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 6c31a4d..fb48fc3 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -43,6 +43,7 @@ def update_sample_data(conn: Any, count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" + # pylint: disable=[R0913, R0914] STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " "WHERE StrainId = %s AND Id = %s") @@ -252,6 +253,9 @@ def set_homologene_id_field(trait_type, trait_info, conn): return functions_table[trait_type](trait_info) def load_publish_qtl_info(trait_info, conn): + """ + Load extra QTL information for `Publish` traits + """ query = ( "SELECT PublishXRef.Locus, PublishXRef.LRS, PublishXRef.additive " "FROM PublishXRef, PublishFreeze " @@ -264,6 +268,9 @@ def load_publish_qtl_info(trait_info, conn): return {"locus": "", "lrs": "", "additive": ""} def load_probeset_qtl_info(trait_info, conn): + """ + Load extra QTL information for `ProbeSet` traits + """ query = ( "SELECT ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, " "ProbeSetXRef.mean, ProbeSetXRef.additive " @@ -278,6 +285,22 @@ def load_probeset_qtl_info(trait_info, conn): return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} def load_qtl_info(qtl, trait_type, trait_info, conn): + """ + Load extra QTL information for traits + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L500-L534 + + PARAMETERS: + qtl: boolean + trait_type: string + The type of the trait in consideration + trait_info: map/dictionary + A dictionary of the trait's key-value pairs + conn: + A database connection object + """ if not qtl: return trait_info qtl_info_functions = { @@ -290,6 +313,9 @@ def load_qtl_info(qtl, trait_type, trait_info, conn): return qtl_info_functions[trait_type](trait_info, conn) def build_trait_name(trait_fullname): + """ + Initialises the trait's name, and other values from the search data provided + """ name_parts = trait_fullname.split("::") assert len(name_parts) >= 2, "Name format error" return { @@ -300,6 +326,9 @@ def build_trait_name(trait_fullname): } def retrieve_probeset_sequence(trait, conn): + """ + Retrieve a 'ProbeSet' trait's sequence information + """ query = ( "SELECT ProbeSet.BlatSeq " "FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef " diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py index 4f405cb..38de0e2 100644 --- a/tests/unit/db/test_datasets.py +++ b/tests/unit/db/test_datasets.py @@ -1,3 +1,5 @@ +"""Tests for gn3/db/datasets.py""" + from unittest import mock, TestCase from gn3.db.datasets import ( retrieve_dataset_name, @@ -7,6 +9,7 @@ from gn3.db.datasets import ( retrieve_probeset_riset_fields) class TestDatasetsDBFunctions(TestCase): + """Test cases for datasets functions.""" def test_retrieve_dataset_name(self): """Test that the function is called correctly.""" @@ -34,7 +37,7 @@ class TestDatasetsDBFunctions(TestCase): "(Name = %(name)s " "OR FullName = %(name)s " "OR ShortName = %(name)s)".format( - table=table, cols=columns, ttype=trait_type), + table=table, cols=columns), {"threshold": thresh, "name": dataset_name}) def test_retrieve_probeset_riset_fields(self): diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 5f52c18..d9d7bbb 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -121,6 +121,9 @@ class TestTraitsDBFunctions(TestCase): trait_source) def test_build_trait_name_with_good_fullnames(self): + """ + Check that the name is built correctly. + """ for fullname, expected in [ ["testdb::testname", {"db": {"dataset_name": "testdb"}, "trait_name": "testname", @@ -133,6 +136,9 @@ class TestTraitsDBFunctions(TestCase): self.assertEqual(build_trait_name(fullname), expected) def test_build_trait_name_with_bad_fullnames(self): + """ + Check that an exception is raised if the full name format is wrong. + """ for fullname in ["", "test", "test:test"]: with self.subTest(fullname=fullname): with self.assertRaises(AssertionError, msg="Name format error"): -- cgit v1.2.3 From 9ed71447892922c0039dd749a9052be502f1b818 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 11:46:26 +0300 Subject: Add missing arguments. Fix typo. Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix minor bugs in the code. --- gn3/db/traits.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index fb48fc3..be46437 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -263,7 +263,12 @@ def load_publish_qtl_info(trait_info, conn): "AND PublishXRef.InbredSetId = PublishFreeze.InbredSetId " "AND PublishFreeze.Id = %(dataset_id)s") with conn.cursor() as cursor: - cursor.execute() + cursor.execute( + query, + { + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"] + }) return dict(zip(["locus", "lrs", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "additive": ""} @@ -279,7 +284,12 @@ def load_probeset_qtl_info(trait_info, conn): " AND ProbeSet.Name = %(trait_name)s " "AND ProbeSetXRef.ProbeSetFreezeId = %(dataset_id)s") with conn.cursor() as cursor: - cursor.execute() + cursor.execute( + query, + { + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"] + }) return dict(zip( ["locus", "lrs", "pvalue", "mean", "additive"], cursor.fetchone())) return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} @@ -307,7 +317,7 @@ def load_qtl_info(qtl, trait_type, trait_info, conn): "Publish": load_publish_qtl_info, "ProbeSet": load_probeset_qtl_info } - if trait_inf["name"] not in qtl_info_functions.keys(): + if trait_info["name"] not in qtl_info_functions.keys(): return trait_info return qtl_info_functions[trait_type](trait_info, conn) -- cgit v1.2.3 From 8f022ae1a31224d0526443ad9779f30206b4a770 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 14:22:54 +0300 Subject: Retrieve the trait data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add functions to retrieve the `value`, `variance`, and `ndata` values for any given trait. --- gn3/db/traits.py | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 244 insertions(+), 1 deletion(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index be46437..a740352 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -1,5 +1,5 @@ """This class contains functions relating to trait data manipulation""" -from typing import Any, Dict, Union +from typing import Any, Dict, Union, Sequence from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -408,3 +408,246 @@ def retrieve_trait_info( "riset": trait_dataset["riset"] } return trait_info + +def retrieve_temp_trait_data(trait_info: dict, conn: Any): + """ + Retrieve trait data for `Temp` traits. + """ + query = ( + "SELECT " + "Strain.Name, TempData.value, TempData.SE, TempData.NStrain, " + "TempData.Id " + "FROM TempData, Temp, Strain " + "WHERE TempData.StrainId = Strain.Id " + "AND TempData.Id = Temp.DataId " + "AND Temp.name = %(trait_name)s " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "nstrain", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_species_id(riset, conn: Any): + """ + Retrieve a species id given the RISet value + """ + with conn.cursor as cursor: + cursor.execute( + "SELECT SpeciesId from InbredSet WHERE Name = %(riset)s", + {"riset": riset}) + return cursor.fetchone()[0] + return None + +def retrieve_geno_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Geno` traits. + """ + query = ( + "SELECT Strain.Name, GenoData.value, GenoSE.error, GenoData.Id " + "FROM (GenoData, GenoFreeze, Strain, Geno, GenoXRef) " + "LEFT JOIN GenoSE ON " + "(GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId) " + "WHERE Geno.SpeciesId = %(species_id)s " + "AND Geno.Name = %(trait_name)s AND GenoXRef.GenoId = Geno.Id " + "AND GenoXRef.GenoFreezeId = GenoFreeze.Id " + "AND GenoFreeze.Name = %(dataset_name)s " + "AND GenoXRef.DataId = GenoData.Id " + "AND GenoData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_name": trait_info["db"]["dataset_name"], + "species_id": retrieve_species_id( + trait_info["db"]["riset"], conn)}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_publish_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Publish` traits. + """ + query = ( + "SELECT " + "Strain.Name, PublishData.value, PublishSE.error, NStrain.count, " + "PublishData.Id " + "FROM (PublishData, Strain, PublishXRef, PublishFreeze) " + "LEFT JOIN PublishSE ON " + "(PublishSE.DataId = PublishData.Id " + "AND PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id " + "AND NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishData.Id = PublishXRef.DataId " + "AND PublishXRef.Id = %(trait_name)s " + "AND PublishFreeze.Id = %(dataset_id)s " + "AND PublishData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "nstrain", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_cellid_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `Probe Data` types. + """ + query = ( + "SELECT " + "Strain.Name, ProbeData.value, ProbeSE.error, ProbeData.Id " + "FROM (ProbeData, ProbeFreeze, ProbeSetFreeze, ProbeXRef, Strain," + " Probe, ProbeSet) " + "LEFT JOIN ProbeSE ON " + "(ProbeSE.DataId = ProbeData.Id " + " AND ProbeSE.StrainId = ProbeData.StrainId) " + "WHERE Probe.Name = %(cellid)s " + "AND ProbeSet.Name = %(trait_name)s " + "AND Probe.ProbeSetId = ProbeSet.Id " + "AND ProbeXRef.ProbeId = Probe.Id " + "AND ProbeXRef.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeSetFreeze.Name = %(dataset_name)s " + "AND ProbeXRef.DataId = ProbeData.Id " + "AND ProbeData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + with conn.cursor() as cursor: + cursor.execute( + query, + {"cellid": trait_info["cellid"], + "trait_name": trait_info["trait_name"], + "dataset_id": trait_info["db"]["dataset_id"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def retrieve_probeset_trait_data(trait_info: Dict, conn: Any): + """ + Retrieve trait data for `ProbeSet` traits. + """ + query = ( + "SELECT Strain.Name, ProbeSetData.value, ProbeSetSE.error, " + "ProbeSetData.Id " + "FROM (ProbeSetData, ProbeSetFreeze, Strain, ProbeSet, ProbeSetXRef) " + "LEFT JOIN ProbeSetSE ON " + "(ProbeSetSE.DataId = ProbeSetData.Id " + "AND ProbeSetSE.StrainId = ProbeSetData.StrainId) " + "WHERE ProbeSet.Name = %(trait_name)s " + "AND ProbeSetXRef.ProbeSetId = ProbeSet.Id " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %(dataset_name)s " + "AND ProbeSetXRef.DataId = ProbeSetData.Id " + "AND ProbeSetData.StrainId = Strain.Id " + "ORDER BY Strain.Name") + + with conn.cursor() as cursor: + cursor.execute( + query, + {"trait_name": trait_info["trait_name"], + "dataset_name": trait_info["db"]["dataset_name"]}) + return [dict(zip( + ["strain_name", "value", "se_error", "id"], row)) + for row in cursor.fetchall()] + return [] + +def with_strainlist_data_setup(strainlist: Sequence[str]): + """ + Build function that computes the trait data from provided list of strains. + + PARAMETERS + strainlist: (list) + A list of strain names + + RETURNS: + Returns a function that given some data from the database, computes the + strain's value, variance and ndata values, only if the strain is present + in the provided `strainlist` variable. + """ + def setup_fn(tdata): + if tdata["strain_name"] in strainlist: + val = tdata["value"] + if val is not None: + return { + "strain_name": tdata["strain_name"], + "value": val, + "variance": tdata["se_error"], + "ndata": tdata.get("nstrain", None) + } + return None + return setup_fn + +def without_strainlist_data_setup(): + """ + Build function that computes the trait data. + + RETURNS: + Returns a function that given some data from the database, computes the + strain's value, variance and ndata values. + """ + def setup_fn(tdata): + val = tdata["value"] + if val is not None: + return { + "strain_name": tdata["strain_name"], + "value": val, + "variance": tdata["se_error"], + "ndata": tdata.get("nstrain", None) + } + return None + return setup_fn + +def retrieve_trait_data(trait: dict, conn: Any, strainlist: Sequence[str] = tuple()): + """ + Retrieve trait data + + DESCRIPTION + Retrieve trait data as is done in + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L258-L386 + """ + # I do not like this section, but it retains the flow in the old codebase + if trait["db"]["dataset_type"] == "Temp": + results = retrieve_temp_trait_data(trait, conn) + elif trait["db"]["dataset_type"] == "Publish": + results = retrieve_publish_trait_data(trait, conn) + elif trait["cellid"]: + results = retrieve_cellid_trait_data(trait, conn) + elif trait["db"]["dataset_type"] == "ProbeSet": + results = retrieve_probeset_trait_data(trait, conn) + else: + results = retrieve_geno_trait_data(trait, conn) + + if results: + # do something with mysqlid + mysqlid = results[0]["id"] + if strainlist: + data = [ + item for item in + map(with_strainlist_data_setup(strainlist), results) + if item is not None] + else: + data = [ + item for item in + map(without_strainlist_data_setup(), results) + if item is not None] + + return { + "mysqlid": mysqlid, + "data": dict(map( + lambda x: ( + x["strain_name"], + {k:v for k, v in x.items() if x != "strain_name"}), + data))} + return {} -- cgit v1.2.3 From 243d76bd5cdb989ee7d3311e44aafb7e8f7da712 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 14:25:49 +0300 Subject: Set up the trait dataset type correctly * gn3/db/traits.py: setup `trait_dataset_type` * tests/unit/db/test_traits.py: fix tests The type ('Temp', 'Geno', 'Publish', and 'ProbeSet') relate to a trait's dataset, and not the trait itself. This commit updates the code to take this into consideration. The dataset type is also set up from a trait's full name, therefore this commit removes the `trait_type` argument from the `retrieve_trait_info` function. --- gn3/db/traits.py | 33 ++++++++++++++++++++++++--------- tests/unit/db/test_traits.py | 27 ++++++++++++--------------- 2 files changed, 36 insertions(+), 24 deletions(-) (limited to 'gn3') diff --git a/gn3/db/traits.py b/gn3/db/traits.py index a740352..6ea24be 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -326,10 +326,23 @@ def build_trait_name(trait_fullname): """ Initialises the trait's name, and other values from the search data provided """ + def dataset_type(dset_name): + if dset_name.find('Temp') >= 0: + return "Temp" + if dset_name.find('Geno') >= 0: + return "Geno" + if dset_name.find('Publish') >= 0: + return "Publish" + return "ProbeSet" + name_parts = trait_fullname.split("::") assert len(name_parts) >= 2, "Name format error" + dataset_name = name_parts[0] + dataset_type = dataset_type(dataset_name) return { - "db": {"dataset_name": name_parts[0]}, + "db": { + "dataset_name": dataset_name, + "dataset_type": dataset_type}, "trait_fullname": trait_fullname, "trait_name": name_parts[1], "cellid": name_parts[2] if len(name_parts) == 3 else "" @@ -357,7 +370,7 @@ def retrieve_probeset_sequence(trait, conn): return {**trait, "sequence": seq[0] if seq else ""} def retrieve_trait_info( - trait_type: str, threshold: int, trait_full_name: str, conn: Any, + threshold: int, trait_full_name: str, conn: Any, qtl=None): """Retrieves the trait information. @@ -366,6 +379,7 @@ def retrieve_trait_info( This function, or the dependent functions, might be incomplete as they are currently.""" trait = build_trait_name(trait_full_name) + trait_dataset_type = trait["db"]["dataset_type"] trait_info_function_table = { "Publish": retrieve_publish_trait_info, "ProbeSet": retrieve_probeset_trait_info, @@ -374,14 +388,14 @@ def retrieve_trait_info( } common_post_processing_fn = compose( - lambda ti: load_qtl_info(qtl, trait_type, ti, conn), - lambda ti: set_homologene_id_field(trait_type, ti, conn), - lambda ti: {"trait_type": trait_type, **ti}, + lambda ti: load_qtl_info(qtl, trait_dataset_type, ti, conn), + lambda ti: set_homologene_id_field(trait_dataset_type, ti, conn), + lambda ti: {"trait_type": trait_dataset_type, **ti}, lambda ti: {**trait, **ti}) trait_post_processing_functions_table = { "Publish": compose( - lambda ti: set_confidential_field(trait_type, ti), + lambda ti: set_confidential_field(trait_dataset_type, ti), common_post_processing_fn), "ProbeSet": compose( lambda ti: retrieve_probeset_sequence(ti, conn), @@ -391,9 +405,10 @@ def retrieve_trait_info( } retrieve_info = compose( - set_haveinfo_field, trait_info_function_table[trait_type]) + set_haveinfo_field, trait_info_function_table[trait_dataset_type]) - trait_dataset = retrieve_trait_dataset(trait_type, trait, threshold, conn) + trait_dataset = retrieve_trait_dataset( + trait_dataset_type, trait, threshold, conn) trait_info = retrieve_info( { "trait_name": trait["trait_name"], @@ -403,7 +418,7 @@ def retrieve_trait_info( conn) if trait_info["haveinfo"]: return { - **trait_post_processing_functions_table[trait_type](trait_info), + **trait_post_processing_functions_table[trait_dataset_type](trait_info), "db": {**trait["db"], **trait_dataset}, "riset": trait_dataset["riset"] } diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index d9d7bbb..ee98893 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -126,11 +126,12 @@ class TestTraitsDBFunctions(TestCase): """ for fullname, expected in [ ["testdb::testname", - {"db": {"dataset_name": "testdb"}, "trait_name": "testname", - "cellid": "", "trait_fullname": "testdb::testname"}], + {"db": {"dataset_name": "testdb", "dataset_type": "ProbeSet"}, + "trait_name": "testname", "cellid": "", + "trait_fullname": "testdb::testname"}], ["testdb::testname::testcell", - {"db": {"dataset_name": "testdb"}, "trait_name": "testname", - "cellid": "testcell", + {"db": {"dataset_name": "testdb", "dataset_type": "ProbeSet"}, + "trait_name": "testname", "cellid": "testcell", "trait_fullname": "testdb::testname::testcell"}]]: with self.subTest(fullname=fullname): self.assertEqual(build_trait_name(fullname), expected) @@ -146,22 +147,18 @@ class TestTraitsDBFunctions(TestCase): def test_retrieve_trait_info(self): """Test that information on traits is retrieved as appropriate.""" - for trait_type, threshold, trait_fullname, expected in [ - ["Publish", 9, "pubDb::PublishTraitName::pubCell", - {"haveinfo": 0}], - ["ProbeSet", 5, "prbDb::ProbeSetTraitName::prbCell", - {"haveinfo": 0}], - ["Geno", 12, "genDb::GenoTraitName", - {"haveinfo": 0}], - ["Temp", 6, "tmpDb::TempTraitName", - {"haveinfo": 0}]]: + for threshold, trait_fullname, expected in [ + [9, "pubDb::PublishTraitName::pubCell", {"haveinfo": 0}], + [5, "prbDb::ProbeSetTraitName::prbCell", {"haveinfo": 0}], + [12, "genDb::GenoTraitName", {"haveinfo": 0}], + [6, "tmpDb::TempTraitName", {"haveinfo": 0}]]: db_mock = mock.MagicMock() - with self.subTest(trait_type=trait_type): + with self.subTest(trait_fullname=trait_fullname): with db_mock.cursor() as cursor: cursor.fetchone.return_value = tuple() self.assertEqual( retrieve_trait_info( - trait_type, threshold, trait_fullname, db_mock), + threshold, trait_fullname, db_mock), expected) def test_update_sample_data(self): -- cgit v1.2.3 From 72ab476e5825c8c2b0d5102d6f1227ace8f7fa68 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 18:34:46 +0300 Subject: Build up the heatmap data Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add code to compute and organise the data that will be used to draw the final heatmap. This varies significantly in how it works from the original, but it still tries to retain the general flow of data. --- gn3/computations/heatmap.py | 173 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 gn3/computations/heatmap.py (limited to 'gn3') diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py new file mode 100644 index 0000000..98ea26f --- /dev/null +++ b/gn3/computations/heatmap.py @@ -0,0 +1,173 @@ +""" +This module will contain functions to be used in computation of the data used to +generate various kinds of heatmaps. +""" + +from gn3.computations.slink import slink +from gn3.computations.correlations2 import compute_correlation + +def export_trait_data( + trait_data: dict, strainlist: Sequence[str], dtype: str="val", + var_exists: bool=False, n_exists: bool=False): + """ + Export data according to `strainlist`. Mostly used in calculating + correlations. + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L166-L211 + + PARAMETERS + trait: (dict) + The dictionary of key-value pairs representing a trait + strainlist: (list) + A list of strain names + type: (str) + ... verify what this is ... + var_exists: (bool) + A flag indicating existence of variance + n_exists: (bool) + A flag indicating existence of ndata + """ + def __export_all_types(tdata, strain): + sample_data = [] + if tdata[strain]["val"]: + sample_data.append(tdata[strain]["val"]) + if var_exists: + if tdata[strain].var: + sample_data.append(tdata[strain]["var"]) + else: + sample_data.append(None) + if n_exists: + if tdata[strain]["ndata"]: + sample_data.append(tdata[strain]["ndata"]) + else: + sample_data.append(None) + else: + if var_exists and n_exists: + sample_data += [None, None, None] + elif var_exists or n_exists: + sample_data += [None, None] + else: + sample_data.append(None) + + return tuple(sample_data) + + def __exporter(accumulator, strain): + if tdata.has_key(strain): + if dtype == "val": + return accumulator + (tdata[strain]["val"], ) + if dtype == "var": + return accumulator + (tdata[strain]["var"], ) + if dtype == "N": + return tdata[strain]["ndata"] + if dtype == "all": + return accumulator + __export_all_types( + accumulator, tdata, strain) + else: + raise KeyError("Type `%s` is incorrect" % dtype) + else: + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) + + return reduce(__exporter(strain), strainlist, tuple()) + +def trait_display_name(trait: Dict): + """ + Given a trait, return a name to use to display the trait on a heatmap. + + DESCRIPTION + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L141-L157 + """ + if trait.get("db", None) and trait.get("trait_name", None): + if trait["db"]["dataset_type"] == "Temp": + desc = trait["description"] + if desc.find("PCA") >= 0: + return "%s::%s" % ( + trait["db"]["displayname"], + desc[desc.rindex(':')+1:].strip()) + return "%s::%s" % ( + trait["db"]["displayname"], + desc[:desc.index('entered')].strip()) + else: + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix + return trait["description"] + +def cluster_traits(traits_data_list: Sequence[Dict]): + """ + Clusters the trait values. + + DESCRIPTION + Attempts to replicate the clustering of the traits, as done at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L162 + """ + def __compute_corr(tdata_i, tdata_j): + if tdata_j[0] < tdata_i[0]: + corr, nOverlap = compute_correlation(tdata_i, tdata_j) + if (1 - corr) < 0: + return 0.0 + return 1 - corr + return 0.0 + + def __cluster(tdata_i): + res2 = tuple( + __compute_corr(tdata_i, tdata_j) for tdata_j in enumerate(traits)) + + return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) + +def heatmap_data( + fd, search_result, conn: Any, colorScheme=None, userPrivilege=None, + userName=None): + """ + heatmap function + + DESCRIPTION + This function is an attempt to reproduce the initialisation at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L46-L64 + and also the clustering and slink computations at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/heatmap/Heatmap.py#L138-L165 + with the help of the `gn3.computations.heatmap.cluster_traits` function. + + It does not try to actually draw the heatmap image. + + PARAMETERS: + TODO: Elaborate on the parameters here... + """ + cluster_checked = fd.formdata.getvalue("clusterCheck", "") + strainlist = [strain for strain in fd.strainlist if strain not in fd.parlist] + genotype = fd.genotype + + def __retrieve_traitlist_and_datalist(threshold, fullname): + trait = retrieve_trait_info(threshold, fullname, conn) + return (trait, export_trait_data(retrieve_trait_data(trait), strainlist)) + + traits_details = [ + __retrieve_traitlist_and_datalist(threshold, fullname) + for fullname in search_result] + traits_list = map(lambda x: x[0], traits_details) + traits_data_list = map(lambda x: x[1], traits_details) + + return { + "target_description_checked": fd.formdata.getvalue( + "targetDescriptionCheck", ""), + "cluster_checked": cluster_checked, + "slink_data": ( + slink(cluster_traits(traits_list, strainlist)) + if cluster_checked else False) + "sessionfile": fd.formdata.getvalue("session"), + "genotype": genotype, + "nLoci": sum(map(lambda x: len(x), genotype)) + "strainlist": strainlist, + "ppolar": fd.ppolar, + "mpolar":fd.mpolar, + "traits_list": traits_list + "traits_data_list": traits_data_list + } -- cgit v1.2.3 From d97e3141554c3a13a1ec163373b19b9552a79fb0 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 12 Aug 2021 13:08:05 +0300 Subject: Import missing definitions Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Import some missing definitions. --- gn3/computations/heatmap.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 98ea26f..1b7dfb7 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -3,12 +3,15 @@ This module will contain functions to be used in computation of the data used to generate various kinds of heatmaps. """ +from functools import reduce +from typing import Any, Dict, Sequence from gn3.computations.slink import slink +from gn3.db.traits import retrieve_trait_data, retrieve_trait_info from gn3.computations.correlations2 import compute_correlation def export_trait_data( - trait_data: dict, strainlist: Sequence[str], dtype: str="val", - var_exists: bool=False, n_exists: bool=False): + trait_data: dict, strainlist: Sequence[str], dtype: str = "val", + var_exists: bool = False, n_exists: bool = False): """ Export data according to `strainlist`. Mostly used in calculating correlations. -- cgit v1.2.3 From 3420e378a614f1ecec85f633cd9f202764a54eda Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 12 Aug 2021 13:09:32 +0300 Subject: Fix linting errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Fix some errors caught by the linter. --- gn3/computations/heatmap.py | 79 +++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 39 deletions(-) (limited to 'gn3') diff --git a/gn3/computations/heatmap.py b/gn3/computations/heatmap.py index 1b7dfb7..a0e778a 100644 --- a/gn3/computations/heatmap.py +++ b/gn3/computations/heatmap.py @@ -57,26 +57,24 @@ def export_trait_data( return tuple(sample_data) def __exporter(accumulator, strain): - if tdata.has_key(strain): + # pylint: disable=[R0911] + if trait_data.has_key(strain): if dtype == "val": - return accumulator + (tdata[strain]["val"], ) + return accumulator + (trait_data[strain]["val"], ) if dtype == "var": - return accumulator + (tdata[strain]["var"], ) + return accumulator + (trait_data[strain]["var"], ) if dtype == "N": - return tdata[strain]["ndata"] + return trait_data[strain]["ndata"] if dtype == "all": - return accumulator + __export_all_types( - accumulator, tdata, strain) - else: - raise KeyError("Type `%s` is incorrect" % dtype) - else: - if var_exists and n_exists: - return accumulator + (None, None, None) - if var_exists or n_exists: - return accumulator + (None, None) - return accumulator + (None,) + return accumulator + __export_all_types(trait_data, strain) + raise KeyError("Type `%s` is incorrect" % dtype) + if var_exists and n_exists: + return accumulator + (None, None, None) + if var_exists or n_exists: + return accumulator + (None, None) + return accumulator + (None,) - return reduce(__exporter(strain), strainlist, tuple()) + return reduce(__exporter, strainlist, tuple()) def trait_display_name(trait: Dict): """ @@ -96,12 +94,11 @@ def trait_display_name(trait: Dict): return "%s::%s" % ( trait["db"]["displayname"], desc[:desc.index('entered')].strip()) - else: - prefix = "%s::%s" % ( - trait["db"]["dataset_name"], trait["trait_name"]) - if trait["cellid"]: - return "%s::%s" % (prefix, trait["cellid"]) - return prefix + prefix = "%s::%s" % ( + trait["db"]["dataset_name"], trait["trait_name"]) + if trait["cellid"]: + return "%s::%s" % (prefix, trait["cellid"]) + return prefix return trait["description"] def cluster_traits(traits_data_list: Sequence[Dict]): @@ -114,21 +111,21 @@ def cluster_traits(traits_data_list: Sequence[Dict]): """ def __compute_corr(tdata_i, tdata_j): if tdata_j[0] < tdata_i[0]: - corr, nOverlap = compute_correlation(tdata_i, tdata_j) + corr_vals = compute_correlation(tdata_i, tdata_j) + corr = corr_vals[0] if (1 - corr) < 0: return 0.0 return 1 - corr return 0.0 def __cluster(tdata_i): - res2 = tuple( - __compute_corr(tdata_i, tdata_j) for tdata_j in enumerate(traits)) + return tuple( + __compute_corr(tdata_i, tdata_j) + for tdata_j in enumerate(traits_data_list)) return tuple(__cluster(tdata_i) for tdata_i in enumerate(traits_data_list)) -def heatmap_data( - fd, search_result, conn: Any, colorScheme=None, userPrivilege=None, - userName=None): +def heatmap_data(formd, search_result, conn: Any): """ heatmap function @@ -144,13 +141,17 @@ def heatmap_data( PARAMETERS: TODO: Elaborate on the parameters here... """ - cluster_checked = fd.formdata.getvalue("clusterCheck", "") - strainlist = [strain for strain in fd.strainlist if strain not in fd.parlist] - genotype = fd.genotype + threshold = 0 # webqtlConfig.PUBLICTHRESH + cluster_checked = formd.formdata.getvalue("clusterCheck", "") + strainlist = [ + strain for strain in formd.strainlist if strain not in formd.parlist] + genotype = formd.genotype def __retrieve_traitlist_and_datalist(threshold, fullname): trait = retrieve_trait_info(threshold, fullname, conn) - return (trait, export_trait_data(retrieve_trait_data(trait), strainlist)) + return ( + trait, + export_trait_data(retrieve_trait_data(trait, conn), strainlist)) traits_details = [ __retrieve_traitlist_and_datalist(threshold, fullname) @@ -159,18 +160,18 @@ def heatmap_data( traits_data_list = map(lambda x: x[1], traits_details) return { - "target_description_checked": fd.formdata.getvalue( + "target_description_checked": formd.formdata.getvalue( "targetDescriptionCheck", ""), "cluster_checked": cluster_checked, "slink_data": ( - slink(cluster_traits(traits_list, strainlist)) - if cluster_checked else False) - "sessionfile": fd.formdata.getvalue("session"), + slink(cluster_traits(traits_data_list)) + if cluster_checked else False), + "sessionfile": formd.formdata.getvalue("session"), "genotype": genotype, - "nLoci": sum(map(lambda x: len(x), genotype)) + "nLoci": sum(map(len, genotype)), "strainlist": strainlist, - "ppolar": fd.ppolar, - "mpolar":fd.mpolar, - "traits_list": traits_list + "ppolar": formd.ppolar, + "mpolar":formd.mpolar, + "traits_list": traits_list, "traits_data_list": traits_data_list } -- cgit v1.2.3