From f1876f8b9939a9b863dc88aab8d3fed3c16ad4e1 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Thu, 5 Aug 2021 13:08:57 +0300 Subject: Reorganise the database code Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Reorganise the code to separate the datasets from the traits, and to more closely conform to the same flow as that in GN1 --- gn3/db/datasets.py | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 gn3/db/datasets.py (limited to 'gn3/db/datasets.py') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py new file mode 100644 index 0000000..3ad50f6 --- /dev/null +++ b/gn3/db/datasets.py @@ -0,0 +1,251 @@ +from typing import Any, Dict, Union + +def retrieve_probeset_trait_dataset_name( + threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName, DataScale " + "FROM ProbeSetFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname", "dataset_datascale"], + cursor.fetchone)) + +def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM PublishFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM GenoFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_temp_trait_dataset_name(threshold: int, name: str, connection: Any): + query = ( + "SELECT Id, Name, FullName, ShortName " + "FROM TempFreeze " + "WHERE " + "public > %(threshold)s " + "AND " + "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") + with connection.cursor() as cursor: + cursor.execute( + query, + { + "threshold": threshold, + "name": name + }) + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname"], + cursor.fetchone)) + +def retrieve_dataset_name( + trait_type: str, threshold: int, trait_name: str, dataset_name: str, + conn: Any): + """ + Retrieve the name of a trait given the trait's name + + This is extracted from the `webqtlDataset.retrieveName` function as is + implemented at + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlDataset.py#L140-L169 + """ + fn_map = { + "ProbeSet": retrieve_probeset_trait_dataset_name, + "Publish": retrieve_publish_trait_dataset_name, + "Geno": retrieve_geno_trait_dataset_name, + "Temp": retrieve_temp_trait_dataset_name} + if trait_type == "Temp": + return retrieve_temp_trait_dataset_name(threshold, trait_name, conn) + return fn_map[trait_type](threshold, dataset_name, conn) + + +def retrieve_geno_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Geno trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, GenoFreeze " + "WHERE GenoFreeze.InbredSetId = InbredSet.Id " + "AND GenoFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_publish_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various Publish trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, PublishFreeze " + "WHERE PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_probeset_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for various ProbeSet trait types. + """ + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, ProbeSetFreeze, ProbeFreeze " + "WHERE ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " + "AND ProbeSetFreeze.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_temp_riset_fields(name, conn): + query = ( + "SELECT InbredSet.Name, InbredSet.Id " + "FROM InbredSet, Temp " + "WHERE Temp.InbredSetId = InbredSet.Id " + "AND Temp.Name = %(name)s") + with conn.cursor() as cursor: + cursor.execute(query, {"name": name}) + return dict(zip(["riset", "risetid"], cursor.fetchone())) + return {} + +def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): + """ + Retrieve the RISet, and RISetID values for various trait types. + """ + riset_fns_map = { + "Geno": retrieve_geno_riset_fields, + "Publish": retrieve_publish_riset_fields, + "ProbeSet": retrieve_probeset_riset_fields + } + + if trait_type == "Temp": + riset_info = retrieve_temp_riset_fields(trait_name, conn) + else: + riset_info = riset_fns_map[trait_type](dataset_info["dataset_name"], conn) + + return { + **dataset_info, + **riset_info, + "riset": ( + "BXD" if riset_info.get("riset") == "BXD300" + else riset_info.get("riset", "")) + } + +def retrieve_temp_trait_dataset(): + return { + "searchfield": ["name", "description"], + "disfield": ["name", "description"], + "type": "Temp", + "dataset_id": 1, + "fullname": "Temporary Storage", + "shortname": "Temp" + } + +def retrieve_geno_trait_dataset(): + return { + "searchfield": ["name","chr"], + "disfield": ["name","chr","mb", "source2", "sequence"], + "type": "Geno" + } + +def retrieve_publish_trait_dataset(): + return { + "searchfield": [ + "name", "post_publication_description", "abstract", "title", + "authors"], + "disfield": [ + "name","pubmed_id", "pre_publication_description", + "post_publication_description", "original_description", + "pre_publication_abbreviation", "post_publication_abbreviation", + "lab_code", "submitter", "owner", "authorized_users", + "authors","title","abstract", "journal","volume","pages","month", + "year","sequence", "units", "comments"], + "type": "Publish" + } + +def retrieve_probeset_trait_dataset(): + return { + "searchfield": [ + "name", "description", "probe_target_description", "symbol", + "alias", "genbankid", "unigeneid", "omim", "refseq_transcriptid", + "probe_set_specificity", "probe_set_blat_score"], + "disfield": [ + "name", "symbol", "description", "probe_target_description", "chr", + "mb", "alias", "geneid", "genbankid", "unigeneid", "omim", + "refseq_transcriptid", "blatseq", "targetseq", "chipid", "comments", + "strand_probe", "strand_gene", "probe_set_target_region", + "proteinid", "probe_set_specificity", "probe_set_blat_score", + "probe_set_blat_mb_start", "probe_set_blat_mb_end", + "probe_set_strand", "probe_set_note_by_rw", "flag"], + "type": "ProbeSet" + } + +def retrieve_trait_dataset(trait_type, trait, threshold, conn): + dataset_fns = { + "Temp": retrieve_temp_trait_dataset, + "Geno": retrieve_geno_trait_dataset, + "Publish": retrieve_publish_trait_dataset, + "ProbeSet": retrieve_probeset_trait_dataset + } + dataset_name_info = { + "dataset_id": None, + "dataset_name": trait["db"]["dataset_name"], + **retrieve_dataset_name( + trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], + conn) + } + riset = retrieve_riset_fields( + trait_type, trait["trait_name"], dataset_name_info, conn) + return { + "display_name": dataset_name_info["dataset_name"], + **dataset_name_info, + **dataset_fns[trait_type](), + **riset + } -- cgit v1.2.3 From 667e67bae832ca5083f3319ada4fda67aca41f44 Mon Sep 17 00:00:00 2001 From: Muriithi Frederick Muriuki Date: Mon, 9 Aug 2021 11:44:47 +0300 Subject: Fix linting errors Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * Add module, class and function docstrings * Deactivate some irrelevant pylint errors * Fix indentations and line-lengths --- gn3/db/datasets.py | 64 ++++++++++++++++++++++++++++++++++-------- gn3/db/traits.py | 29 +++++++++++++++++++ tests/unit/db/test_datasets.py | 5 +++- tests/unit/db/test_traits.py | 6 ++++ 4 files changed, 91 insertions(+), 13 deletions(-) (limited to 'gn3/db/datasets.py') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 3ad50f6..53d6811 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -1,7 +1,13 @@ -from typing import Any, Dict, Union +""" +This module contains functions relating to specific trait dataset manipulation +""" +from typing import Any def retrieve_probeset_trait_dataset_name( threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `ProbeSet` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName, DataScale " "FROM ProbeSetFreeze " @@ -21,7 +27,11 @@ def retrieve_probeset_trait_dataset_name( "dataset_shortname", "dataset_datascale"], cursor.fetchone)) -def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_publish_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Publish` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM PublishFreeze " @@ -41,7 +51,11 @@ def retrieve_publish_trait_dataset_name(threshold: int, name: str, connection: A "dataset_shortname"], cursor.fetchone)) -def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_geno_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Geno` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM GenoFreeze " @@ -61,7 +75,11 @@ def retrieve_geno_trait_dataset_name(threshold: int, name: str, connection: Any) "dataset_shortname"], cursor.fetchone)) -def retrieve_temp_trait_dataset_name(threshold: int, name: str, connection: Any): +def retrieve_temp_trait_dataset_name( + threshold: int, name: str, connection: Any): + """ + Get the ID, DataScale and various name formats for a `Temp` trait. + """ query = ( "SELECT Id, Name, FullName, ShortName " "FROM TempFreeze " @@ -145,6 +163,9 @@ def retrieve_probeset_riset_fields(name, conn): return {} def retrieve_temp_riset_fields(name, conn): + """ + Retrieve the RISet, and RISetID values for `Temp` trait types. + """ query = ( "SELECT InbredSet.Name, InbredSet.Id " "FROM InbredSet, Temp " @@ -179,6 +200,10 @@ def retrieve_riset_fields(trait_type, trait_name, dataset_info, conn): } def retrieve_temp_trait_dataset(): + """ + Retrieve the dataset that relates to `Temp` traits + """ + # pylint: disable=[C0330] return { "searchfield": ["name", "description"], "disfield": ["name", "description"], @@ -189,28 +214,40 @@ def retrieve_temp_trait_dataset(): } def retrieve_geno_trait_dataset(): + """ + Retrieve the dataset that relates to `Geno` traits + """ + # pylint: disable=[C0330] return { - "searchfield": ["name","chr"], - "disfield": ["name","chr","mb", "source2", "sequence"], + "searchfield": ["name", "chr"], + "disfield": ["name", "chr", "mb", "source2", "sequence"], "type": "Geno" } def retrieve_publish_trait_dataset(): + """ + Retrieve the dataset that relates to `Publish` traits + """ + # pylint: disable=[C0330] return { "searchfield": [ "name", "post_publication_description", "abstract", "title", "authors"], "disfield": [ - "name","pubmed_id", "pre_publication_description", - "post_publication_description", "original_description", + "name", "pubmed_id", "pre_publication_description", + "post_publication_description", "original_description", "pre_publication_abbreviation", "post_publication_abbreviation", "lab_code", "submitter", "owner", "authorized_users", - "authors","title","abstract", "journal","volume","pages","month", - "year","sequence", "units", "comments"], + "authors", "title", "abstract", "journal", "volume", "pages", + "month", "year", "sequence", "units", "comments"], "type": "Publish" } def retrieve_probeset_trait_dataset(): + """ + Retrieve the dataset that relates to `ProbeSet` traits + """ + # pylint: disable=[C0330] return { "searchfield": [ "name", "description", "probe_target_description", "symbol", @@ -228,6 +265,9 @@ def retrieve_probeset_trait_dataset(): } def retrieve_trait_dataset(trait_type, trait, threshold, conn): + """ + Retrieve the dataset that relates to a specific trait. + """ dataset_fns = { "Temp": retrieve_temp_trait_dataset, "Geno": retrieve_geno_trait_dataset, @@ -238,8 +278,8 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): "dataset_id": None, "dataset_name": trait["db"]["dataset_name"], **retrieve_dataset_name( - trait_type, threshold, trait["trait_name"], trait["db"]["dataset_name"], - conn) + trait_type, threshold, trait["trait_name"], + trait["db"]["dataset_name"], conn) } riset = retrieve_riset_fields( trait_type, trait["trait_name"], dataset_name_info, conn) diff --git a/gn3/db/traits.py b/gn3/db/traits.py index 6c31a4d..fb48fc3 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -43,6 +43,7 @@ def update_sample_data(conn: Any, count: Union[int, str]): """Given the right parameters, update sample-data from the relevant table.""" + # pylint: disable=[R0913, R0914] STRAIN_ID_SQL: str = "UPDATE Strain SET Name = %s WHERE Id = %s" PUBLISH_DATA_SQL: str = ("UPDATE PublishData SET value = %s " "WHERE StrainId = %s AND Id = %s") @@ -252,6 +253,9 @@ def set_homologene_id_field(trait_type, trait_info, conn): return functions_table[trait_type](trait_info) def load_publish_qtl_info(trait_info, conn): + """ + Load extra QTL information for `Publish` traits + """ query = ( "SELECT PublishXRef.Locus, PublishXRef.LRS, PublishXRef.additive " "FROM PublishXRef, PublishFreeze " @@ -264,6 +268,9 @@ def load_publish_qtl_info(trait_info, conn): return {"locus": "", "lrs": "", "additive": ""} def load_probeset_qtl_info(trait_info, conn): + """ + Load extra QTL information for `ProbeSet` traits + """ query = ( "SELECT ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, " "ProbeSetXRef.mean, ProbeSetXRef.additive " @@ -278,6 +285,22 @@ def load_probeset_qtl_info(trait_info, conn): return {"locus": "", "lrs": "", "pvalue": "", "mean": "", "additive": ""} def load_qtl_info(qtl, trait_type, trait_info, conn): + """ + Load extra QTL information for traits + + DESCRIPTION: + Migrated from + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/base/webqtlTrait.py#L500-L534 + + PARAMETERS: + qtl: boolean + trait_type: string + The type of the trait in consideration + trait_info: map/dictionary + A dictionary of the trait's key-value pairs + conn: + A database connection object + """ if not qtl: return trait_info qtl_info_functions = { @@ -290,6 +313,9 @@ def load_qtl_info(qtl, trait_type, trait_info, conn): return qtl_info_functions[trait_type](trait_info, conn) def build_trait_name(trait_fullname): + """ + Initialises the trait's name, and other values from the search data provided + """ name_parts = trait_fullname.split("::") assert len(name_parts) >= 2, "Name format error" return { @@ -300,6 +326,9 @@ def build_trait_name(trait_fullname): } def retrieve_probeset_sequence(trait, conn): + """ + Retrieve a 'ProbeSet' trait's sequence information + """ query = ( "SELECT ProbeSet.BlatSeq " "FROM ProbeSet, ProbeSetFreeze, ProbeSetXRef " diff --git a/tests/unit/db/test_datasets.py b/tests/unit/db/test_datasets.py index 4f405cb..38de0e2 100644 --- a/tests/unit/db/test_datasets.py +++ b/tests/unit/db/test_datasets.py @@ -1,3 +1,5 @@ +"""Tests for gn3/db/datasets.py""" + from unittest import mock, TestCase from gn3.db.datasets import ( retrieve_dataset_name, @@ -7,6 +9,7 @@ from gn3.db.datasets import ( retrieve_probeset_riset_fields) class TestDatasetsDBFunctions(TestCase): + """Test cases for datasets functions.""" def test_retrieve_dataset_name(self): """Test that the function is called correctly.""" @@ -34,7 +37,7 @@ class TestDatasetsDBFunctions(TestCase): "(Name = %(name)s " "OR FullName = %(name)s " "OR ShortName = %(name)s)".format( - table=table, cols=columns, ttype=trait_type), + table=table, cols=columns), {"threshold": thresh, "name": dataset_name}) def test_retrieve_probeset_riset_fields(self): diff --git a/tests/unit/db/test_traits.py b/tests/unit/db/test_traits.py index 5f52c18..d9d7bbb 100644 --- a/tests/unit/db/test_traits.py +++ b/tests/unit/db/test_traits.py @@ -121,6 +121,9 @@ class TestTraitsDBFunctions(TestCase): trait_source) def test_build_trait_name_with_good_fullnames(self): + """ + Check that the name is built correctly. + """ for fullname, expected in [ ["testdb::testname", {"db": {"dataset_name": "testdb"}, "trait_name": "testname", @@ -133,6 +136,9 @@ class TestTraitsDBFunctions(TestCase): self.assertEqual(build_trait_name(fullname), expected) def test_build_trait_name_with_bad_fullnames(self): + """ + Check that an exception is raised if the full name format is wrong. + """ for fullname in ["", "test", "test:test"]: with self.subTest(fullname=fullname): with self.assertRaises(AssertionError, msg="Name format error"): -- cgit v1.2.3