diff options
Diffstat (limited to 'gn3/db/datasets.py')
-rw-r--r-- | gn3/db/datasets.py | 152 |
1 files changed, 112 insertions, 40 deletions
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 6c328f5..b19db53 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -1,7 +1,11 @@ """ This module contains functions relating to specific trait dataset manipulation """ -from typing import Any +import re +from string import Template +from typing import Any, Dict, List, Optional +from SPARQLWrapper import JSON, SPARQLWrapper +from gn3.settings import SPARQL_ENDPOINT def retrieve_probeset_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -22,10 +26,13 @@ def retrieve_probeset_trait_dataset_name( "threshold": threshold, "name": name }) - return dict(zip( - ["dataset_id", "dataset_name", "dataset_fullname", - "dataset_shortname", "dataset_datascale"], - cursor.fetchone())) + res = cursor.fetchone() + if res: + return dict(zip( + ["dataset_id", "dataset_name", "dataset_fullname", + "dataset_shortname", "dataset_datascale"], + res)) + return {"dataset_id": None, "dataset_name": name, "dataset_fullname": name} def retrieve_publish_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -75,33 +82,8 @@ def retrieve_geno_trait_dataset_name( "dataset_shortname"], cursor.fetchone())) -def retrieve_temp_trait_dataset_name( - threshold: int, name: str, connection: Any): - """ - Get the ID, DataScale and various name formats for a `Temp` trait. - """ - query = ( - "SELECT Id, Name, FullName, ShortName " - "FROM TempFreeze " - "WHERE " - "public > %(threshold)s " - "AND " - "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)") - with connection.cursor() as cursor: - cursor.execute( - query, - { - "threshold": threshold, - "name": name - }) - return dict(zip( - ["dataset_id", "dataset_name", "dataset_fullname", - "dataset_shortname"], - cursor.fetchone())) - def retrieve_dataset_name( - trait_type: str, threshold: int, trait_name: str, dataset_name: str, - conn: Any): + trait_type: str, threshold: int, dataset_name: str, conn: Any): """ Retrieve the name of a trait given the trait's name @@ -113,9 +95,7 @@ def retrieve_dataset_name( "ProbeSet": retrieve_probeset_trait_dataset_name, "Publish": retrieve_publish_trait_dataset_name, "Geno": retrieve_geno_trait_dataset_name, - "Temp": retrieve_temp_trait_dataset_name} - if trait_type == "Temp": - return retrieve_temp_trait_dataset_name(threshold, trait_name, conn) + "Temp": lambda threshold, dataset_name, conn: {}} return fn_map[trait_type](threshold, dataset_name, conn) @@ -203,7 +183,6 @@ def retrieve_temp_trait_dataset(): """ Retrieve the dataset that relates to `Temp` traits """ - # pylint: disable=[C0330] return { "searchfield": ["name", "description"], "disfield": ["name", "description"], @@ -217,7 +196,6 @@ def retrieve_geno_trait_dataset(): """ Retrieve the dataset that relates to `Geno` traits """ - # pylint: disable=[C0330] return { "searchfield": ["name", "chr"], "disfield": ["name", "chr", "mb", "source2", "sequence"], @@ -228,7 +206,6 @@ def retrieve_publish_trait_dataset(): """ Retrieve the dataset that relates to `Publish` traits """ - # pylint: disable=[C0330] return { "searchfield": [ "name", "post_publication_description", "abstract", "title", @@ -247,7 +224,6 @@ def retrieve_probeset_trait_dataset(): """ Retrieve the dataset that relates to `ProbeSet` traits """ - # pylint: disable=[C0330] return { "searchfield": [ "name", "description", "probe_target_description", "symbol", @@ -278,8 +254,7 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): "dataset_id": None, "dataset_name": trait["db"]["dataset_name"], **retrieve_dataset_name( - trait_type, threshold, trait["trait_name"], - trait["db"]["dataset_name"], conn) + trait_type, threshold, trait["db"]["dataset_name"], conn) } group = retrieve_group_fields( trait_type, trait["trait_name"], dataset_name_info, conn) @@ -289,3 +264,100 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): **dataset_fns[trait_type](), **group } + +def sparql_query(query: str) -> List[Dict[str, Any]]: + """Run a SPARQL query and return the bound variables.""" + sparql = SPARQLWrapper(SPARQL_ENDPOINT) + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + return sparql.queryAndConvert()['results']['bindings'] + +def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]: + """Return info about dataset with ACCESSION_ID.""" + # Check accession_id to protect against query injection. + # TODO: This function doesn't yet return the names of the actual dataset files. + pattern = re.compile(r'GN\d+', re.ASCII) + if not pattern.fullmatch(accession_id): + return None + # KLUDGE: We split the SPARQL query because virtuoso is very slow on a + # single large query. + queries = [""" +PREFIX gn: <http://genenetwork.org/> +SELECT ?name ?dataset_group ?status ?title ?geo_series +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:name ?name . + OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } . + # FIXME: gn:datasetStatus should not be optional. But, some records don't + # have it. + OPTIONAL { ?dataset gn:datasetStatus ?status } . + OPTIONAL { ?dataset gn:title ?title } . + OPTIONAL { ?dataset gn:geoSeries ?geo_series } . +} +""", + """ +PREFIX gn: <http://genenetwork.org/> +SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:normalization / gn:name ?normalization_name ; + gn:datasetOfSpecies / gn:menuName ?species_name ; + gn:datasetOfInbredSet / gn:name ?inbred_set_name . + OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } . + OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } . +} +""", + """ +PREFIX gn: <http://genenetwork.org/> +SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform + ?about_data_processing ?notes ?experiment_design ?contributors + ?citation ?acknowledgment +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset . + OPTIONAL { ?dataset gn:specifics ?specifics . } + OPTIONAL { ?dataset gn:summary ?summary . } + OPTIONAL { ?dataset gn:aboutCases ?about_cases . } + OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . } + OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . } + OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . } + OPTIONAL { ?dataset gn:notes ?notes . } + OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . } + OPTIONAL { ?dataset gn:contributors ?contributors . } + OPTIONAL { ?dataset gn:citation ?citation . } + OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . } +} +"""] + result: Dict[str, Any] = {'accession_id': accession_id, + 'investigator': {}} + query_result = {} + for query in queries: + if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)): + query_result.update(sparql_result[0]) + else: + return None + for key, value in query_result.items(): + result[key] = value['value'] + investigator_query_result = sparql_query(Template(""" +PREFIX gn: <http://genenetwork.org/> +SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:datasetOfInvestigator ?investigator . + OPTIONAL { ?investigator foaf:name ?name . } + OPTIONAL { ?investigator gn:address ?address . } + OPTIONAL { ?investigator gn:city ?city . } + OPTIONAL { ?investigator gn:state ?state . } + OPTIONAL { ?investigator gn:zipCode ?zip . } + OPTIONAL { ?investigator foaf:phone ?phone . } + OPTIONAL { ?investigator foaf:mbox ?email . } + OPTIONAL { ?investigator gn:country ?country . } + OPTIONAL { ?investigator foaf:homepage ?homepage . } +} +""").substitute(accession_id=accession_id))[0] + for key, value in investigator_query_result.items(): + result['investigator'][key] = value['value'] + return result |