From 6e58cb5ffe3b145aa6ba9b7a54672e7a7f18a0b7 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 12 Oct 2022 15:50:33 +0300 Subject: Re-implement RDF related code to use monads in its own module * gn3/api/general.py: Replace gn3.db.datasets import with gn3.db.rdf. (dataset_metadata) : Replace datasets.dataset_metadata with rdf.get_dataset_metadata. * gn3/db/datasets.py: Remove unused imports. (sparql_query, dataset_metadata): Delete. * gn3/db/rdf.py: (sparql_query, get_dataset_metadata): New functions. --- gn3/db/datasets.py | 104 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 102 deletions(-) (limited to 'gn3/db/datasets.py') diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 1d2f071..bc5467b 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -1,11 +1,8 @@ """ This module contains functions relating to specific trait dataset manipulation """ -import re -from string import Template -from typing import Any, Dict, List, Optional -from SPARQLWrapper import JSON, SPARQLWrapper -from gn3.settings import SPARQL_ENDPOINT +from typing import Any + def retrieve_probeset_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -264,100 +261,3 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): **dataset_fns[trait_type](), **group } - -def sparql_query(query: str) -> List[Dict[str, Any]]: - """Run a SPARQL query and return the bound variables.""" - sparql = SPARQLWrapper(SPARQL_ENDPOINT) - sparql.setQuery(query) - sparql.setReturnFormat(JSON) - return sparql.queryAndConvert()['results']['bindings'] # type: ignore - -def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]: - """Return info about dataset with ACCESSION_ID.""" - # Check accession_id to protect against query injection. - # TODO: This function doesn't yet return the names of the actual dataset files. - pattern = re.compile(r'GN\d+', re.ASCII) - if not pattern.fullmatch(accession_id): - return None - # KLUDGE: We split the SPARQL query because virtuoso is very slow on a - # single large query. - queries = [""" -PREFIX gn: -SELECT ?name ?dataset_group ?status ?title ?geo_series -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset ; - gn:name ?name . - OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } . - # FIXME: gn:datasetStatus should not be optional. But, some records don't - # have it. - OPTIONAL { ?dataset gn:datasetStatus ?status } . - OPTIONAL { ?dataset gn:title ?title } . - OPTIONAL { ?dataset gn:geoSeries ?geo_series } . -} -""", - """ -PREFIX gn: -SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset ; - gn:normalization / gn:name ?normalization_name ; - gn:datasetOfSpecies / gn:menuName ?species_name ; - gn:datasetOfInbredSet / gn:name ?inbred_set_name . - OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } . - OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } . -} -""", - """ -PREFIX gn: -SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform - ?about_data_processing ?notes ?experiment_design ?contributors - ?citation ?acknowledgment -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset . - OPTIONAL { ?dataset gn:specifics ?specifics . } - OPTIONAL { ?dataset gn:summary ?summary . } - OPTIONAL { ?dataset gn:aboutCases ?about_cases . } - OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . } - OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . } - OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . } - OPTIONAL { ?dataset gn:notes ?notes . } - OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . } - OPTIONAL { ?dataset gn:contributors ?contributors . } - OPTIONAL { ?dataset gn:citation ?citation . } - OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . } -} -"""] - result: Dict[str, Any] = {'accession_id': accession_id, - 'investigator': {}} - query_result = {} - for query in queries: - if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)): - query_result.update(sparql_result[0]) - else: - return None - for key, value in query_result.items(): - result[key] = value['value'] - investigator_query_result = sparql_query(Template(""" -PREFIX gn: -SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset ; - gn:datasetOfInvestigator ?investigator . - OPTIONAL { ?investigator foaf:name ?name . } - OPTIONAL { ?investigator gn:address ?address . } - OPTIONAL { ?investigator gn:city ?city . } - OPTIONAL { ?investigator gn:state ?state . } - OPTIONAL { ?investigator gn:zipCode ?zip . } - OPTIONAL { ?investigator foaf:phone ?phone . } - OPTIONAL { ?investigator foaf:mbox ?email . } - OPTIONAL { ?investigator gn:country ?country . } - OPTIONAL { ?investigator foaf:homepage ?homepage . } -} -""").substitute(accession_id=accession_id))[0] - for key, value in investigator_query_result.items(): - result['investigator'][key] = value['value'] - return result -- cgit v1.2.3