aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gn3/api/general.py4
-rw-r--r--gn3/db/datasets.py104
-rw-r--r--gn3/db/rdf.py131
3 files changed, 135 insertions, 104 deletions
diff --git a/gn3/api/general.py b/gn3/api/general.py
index e0bfc81..68b128b 100644
--- a/gn3/api/general.py
+++ b/gn3/api/general.py
@@ -7,7 +7,7 @@ from flask import request
from gn3.fs_helpers import extract_uploaded_file
from gn3.commands import run_cmd
-from gn3.db import datasets
+from gn3.db import rdf
general = Blueprint("general", __name__)
@@ -72,4 +72,4 @@ def run_r_qtl(geno_filestr, pheno_filestr):
@general.route("/dataset/<accession_id>")
def dataset_metadata(accession_id):
"""Return info as JSON for dataset with ACCESSION_ID."""
- return jsonify(datasets.dataset_metadata(accession_id))
+ return jsonify(rdf.get_dataset_metadata(accession_id).data)
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 1d2f071..bc5467b 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -1,11 +1,8 @@
"""
This module contains functions relating to specific trait dataset manipulation
"""
-import re
-from string import Template
-from typing import Any, Dict, List, Optional
-from SPARQLWrapper import JSON, SPARQLWrapper
-from gn3.settings import SPARQL_ENDPOINT
+from typing import Any
+
def retrieve_probeset_trait_dataset_name(
threshold: int, name: str, connection: Any):
@@ -264,100 +261,3 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
**dataset_fns[trait_type](),
**group
}
-
-def sparql_query(query: str) -> List[Dict[str, Any]]:
- """Run a SPARQL query and return the bound variables."""
- sparql = SPARQLWrapper(SPARQL_ENDPOINT)
- sparql.setQuery(query)
- sparql.setReturnFormat(JSON)
- return sparql.queryAndConvert()['results']['bindings'] # type: ignore
-
-def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]:
- """Return info about dataset with ACCESSION_ID."""
- # Check accession_id to protect against query injection.
- # TODO: This function doesn't yet return the names of the actual dataset files.
- pattern = re.compile(r'GN\d+', re.ASCII)
- if not pattern.fullmatch(accession_id):
- return None
- # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
- # single large query.
- queries = ["""
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?dataset_group ?status ?title ?geo_series
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset ;
- gn:name ?name .
- OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
- # FIXME: gn:datasetStatus should not be optional. But, some records don't
- # have it.
- OPTIONAL { ?dataset gn:datasetStatus ?status } .
- OPTIONAL { ?dataset gn:title ?title } .
- OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
-}
-""",
- """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset ;
- gn:normalization / gn:name ?normalization_name ;
- gn:datasetOfSpecies / gn:menuName ?species_name ;
- gn:datasetOfInbredSet / gn:name ?inbred_set_name .
- OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
- OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
-}
-""",
- """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
- ?about_data_processing ?notes ?experiment_design ?contributors
- ?citation ?acknowledgment
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset .
- OPTIONAL { ?dataset gn:specifics ?specifics . }
- OPTIONAL { ?dataset gn:summary ?summary . }
- OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
- OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
- OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
- OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
- OPTIONAL { ?dataset gn:notes ?notes . }
- OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
- OPTIONAL { ?dataset gn:contributors ?contributors . }
- OPTIONAL { ?dataset gn:citation ?citation . }
- OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
-}
-"""]
- result: Dict[str, Any] = {'accession_id': accession_id,
- 'investigator': {}}
- query_result = {}
- for query in queries:
- if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
- query_result.update(sparql_result[0])
- else:
- return None
- for key, value in query_result.items():
- result[key] = value['value']
- investigator_query_result = sparql_query(Template("""
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset ;
- gn:datasetOfInvestigator ?investigator .
- OPTIONAL { ?investigator foaf:name ?name . }
- OPTIONAL { ?investigator gn:address ?address . }
- OPTIONAL { ?investigator gn:city ?city . }
- OPTIONAL { ?investigator gn:state ?state . }
- OPTIONAL { ?investigator gn:zipCode ?zip . }
- OPTIONAL { ?investigator foaf:phone ?phone . }
- OPTIONAL { ?investigator foaf:mbox ?email . }
- OPTIONAL { ?investigator gn:country ?country . }
- OPTIONAL { ?investigator foaf:homepage ?homepage . }
-}
-""").substitute(accession_id=accession_id))[0]
- for key, value in investigator_query_result.items():
- result['investigator'][key] = value['value']
- return result
diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py
new file mode 100644
index 0000000..f1312e3
--- /dev/null
+++ b/gn3/db/rdf.py
@@ -0,0 +1,131 @@
+"""RDF utilities
+
+This module is a collection of functions that handle SPARQL queries.
+
+"""
+import re
+from typing import Tuple
+from string import Template
+from SPARQLWrapper import JSON, SPARQLWrapper
+from pymonad.maybe import Just
+from gn3.monads import MonadicDict
+from gn3.settings import SPARQL_ENDPOINT
+
+
+def sparql_query(query: str) -> Tuple[MonadicDict, ...]:
+ """Run a SPARQL query and return the bound variables."""
+ sparql = SPARQLWrapper(SPARQL_ENDPOINT)
+ sparql.setQuery(query)
+ sparql.setReturnFormat(JSON)
+ results = sparql.queryAndConvert()
+ if _r := results["results"]["bindings"]: # type: ignore
+ return (*(MonadicDict(bindings) for bindings in _r),) # type: ignore
+ return (MonadicDict(),)
+
+
+def get_dataset_metadata(accession_id: str) -> MonadicDict:
+ """Return info about dataset with ACCESSION_ID."""
+ # Check accession_id to protect against query injection.
+ # TODO: This function doesn't yet return the names of the actual dataset
+ # files.
+ pattern = re.compile(r"GN\d+", re.ASCII)
+ if not pattern.fullmatch(accession_id):
+ return MonadicDict()
+ # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
+ # single large query.
+ queries = [
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?dataset_group ?status ?title ?geo_series
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset .
+ OPTIONAL { ?dataset gn:name ?name } .
+ OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
+ # FIXME: gn:datasetStatus should not be optional. But, some records don't
+ # have it.
+ OPTIONAL { ?dataset gn:datasetStatus ?status } .
+ OPTIONAL { ?dataset gn:title ?title } .
+ OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
+}
+""",
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset ;
+ gn:normalization / gn:name ?normalization_name ;
+ gn:datasetOfSpecies / gn:menuName ?species_name ;
+ gn:datasetOfInbredSet / gn:name ?inbred_set_name .
+ OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
+ OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
+}
+""",
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
+ ?about_data_processing ?notes ?experiment_design ?contributors
+ ?citation ?acknowledgment
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset .
+ OPTIONAL { ?dataset gn:specifics ?specifics . }
+ OPTIONAL { ?dataset gn:summary ?summary . }
+ OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
+ OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
+ OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
+ OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
+ OPTIONAL { ?dataset gn:notes ?notes . }
+ OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
+ OPTIONAL { ?dataset gn:contributors ?contributors . }
+ OPTIONAL { ?dataset gn:citation ?citation . }
+ OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
+}
+""",
+ ]
+ result: MonadicDict = MonadicDict(
+ {
+ "accession_id": accession_id,
+ }
+ )
+ query_result: MonadicDict = MonadicDict()
+ for query in queries:
+ if not (
+ # Expecting only one result
+ sparql_result := sparql_query(
+ Template(query).substitute(accession_id=accession_id)
+ )[0]
+ ):
+ return MonadicDict()
+ query_result |= sparql_result
+ for key, value in query_result.items():
+ result[key] = value.bind(lambda x: Just(x["value"]))
+
+ investigator_query_result = sparql_query(
+ Template(
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset ;
+ gn:datasetOfInvestigator ?investigator .
+ OPTIONAL { ?investigator foaf:name ?name . }
+ OPTIONAL { ?investigator gn:address ?address . }
+ OPTIONAL { ?investigator gn:city ?city . }
+ OPTIONAL { ?investigator gn:state ?state . }
+ OPTIONAL { ?investigator gn:zipCode ?zip . }
+ OPTIONAL { ?investigator foaf:phone ?phone . }
+ OPTIONAL { ?investigator foaf:mbox ?email . }
+ OPTIONAL { ?investigator gn:country ?country . }
+ OPTIONAL { ?investigator foaf:homepage ?homepage . }
+}
+ """
+ ).substitute(accession_id=accession_id)
+ )[0]
+ result["investigators"] = Just({
+ key: value.bind(lambda a: a["value"])
+ for key, value in investigator_query_result.items()
+ })
+ return result