aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2021-11-19 16:42:00 +0530
committerArun Isaac2021-12-02 17:03:52 +0530
commit0a29e362bd8627b9346e2260a14c81a46e2a76d3 (patch)
tree5c5d061aec5fbef5a63a8ddbe95a9a59f72ded7f
parenta1516993c7f6dc608f75ba42cb27b983e0c5c330 (diff)
downloadgenenetwork3-0a29e362bd8627b9346e2260a14c81a46e2a76d3.tar.gz
Implement dataset metadata API endpoint.
* guix.scm: Import (gnu packages rdf). (genenetwork3)[propagated-inputs]: Add python-sparqlwrapper. * gn3/settings.py (SPARQL_ENDPOINT): New variable. * gn3/api/general.py: Import datasets from gn3.db. (dataset_metadata): New API endpoint. * gn3/db/datasets.py: Import re, Template from string, Dict and Optional from typing, JSON and SPARQLWrapper from SPARQLWrapper, SPARQL_ENDPOINT from gn3.settings. (sparql_query, dataset_metadata): New functions.
-rw-r--r--gn3/api/general.py7
-rw-r--r--gn3/db/datasets.py103
-rw-r--r--gn3/settings.py3
-rw-r--r--guix.scm2
4 files changed, 113 insertions, 2 deletions
diff --git a/gn3/api/general.py b/gn3/api/general.py
index 69ec343..e0bfc81 100644
--- a/gn3/api/general.py
+++ b/gn3/api/general.py
@@ -7,7 +7,7 @@ from flask import request
from gn3.fs_helpers import extract_uploaded_file
from gn3.commands import run_cmd
-
+from gn3.db import datasets
general = Blueprint("general", __name__)
@@ -68,3 +68,8 @@ def run_r_qtl(geno_filestr, pheno_filestr):
cmd = (f"Rscript {rqtl_wrapper} "
f"{geno_filestr} {pheno_filestr}")
return jsonify(run_cmd(cmd)), 201
+
+@general.route("/dataset/<accession_id>")
+def dataset_metadata(accession_id):
+ """Return info as JSON for dataset with ACCESSION_ID."""
+ return jsonify(datasets.dataset_metadata(accession_id))
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 6c328f5..e4c779a 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -1,7 +1,11 @@
"""
This module contains functions relating to specific trait dataset manipulation
"""
-from typing import Any
+import re
+from string import Template
+from typing import Any, Dict, Optional
+from SPARQLWrapper import JSON, SPARQLWrapper
+from gn3.settings import SPARQL_ENDPOINT
def retrieve_probeset_trait_dataset_name(
threshold: int, name: str, connection: Any):
@@ -289,3 +293,100 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
**dataset_fns[trait_type](),
**group
}
+
+def sparql_query(query: str) -> Dict[str, Any]:
+ """Run a SPARQL query and return the bound variables."""
+ sparql = SPARQLWrapper(SPARQL_ENDPOINT)
+ sparql.setQuery(query)
+ sparql.setReturnFormat(JSON)
+ return sparql.queryAndConvert()['results']['bindings']
+
+def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]:
+ """Return info about dataset with ACCESSION_ID."""
+ # Check accession_id to protect against query injection.
+ # TODO: This function doesn't yet return the names of the actual dataset files.
+ pattern = re.compile(r'GN\d+', re.ASCII)
+ if not pattern.fullmatch(accession_id):
+ return None
+ # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
+ # single large query.
+ queries = ["""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?dataset_group ?status ?title ?geo_series
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset ;
+ gn:name ?name .
+ OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
+ # FIXME: gn:datasetStatus should not be optional. But, some records don't
+ # have it.
+ OPTIONAL { ?dataset gn:datasetStatus ?status } .
+ OPTIONAL { ?dataset gn:title ?title } .
+ OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
+}
+""",
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset ;
+ gn:normalization / gn:name ?normalization_name ;
+ gn:datasetOfSpecies / gn:menuName ?species_name ;
+ gn:datasetOfInbredSet / gn:name ?inbred_set_name .
+ OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
+ OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
+}
+""",
+ """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
+ ?about_data_processing ?notes ?experiment_design ?contributors
+ ?citation ?acknowledgment
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset .
+ OPTIONAL { ?dataset gn:specifics ?specifics . }
+ OPTIONAL { ?dataset gn:summary ?summary . }
+ OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
+ OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
+ OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
+ OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
+ OPTIONAL { ?dataset gn:notes ?notes . }
+ OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
+ OPTIONAL { ?dataset gn:contributors ?contributors . }
+ OPTIONAL { ?dataset gn:citation ?citation . }
+ OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
+}
+"""]
+ result = {'accession_id': accession_id,
+ 'investigator': {}}
+ query_result = {}
+ for query in queries:
+ if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
+ query_result.update(sparql_result[0])
+ else:
+ return None
+ for key, value in query_result.items():
+ result[key] = value['value']
+ investigator_query_result = sparql_query(Template("""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
+WHERE {
+ ?dataset gn:accessionId "$accession_id" ;
+ rdf:type gn:dataset ;
+ gn:datasetOfInvestigator ?investigator .
+ OPTIONAL { ?investigator foaf:name ?name . }
+ OPTIONAL { ?investigator gn:address ?address . }
+ OPTIONAL { ?investigator gn:city ?city . }
+ OPTIONAL { ?investigator gn:state ?state . }
+ OPTIONAL { ?investigator gn:zipCode ?zip . }
+ OPTIONAL { ?investigator foaf:phone ?phone . }
+ OPTIONAL { ?investigator foaf:mbox ?email . }
+ OPTIONAL { ?investigator gn:country ?country . }
+ OPTIONAL { ?investigator foaf:homepage ?homepage . }
+}
+""").substitute(accession_id=accession_id))[0]
+ for key, value in investigator_query_result.items():
+ result['investigator'][key] = value['value']
+ return result
diff --git a/gn3/settings.py b/gn3/settings.py
index 0ac6698..c945fbf 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -13,6 +13,9 @@ REDIS_JOB_QUEUE = "GN3::job-queue"
TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir())
RQTL_WRAPPER = "rqtl_wrapper.R"
+# SPARQL endpoint
+SPARQL_ENDPOINT = "http://localhost:8891/sparql"
+
# SQL confs
SQL_URI = os.environ.get(
"SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl")
diff --git a/guix.scm b/guix.scm
index 9bf23c8..e2e49ab 100644
--- a/guix.scm
+++ b/guix.scm
@@ -42,6 +42,7 @@
(gnu packages python-web)
(gnu packages python-xyz)
(gnu packages python-science)
+ (gnu packages rdf)
((guix build utils) #:select (with-directory-excursion))
(guix build-system python)
(guix gexp)
@@ -78,6 +79,7 @@
("python-redis" ,python-redis)
("python-requests" ,python-requests)
("python-scipy" ,python-scipy)
+ ("python-sparqlwrapper" ,python-sparqlwrapper)
("r-optparse" ,r-optparse)
("r-qtl" ,r-qtl)
("r-rjson" ,r-rjson)