aboutsummaryrefslogtreecommitdiff
path: root/gn3
diff options
context:
space:
mode:
authorMunyoki Kilyungi2022-12-01 15:32:42 +0300
committerBonfaceKilz2022-12-06 11:28:28 +0300
commitc75ed75c6a5987eed6672be924870f86282ed6d6 (patch)
tree9d91afcbef1ea9c2ec6557b86b5187f5093374ab /gn3
parent09f4d9c4bec6e3395296a69ce718dea30240ed54 (diff)
downloadgenenetwork3-c75ed75c6a5987eed6672be924870f86282ed6d6.tar.gz
Use a dataset's name to fetch it's metadata from RDF
* gn3/api/metadata.py (jsonify_dataset_metadata): Rewrite metadata end-point to use a dataset's name instead of it's accession_id. * gn3/db/rdf.py (get_dataset_metadata): Replace accession_id with name. Use one single RDF query instead of multiple queries.
Diffstat (limited to 'gn3')
-rw-r--r--gn3/api/metadata.py6
-rw-r--r--gn3/db/rdf.py158
2 files changed, 61 insertions, 103 deletions
diff --git a/gn3/api/metadata.py b/gn3/api/metadata.py
index 35e4067..8f6127f 100644
--- a/gn3/api/metadata.py
+++ b/gn3/api/metadata.py
@@ -13,14 +13,14 @@ from gn3.db.rdf import get_dataset_metadata
metadata = Blueprint("metadata", __name__)
-@metadata.route("/dataset/<accession_id>", methods=["GET"])
-def jsonify_dataset_metadata(accession_id):
+@metadata.route("/dataset/<name>", methods=["GET"])
+def jsonify_dataset_metadata(name):
"""Fetch a dataset's metadata given it's ACCESSION_ID"""
try:
return jsonify(
get_dataset_metadata(
SPARQLWrapper(current_app.config.get("SPARQL_ENDPOINT")),
- accession_id,
+ name,
).data
)
# The virtuoso server is misconfigured or it isn't running at all
diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py
index 7ed80b1..b690d5a 100644
--- a/gn3/db/rdf.py
+++ b/gn3/db/rdf.py
@@ -24,112 +24,70 @@ def sparql_query(
def get_dataset_metadata(
- sparql_conn: SPARQLWrapper, accession_id: str
+ sparql_conn: SPARQLWrapper, name: str
) -> MonadicDict:
- """Return info about dataset with ACCESSION_ID."""
- # Check accession_id to protect against query injection.
- # TODO: This function doesn't yet return the names of the actual dataset
- # files.
- pattern = re.compile(r"GN\d+", re.ASCII)
- if not pattern.fullmatch(accession_id):
- return MonadicDict()
- # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
- # single large query.
- queries = [
- """
+ """Return info about dataset with a given NAME"""
+ __metadata_query = """
PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?dataset_group ?status ?title ?geo_series
+
+SELECT ?accession_id ?dataset_group ?status ?title ?geo_series ?specifics ?summary ?about_tissue
+?about_platform ?about_data_processing ?notes ?experiment_design ?contributors ?citation ?acknowledgement
+?platform_name ?tissue_name ?normalization_name ?species_name ?inbred_set_name
+?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset .
- OPTIONAL { ?dataset gn:name ?name } .
+ ?dataset gn:accessionId ?accession_id ;
+ rdf:type gn:dataset ;
+ gn:name "$name" .
+ OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing } .
+ OPTIONAL { ?dataset gn:aboutPlatform ?about_platform } .
+ OPTIONAL { ?dataset gn:aboutTissue ?about_tissue } .
+ OPTIONAL { ?dataset gn:acknowledgement ?acknowledgement } .
+ OPTIONAL { ?dataset gn:citation ?citation } .
+ OPTIONAL { ?dataset gn:contributors ?contributors } .
OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
- # FIXME: gn:datasetStatus should not be optional. But, some records don't
- # have it.
OPTIONAL { ?dataset gn:datasetStatus ?status } .
- OPTIONAL { ?dataset gn:title ?title } .
+ OPTIONAL { ?dataset gn:experimentDesign ?experiment_design } .
OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
-}
-""",
- """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset ;
- gn:normalization / gn:name ?normalization_name ;
- gn:datasetOfSpecies / gn:menuName ?species_name ;
- gn:datasetOfInbredSet / gn:name ?inbred_set_name .
- OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
- OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
-}
-""",
- """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
- ?about_data_processing ?notes ?experiment_design ?contributors
- ?citation ?acknowledgment
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset .
- OPTIONAL { ?dataset gn:specifics ?specifics . }
- OPTIONAL { ?dataset gn:summary ?summary . }
- OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
- OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
- OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
- OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
- OPTIONAL { ?dataset gn:notes ?notes . }
- OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
- OPTIONAL { ?dataset gn:contributors ?contributors . }
- OPTIONAL { ?dataset gn:citation ?citation . }
- OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
-}
-""",
- ]
- result: MonadicDict = MonadicDict(
- {
- "accession_id": accession_id,
- }
- )
- query_result: MonadicDict = MonadicDict()
- for query in queries:
- if not (
- # Expecting only one result
- sparql_result := sparql_query(
- sparql_conn,
- Template(query).substitute(accession_id=accession_id)
- )[0]
- ):
- return MonadicDict()
- query_result |= sparql_result
- for key, value in query_result.items():
- result[key] = value.bind(lambda x: Just(x["value"]))
-
- investigator_query_result = sparql_query(
- sparql_conn,
- Template(
- """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
-WHERE {
- ?dataset gn:accessionId "$accession_id" ;
- rdf:type gn:dataset ;
- gn:datasetOfInvestigator ?investigator .
- OPTIONAL { ?investigator foaf:name ?name . }
- OPTIONAL { ?investigator gn:address ?address . }
- OPTIONAL { ?investigator gn:city ?city . }
- OPTIONAL { ?investigator gn:state ?state . }
- OPTIONAL { ?investigator gn:zipCode ?zip . }
- OPTIONAL { ?investigator foaf:phone ?phone . }
- OPTIONAL { ?investigator foaf:mbox ?email . }
- OPTIONAL { ?investigator gn:country ?country . }
- OPTIONAL { ?investigator foaf:homepage ?homepage . }
+ OPTIONAL { ?dataset gn:notes ?notes } .
+ OPTIONAL { ?dataset gn:specifics ?specifics } .
+ OPTIONAL { ?dataset gn:summary ?summary } .
+ OPTIONAL { ?dataset gn:title ?title } .
+ OPTIONAL {
+ ?dataset gn:normalization ?normalization .
+ ?normalization gn:name ?normalization_name .
+ } .
+ OPTIONAL {
+ ?dataset gn:datasetOfPlatform ?platform .
+ ?platform gn:name ?platform_name .
+ } .
+ OPTIONAL {
+ ?dataset gn:datasetOfTissue ?tissue .
+ ?tissue gn:name ?tissue_name .
+ } .
+ OPTIONAL {
+ ?dataset gn:datasetOfSpecies ?species ;
+ gn:datasetOfInbredSet ?inbred_set .
+ ?species gn:name ?species_name .
+ ?inbred_set gn:name ?inbred_set_name .
+ } .
+ OPTIONAL {
+ ?dataset gn:datasetOfInvestigator ?investigator .
+ OPTIONAL { ?investigator foaf:name ?name . }
+ OPTIONAL { ?investigator gn:address ?address . }
+ OPTIONAL { ?investigator gn:city ?city . }
+ OPTIONAL { ?investigator gn:state ?state . }
+ OPTIONAL { ?investigator gn:zipCode ?zip . }
+ OPTIONAL { ?investigator foaf:phone ?phone . }
+ OPTIONAL { ?investigator foaf:mbox ?email . }
+ OPTIONAL { ?investigator gn:country ?country . }
+ OPTIONAL { ?investigator foaf:homepage ?homepage . }
+ }
}
"""
- ).substitute(accession_id=accession_id)
- )[0]
- result["investigators"] = Just({
- key: value.bind(lambda a: a["value"])
- for key, value in investigator_query_result.items()
- })
+ result: MonadicDict = MonadicDict()
+ for key, value in sparql_query(
+ sparql_conn,
+ Template(__metadata_query).substitute(name=name)
+ )[0].items():
+ result[key] = value.bind(lambda x: Just(x["value"]))
return result