about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2022-12-01 15:32:42 +0300
committerBonfaceKilz2022-12-06 11:28:28 +0300
commitc75ed75c6a5987eed6672be924870f86282ed6d6 (patch)
tree9d91afcbef1ea9c2ec6557b86b5187f5093374ab
parent09f4d9c4bec6e3395296a69ce718dea30240ed54 (diff)
downloadgenenetwork3-c75ed75c6a5987eed6672be924870f86282ed6d6.tar.gz
Use a dataset's name to fetch it's metadata from RDF
* gn3/api/metadata.py (jsonify_dataset_metadata): Rewrite metadata
end-point to use a dataset's name instead of it's accession_id.
* gn3/db/rdf.py (get_dataset_metadata): Replace accession_id with
name.  Use one single RDF query instead of multiple queries.
-rw-r--r--gn3/api/metadata.py6
-rw-r--r--gn3/db/rdf.py158
2 files changed, 61 insertions, 103 deletions
diff --git a/gn3/api/metadata.py b/gn3/api/metadata.py
index 35e4067..8f6127f 100644
--- a/gn3/api/metadata.py
+++ b/gn3/api/metadata.py
@@ -13,14 +13,14 @@ from gn3.db.rdf import get_dataset_metadata
 metadata = Blueprint("metadata", __name__)
 
 
-@metadata.route("/dataset/<accession_id>", methods=["GET"])
-def jsonify_dataset_metadata(accession_id):
+@metadata.route("/dataset/<name>", methods=["GET"])
+def jsonify_dataset_metadata(name):
     """Fetch a dataset's metadata given it's ACCESSION_ID"""
     try:
         return jsonify(
             get_dataset_metadata(
                 SPARQLWrapper(current_app.config.get("SPARQL_ENDPOINT")),
-                accession_id,
+                name,
             ).data
         )
     # The virtuoso server is misconfigured or it isn't running at all
diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py
index 7ed80b1..b690d5a 100644
--- a/gn3/db/rdf.py
+++ b/gn3/db/rdf.py
@@ -24,112 +24,70 @@ def sparql_query(
 
 
 def get_dataset_metadata(
-        sparql_conn: SPARQLWrapper, accession_id: str
+        sparql_conn: SPARQLWrapper, name: str
 ) -> MonadicDict:
-    """Return info about dataset with ACCESSION_ID."""
-    # Check accession_id to protect against query injection.
-    # TODO: This function doesn't yet return the names of the actual dataset
-    # files.
-    pattern = re.compile(r"GN\d+", re.ASCII)
-    if not pattern.fullmatch(accession_id):
-        return MonadicDict()
-    # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
-    # single large query.
-    queries = [
-        """
+    """Return info about dataset with a given NAME"""
+    __metadata_query = """
 PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?dataset_group ?status ?title ?geo_series
+
+SELECT ?accession_id ?dataset_group ?status ?title ?geo_series ?specifics ?summary ?about_tissue
+?about_platform ?about_data_processing ?notes ?experiment_design ?contributors ?citation ?acknowledgement
+?platform_name ?tissue_name ?normalization_name ?species_name ?inbred_set_name
+?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
 WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset .
-  OPTIONAL { ?dataset gn:name ?name } .
+  ?dataset gn:accessionId ?accession_id ;
+           rdf:type gn:dataset ;
+           gn:name "$name" .
+  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing } .
+  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform } .
+  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue } .
+  OPTIONAL { ?dataset gn:acknowledgement ?acknowledgement } .
+  OPTIONAL { ?dataset gn:citation ?citation } .
+  OPTIONAL { ?dataset gn:contributors ?contributors } .
   OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
-  # FIXME: gn:datasetStatus should not be optional. But, some records don't
-  # have it.
   OPTIONAL { ?dataset gn:datasetStatus ?status } .
-  OPTIONAL { ?dataset gn:title ?title } .
+  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design } .
   OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
-}
-""",
-        """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset ;
-           gn:normalization / gn:name ?normalization_name ;
-           gn:datasetOfSpecies / gn:menuName ?species_name ;
-           gn:datasetOfInbredSet / gn:name ?inbred_set_name .
-  OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
-  OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
-}
-""",
-        """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
-       ?about_data_processing ?notes ?experiment_design ?contributors
-       ?citation ?acknowledgment
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset .
-  OPTIONAL { ?dataset gn:specifics ?specifics . }
-  OPTIONAL { ?dataset gn:summary ?summary . }
-  OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
-  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
-  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
-  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
-  OPTIONAL { ?dataset gn:notes ?notes . }
-  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
-  OPTIONAL { ?dataset gn:contributors ?contributors . }
-  OPTIONAL { ?dataset gn:citation ?citation . }
-  OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
-}
-""",
-    ]
-    result: MonadicDict = MonadicDict(
-        {
-            "accession_id": accession_id,
-        }
-    )
-    query_result: MonadicDict = MonadicDict()
-    for query in queries:
-        if not (
-            # Expecting only one result
-            sparql_result := sparql_query(
-                sparql_conn,
-                Template(query).substitute(accession_id=accession_id)
-            )[0]
-        ):
-            return MonadicDict()
-        query_result |= sparql_result
-    for key, value in query_result.items():
-        result[key] = value.bind(lambda x: Just(x["value"]))
-
-    investigator_query_result = sparql_query(
-        sparql_conn,
-        Template(
-            """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset ;
-           gn:datasetOfInvestigator ?investigator .
-  OPTIONAL { ?investigator foaf:name ?name . }
-  OPTIONAL { ?investigator gn:address ?address . }
-  OPTIONAL { ?investigator gn:city ?city . }
-  OPTIONAL { ?investigator gn:state ?state . }
-  OPTIONAL { ?investigator gn:zipCode ?zip . }
-  OPTIONAL { ?investigator foaf:phone ?phone . }
-  OPTIONAL { ?investigator foaf:mbox ?email . }
-  OPTIONAL { ?investigator gn:country ?country . }
-  OPTIONAL { ?investigator foaf:homepage ?homepage . }
+  OPTIONAL { ?dataset gn:notes ?notes } .
+  OPTIONAL { ?dataset gn:specifics ?specifics } .
+  OPTIONAL { ?dataset gn:summary ?summary } .
+  OPTIONAL { ?dataset gn:title ?title } .
+  OPTIONAL {
+    ?dataset gn:normalization ?normalization .
+    ?normalization gn:name ?normalization_name .
+  } .
+  OPTIONAL {
+    ?dataset gn:datasetOfPlatform ?platform .
+    ?platform gn:name ?platform_name .
+  } .
+  OPTIONAL {
+    ?dataset gn:datasetOfTissue ?tissue .
+    ?tissue gn:name ?tissue_name .
+  } .
+  OPTIONAL {
+      ?dataset gn:datasetOfSpecies ?species ;
+               gn:datasetOfInbredSet ?inbred_set .
+      ?species gn:name ?species_name .
+      ?inbred_set gn:name ?inbred_set_name .
+  } .
+  OPTIONAL {
+      ?dataset gn:datasetOfInvestigator ?investigator .
+           OPTIONAL { ?investigator foaf:name ?name . }
+           OPTIONAL { ?investigator gn:address ?address . }
+           OPTIONAL { ?investigator gn:city ?city . }
+           OPTIONAL { ?investigator gn:state ?state . }
+           OPTIONAL { ?investigator gn:zipCode ?zip . }
+           OPTIONAL { ?investigator foaf:phone ?phone . }
+           OPTIONAL { ?investigator foaf:mbox ?email . }
+           OPTIONAL { ?investigator gn:country ?country . }
+           OPTIONAL { ?investigator foaf:homepage ?homepage . }
+  }
 }
     """
-        ).substitute(accession_id=accession_id)
-    )[0]
-    result["investigators"] = Just({
-        key: value.bind(lambda a: a["value"])
-                for key, value in investigator_query_result.items()
-    })
+    result: MonadicDict = MonadicDict()
+    for key, value in sparql_query(
+            sparql_conn,
+            Template(__metadata_query).substitute(name=name)
+    )[0].items():
+        result[key] = value.bind(lambda x: Just(x["value"]))
     return result