Re-implement RDF related code to use monads in its own module

* gn3/api/general.py: Replace gn3.db.datasets import with gn3.db.rdf. (dataset_metadata) <jsonify>: Replace datasets.dataset_metadata with rdf.get_dataset_metadata. * gn3/db/datasets.py: Remove unused imports. (sparql_query, dataset_metadata): Delete. * gn3/db/rdf.py: (sparql_query, get_dataset_metadata): New functions.
author: Munyoki Kilyungi 2022-10-12 15:50:33 +0300
committer: BonfaceKilz 2022-11-02 17:36:22 +0300
commit: 6e58cb5ffe3b145aa6ba9b7a54672e7a7f18a0b7 (patch)
tree: 0a7d5d2bbd5774b91188cf90f3c58c9be1d77c31 /gn3/db
parent: 2e85b5c7c4be181182f263fcb644abd173be83c1 (diff)
download: genenetwork3-6e58cb5ffe3b145aa6ba9b7a54672e7a7f18a0b7.tar.gz
2 files changed, 133 insertions, 102 deletions
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 1d2f071..bc5467b 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -1,11 +1,8 @@
 """
 This module contains functions relating to specific trait dataset manipulation
 """
-import re
-from string import Template
-from typing import Any, Dict, List, Optional
-from SPARQLWrapper import JSON, SPARQLWrapper
-from gn3.settings import SPARQL_ENDPOINT
+from typing import Any
+
 
 def retrieve_probeset_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -264,100 +261,3 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
         **dataset_fns[trait_type](),
         **group
     }
-
-def sparql_query(query: str) -> List[Dict[str, Any]]:
-    """Run a SPARQL query and return the bound variables."""
-    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
-    sparql.setQuery(query)
-    sparql.setReturnFormat(JSON)
-    return sparql.queryAndConvert()['results']['bindings']  # type: ignore
-
-def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]:
-    """Return info about dataset with ACCESSION_ID."""
-    # Check accession_id to protect against query injection.
-    # TODO: This function doesn't yet return the names of the actual dataset files.
-    pattern = re.compile(r'GN\d+', re.ASCII)
-    if not pattern.fullmatch(accession_id):
-        return None
-    # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
-    # single large query.
-    queries = ["""
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?dataset_group ?status ?title ?geo_series
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset ;
-           gn:name ?name .
-  OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
-  # FIXME: gn:datasetStatus should not be optional. But, some records don't
-  # have it.
-  OPTIONAL { ?dataset gn:datasetStatus ?status } .
-  OPTIONAL { ?dataset gn:title ?title } .
-  OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
-}
-""",
-               """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset ;
-           gn:normalization / gn:name ?normalization_name ;
-           gn:datasetOfSpecies / gn:menuName ?species_name ;
-           gn:datasetOfInbredSet / gn:name ?inbred_set_name .
-  OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
-  OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
-}
-""",
-               """
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
-       ?about_data_processing ?notes ?experiment_design ?contributors
-       ?citation ?acknowledgment
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset .
-  OPTIONAL { ?dataset gn:specifics ?specifics . }
-  OPTIONAL { ?dataset gn:summary ?summary . }
-  OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
-  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
-  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
-  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
-  OPTIONAL { ?dataset gn:notes ?notes . }
-  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
-  OPTIONAL { ?dataset gn:contributors ?contributors . }
-  OPTIONAL { ?dataset gn:citation ?citation . }
-  OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
-}
-"""]
-    result: Dict[str, Any] = {'accession_id': accession_id,
-                              'investigator': {}}
-    query_result = {}
-    for query in queries:
-        if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
-            query_result.update(sparql_result[0])
-        else:
-            return None
-    for key, value in query_result.items():
-        result[key] = value['value']
-    investigator_query_result = sparql_query(Template("""
-PREFIX gn: <http://genenetwork.org/>
-SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
-WHERE {
-  ?dataset gn:accessionId "$accession_id" ;
-           rdf:type gn:dataset ;
-           gn:datasetOfInvestigator ?investigator .
-  OPTIONAL { ?investigator foaf:name ?name . }
-  OPTIONAL { ?investigator gn:address ?address . }
-  OPTIONAL { ?investigator gn:city ?city . }
-  OPTIONAL { ?investigator gn:state ?state . }
-  OPTIONAL { ?investigator gn:zipCode ?zip . }
-  OPTIONAL { ?investigator foaf:phone ?phone . }
-  OPTIONAL { ?investigator foaf:mbox ?email . }
-  OPTIONAL { ?investigator gn:country ?country . }
-  OPTIONAL { ?investigator foaf:homepage ?homepage . }
-}
-""").substitute(accession_id=accession_id))[0]
-    for key, value in investigator_query_result.items():
-        result['investigator'][key] = value['value']
-    return result
diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py
new file mode 100644
index 0000000..f1312e3
--- /dev/null
+++ b/gn3/db/rdf.py
@@ -0,0 +1,131 @@
+"""RDF utilities
+
+This module is a collection of functions that handle SPARQL queries.
+
+"""
+import re
+from typing import Tuple
+from string import Template
+from SPARQLWrapper import JSON, SPARQLWrapper
+from pymonad.maybe import Just
+from gn3.monads import MonadicDict
+from gn3.settings import SPARQL_ENDPOINT
+
+
+def sparql_query(query: str) -> Tuple[MonadicDict, ...]:
+    """Run a SPARQL query and return the bound variables."""
+    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    results = sparql.queryAndConvert()
+    if _r := results["results"]["bindings"]:  # type: ignore
+        return (*(MonadicDict(bindings) for bindings in _r),)  # type: ignore
+    return (MonadicDict(),)
+
+
+def get_dataset_metadata(accession_id: str) -> MonadicDict:
+    """Return info about dataset with ACCESSION_ID."""
+    # Check accession_id to protect against query injection.
+    # TODO: This function doesn't yet return the names of the actual dataset
+    # files.
+    pattern = re.compile(r"GN\d+", re.ASCII)
+    if not pattern.fullmatch(accession_id):
+        return MonadicDict()
+    # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
+    # single large query.
+    queries = [
+        """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?dataset_group ?status ?title ?geo_series
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset .
+  OPTIONAL { ?dataset gn:name ?name } .
+  OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
+  # FIXME: gn:datasetStatus should not be optional. But, some records don't
+  # have it.
+  OPTIONAL { ?dataset gn:datasetStatus ?status } .
+  OPTIONAL { ?dataset gn:title ?title } .
+  OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
+}
+""",
+        """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:normalization / gn:name ?normalization_name ;
+           gn:datasetOfSpecies / gn:menuName ?species_name ;
+           gn:datasetOfInbredSet / gn:name ?inbred_set_name .
+  OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
+  OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
+}
+""",
+        """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
+       ?about_data_processing ?notes ?experiment_design ?contributors
+       ?citation ?acknowledgment
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset .
+  OPTIONAL { ?dataset gn:specifics ?specifics . }
+  OPTIONAL { ?dataset gn:summary ?summary . }
+  OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
+  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
+  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
+  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
+  OPTIONAL { ?dataset gn:notes ?notes . }
+  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
+  OPTIONAL { ?dataset gn:contributors ?contributors . }
+  OPTIONAL { ?dataset gn:citation ?citation . }
+  OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
+}
+""",
+    ]
+    result: MonadicDict = MonadicDict(
+        {
+            "accession_id": accession_id,
+        }
+    )
+    query_result: MonadicDict = MonadicDict()
+    for query in queries:
+        if not (
+            # Expecting only one result
+            sparql_result := sparql_query(
+                Template(query).substitute(accession_id=accession_id)
+            )[0]
+        ):
+            return MonadicDict()
+        query_result |= sparql_result
+    for key, value in query_result.items():
+        result[key] = value.bind(lambda x: Just(x["value"]))
+
+    investigator_query_result = sparql_query(
+        Template(
+            """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:datasetOfInvestigator ?investigator .
+  OPTIONAL { ?investigator foaf:name ?name . }
+  OPTIONAL { ?investigator gn:address ?address . }
+  OPTIONAL { ?investigator gn:city ?city . }
+  OPTIONAL { ?investigator gn:state ?state . }
+  OPTIONAL { ?investigator gn:zipCode ?zip . }
+  OPTIONAL { ?investigator foaf:phone ?phone . }
+  OPTIONAL { ?investigator foaf:mbox ?email . }
+  OPTIONAL { ?investigator gn:country ?country . }
+  OPTIONAL { ?investigator foaf:homepage ?homepage . }
+}
+    """
+        ).substitute(accession_id=accession_id)
+    )[0]
+    result["investigators"] = Just({
+        key: value.bind(lambda a: a["value"])
+                for key, value in investigator_query_result.items()
+    })
+    return result
author	Munyoki Kilyungi	2022-10-12 15:50:33 +0300
committer	BonfaceKilz	2022-11-02 17:36:22 +0300
commit	6e58cb5ffe3b145aa6ba9b7a54672e7a7f18a0b7 (patch)
tree	0a7d5d2bbd5774b91188cf90f3c58c9be1d77c31 /gn3/db
parent	2e85b5c7c4be181182f263fcb644abd173be83c1 (diff)
download	genenetwork3-6e58cb5ffe3b145aa6ba9b7a54672e7a7f18a0b7.tar.gz