about summary refs log tree commit diff
path: root/gn3/db/datasets.py
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/db/datasets.py')
-rw-r--r--gn3/db/datasets.py152
1 files changed, 112 insertions, 40 deletions
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 6c328f5..b19db53 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -1,7 +1,11 @@
 """
 This module contains functions relating to specific trait dataset manipulation
 """
-from typing import Any
+import re
+from string import Template
+from typing import Any, Dict, List, Optional
+from SPARQLWrapper import JSON, SPARQLWrapper
+from gn3.settings import SPARQL_ENDPOINT
 
 def retrieve_probeset_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -22,10 +26,13 @@ def retrieve_probeset_trait_dataset_name(
                 "threshold": threshold,
                 "name": name
             })
-        return dict(zip(
-            ["dataset_id", "dataset_name", "dataset_fullname",
-             "dataset_shortname", "dataset_datascale"],
-            cursor.fetchone()))
+        res = cursor.fetchone()
+        if res:
+            return dict(zip(
+                ["dataset_id", "dataset_name", "dataset_fullname",
+                 "dataset_shortname", "dataset_datascale"],
+                res))
+        return {"dataset_id": None, "dataset_name": name, "dataset_fullname": name}
 
 def retrieve_publish_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -75,33 +82,8 @@ def retrieve_geno_trait_dataset_name(
              "dataset_shortname"],
             cursor.fetchone()))
 
-def retrieve_temp_trait_dataset_name(
-        threshold: int, name: str, connection: Any):
-    """
-    Get the ID, DataScale and various name formats for a `Temp` trait.
-    """
-    query = (
-        "SELECT Id, Name, FullName, ShortName "
-        "FROM TempFreeze "
-        "WHERE "
-        "public > %(threshold)s "
-        "AND "
-        "(Name = %(name)s OR FullName = %(name)s OR ShortName = %(name)s)")
-    with connection.cursor() as cursor:
-        cursor.execute(
-            query,
-            {
-                "threshold": threshold,
-                "name": name
-            })
-        return dict(zip(
-            ["dataset_id", "dataset_name", "dataset_fullname",
-             "dataset_shortname"],
-            cursor.fetchone()))
-
 def retrieve_dataset_name(
-        trait_type: str, threshold: int, trait_name: str, dataset_name: str,
-        conn: Any):
+        trait_type: str, threshold: int, dataset_name: str, conn: Any):
     """
     Retrieve the name of a trait given the trait's name
 
@@ -113,9 +95,7 @@ def retrieve_dataset_name(
         "ProbeSet": retrieve_probeset_trait_dataset_name,
         "Publish": retrieve_publish_trait_dataset_name,
         "Geno": retrieve_geno_trait_dataset_name,
-        "Temp": retrieve_temp_trait_dataset_name}
-    if trait_type == "Temp":
-        return retrieve_temp_trait_dataset_name(threshold, trait_name, conn)
+        "Temp": lambda threshold, dataset_name, conn: {}}
     return fn_map[trait_type](threshold, dataset_name, conn)
 
 
@@ -203,7 +183,6 @@ def retrieve_temp_trait_dataset():
     """
     Retrieve the dataset that relates to `Temp` traits
     """
-    # pylint: disable=[C0330]
     return {
         "searchfield": ["name", "description"],
         "disfield": ["name", "description"],
@@ -217,7 +196,6 @@ def retrieve_geno_trait_dataset():
     """
     Retrieve the dataset that relates to `Geno` traits
     """
-    # pylint: disable=[C0330]
     return {
         "searchfield": ["name", "chr"],
 	"disfield": ["name", "chr", "mb", "source2", "sequence"],
@@ -228,7 +206,6 @@ def retrieve_publish_trait_dataset():
     """
     Retrieve the dataset that relates to `Publish` traits
     """
-    # pylint: disable=[C0330]
     return {
         "searchfield": [
             "name", "post_publication_description", "abstract", "title",
@@ -247,7 +224,6 @@ def retrieve_probeset_trait_dataset():
     """
     Retrieve the dataset that relates to `ProbeSet` traits
     """
-    # pylint: disable=[C0330]
     return {
         "searchfield": [
             "name", "description", "probe_target_description", "symbol",
@@ -278,8 +254,7 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
         "dataset_id": None,
         "dataset_name": trait["db"]["dataset_name"],
         **retrieve_dataset_name(
-            trait_type, threshold, trait["trait_name"],
-            trait["db"]["dataset_name"], conn)
+            trait_type, threshold, trait["db"]["dataset_name"], conn)
     }
     group = retrieve_group_fields(
         trait_type, trait["trait_name"], dataset_name_info, conn)
@@ -289,3 +264,100 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
         **dataset_fns[trait_type](),
         **group
     }
+
+def sparql_query(query: str) -> List[Dict[str, Any]]:
+    """Run a SPARQL query and return the bound variables."""
+    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    return sparql.queryAndConvert()['results']['bindings']
+
+def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]:
+    """Return info about dataset with ACCESSION_ID."""
+    # Check accession_id to protect against query injection.
+    # TODO: This function doesn't yet return the names of the actual dataset files.
+    pattern = re.compile(r'GN\d+', re.ASCII)
+    if not pattern.fullmatch(accession_id):
+        return None
+    # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
+    # single large query.
+    queries = ["""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?dataset_group ?status ?title ?geo_series
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:name ?name .
+  OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
+  # FIXME: gn:datasetStatus should not be optional. But, some records don't
+  # have it.
+  OPTIONAL { ?dataset gn:datasetStatus ?status } .
+  OPTIONAL { ?dataset gn:title ?title } .
+  OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
+}
+""",
+               """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:normalization / gn:name ?normalization_name ;
+           gn:datasetOfSpecies / gn:menuName ?species_name ;
+           gn:datasetOfInbredSet / gn:name ?inbred_set_name .
+  OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
+  OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
+}
+""",
+               """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
+       ?about_data_processing ?notes ?experiment_design ?contributors
+       ?citation ?acknowledgment
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset .
+  OPTIONAL { ?dataset gn:specifics ?specifics . }
+  OPTIONAL { ?dataset gn:summary ?summary . }
+  OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
+  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
+  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
+  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
+  OPTIONAL { ?dataset gn:notes ?notes . }
+  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
+  OPTIONAL { ?dataset gn:contributors ?contributors . }
+  OPTIONAL { ?dataset gn:citation ?citation . }
+  OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
+}
+"""]
+    result: Dict[str, Any] = {'accession_id': accession_id,
+                              'investigator': {}}
+    query_result = {}
+    for query in queries:
+        if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
+            query_result.update(sparql_result[0])
+        else:
+            return None
+    for key, value in query_result.items():
+        result[key] = value['value']
+    investigator_query_result = sparql_query(Template("""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:datasetOfInvestigator ?investigator .
+  OPTIONAL { ?investigator foaf:name ?name . }
+  OPTIONAL { ?investigator gn:address ?address . }
+  OPTIONAL { ?investigator gn:city ?city . }
+  OPTIONAL { ?investigator gn:state ?state . }
+  OPTIONAL { ?investigator gn:zipCode ?zip . }
+  OPTIONAL { ?investigator foaf:phone ?phone . }
+  OPTIONAL { ?investigator foaf:mbox ?email . }
+  OPTIONAL { ?investigator gn:country ?country . }
+  OPTIONAL { ?investigator foaf:homepage ?homepage . }
+}
+""").substitute(accession_id=accession_id))[0]
+    for key, value in investigator_query_result.items():
+        result['investigator'][key] = value['value']
+    return result