about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--gn3/api/general.py7
-rw-r--r--gn3/db/datasets.py103
-rw-r--r--gn3/settings.py3
-rw-r--r--guix-system.scm120
-rw-r--r--guix.scm2
5 files changed, 233 insertions, 2 deletions
diff --git a/gn3/api/general.py b/gn3/api/general.py
index 69ec343..e0bfc81 100644
--- a/gn3/api/general.py
+++ b/gn3/api/general.py
@@ -7,7 +7,7 @@ from flask import request
 
 from gn3.fs_helpers import extract_uploaded_file
 from gn3.commands import run_cmd
-
+from gn3.db import datasets
 
 general = Blueprint("general", __name__)
 
@@ -68,3 +68,8 @@ def run_r_qtl(geno_filestr, pheno_filestr):
     cmd = (f"Rscript {rqtl_wrapper} "
            f"{geno_filestr} {pheno_filestr}")
     return jsonify(run_cmd(cmd)), 201
+
+@general.route("/dataset/<accession_id>")
+def dataset_metadata(accession_id):
+    """Return info as JSON for dataset with ACCESSION_ID."""
+    return jsonify(datasets.dataset_metadata(accession_id))
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index 6c328f5..e4c779a 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -1,7 +1,11 @@
 """
 This module contains functions relating to specific trait dataset manipulation
 """
-from typing import Any
+import re
+from string import Template
+from typing import Any, Dict, Optional
+from SPARQLWrapper import JSON, SPARQLWrapper
+from gn3.settings import SPARQL_ENDPOINT
 
 def retrieve_probeset_trait_dataset_name(
         threshold: int, name: str, connection: Any):
@@ -289,3 +293,100 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn):
         **dataset_fns[trait_type](),
         **group
     }
+
+def sparql_query(query: str) -> Dict[str, Any]:
+    """Run a SPARQL query and return the bound variables."""
+    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
+    sparql.setQuery(query)
+    sparql.setReturnFormat(JSON)
+    return sparql.queryAndConvert()['results']['bindings']
+
+def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]:
+    """Return info about dataset with ACCESSION_ID."""
+    # Check accession_id to protect against query injection.
+    # TODO: This function doesn't yet return the names of the actual dataset files.
+    pattern = re.compile(r'GN\d+', re.ASCII)
+    if not pattern.fullmatch(accession_id):
+        return None
+    # KLUDGE: We split the SPARQL query because virtuoso is very slow on a
+    # single large query.
+    queries = ["""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?dataset_group ?status ?title ?geo_series
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:name ?name .
+  OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } .
+  # FIXME: gn:datasetStatus should not be optional. But, some records don't
+  # have it.
+  OPTIONAL { ?dataset gn:datasetStatus ?status } .
+  OPTIONAL { ?dataset gn:title ?title } .
+  OPTIONAL { ?dataset gn:geoSeries ?geo_series } .
+}
+""",
+             """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:normalization / gn:name ?normalization_name ;
+           gn:datasetOfSpecies / gn:menuName ?species_name ;
+           gn:datasetOfInbredSet / gn:name ?inbred_set_name .
+  OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } .
+  OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } .
+}
+""",
+             """
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform
+       ?about_data_processing ?notes ?experiment_design ?contributors
+       ?citation ?acknowledgment
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset .
+  OPTIONAL { ?dataset gn:specifics ?specifics . }
+  OPTIONAL { ?dataset gn:summary ?summary . }
+  OPTIONAL { ?dataset gn:aboutCases ?about_cases . }
+  OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . }
+  OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . }
+  OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . }
+  OPTIONAL { ?dataset gn:notes ?notes . }
+  OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . }
+  OPTIONAL { ?dataset gn:contributors ?contributors . }
+  OPTIONAL { ?dataset gn:citation ?citation . }
+  OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . }
+}
+"""]
+    result = {'accession_id': accession_id,
+              'investigator': {}}
+    query_result = {}
+    for query in queries:
+        if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)):
+            query_result.update(sparql_result[0])
+        else:
+            return None
+    for key, value in query_result.items():
+        result[key] = value['value']
+    investigator_query_result = sparql_query(Template("""
+PREFIX gn: <http://genenetwork.org/>
+SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage
+WHERE {
+  ?dataset gn:accessionId "$accession_id" ;
+           rdf:type gn:dataset ;
+           gn:datasetOfInvestigator ?investigator .
+  OPTIONAL { ?investigator foaf:name ?name . }
+  OPTIONAL { ?investigator gn:address ?address . }
+  OPTIONAL { ?investigator gn:city ?city . }
+  OPTIONAL { ?investigator gn:state ?state . }
+  OPTIONAL { ?investigator gn:zipCode ?zip . }
+  OPTIONAL { ?investigator foaf:phone ?phone . }
+  OPTIONAL { ?investigator foaf:mbox ?email . }
+  OPTIONAL { ?investigator gn:country ?country . }
+  OPTIONAL { ?investigator foaf:homepage ?homepage . }
+}
+""").substitute(accession_id=accession_id))[0]
+    for key, value in investigator_query_result.items():
+        result['investigator'][key] = value['value']
+    return result
diff --git a/gn3/settings.py b/gn3/settings.py
index 0ac6698..c945fbf 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -13,6 +13,9 @@ REDIS_JOB_QUEUE = "GN3::job-queue"
 TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir())
 RQTL_WRAPPER = "rqtl_wrapper.R"
 
+# SPARQL endpoint
+SPARQL_ENDPOINT = "http://localhost:8891/sparql"
+
 # SQL confs
 SQL_URI = os.environ.get(
     "SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl")
diff --git a/guix-system.scm b/guix-system.scm
new file mode 100644
index 0000000..c154d01
--- /dev/null
+++ b/guix-system.scm
@@ -0,0 +1,120 @@
+(use-modules (gnu)
+             (gn services databases)
+             (gnu packages admin)
+             (gnu services shepherd)
+             (guix derivations)
+             (guix monads)
+             (guix profiles)
+             (guix search-paths)
+             (guix records)
+             (guix store)
+             (ice-9 match))
+
+(define genenetwork3
+  (load "guix.scm"))
+
+(define (packages->profile packages)
+  "Return profile with PACKAGES."
+  (with-store store
+    (run-with-store store
+      (mlet* %store-monad ((prof-drv (profile-derivation
+                                      (packages->manifest packages)))
+                           (profile -> (derivation->output-path prof-drv)))
+        (mbegin %store-monad
+          (built-derivations (list prof-drv))
+          (return profile))))))
+
+(define (packages->environment-variables packages)
+  "Return environment variables of a profile with PACKAGES. Return value is an
+association list mapping the names of environment variables to their values."
+  (map (match-lambda
+         ((search-path . value)
+          (cons (search-path-specification-variable search-path)
+                value)))
+       (profile-search-paths (packages->profile packages))))
+
+(define (packages->profile-environment packages)
+  "Return environment of a profile with PACKAGES. Return value is a
+list of environment variables suitable as input to the environ
+function."
+  (map (match-lambda
+         ((search-path . value)
+          (string-append (search-path-specification-variable search-path)
+                         "=" value)))
+       (profile-search-paths (packages->profile packages))))
+
+(define-record-type* <genenetwork3-configuration>
+  genenetwork3-configuration make-genenetwork3-configuration
+  genenetwork3-configuration?
+  (package genenetwork3-configuration-package
+           (default genenetwork3))
+  (port genenetwork3-configuration-port
+        (default 5000)))
+
+(define %genenetwork3-accounts
+  (list (user-group (name "genenetwork3")
+                    (system? #t))
+        (user-account
+         (name "genenetwork3")
+         (group "genenetwork3")
+         (system? #t)
+         (comment "GeneNetwork 3 user")
+         (home-directory "/var/empty")
+         (shell (file-append shadow "/sbin/nologin")))))
+
+;; FIXME: Factorize this service into two. We should have a gunicorn
+;; service that is extended by the genenetwork service. This way, the
+;; app is better decoupled from the deployment.
+(define genenetwork3-shepherd-service
+  (match-lambda
+    (($ <genenetwork3-configuration> package port)
+     (shepherd-service
+      (documentation "Run GeneNetwork 3.")
+      (provision '(genenetwork3))
+      (requirement '(networking virtuoso))
+      (start #~(begin
+                 ;; Reference the profile.
+                 #$(packages->profile (list package))
+                 ;; Start the gunicorn process.
+                 (make-forkexec-constructor
+                  (list #$(file-append gunicorn "/bin/gunicorn")
+                        "-b" #$(string-append "127.0.0.1:" (number->string port))
+                        "gn3.app:create_app()")
+                  #:user "genenetwork3"
+                  #:group "genenetwork3"
+                  #:environment-variables
+                  '#$(packages->profile-environment (list package)))))
+      (stop #~(make-kill-destructor))))))
+
+(define genenetwork3-service-type
+  (service-type
+   (name 'genenetwork3)
+   (description "Run GeneNetwork 3.")
+   (extensions
+    (list (service-extension account-service-type
+                             (const %genenetwork3-accounts))
+          (service-extension shepherd-root-service-type
+                             (compose list genenetwork3-shepherd-service))))
+   (default-value (genenetwork3-configuration))))
+
+(operating-system
+  (host-name "genenetwork3")
+  (timezone "UTC")
+  (locale "en_US.utf8")
+  (bootloader (bootloader-configuration
+               (bootloader grub-bootloader)
+               (targets (list "/dev/sdX"))))
+  (file-systems (cons (file-system
+                        (device "root")
+                        (mount-point "/")
+                        (type "ext4"))
+                      %base-file-systems))
+  (users %base-user-accounts)
+  (packages %base-packages)
+  (services (cons* (service virtuoso-service-type
+                            (virtuoso-configuration
+                             (http-server-port 8891)))
+                   (service genenetwork3-service-type
+                            (genenetwork3-configuration
+                             (port 5000)))
+                   %base-services)))
diff --git a/guix.scm b/guix.scm
index 9bf23c8..e2e49ab 100644
--- a/guix.scm
+++ b/guix.scm
@@ -42,6 +42,7 @@
              (gnu packages python-web)
              (gnu packages python-xyz)
              (gnu packages python-science)
+             (gnu packages rdf)
              ((guix build utils) #:select (with-directory-excursion))
              (guix build-system python)
              (guix gexp)
@@ -78,6 +79,7 @@
                        ("python-redis" ,python-redis)
                        ("python-requests" ,python-requests)
                        ("python-scipy" ,python-scipy)
+                       ("python-sparqlwrapper" ,python-sparqlwrapper)
                        ("r-optparse" ,r-optparse)
                        ("r-qtl" ,r-qtl)
                        ("r-rjson" ,r-rjson)