From 0a29e362bd8627b9346e2260a14c81a46e2a76d3 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 19 Nov 2021 16:42:00 +0530 Subject: Implement dataset metadata API endpoint. * guix.scm: Import (gnu packages rdf). (genenetwork3)[propagated-inputs]: Add python-sparqlwrapper. * gn3/settings.py (SPARQL_ENDPOINT): New variable. * gn3/api/general.py: Import datasets from gn3.db. (dataset_metadata): New API endpoint. * gn3/db/datasets.py: Import re, Template from string, Dict and Optional from typing, JSON and SPARQLWrapper from SPARQLWrapper, SPARQL_ENDPOINT from gn3.settings. (sparql_query, dataset_metadata): New functions. --- gn3/api/general.py | 7 +++- gn3/db/datasets.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++- gn3/settings.py | 3 ++ guix.scm | 2 ++ 4 files changed, 113 insertions(+), 2 deletions(-) diff --git a/gn3/api/general.py b/gn3/api/general.py index 69ec343..e0bfc81 100644 --- a/gn3/api/general.py +++ b/gn3/api/general.py @@ -7,7 +7,7 @@ from flask import request from gn3.fs_helpers import extract_uploaded_file from gn3.commands import run_cmd - +from gn3.db import datasets general = Blueprint("general", __name__) @@ -68,3 +68,8 @@ def run_r_qtl(geno_filestr, pheno_filestr): cmd = (f"Rscript {rqtl_wrapper} " f"{geno_filestr} {pheno_filestr}") return jsonify(run_cmd(cmd)), 201 + +@general.route("/dataset/") +def dataset_metadata(accession_id): + """Return info as JSON for dataset with ACCESSION_ID.""" + return jsonify(datasets.dataset_metadata(accession_id)) diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index 6c328f5..e4c779a 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -1,7 +1,11 @@ """ This module contains functions relating to specific trait dataset manipulation """ -from typing import Any +import re +from string import Template +from typing import Any, Dict, Optional +from SPARQLWrapper import JSON, SPARQLWrapper +from gn3.settings import SPARQL_ENDPOINT def retrieve_probeset_trait_dataset_name( threshold: int, name: str, connection: Any): @@ -289,3 +293,100 @@ def retrieve_trait_dataset(trait_type, trait, threshold, conn): **dataset_fns[trait_type](), **group } + +def sparql_query(query: str) -> Dict[str, Any]: + """Run a SPARQL query and return the bound variables.""" + sparql = SPARQLWrapper(SPARQL_ENDPOINT) + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + return sparql.queryAndConvert()['results']['bindings'] + +def dataset_metadata(accession_id: str) -> Optional[Dict[str, Any]]: + """Return info about dataset with ACCESSION_ID.""" + # Check accession_id to protect against query injection. + # TODO: This function doesn't yet return the names of the actual dataset files. + pattern = re.compile(r'GN\d+', re.ASCII) + if not pattern.fullmatch(accession_id): + return None + # KLUDGE: We split the SPARQL query because virtuoso is very slow on a + # single large query. + queries = [""" +PREFIX gn: +SELECT ?name ?dataset_group ?status ?title ?geo_series +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:name ?name . + OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } . + # FIXME: gn:datasetStatus should not be optional. But, some records don't + # have it. + OPTIONAL { ?dataset gn:datasetStatus ?status } . + OPTIONAL { ?dataset gn:title ?title } . + OPTIONAL { ?dataset gn:geoSeries ?geo_series } . +} +""", + """ +PREFIX gn: +SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:normalization / gn:name ?normalization_name ; + gn:datasetOfSpecies / gn:menuName ?species_name ; + gn:datasetOfInbredSet / gn:name ?inbred_set_name . + OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } . + OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } . +} +""", + """ +PREFIX gn: +SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform + ?about_data_processing ?notes ?experiment_design ?contributors + ?citation ?acknowledgment +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset . + OPTIONAL { ?dataset gn:specifics ?specifics . } + OPTIONAL { ?dataset gn:summary ?summary . } + OPTIONAL { ?dataset gn:aboutCases ?about_cases . } + OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . } + OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . } + OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . } + OPTIONAL { ?dataset gn:notes ?notes . } + OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . } + OPTIONAL { ?dataset gn:contributors ?contributors . } + OPTIONAL { ?dataset gn:citation ?citation . } + OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . } +} +"""] + result = {'accession_id': accession_id, + 'investigator': {}} + query_result = {} + for query in queries: + if sparql_result := sparql_query(Template(query).substitute(accession_id=accession_id)): + query_result.update(sparql_result[0]) + else: + return None + for key, value in query_result.items(): + result[key] = value['value'] + investigator_query_result = sparql_query(Template(""" +PREFIX gn: +SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage +WHERE { + ?dataset gn:accessionId "$accession_id" ; + rdf:type gn:dataset ; + gn:datasetOfInvestigator ?investigator . + OPTIONAL { ?investigator foaf:name ?name . } + OPTIONAL { ?investigator gn:address ?address . } + OPTIONAL { ?investigator gn:city ?city . } + OPTIONAL { ?investigator gn:state ?state . } + OPTIONAL { ?investigator gn:zipCode ?zip . } + OPTIONAL { ?investigator foaf:phone ?phone . } + OPTIONAL { ?investigator foaf:mbox ?email . } + OPTIONAL { ?investigator gn:country ?country . } + OPTIONAL { ?investigator foaf:homepage ?homepage . } +} +""").substitute(accession_id=accession_id))[0] + for key, value in investigator_query_result.items(): + result['investigator'][key] = value['value'] + return result diff --git a/gn3/settings.py b/gn3/settings.py index 0ac6698..c945fbf 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -13,6 +13,9 @@ REDIS_JOB_QUEUE = "GN3::job-queue" TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir()) RQTL_WRAPPER = "rqtl_wrapper.R" +# SPARQL endpoint +SPARQL_ENDPOINT = "http://localhost:8891/sparql" + # SQL confs SQL_URI = os.environ.get( "SQL_URI", "mysql://webqtlout:webqtlout@localhost/db_webqtl") diff --git a/guix.scm b/guix.scm index 9bf23c8..e2e49ab 100644 --- a/guix.scm +++ b/guix.scm @@ -42,6 +42,7 @@ (gnu packages python-web) (gnu packages python-xyz) (gnu packages python-science) + (gnu packages rdf) ((guix build utils) #:select (with-directory-excursion)) (guix build-system python) (guix gexp) @@ -78,6 +79,7 @@ ("python-redis" ,python-redis) ("python-requests" ,python-requests) ("python-scipy" ,python-scipy) + ("python-sparqlwrapper" ,python-sparqlwrapper) ("r-optparse" ,r-optparse) ("r-qtl" ,r-qtl) ("r-rjson" ,r-rjson) -- cgit v1.2.3 From 63923aee24a5f605961821ea31e64213f885db74 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Thu, 2 Dec 2021 17:02:57 +0530 Subject: Add a Guix operating-system configuration. * guix-system.scm: New file. --- guix-system.scm | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 guix-system.scm diff --git a/guix-system.scm b/guix-system.scm new file mode 100644 index 0000000..c154d01 --- /dev/null +++ b/guix-system.scm @@ -0,0 +1,120 @@ +(use-modules (gnu) + (gn services databases) + (gnu packages admin) + (gnu services shepherd) + (guix derivations) + (guix monads) + (guix profiles) + (guix search-paths) + (guix records) + (guix store) + (ice-9 match)) + +(define genenetwork3 + (load "guix.scm")) + +(define (packages->profile packages) + "Return profile with PACKAGES." + (with-store store + (run-with-store store + (mlet* %store-monad ((prof-drv (profile-derivation + (packages->manifest packages))) + (profile -> (derivation->output-path prof-drv))) + (mbegin %store-monad + (built-derivations (list prof-drv)) + (return profile)))))) + +(define (packages->environment-variables packages) + "Return environment variables of a profile with PACKAGES. Return value is an +association list mapping the names of environment variables to their values." + (map (match-lambda + ((search-path . value) + (cons (search-path-specification-variable search-path) + value))) + (profile-search-paths (packages->profile packages)))) + +(define (packages->profile-environment packages) + "Return environment of a profile with PACKAGES. Return value is a +list of environment variables suitable as input to the environ +function." + (map (match-lambda + ((search-path . value) + (string-append (search-path-specification-variable search-path) + "=" value))) + (profile-search-paths (packages->profile packages)))) + +(define-record-type* + genenetwork3-configuration make-genenetwork3-configuration + genenetwork3-configuration? + (package genenetwork3-configuration-package + (default genenetwork3)) + (port genenetwork3-configuration-port + (default 5000))) + +(define %genenetwork3-accounts + (list (user-group (name "genenetwork3") + (system? #t)) + (user-account + (name "genenetwork3") + (group "genenetwork3") + (system? #t) + (comment "GeneNetwork 3 user") + (home-directory "/var/empty") + (shell (file-append shadow "/sbin/nologin"))))) + +;; FIXME: Factorize this service into two. We should have a gunicorn +;; service that is extended by the genenetwork service. This way, the +;; app is better decoupled from the deployment. +(define genenetwork3-shepherd-service + (match-lambda + (($ package port) + (shepherd-service + (documentation "Run GeneNetwork 3.") + (provision '(genenetwork3)) + (requirement '(networking virtuoso)) + (start #~(begin + ;; Reference the profile. + #$(packages->profile (list package)) + ;; Start the gunicorn process. + (make-forkexec-constructor + (list #$(file-append gunicorn "/bin/gunicorn") + "-b" #$(string-append "127.0.0.1:" (number->string port)) + "gn3.app:create_app()") + #:user "genenetwork3" + #:group "genenetwork3" + #:environment-variables + '#$(packages->profile-environment (list package))))) + (stop #~(make-kill-destructor)))))) + +(define genenetwork3-service-type + (service-type + (name 'genenetwork3) + (description "Run GeneNetwork 3.") + (extensions + (list (service-extension account-service-type + (const %genenetwork3-accounts)) + (service-extension shepherd-root-service-type + (compose list genenetwork3-shepherd-service)))) + (default-value (genenetwork3-configuration)))) + +(operating-system + (host-name "genenetwork3") + (timezone "UTC") + (locale "en_US.utf8") + (bootloader (bootloader-configuration + (bootloader grub-bootloader) + (targets (list "/dev/sdX")))) + (file-systems (cons (file-system + (device "root") + (mount-point "/") + (type "ext4")) + %base-file-systems)) + (users %base-user-accounts) + (packages %base-packages) + (services (cons* (service virtuoso-service-type + (virtuoso-configuration + (http-server-port 8891))) + (service genenetwork3-service-type + (genenetwork3-configuration + (port 5000))) + %base-services))) -- cgit v1.2.3