#! Module for handling SPARQL primitives. Note that GN queries should go into gn/db/sources - this is currently not the case. !# (define-module (gn db sparql) #:use-module (json) #:use-module (ice-9 match) #:use-module (ice-9 format) #:use-module (ice-9 iconv) #:use-module (ice-9 receive) #:use-module (ice-9 string-fun) #:use-module (web client) #:use-module (web request) #:use-module (web uri) #:use-module (gn cache memoize) #:use-module (gn db sources wikidata) #:use-module (web gn-uri) #:export (memo-sparql-species memo-sparql-species-meta sparql-species-meta sparql-groups-meta sparql-group-info memo-sparql-wd-species-info compile-species compile-groups-meta get-rows tsv->scm strip-lang make-table make-pairs ) ) (define (strip-lang s) "Strip quotes and language tag (@en) from RDF entries" (list->string (match (string->list s) [(#\"rest ... #\") rest] [(#\"rest ... #\" #\@ #\e #\n) rest] [rest rest])) ) (define (gn-sparql-endpoint-url) "https://sparql.genenetwork.org/sparql") (define (wd-sparql-endpoint-url) "https://query.wikidata.org/sparql") (define (gn-sparql-prefix query) (string-append " PREFIX gn: <http://genenetwork.org/id/> PREFIX gnt: <http://genenetwork.org/term/> PREFIX gnc: <http://genenetwork.org/category/> PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " query)) (define (wdt-taxon-name) "wdt:P225") (define (sparql-exec endpoint-url query) "Execute raw SPARQL query returning response as a UTF8 string" (bytevector->string (receive (response-status response-body) (http-request (string-append endpoint-url "?default-graph-uri=&query=" (uri-encode query) "&format=application%2Fsparql-results%2Bjson")) response-body) "UTF-8")) (define (sparql-tsv endpoint-url query) "Execute raw SPARQL query returning response as a UTF8 string, e.g. (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")) " ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2 (receive (response-status response-body) (http-get (pk (string-append endpoint-url "?query=" (uri-encode query))) #:headers '((Accept . "text/tab-separated-values")(user-agent . "curl/7.74.0"))) response-body)) (define (unpack field response) "Helper to get nested JSON field from SPARQL response" (cdr (assoc field response))) (define (sparql-names response) "Helper to get the names part of a SPARQL query" (unpack "vars" (unpack "head" response))) (define (sparql-results response) "Helper to get the results part of a SPARQL query" (unpack "bindings" (unpack "results" response))) (define (sparql-scm endpoint-url query) "Return dual S-exp 'resultset' of varnames and results" (let ((response (json-string->scm (sparql-exec endpoint-url (gn-sparql-prefix query))))) (values (sparql-names response) (sparql-results response)))) (define (tsv->scm text) "Split a TSV string into a list of fields. Returns list of names header) and rows" (let ([lst (map (lambda (f) (string-split f #\tab) ) (delete "" (string-split text #\newline)))]) (values (car lst) (cdr lst)) )) #! (define-values (names res) (sparql-species-meta)) (define table (get-rows names res)) (define recs '()) (define h (compile-species recs table)) (assoc "http://genenetwork.org/species_drosophila_melanogaster" h) (assoc-ref h "http://genenetwork.org/id/Drosophila_melanogaster") (define d (car h)) (assoc-ref (list d) "http://genenetwork.org/species_drosophila_melanogaster") (scm->json #(1 (("2" . 3)))) ;; [1,{"2":3}] (scm->json #("http://genenetwork.org/species_drosophila_melanogaster" (("http://genenetwork.org/menuName" . "Drosophila") ("http://genenetwork.org/binomialName" . "Drosophila melanogaster") ))) ;; ["http://genenetwork.org/species_drosophila_melanogaster",{"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}] l ;; (("http://genenetwork.org/menuName" "Drosophila") ("http://genenetwork.org/name" "Drosophila") ("http://genenetwork.org/binomialName" "Drosophila melanogaster")) (scm->json (map (lambda (i) (cons (car i) (car (cdr i)))) l)) ;; {"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/name":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"} curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="SELECT DISTINCT * where { wd:Q158695 wdt:P225 ?o . } limit 5" { "head" : { "vars" : [ "o" ] }, "results" : { "bindings" : [ { "o" : { "type" : "literal", "value" : "Arabidopsis thaliana" } } ] } } !# (define (sparql-wd-species-info species) "Returns wikidata entry for species, e.g.: (sparql-wd-species-info \"Q158695\") generates something like SELECT DISTINCT * where { wd:Q158695 wdt:P225 ?o . } limit 10 " (sparql-tsv (wd-sparql-endpoint-url) (string-append " SELECT DISTINCT ?taxon ?ncbi ?descr where { wd:" species " " (wdt-taxon-name) " ?taxon ; wdt:P685 ?ncbi ; schema:description ?descr . ?species wdt:P685 ?ncbi . FILTER (lang(?descr)='en') } limit 5 "))) (define memo-sparql-wd-species-info (memoize sparql-wd-species-info)) #! gn:Mus_musculus rdf:type gnc:species . gn:Mus_musculus gnt:name "Mouse" . gn:Mus_musculus rdfs:label "Mouse (Mus musculus, mm10)" . gn:Mus_musculus gnt:binomialName "Mus musculus" . gn:Mus_musculus gnt:family "Vertebrates" . gn:Mus_musculus gnt:organism taxon:10090 . !# (define (sparql-species) (sparql-scm (gn-sparql-endpoint-url) " SELECT DISTINCT ?species WHERE { ?species rdf:type gnc:species . }")) (define memo-sparql-species (memoize2 sparql-species)) (define (sparql-species-meta) "Return values names recs" (sparql-scm (gn-sparql-endpoint-url) " SELECT ?species ?p ?o WHERE { MINUS { ?species rdf:type ?o . } { SELECT DISTINCT ?species ?p ?o WHERE { ?species rdf:type gnc:species . ?species ?p ?o . }}}")) (define memo-sparql-species-meta (memoize2 sparql-species-meta)) #! dump-species-metadata.ttl:gn:Axbxa rdf:type gnc:inbredSet . dump-species-metadata.ttl:gn:Axbxa rdfs:label "AXB/BXA Family" . dump-species-metadata.ttl:gn:Axbxa gnt:family "Reference Populations (replicate average, SE, N)" . dump-species-metadata.ttl:gn:Axbxa gnt:mappingMethod "AXBXA" . dump-species-metadata.ttl:gn:Axbxa gnt:code "AXB" . dump-species-metadata.ttl:gn:Axbxa gnt:belongsToSpecies gn:Mus_musculus . !# (define (get-values names row) "Get values by name from a resultset row" (map (lambda (n) (unpack "value" (unpack n row))) (array->list names))) (define (get-rows names results) "Format results as a list of values ordered by names" (map (lambda (row) (get-values names row)) (array->list results))) (define (get-pairs names results) "Format results as a list of key-values ordered by names" (map (lambda (row) (let ([tuple (get-values names row)]) (cons (car tuple) (car (cdr tuple))) )) (array->list results))) ;; from the triples first harvest the species URIs, followed by creating records of information (define (compile-species recs rows) "Compile a matrix of species triples into records" (for-each (lambda (r) (let* ([species (car r)] [v (cdr (cdr r))] [p (car (cdr r))] [nrec '()] [kvs (assoc species recs)]) ; find record to fill based on subject ;; for each gnid add value pair (if (not kvs) (set! nrec `(("gnid" ,species))) (set! nrec (cdr kvs)) ) (set! nrec (assoc-set! nrec p v)) (set! recs (assoc-set! recs species nrec)) )) rows) recs) ;; ------------------------------------------------------------------------------ (define (sparql-groups-meta) "Return values names recs - (set-id, species, descr)" (sparql-scm (gn-sparql-endpoint-url) " SELECT DISTINCT ?set ?species ?descr WHERE { ?set rdf:type gnc:inbredSet ; gnt:belongsToSpecies ?species . OPTIONAL {?set rdfs:label ?descr } . }")) (define memo-sparql-groups-meta (memoize2 sparql-groups-meta)) (define (make-table sparql-thunk) "Make a tuple of column names and rows" (receive (names res) (sparql-thunk) (let ([rows (get-rows names res)]) (list names rows)))) (define (make-pairs sparql-thunk) "Make a tuple of column names and rows" (receive (names res) (sparql-thunk) (let ([rows (get-pairs names res)]) (list names rows)))) (define (compile-groups-meta) "Return tuple of names and rows containing #(set species descr)" (receive (names res) (memo-sparql-groups-meta) (let ([rows (get-rows names res)]) (list names rows)))) (define (sparql-group-info gnid) "Return set/group info - used by meta and data output" (sparql-scm (gn-sparql-endpoint-url) (string-append " SELECT DISTINCT ?key ?value WHERE { " gnid " ?key ?value . # FILTER ( !EXISTS{ " gnid " gnt:hasTissue ?value }) }")))