From 6cad296ed1c3f4ffbc621c645cc53e2392d9faff Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 6 Aug 2023 11:13:23 +0200 Subject: SPARQL: move into separate module --- gn/cache/memoize.scm | 4 ++ gn/db/sparql.scm | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++ web/webserver.scm | 179 +--------------------------------------------- 3 files changed, 202 insertions(+), 177 deletions(-) create mode 100644 gn/db/sparql.scm diff --git a/gn/cache/memoize.scm b/gn/cache/memoize.scm index e7ccc33..42b2163 100644 --- a/gn/cache/memoize.scm +++ b/gn/cache/memoize.scm @@ -7,6 +7,10 @@ ;; https://lispdreams.wordpress.com/2016/04/08/lisp-memoization-techniques/ (define-module (gn cache memoize) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-13) ; hash table for memoize + #:use-module (srfi srfi-11) ; let-values + #:export (memoize memoize2)) diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm new file mode 100644 index 0000000..6c3b5b2 --- /dev/null +++ b/gn/db/sparql.scm @@ -0,0 +1,196 @@ +(define-module (gn db sparql) + #:use-module (json) + #:use-module (ice-9 match) + #:use-module (ice-9 format) + #:use-module (ice-9 iconv) + #:use-module (ice-9 receive) + #:use-module (ice-9 string-fun) + #:use-module (gn cache memoize) + #:use-module (web client) + #:use-module (web uri) + + #:export (memo-sparql-species + memo-sparql-species-meta + memo-sparql-wd-species-info + compile-species + get-rows + tsv->scm + ) +) + +(define (gn-sparql-endpoint-url) + "https://sparql.genenetwork.org/sparql") + +(define (wd-sparql-endpoint-url) + "https://query.wikidata.org/sparql") + +(define (wdt-taxon-name) "wdt:P225") + +(define (sparql-exec endpoint-url query) + "Execute raw SPARQL query returning response as a UTF8 string" + (bytevector->string (receive (response-status response-body) + (http-request (string-append endpoint-url "?default-graph-uri=&query=" (uri-encode query) "&format=application%2Fsparql-results%2Bjson")) + + response-body) "UTF-8")) + +(define (sparql-tsv endpoint-url query) + "Execute raw SPARQL query returning response as a UTF8 string, e.g. +(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")) +" + ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2 + (receive (response-status response-body) + (http-get (pk (string-append endpoint-url "?query=" (uri-encode query))) #:headers '((Accept . "text/tab-separated-values")(user-agent . "curl/7.74.0"))) + response-body)) + +(define (unpack field response) + "Helper to get nested JSON field from SPARQL response" + (cdr (assoc field response))) + +(define (sparql-names response) + "Helper to get the names part of a SPARQL query" + (unpack "vars" (unpack "head" response))) + +(define (sparql-results response) + "Helper to get the results part of a SPARQL query" + (unpack "bindings" (unpack "results" response))) + +(define (sparql-scm endpoint-url query) + "Return dual S-exp 'resultset' of varnames and results" + (let ((response (json-string->scm (sparql-exec endpoint-url query)))) + (values (sparql-names response) (sparql-results response)))) + +(define (tsv->scm text) + "Split a TSV string into a list of fields. Returns list of names header) and rows" + (let ([lst (map (lambda (f) (string-split f #\tab) ) (delete "" (string-split text #\newline)))]) + (values (car lst) (cdr lst)) + )) + +#! +(define-values (names res) (sparql-species-meta)) +(define table (get-rows names res)) +(define recs '()) +(define h (compile-species recs table)) +(assoc "http://genenetwork.org/species_drosophila_melanogaster" h) +(assoc-ref h "http://genenetwork.org/species_drosophila_melanogaster") ;; note switch! +(define d (car h)) +(assoc-ref (list d) "http://genenetwork.org/species_drosophila_melanogaster") + +(scm->json #(1 (("2" . 3)))) +;; [1,{"2":3}] +(scm->json #("http://genenetwork.org/species_drosophila_melanogaster" (("http://genenetwork.org/menuName" . "Drosophila") ("http://genenetwork.org/binomialName" . "Drosophila melanogaster") ))) +;; ["http://genenetwork.org/species_drosophila_melanogaster",{"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}] +l +;; (("http://genenetwork.org/menuName" "Drosophila") ("http://genenetwork.org/name" "Drosophila") ("http://genenetwork.org/binomialName" "Drosophila melanogaster")) +(scm->json (map (lambda (i) (cons (car i) (car (cdr i)))) l)) +;; {"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/name":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"} + + +curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="SELECT DISTINCT * where { + wd:Q158695 wdt:P225 ?o . +} limit 5" +{ + "head" : { + "vars" : [ "o" ] + }, + "results" : { + "bindings" : [ { + "o" : { + "type" : "literal", + "value" : "Arabidopsis thaliana" + } + } ] + } +} +!# + +(define (sparql-wd-species-info species) + "Returns wikidata entry for species, e.g.: + + (sparql-wd-species-info \"Q158695\") generates something like + +SELECT DISTINCT * where { wd:Q158695 wdt:P225 ?o . } limit 10 + +" + (sparql-tsv (wd-sparql-endpoint-url) (string-append " +SELECT DISTINCT ?taxon ?ncbi ?descr where { + wd:" species " " (wdt-taxon-name) " ?taxon ; + wdt:P685 ?ncbi ; + schema:description ?descr . + ?species wdt:P685 ?ncbi . + FILTER (lang(?descr)='en') +} limit 5 + +"))) + +(define memo-sparql-wd-species-info + (memoize sparql-wd-species-info)) + +#! +gn:Mus_musculus rdf:type gnc:species . +gn:Mus_musculus gnt:name "Mouse" . +gn:Mus_musculus rdfs:label "Mouse (Mus musculus, mm10)" . +gn:Mus_musculus gnt:binomialName "Mus musculus" . +gn:Mus_musculus gnt:family "Vertebrates" . +gn:Mus_musculus gnt:organism taxon:10090 . +!# + +(define (sparql-species) + (sparql-scm (gn-sparql-endpoint-url) " +PREFIX gn: +PREFIX gnc: +PREFIX rdf: + +SELECT DISTINCT ?species WHERE { + ?species rdf:type gnc:species . +}")) + +(define memo-sparql-species + (memoize2 sparql-species)) + +(define (sparql-species-meta) + "Return values names recs" + (sparql-scm (gn-sparql-endpoint-url) " +PREFIX gn: +PREFIX gnc: +PREFIX gnt: +PREFIX rdf: + +SELECT ?species ?p ?o WHERE { + MINUS { ?species rdf:type ?o . } +{ + SELECT DISTINCT ?species ?p ?o WHERE { + ?species rdf:type gnc:species . + ?species ?p ?o . + }}}")) + +(define memo-sparql-species-meta + (memoize2 sparql-species-meta)) + + +(define (get-values names row) + "Get values by name from a resultset row" + (map (lambda (n) (unpack "value" (unpack n row))) (array->list names))) + +(define (get-rows names results) + "Format results as a list of values ordered by names" + (map (lambda (row) (get-values names row)) (array->list results))) + +;; from the triples first harvest the species URIs, followed by creating records of information + +(define (compile-species recs rows) + "Compile a matrix of species triples into records" + (for-each (lambda (r) + (let* ([s (car r)] + [v (cdr (cdr r))] + [p (car (cdr r))] + [nrec '()] + [kv (assoc s recs)]) ; find record to fill based on subject + (if (not kv) + (set! nrec '()) + (set! nrec (cdr kv)) + ) + (set! nrec (assoc-set! nrec p v)) + (set! recs (assoc-set! recs s nrec)) + )) + rows) + recs) diff --git a/web/webserver.scm b/web/webserver.scm index 910024f..657d82e 100755 --- a/web/webserver.scm +++ b/web/webserver.scm @@ -18,7 +18,6 @@ ;; (ice-9 source) (srfi srfi-1) (srfi srfi-11) ; let-values - (srfi srfi-13) ; hash table for memoize (srfi srfi-26) (web http) (web client) @@ -26,7 +25,8 @@ (web response) (web uri) (fibers web server) - (gn cache memoize)) + (gn cache memoize) + (gn db sparql)) (define get-version "2.0") @@ -34,11 +34,6 @@ (define (base-url) "https://genenetwork.org") -(define (gn-sparql-endpoint-url) - "https://sparql.genenetwork.org/sparql") - -(define (wd-sparql-endpoint-url) - "https://query.wikidata.org/sparql") (define (prefix) "Build the API URL including version" @@ -73,7 +68,6 @@ (define (mk-predicate postfix) (mk-html (string-append "predicate" "/" postfix))) -(define (wdt-taxon-name) "wdt:P225") (define info `( ("name" . "GeneNetwork REST API") @@ -92,175 +86,6 @@ (,(mk-url "datasets")."Get a list of datasets"))))) -(define (sparql-exec endpoint-url query) - "Execute raw SPARQL query returning response as a UTF8 string" - (bytevector->string (receive (response-status response-body) - (http-request (string-append endpoint-url "?default-graph-uri=&query=" (uri-encode query) "&format=application%2Fsparql-results%2Bjson")) - - response-body) "UTF-8")) - -(define (sparql-tsv endpoint-url query) - "Execute raw SPARQL query returning response as a UTF8 string, e.g. -(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")) -" - ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2 - (receive (response-status response-body) - (http-get (pk (string-append endpoint-url "?query=" (uri-encode query))) #:headers '((Accept . "text/tab-separated-values")(user-agent . "curl/7.74.0"))) - response-body)) - -(define (unpack field response) - "Helper to get nested JSON field from SPARQL response" - (cdr (assoc field response))) - -(define (sparql-names response) - "Helper to get the names part of a SPARQL query" - (unpack "vars" (unpack "head" response))) - -(define (sparql-results response) - "Helper to get the results part of a SPARQL query" - (unpack "bindings" (unpack "results" response))) - -(define (sparql-scm endpoint-url query) - "Return dual S-exp 'resultset' of varnames and results" - (let ((response (json-string->scm (sparql-exec endpoint-url query)))) - (values (sparql-names response) (sparql-results response)))) - -(define (tsv->scm text) - "Split a TSV string into a list of fields. Returns list of names header) and rows" - (let ([lst (map (lambda (f) (string-split f #\tab) ) (delete "" (string-split text #\newline)))]) - (values (car lst) (cdr lst)) - )) - -#! -(define-values (names res) (sparql-species-meta)) -(define table (get-rows names res)) -(define recs '()) -(define h (compile-species recs table)) -(assoc "http://genenetwork.org/species_drosophila_melanogaster" h) -(assoc-ref h "http://genenetwork.org/species_drosophila_melanogaster") ;; note switch! -(define d (car h)) -(assoc-ref (list d) "http://genenetwork.org/species_drosophila_melanogaster") - -(scm->json #(1 (("2" . 3)))) -;; [1,{"2":3}] -(scm->json #("http://genenetwork.org/species_drosophila_melanogaster" (("http://genenetwork.org/menuName" . "Drosophila") ("http://genenetwork.org/binomialName" . "Drosophila melanogaster") ))) -;; ["http://genenetwork.org/species_drosophila_melanogaster",{"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}] -l -;; (("http://genenetwork.org/menuName" "Drosophila") ("http://genenetwork.org/name" "Drosophila") ("http://genenetwork.org/binomialName" "Drosophila melanogaster")) -(scm->json (map (lambda (i) (cons (car i) (car (cdr i)))) l)) -;; {"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/name":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"} - - -curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="SELECT DISTINCT * where { - wd:Q158695 wdt:P225 ?o . -} limit 5" -{ - "head" : { - "vars" : [ "o" ] - }, - "results" : { - "bindings" : [ { - "o" : { - "type" : "literal", - "value" : "Arabidopsis thaliana" - } - } ] - } -} -!# - -(define (sparql-wd-species-info species) - "Returns wikidata entry for species, e.g.: - - (sparql-wd-species-info \"Q158695\") generates something like - -SELECT DISTINCT * where { wd:Q158695 wdt:P225 ?o . } limit 10 - -" - (sparql-tsv (wd-sparql-endpoint-url) (string-append " -SELECT DISTINCT ?taxon ?ncbi ?descr where { - wd:" species " " (wdt-taxon-name) " ?taxon ; - wdt:P685 ?ncbi ; - schema:description ?descr . - ?species wdt:P685 ?ncbi . - FILTER (lang(?descr)='en') -} limit 5 - -"))) - -(define memo-sparql-wd-species-info - (memoize sparql-wd-species-info)) - -#! -gn:Mus_musculus rdf:type gnc:species . -gn:Mus_musculus gnt:name "Mouse" . -gn:Mus_musculus rdfs:label "Mouse (Mus musculus, mm10)" . -gn:Mus_musculus gnt:binomialName "Mus musculus" . -gn:Mus_musculus gnt:family "Vertebrates" . -gn:Mus_musculus gnt:organism taxon:10090 . -!# - -(define (sparql-species) - (sparql-scm (gn-sparql-endpoint-url) " -PREFIX gn: -PREFIX gnc: -PREFIX rdf: - -SELECT DISTINCT ?species WHERE { - ?species rdf:type gnc:species . -}")) - -(define memo-sparql-species - (memoize2 sparql-species)) - -(define (sparql-species-meta) - "Return values names recs" - (sparql-scm (gn-sparql-endpoint-url) " -PREFIX gn: -PREFIX gnc: -PREFIX gnt: -PREFIX rdf: - -SELECT ?species ?p ?o WHERE { - MINUS { ?species rdf:type ?o . } -{ - SELECT DISTINCT ?species ?p ?o WHERE { - ?species rdf:type gnc:species . - ?species ?p ?o . - }}}")) - -(define memo-sparql-species-meta - (memoize2 sparql-species-meta)) - - -(define (get-values names row) - "Get values by name from a resultset row" - (map (lambda (n) (unpack "value" (unpack n row))) (array->list names))) - -(define (get-rows names results) - "Format results as a list of values ordered by names" - (map (lambda (row) (get-values names row)) (array->list results))) - -;; from the triples first harvest the species URIs, followed by creating records of information - -(define (compile-species recs rows) - "Compile a matrix of species triples into records" - (for-each (lambda (r) - (let* ([s (car r)] - [v (cdr (cdr r))] - [p (car (cdr r))] - [nrec '()] - [kv (assoc s recs)]) ; find record to fill based on subject - (if (not kv) - (set! nrec '()) - (set! nrec (cdr kv)) - ) - (set! nrec (assoc-set! nrec p v)) - (set! recs (assoc-set! recs s nrec)) - )) - rows) - recs) - ;; result should be a vector of list of pair (define (species-digest recs) (map (lambda (r) -- cgit v1.2.3