From 6cad296ed1c3f4ffbc621c645cc53e2392d9faff Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 6 Aug 2023 11:13:23 +0200
Subject: SPARQL: move into separate module
---
gn/db/sparql.scm | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 196 insertions(+)
create mode 100644 gn/db/sparql.scm
(limited to 'gn/db')
diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm
new file mode 100644
index 0000000..6c3b5b2
--- /dev/null
+++ b/gn/db/sparql.scm
@@ -0,0 +1,196 @@
+(define-module (gn db sparql)
+ #:use-module (json)
+ #:use-module (ice-9 match)
+ #:use-module (ice-9 format)
+ #:use-module (ice-9 iconv)
+ #:use-module (ice-9 receive)
+ #:use-module (ice-9 string-fun)
+ #:use-module (gn cache memoize)
+ #:use-module (web client)
+ #:use-module (web uri)
+
+ #:export (memo-sparql-species
+ memo-sparql-species-meta
+ memo-sparql-wd-species-info
+ compile-species
+ get-rows
+ tsv->scm
+ )
+)
+
+(define (gn-sparql-endpoint-url)
+ "https://sparql.genenetwork.org/sparql")
+
+(define (wd-sparql-endpoint-url)
+ "https://query.wikidata.org/sparql")
+
+(define (wdt-taxon-name) "wdt:P225")
+
+(define (sparql-exec endpoint-url query)
+ "Execute raw SPARQL query returning response as a UTF8 string"
+ (bytevector->string (receive (response-status response-body)
+ (http-request (string-append endpoint-url "?default-graph-uri=&query=" (uri-encode query) "&format=application%2Fsparql-results%2Bjson"))
+
+ response-body) "UTF-8"))
+
+(define (sparql-tsv endpoint-url query)
+ "Execute raw SPARQL query returning response as a UTF8 string, e.g.
+(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\"))
+"
+ ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2
+ (receive (response-status response-body)
+ (http-get (pk (string-append endpoint-url "?query=" (uri-encode query))) #:headers '((Accept . "text/tab-separated-values")(user-agent . "curl/7.74.0")))
+ response-body))
+
+(define (unpack field response)
+ "Helper to get nested JSON field from SPARQL response"
+ (cdr (assoc field response)))
+
+(define (sparql-names response)
+ "Helper to get the names part of a SPARQL query"
+ (unpack "vars" (unpack "head" response)))
+
+(define (sparql-results response)
+ "Helper to get the results part of a SPARQL query"
+ (unpack "bindings" (unpack "results" response)))
+
+(define (sparql-scm endpoint-url query)
+ "Return dual S-exp 'resultset' of varnames and results"
+ (let ((response (json-string->scm (sparql-exec endpoint-url query))))
+ (values (sparql-names response) (sparql-results response))))
+
+(define (tsv->scm text)
+ "Split a TSV string into a list of fields. Returns list of names header) and rows"
+ (let ([lst (map (lambda (f) (string-split f #\tab) ) (delete "" (string-split text #\newline)))])
+ (values (car lst) (cdr lst))
+ ))
+
+#!
+(define-values (names res) (sparql-species-meta))
+(define table (get-rows names res))
+(define recs '())
+(define h (compile-species recs table))
+(assoc "http://genenetwork.org/species_drosophila_melanogaster" h)
+(assoc-ref h "http://genenetwork.org/species_drosophila_melanogaster") ;; note switch!
+(define d (car h))
+(assoc-ref (list d) "http://genenetwork.org/species_drosophila_melanogaster")
+
+(scm->json #(1 (("2" . 3))))
+;; [1,{"2":3}]
+(scm->json #("http://genenetwork.org/species_drosophila_melanogaster" (("http://genenetwork.org/menuName" . "Drosophila") ("http://genenetwork.org/binomialName" . "Drosophila melanogaster") )))
+;; ["http://genenetwork.org/species_drosophila_melanogaster",{"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}]
+l
+;; (("http://genenetwork.org/menuName" "Drosophila") ("http://genenetwork.org/name" "Drosophila") ("http://genenetwork.org/binomialName" "Drosophila melanogaster"))
+(scm->json (map (lambda (i) (cons (car i) (car (cdr i)))) l))
+;; {"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/name":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}
+
+
+curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="SELECT DISTINCT * where {
+ wd:Q158695 wdt:P225 ?o .
+} limit 5"
+{
+ "head" : {
+ "vars" : [ "o" ]
+ },
+ "results" : {
+ "bindings" : [ {
+ "o" : {
+ "type" : "literal",
+ "value" : "Arabidopsis thaliana"
+ }
+ } ]
+ }
+}
+!#
+
+(define (sparql-wd-species-info species)
+ "Returns wikidata entry for species, e.g.:
+
+ (sparql-wd-species-info \"Q158695\") generates something like
+
+SELECT DISTINCT * where { wd:Q158695 wdt:P225 ?o . } limit 10
+
+"
+ (sparql-tsv (wd-sparql-endpoint-url) (string-append "
+SELECT DISTINCT ?taxon ?ncbi ?descr where {
+ wd:" species " " (wdt-taxon-name) " ?taxon ;
+ wdt:P685 ?ncbi ;
+ schema:description ?descr .
+ ?species wdt:P685 ?ncbi .
+ FILTER (lang(?descr)='en')
+} limit 5
+
+")))
+
+(define memo-sparql-wd-species-info
+ (memoize sparql-wd-species-info))
+
+#!
+gn:Mus_musculus rdf:type gnc:species .
+gn:Mus_musculus gnt:name "Mouse" .
+gn:Mus_musculus rdfs:label "Mouse (Mus musculus, mm10)" .
+gn:Mus_musculus gnt:binomialName "Mus musculus" .
+gn:Mus_musculus gnt:family "Vertebrates" .
+gn:Mus_musculus gnt:organism taxon:10090 .
+!#
+
+(define (sparql-species)
+ (sparql-scm (gn-sparql-endpoint-url) "
+PREFIX gn:
+PREFIX gnc:
+PREFIX rdf:
+
+SELECT DISTINCT ?species WHERE {
+ ?species rdf:type gnc:species .
+}"))
+
+(define memo-sparql-species
+ (memoize2 sparql-species))
+
+(define (sparql-species-meta)
+ "Return values names recs"
+ (sparql-scm (gn-sparql-endpoint-url) "
+PREFIX gn:
+PREFIX gnc:
+PREFIX gnt:
+PREFIX rdf:
+
+SELECT ?species ?p ?o WHERE {
+ MINUS { ?species rdf:type ?o . }
+{
+ SELECT DISTINCT ?species ?p ?o WHERE {
+ ?species rdf:type gnc:species .
+ ?species ?p ?o .
+ }}}"))
+
+(define memo-sparql-species-meta
+ (memoize2 sparql-species-meta))
+
+
+(define (get-values names row)
+ "Get values by name from a resultset row"
+ (map (lambda (n) (unpack "value" (unpack n row))) (array->list names)))
+
+(define (get-rows names results)
+ "Format results as a list of values ordered by names"
+ (map (lambda (row) (get-values names row)) (array->list results)))
+
+;; from the triples first harvest the species URIs, followed by creating records of information
+
+(define (compile-species recs rows)
+ "Compile a matrix of species triples into records"
+ (for-each (lambda (r)
+ (let* ([s (car r)]
+ [v (cdr (cdr r))]
+ [p (car (cdr r))]
+ [nrec '()]
+ [kv (assoc s recs)]) ; find record to fill based on subject
+ (if (not kv)
+ (set! nrec '())
+ (set! nrec (cdr kv))
+ )
+ (set! nrec (assoc-set! nrec p v))
+ (set! recs (assoc-set! recs s nrec))
+ ))
+ rows)
+ recs)
--
cgit v1.2.3