aboutsummaryrefslogtreecommitdiff
#!

Module for handling SPARQL primitives.

Note that GN queries should go into gn/db/sources - this is currently not
the case.

!#

(define-module (gn db sparql)
  #:use-module (json)
  #:use-module (ice-9 match)
  #:use-module (ice-9 format)
  #:use-module (ice-9 iconv)
  #:use-module (ice-9 receive)
  #:use-module (ice-9 string-fun)
  #:use-module (web client)
  #:use-module (web request)
  #:use-module (web uri)
  #:use-module (gn cache memoize)
  #:use-module (gn db sources wikidata)
  #:use-module (web gn-uri)

  #:export (memo-sparql-species
            memo-sparql-species-meta
            sparql-species-meta
            sparql-groups-meta
            sparql-group-info
            memo-sparql-wd-species-info
            compile-species
            compile-groups-meta
            get-rows
            tsv->scm
            strip-lang
            make-table
            make-pairs
            )
)


(define (strip-lang s)
  "Strip quotes and language tag (@en) from RDF entries"
  (list->string (match (string->list s)
		  [(#\"rest ... #\") rest]
		  [(#\"rest ... #\" #\@ #\e #\n) rest]
		  [rest rest]))
  )

(define (gn-sparql-endpoint-url)
  "https://sparql.genenetwork.org/sparql")

(define (wd-sparql-endpoint-url)
  "https://query.wikidata.org/sparql")

(define (gn-sparql-prefix query)
  (string-append
  "
PREFIX gn:  <http://genenetwork.org/id/>
PREFIX gnt: <http://genenetwork.org/term/>
PREFIX gnc: <http://genenetwork.org/category/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

" query))

(define (wdt-taxon-name) "wdt:P225")

(define (sparql-exec endpoint-url query)
  "Execute raw SPARQL query returning response as a UTF8 string"
  (bytevector->string (receive (response-status response-body)
                          (http-request (string-append endpoint-url "?default-graph-uri=&query=" (uri-encode query) "&format=application%2Fsparql-results%2Bjson"))

                         response-body) "UTF-8"))

(define (sparql-tsv endpoint-url query)
  "Execute raw SPARQL query returning response as a UTF8 string, e.g.
(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\"))
"
  ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2
  (receive (response-status response-body)
                          (http-get (pk (string-append endpoint-url "?query=" (uri-encode query))) #:headers '((Accept . "text/tab-separated-values")(user-agent . "curl/7.74.0")))
                         response-body))

(define (unpack field response)
  "Helper to get nested JSON field from SPARQL response"
  (cdr (assoc field response)))

(define (sparql-names response)
  "Helper to get the names part of a SPARQL query"
  (unpack "vars" (unpack "head" response)))

(define (sparql-results response)
  "Helper to get the results part of a SPARQL query"
  (unpack "bindings" (unpack "results" response)))

(define (sparql-scm endpoint-url query)
  "Return dual S-exp 'resultset' of varnames and results"
  (let ((response (json-string->scm
                   (sparql-exec endpoint-url (gn-sparql-prefix query)))))
   (values (sparql-names response) (sparql-results response))))

(define (tsv->scm text)
  "Split a TSV string into a list of fields. Returns list of names header) and rows"
  (let ([lst (map (lambda (f) (string-split f #\tab) ) (delete "" (string-split text #\newline)))])
    (values (car lst) (cdr lst))
  ))

#!
(define-values (names res) (sparql-species-meta))
(define table (get-rows names res))
(define recs '())
(define h (compile-species recs table))
(assoc "http://genenetwork.org/species_drosophila_melanogaster" h)
(assoc-ref h "http://genenetwork.org/id/Drosophila_melanogaster")
(define d (car h))
(assoc-ref (list d) "http://genenetwork.org/species_drosophila_melanogaster")

(scm->json #(1  (("2" . 3))))
;; [1,{"2":3}]
(scm->json #("http://genenetwork.org/species_drosophila_melanogaster" (("http://genenetwork.org/menuName" . "Drosophila") ("http://genenetwork.org/binomialName" . "Drosophila melanogaster") )))
;; ["http://genenetwork.org/species_drosophila_melanogaster",{"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}]
l
;; (("http://genenetwork.org/menuName" "Drosophila") ("http://genenetwork.org/name" "Drosophila") ("http://genenetwork.org/binomialName" "Drosophila melanogaster"))
(scm->json (map (lambda (i) (cons (car i) (car (cdr i)))) l))
;; {"http://genenetwork.org/menuName":"Drosophila","http://genenetwork.org/name":"Drosophila","http://genenetwork.org/binomialName":"Drosophila melanogaster"}


curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="SELECT DISTINCT * where {
  wd:Q158695 wdt:P225 ?o .
} limit 5"
{
  "head" : {
    "vars" : [ "o" ]
  },
  "results" : {
    "bindings" : [ {
      "o" : {
        "type" : "literal",
        "value" : "Arabidopsis thaliana"
      }
    } ]
  }
}
!#

(define (sparql-wd-species-info species)
  "Returns wikidata entry for species, e.g.:

   (sparql-wd-species-info \"Q158695\") generates something like

SELECT DISTINCT * where {  wd:Q158695 wdt:P225 ?o . } limit 10

"
  (sparql-tsv (wd-sparql-endpoint-url) (string-append "
SELECT DISTINCT ?taxon ?ncbi ?descr where {
    wd:" species " " (wdt-taxon-name) " ?taxon ;
               wdt:P685 ?ncbi ;
      schema:description ?descr .
    ?species wdt:P685 ?ncbi .
    FILTER (lang(?descr)='en')
} limit 5

")))

(define memo-sparql-wd-species-info
  (memoize sparql-wd-species-info))

#!
gn:Mus_musculus rdf:type gnc:species .
gn:Mus_musculus gnt:name "Mouse" .
gn:Mus_musculus rdfs:label "Mouse (Mus musculus, mm10)" .
gn:Mus_musculus gnt:binomialName "Mus musculus" .
gn:Mus_musculus gnt:family "Vertebrates" .
gn:Mus_musculus gnt:organism taxon:10090 .
!#

(define (sparql-species)
  (sparql-scm (gn-sparql-endpoint-url) "
SELECT DISTINCT ?species WHERE {
    ?species rdf:type gnc:species .
}"))

(define memo-sparql-species
  (memoize2 sparql-species))

(define (sparql-species-meta)
  "Return values names recs"
  (sparql-scm (gn-sparql-endpoint-url) "
SELECT ?species ?p ?o WHERE {
   MINUS { ?species rdf:type ?o . }
{
  SELECT DISTINCT ?species ?p ?o WHERE {
    ?species rdf:type gnc:species .
    ?species ?p ?o .
   }}}"))

(define memo-sparql-species-meta
  (memoize2 sparql-species-meta))

#!
dump-species-metadata.ttl:gn:Axbxa rdf:type gnc:inbredSet .
dump-species-metadata.ttl:gn:Axbxa rdfs:label "AXB/BXA Family" .
dump-species-metadata.ttl:gn:Axbxa gnt:family "Reference Populations (replicate average, SE, N)" .
dump-species-metadata.ttl:gn:Axbxa gnt:mappingMethod "AXBXA" .
dump-species-metadata.ttl:gn:Axbxa gnt:code "AXB" .
dump-species-metadata.ttl:gn:Axbxa gnt:belongsToSpecies gn:Mus_musculus .
!#

(define (get-values names row)
  "Get values by name from a resultset row"
  (map (lambda (n) (unpack "value" (unpack n row))) (array->list names)))

(define (get-rows names results)
  "Format results as a list of values ordered by names"
  (map (lambda (row) (get-values names row)) (array->list results)))

(define (get-pairs names results)
  "Format results as a list of key-values ordered by names"
  (map (lambda (row)
         (let ([tuple (get-values names row)])
           (cons (car tuple) (car (cdr tuple)))
           ))
       (array->list results)))

;; from the triples first harvest the species URIs, followed by creating records of information

(define (compile-species recs rows)
  "Compile a matrix of species triples into records"
  (for-each (lambda (r)
		(let* ([species (car r)]
		       [v (cdr (cdr r))]
		       [p (car (cdr r))]
		       [nrec '()]
		       [kvs (assoc species recs)]) ; find record to fill based on subject
                  ;; for each gnid add value pair
		  (if (not kvs)
		      (set! nrec `(("gnid" ,species)))
		      (set! nrec (cdr kvs))
		      )
		  (set! nrec (assoc-set! nrec p v))
		  (set! recs (assoc-set! recs species nrec))
		  ))
		rows)
  recs)

;; ------------------------------------------------------------------------------

(define (sparql-groups-meta)
  "Return values names recs - (set-id, species, descr)"
  (sparql-scm (gn-sparql-endpoint-url) "
       SELECT DISTINCT ?set ?species ?descr WHERE {
            ?set rdf:type gnc:inbredSet ;
                 gnt:belongsToSpecies ?species .
            OPTIONAL {?set rdfs:label ?descr } .
   }"))

(define memo-sparql-groups-meta
  (memoize2 sparql-groups-meta))

(define (make-table sparql-thunk)
  "Make a tuple of column names and rows"
  (receive (names res) (sparql-thunk)
    (let ([rows (get-rows names res)])
      (list names rows))))

(define (make-pairs sparql-thunk)
  "Make a tuple of column names and rows"
  (receive (names res) (sparql-thunk)
    (let ([rows (get-pairs names res)])
      (list names rows))))

(define (compile-groups-meta)
  "Return tuple of names and rows containing #(set species descr)"
  (receive (names res) (memo-sparql-groups-meta)
    (let ([rows (get-rows names res)])
      (list names rows))))

(define (sparql-group-info gnid)
  "Return set/group info - used by meta and data output"
  (sparql-scm (gn-sparql-endpoint-url) (string-append "
       SELECT DISTINCT ?key ?value WHERE {
            " gnid " ?key ?value .
            # FILTER ( !EXISTS{ " gnid " gnt:hasTissue ?value })
}")))