about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2025-07-16 09:53:15 +0200
committerPjotr Prins2025-07-16 09:53:15 +0200
commit8058b2755f0e7794667a6c0cfb9721f6ce64dbd5 (patch)
tree45b9d04fe22f60e7a7bc59cadc945ad6d8b23606
parent7477f9af79f6a6ee17d3bc1e6e138bd0c1c5f8c0 (diff)
downloadgn-guile-8058b2755f0e7794667a6c0cfb9721f6ce64dbd5.tar.gz
Wikidata: query for gene aliases
-rw-r--r--gn/data/strains.scm2
-rw-r--r--gn/db/sources/wikidata.scm49
-rw-r--r--gn/db/sparql.scm26
-rw-r--r--web/webserver.scm7
4 files changed, 74 insertions, 10 deletions
diff --git a/gn/data/strains.scm b/gn/data/strains.scm
index c560d9b..07b69ff 100644
--- a/gn/data/strains.scm
+++ b/gn/data/strains.scm
@@ -25,7 +25,7 @@
   "Return assoc list of tuples of strain id+names:
    ((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)...
 
-used-for-mapping? will say whether the strains/individuals are used for mapping.
+optional key used-for-mapping? will say whether the strains/individuals are used for mapping.
 "
   (call-with-db
    (lambda (db)
diff --git a/gn/db/sources/wikidata.scm b/gn/db/sources/wikidata.scm
index 7397426..fe495c5 100644
--- a/gn/db/sources/wikidata.scm
+++ b/gn/db/sources/wikidata.scm
@@ -1,10 +1,38 @@
 #!
 
-Wikidata queries
+Wikidata queries, initially lifted over from the gn3 gene-alias code (that was written in Racket).
 
+Note you can take a SPARQL query and push it into https://query.wikidata.org/. E.g. generate a query and
+copy paste into the query service:
+
+scheme@(guile-user) [3]> (display (wikidata-query-geneids "Shh"))
+```
+SELECT DISTINCT ?wikidata_id
+            WHERE {
+              ?wikidata_id wdt:P31 wd:Q7187;
+                           wdt:P703 ?species .
+              VALUES (?species) { (wd:Q15978631 ) ( wd:Q83310 ) ( wd:Q184224 ) } .
+              ?wikidata_id rdfs:label "Shh"@en .
+              }
+```
+
+It is possible to run queries through curl with
+
+```
+curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="
+    SELECT DISTINCT ?alias
+             WHERE {
+                     wd:Q24420953 rdfs:label ?name ;
+                         skos:altLabel ?alias .
+                     FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").
+                   }"
+```
 !#
 
 (define-module (gn db sources wikidata)
+  #:export (wikidata-query-geneids
+            wikidata-query-gene-aliases
+            )
 )
 
 (define ps-encoded-by "ps:P702")
@@ -14,16 +42,23 @@ Wikidata queries
 (define wd-mouse "wd:Q83310")
 (define wd-rat "wd:Q184224")
 (define wd-gene "wd:Q7187")
+(define wd-shh-rat "wd:Q24420953")
 
-(define (wikidata_query_geneids gene_name)
-  "Return the wikidata identifiers pointing to genes of listed species"
+(define (wikidata-query-geneids gene_name)
+  "SPARQL query to get the wikidata identifiers pointing to genes of listed species, e.g. 'Shh'"
   (string-append
      "SELECT DISTINCT ?wikidata_id
             WHERE {
               ?wikidata_id " wdt-instance-of " " wd-gene ";
                            " wdt-in-taxon " ?species .
               VALUES (?species) { (" wd-human " ) ( " wd-mouse" ) ( " wd-rat" ) } .
-              ?wikidata_id rdfs:label \"" gene_name "\"@en .
-        }
-"
-              ))
+              ?wikidata_id rdfs:label \"" gene_name "\"@en .}"))
+
+(define (wikidata-query-gene-aliases wikidata_id)
+  "SPARQL query to get a list of gene aliases based on a wikidata identifier, e.g. for Q24420953"
+  (string-append
+      "SELECT DISTINCT ?alias
+             WHERE {
+                     wd:" wikidata_id " rdfs:label ?name ;
+                         skos:altLabel ?alias .
+                     FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").}"))
diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm
index f03389b..86d163f 100644
--- a/gn/db/sparql.scm
+++ b/gn/db/sparql.scm
@@ -73,7 +73,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 
 (define (sparql-tsv endpoint-url query)
   "Execute raw SPARQL query returning response as a UTF8 string, e.g.
-(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\"))
+(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")).
+
+Note this procedure works for wikidata, but not for gn!
 "
   ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2
   (receive (response-status response-body)
@@ -93,7 +95,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
   (unpack "bindings" (unpack "results" response)))
 
 (define (sparql-scm endpoint-url query)
-  "Return dual S-exp 'resultset' of varnames and results"
+  "Return dual S-exp 'resultset' of varnames and results.
+
+Note this procedure works for GN, but does not yet work for wikidata"
   (let ((response (json-string->scm
                    (sparql-exec endpoint-url (gn-sparql-prefix query)))))
    (values (sparql-names response) (sparql-results response))))
@@ -164,6 +168,24 @@ SELECT DISTINCT ?taxon ?ncbi ?descr where {
 (define memo-sparql-wd-species-info
   (memoize sparql-wd-species-info))
 
+(define (sparql-wd-geneids gene-name)
+  "Return a list of expanded wikidata ids, e.g.
+(\"<http://www.wikidata.org/entity/Q14860079>\" \"<http://www.wikidata.org/entity/Q24420953>\")"
+  (receive (type values)
+      (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-geneids gene-name)))
+    (map (lambda (item) (car item)) values) ;; flatten list))
+
+(define memo-sparql-wd-geneids
+  (memoize sparql-wd-geneids))
+
+(define (sparql-wd-gene-aliases geneids)
+  (let ([geneid (car geneids)])
+    (receive (type values)
+        (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-gene-aliases geneid)))
+      (map (lambda (item) (car item)) values) ;; flatten list))
+      )
+  ))
+
 #!
 gn:Mus_musculus rdf:type gnc:species .
 gn:Mus_musculus gnt:name "Mouse" .
diff --git a/web/webserver.scm b/web/webserver.scm
index d2a8c8d..430529b 100644
--- a/web/webserver.scm
+++ b/web/webserver.scm
@@ -56,6 +56,11 @@ otherwise search for set/group data"
     (if taxoninfo taxoninfo
         (cdr (get-group-data id)))))
 
+(define (get-gene-aliases genename)
+  "Return a vector of aliases for genename."
+  #("Hx")
+  )
+
 (define (not-found2 request)
   (values (build-response #:code 404)
           (string-append "Resource X not found: "
@@ -247,6 +252,8 @@ otherwise search for set/group data"
     (('GET "doc" path ... page)
      ;; serve documents from /doc/
      (render-doc path page))
+    (('GET "gene" "aliases" genename)
+     (render-json (get-gene-aliases genename)))
     (('GET "species.json")
      (render-json (get-species-data)))
     (('GET "species.meta.json")