about summary refs log tree commit diff
path: root/gn
diff options
context:
space:
mode:
Diffstat (limited to 'gn')
-rw-r--r--gn/data/dataset.scm61
-rw-r--r--gn/data/genotype.scm1
-rw-r--r--gn/data/strains.scm2
-rw-r--r--gn/db/mysql.scm9
-rw-r--r--gn/db/sources/wikidata.scm65
-rw-r--r--gn/db/sparql.scm60
-rw-r--r--gn/runner/gemma.scm15
7 files changed, 200 insertions, 13 deletions
diff --git a/gn/data/dataset.scm b/gn/data/dataset.scm
index c28cf25..afe75ba 100644
--- a/gn/data/dataset.scm
+++ b/gn/data/dataset.scm
@@ -4,14 +4,21 @@
   #:use-module (ice-9 iconv)
   #:use-module (ice-9 receive)
   #:use-module (ice-9 string-fun)
+  #:use-module (srfi srfi-1)
   #:use-module (dbi dbi)
   #:use-module (gn db mysql)
+  #:use-module (gn data genotype)
   #:use-module (gn data group)
   #:use-module (gn util convert)
   #:use-module (web gn-uri)
+  #:use-module (rnrs base) ; for assert
 
   #:export (
             dataset-name
+            get-bxd-publish-list
+            get-bxd-publish-values-list
+            get-bxd-publish-name-value-dict
+            get-bxd-publish-dataid-name-value-dict
             ))
 
 (define (get-dataset db probesetfreeze-id)
@@ -22,3 +29,57 @@
 
 (define (dataset-name db probesetfreeze-id)
   (assoc-ref (get-dataset db probesetfreeze-id) "Name"))
+
+(define (get-dataid-from-publishxrefid id)
+  "Get the internal dataid from publishxref - which is the same as used in the GN2 web interface"
+  (call-with-db
+   (lambda (db)
+     (let [(query (string-append "SELECT Id,PhenotypeId,DataId FROM PublishXRef WHERE Id=" id " AND InbredSetId=1 LIMIT 1"))]
+       (dbi-query db query)
+       (pk (int-to-string (assoc-ref (get-row db) "DataId")))))))
+
+(define (get-bxd-publish-list)
+  (call-with-db
+   (lambda (db)
+     (let [(query "SELECT Id,PhenotypeId,DataId FROM PublishXRef WHERE InbredSetId=1")]
+       (dbi-query db query)
+       (get-rows db '())))))
+
+(define* (get-bxd-publish-values-list dataid #:optional used-for-mapping?)
+  "Returns dict of name values , e.g. [{\"Name\":\"C57BL/6J\",\"value\":9.136},{\"Name\":\"DBA/2J\",\"value\":4.401},{\"Name\":\"BXD9\",\"value\":4.36}, ... used-for-mapping? skips the founders and maybe other unmappable inds. Note, currently unused."
+  (call-with-db
+   (lambda (db)
+     (let [(query (string-append "SELECT Strain.Name, PublishData.value FROM Strain, PublishData WHERE PublishData.Id=" dataid " and Strain.Id=StrainID;"))]
+       (dbi-query db query)
+       (if used-for-mapping?
+           (remove null? (pk (get-rows-apply db
+                                             (lambda (r)
+                                               (if (string-contains (assoc-ref r "Name") "BXD")
+                                                   `(("Name" . ,(assoc-ref r "Name")) ("value" . ,(assoc-ref r "value")))
+                                                   '() ) ;; return empty on no match
+                                               ) '())))
+           (get-rows db '())
+           )))))
+
+(define* (get-bxd-publish-dataid-name-value-dict dataid #:optional used-for-mapping?)
+  "Returns dict of name values, e.g. (((\"C57BL/6J\" . 9.136) (\"DBA/2J\" . 4.401) (\"BXD9\" . 4.36) ... used-for-mapping? skips the founders and maybe other unmappable inds."
+  (call-with-db
+   (lambda (db)
+     (let [(query (string-append "SELECT Strain.Name, PublishData.value FROM Strain, PublishData WHERE PublishData.Id=" dataid " and Strain.Id=StrainID;"))]
+       (dbi-query db query)
+       (if used-for-mapping?
+           (remove null? (pk (get-rows-apply db
+                                             (lambda (r)
+                                               (if (string-contains (assoc-ref r "Name") "BXD")
+                                                   `(,(assoc-ref r "Name") . ,(assoc-ref r "value"))
+                                                   '() ) ;; return empty on no match
+                                               ) '())))
+           (remove null? (pk (get-rows-apply db
+                                             (lambda (r)
+                                               `(,(assoc-ref r "Name") . ,(assoc-ref r "value"))
+                                               ) '())))
+           )))))
+
+(define* (get-bxd-publish-name-value-dict id #:optional used-for-mapping?)
+  "Same as above function, but starting from data id"
+  (get-bxd-publish-dataid-name-value-dict (get-dataid-from-publishxrefid id) used-for-mapping?))
diff --git a/gn/data/genotype.scm b/gn/data/genotype.scm
index c7cb63c..5574382 100644
--- a/gn/data/genotype.scm
+++ b/gn/data/genotype.scm
@@ -16,6 +16,7 @@
             ))
 
 (define (geno-inds-bxd fn)
+  "Returns information from GN's BXD.json, note it fetches the first geno file info, now BXD.8.geno"
   (let [(js (call-with-input-file fn
               (lambda (port)
                 (json->scm port))))]
diff --git a/gn/data/strains.scm b/gn/data/strains.scm
index e5f839b..07b69ff 100644
--- a/gn/data/strains.scm
+++ b/gn/data/strains.scm
@@ -25,7 +25,7 @@
   "Return assoc list of tuples of strain id+names:
    ((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)...
 
-used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME
+optional key used-for-mapping? will say whether the strains/individuals are used for mapping.
 "
   (call-with-db
    (lambda (db)
diff --git a/gn/db/mysql.scm b/gn/db/mysql.scm
index ccd414a..223b5fd 100644
--- a/gn/db/mysql.scm
+++ b/gn/db/mysql.scm
@@ -32,22 +32,23 @@
     ;; (display "===> OPENING DB")
     ;; (newline)
     (let [(db (dbi-open "mysql" "webqtlout:webqtlout:db_webqtl:tcp:127.0.0.1:3306"))]
-      (ensure db)
+      (ensure db "Can't open connection")
       db
     )))
 
 (define (call-with-db thunk)
   (thunk (db-open)))
 
-(define (ensure db)
+(define (ensure db msg1)
   "Use DBI-style handle to report an error. On error the program will stop."
   (match (dbi-get_status db)
     ((stat . msg) (if (= stat 0)
                      #t
                      (begin
-                       (display msg)
+                       (display "SQL Connection ERROR! ")
+                       (display (string-append msg1 " - " msg)
                        (newline)
-                       (assert stat))))))
+                       (assert #f)))))))
 
 (define (has-result? db)
   "Return #t or #f if result is valid"
diff --git a/gn/db/sources/wikidata.scm b/gn/db/sources/wikidata.scm
new file mode 100644
index 0000000..954ce93
--- /dev/null
+++ b/gn/db/sources/wikidata.scm
@@ -0,0 +1,65 @@
+#!
+
+Wikidata queries, initially lifted over from the gn3 gene-alias code (that was written in Racket).
+
+Note you can take a SPARQL query and push it into https://query.wikidata.org/. E.g. generate a query and
+copy paste into the query service:
+
+scheme@(guile-user) [3]> (display (wikidata-query-geneids "Shh"))
+```
+SELECT DISTINCT ?wikidata_id
+            WHERE {
+              ?wikidata_id wdt:P31 wd:Q7187;
+                           wdt:P703 ?species .
+              VALUES (?species) { (wd:Q15978631 ) ( wd:Q83310 ) ( wd:Q184224 ) } .
+              ?wikidata_id rdfs:label "Shh"@en .
+              }
+```
+
+It is possible to run queries through curl with
+
+```
+curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="
+    SELECT DISTINCT ?alias
+             WHERE {
+                     wd:Q24420953 rdfs:label ?name ;
+                         skos:altLabel ?alias .
+                     FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").
+                   }"
+```
+!#
+
+(define-module (gn db sources wikidata)
+  #:export (wikidata-query-geneids
+            wikidata-query-gene-aliases
+            )
+)
+
+(define ps-encoded-by "ps:P702")
+(define wdt-instance-of "wdt:P31")
+(define wdt-in-taxon "wdt:P703")
+(define wd-human "wd:Q15978631")
+(define wd-mouse "wd:Q83310")
+(define wd-rat "wd:Q184224")
+(define wd-gene "wd:Q7187")
+(define wd-shh-rat "wd:Q24420953")
+
+(define (wikidata-query-geneids gene_name)
+  "SPARQL query to get the wikidata identifiers pointing to genes of listed species, e.g. 'Shh'"
+  (string-append
+     "SELECT DISTINCT ?wikidata_id
+            WHERE {
+              ?wikidata_id " wdt-instance-of " " wd-gene ";
+                           " wdt-in-taxon " ?species .
+              VALUES (?species) { (" wd-human " ) ( " wd-mouse" ) ( " wd-rat" ) } .
+              ?wikidata_id rdfs:label \"" gene_name "\"@en .}"))
+
+(define (wikidata-query-gene-aliases wikidata_id)
+  "SPARQL query to get a list of gene aliases based on a wikidata identifier, e.g. for Q24420953. This
+version supports the expanded id only, so <http://www.wikidata.org/entity/Q24420953> including the <,>."
+  (string-append
+      "SELECT DISTINCT ?stripped_alias
+             WHERE { " wikidata_id " rdfs:label ?name ;
+                         skos:altLabel ?alias .
+                         BIND (STR(?alias)  AS ?stripped_alias) .
+                     FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").}"))
diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm
index b7d94f3..bd7a306 100644
--- a/gn/db/sparql.scm
+++ b/gn/db/sparql.scm
@@ -2,23 +2,25 @@
 
 Module for handling SPARQL primitives.
 
-Note that GN queries should go into gn/data - this is currently not
+Note that GN queries should go into gn/db/sources - this is currently not
 the case.
 
 !#
 
 (define-module (gn db sparql)
-  #:use-module (json)
-  #:use-module (ice-9 match)
+  #:use-module (gn cache memoize)
+  #:use-module (gn db sources wikidata)
   #:use-module (ice-9 format)
   #:use-module (ice-9 iconv)
+  #:use-module (ice-9 match)
   #:use-module (ice-9 receive)
   #:use-module (ice-9 string-fun)
+  #:use-module (json)
+  #:use-module (srfi srfi-1)
   #:use-module (web client)
+  #:use-module (web gn-uri)
   #:use-module (web request)
   #:use-module (web uri)
-  #:use-module (gn cache memoize)
-  #:use-module (web gn-uri)
 
   #:export (memo-sparql-species
             memo-sparql-species-meta
@@ -26,6 +28,8 @@ the case.
             sparql-groups-meta
             sparql-group-info
             memo-sparql-wd-species-info
+            memo-sparql-wd-gene-aliases
+            memo-sparql-wd-geneids
             compile-species
             compile-groups-meta
             get-rows
@@ -72,7 +76,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 
 (define (sparql-tsv endpoint-url query)
   "Execute raw SPARQL query returning response as a UTF8 string, e.g.
-(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\"))
+(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")).
+
+Note this procedure works for wikidata, but not for gn!
 "
   ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2
   (receive (response-status response-body)
@@ -92,7 +98,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
   (unpack "bindings" (unpack "results" response)))
 
 (define (sparql-scm endpoint-url query)
-  "Return dual S-exp 'resultset' of varnames and results"
+  "Return dual S-exp 'resultset' of varnames and results.
+
+Note this procedure works for GN, but does not yet work for wikidata"
   (let ((response (json-string->scm
                    (sparql-exec endpoint-url (gn-sparql-prefix query)))))
    (values (sparql-names response) (sparql-results response))))
@@ -160,9 +168,47 @@ SELECT DISTINCT ?taxon ?ncbi ?descr where {
 
 ")))
 
+(define (flatten lst)
+  (cond ((null? lst) '())
+        ((pair? lst) (append (flatten (car lst)) (flatten (cdr lst))))
+        (else (list lst))))
+
+(define (remove-quotes s)
+  (substring s 1 (- (string-length s) 1)))
+
 (define memo-sparql-wd-species-info
   (memoize sparql-wd-species-info))
 
+(define (sparql-wd-geneids gene-name)
+  "Return a list of expanded wikidata ids, e.g.
+(\"<http://www.wikidata.org/entity/Q14860079>\" \"<http://www.wikidata.org/entity/Q24420953>\")"
+  (receive (type values)
+      (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-geneids gene-name)))
+    (map (lambda (item) (car item)) values) ;; flatten list
+    ))
+
+(define memo-sparql-wd-geneids
+  (memoize sparql-wd-geneids))
+
+(define (sparql-wd-gene-aliases geneids)
+  "Returns a flattened and dedpulicated list of geneids with
+(sparql-wd-gene-aliases '(\"Q14860079\" \"Q24420953\"))
+"
+  (let* ([aliases
+         (map (lambda (geneid)
+                (receive (type values)
+                    (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-gene-aliases (pk geneid))))
+                  (map (lambda (item) (car item)) values) ;; flatten list))
+                  )
+                ) geneids)]
+         [rm-quotes-aliases (map (lambda (s) (remove-quotes s)) (flatten aliases))]
+         )
+    (delete-duplicates rm-quotes-aliases)))
+
+(define memo-sparql-wd-gene-aliases
+  (memoize sparql-wd-gene-aliases))
+
+
 #!
 gn:Mus_musculus rdf:type gnc:species .
 gn:Mus_musculus gnt:name "Mouse" .
diff --git a/gn/runner/gemma.scm b/gn/runner/gemma.scm
index 9a5c0fc..c577305 100644
--- a/gn/runner/gemma.scm
+++ b/gn/runner/gemma.scm
@@ -10,11 +10,24 @@
   #:use-module (rnrs base)
 
   #:export (
-            write-pheno-file
+            gemma-pheno-txt
             invoke-gemma-wrapper-loco
             run-gemma
             ))
 
+(define (gemma-pheno-txt family traits)
+  "Return a list of values for GEMMA"
+  (assert (string=? family "BXD")) ; only supported right now
+  (define bxd-inds (geno-inds-bxd "BXD.json"))
+  (assert (= 235 (length bxd-inds)))
+  (map (lambda (ind)
+         (let [(value (assoc-ref traits ind))]
+           (if value
+               (format #f "~a" value)
+               "NA\n")
+           ))
+       bxd-inds))
+
 (define (write-pheno-file fn traits)
   (define bxd-inds (geno-inds-bxd "BXD.json"))
   (assert (= 235 (length bxd-inds)))