aboutsummaryrefslogtreecommitdiff
path: root/gn
diff options
context:
space:
mode:
Diffstat (limited to 'gn')
-rw-r--r--gn/data/hits.scm22
-rw-r--r--gn/data/strains.scm2
-rw-r--r--gn/db/sources/wikidata.scm65
-rw-r--r--gn/db/sparql.scm60
-rw-r--r--gn/runner/gemma.scm19
5 files changed, 149 insertions, 19 deletions
diff --git a/gn/data/hits.scm b/gn/data/hits.scm
index f7ce49e..85c4912 100644
--- a/gn/data/hits.scm
+++ b/gn/data/hits.scm
@@ -5,6 +5,7 @@
#:use-module (ice-9 iconv)
#:use-module (ice-9 receive)
#:use-module (ice-9 string-fun)
+ #:use-module (srfi srfi-9)
;; #:use-module (gn db sparql)
#:use-module (dbi dbi)
#:use-module (gn db mysql)
@@ -17,11 +18,26 @@
get-precompute-hit
set-precompute-hit-status!
update-precompute!
+ hit-data-id
+ hit-probeset-id
+ hit-probesetfreeze-id
))
-(define (get-precompute-hits db prev-id num)
- (dbi-query db (string-append "select Locus, DataId, ProbeSetId, ProbeSetFreezeId from ProbeSetXRef where DataId>" (int-to-string prev-id) " AND Locus_old is NULL ORDER BY DataId LIMIT " (format #f "~d" num)))
- (get-rows db '()))
+
+(define-record-type <hit>
+ (make-hit data-id probeset-id probesetfreeze-id)
+ hit?
+ (data-id hit-data-id)
+ (probeset-id hit-probeset-id)
+ (probesetfreeze-id hit-probesetfreeze-id)
+ )
+
+(define (get-precompute-hits db first-id num)
+ (dbi-query db (string-append "select Locus, DataId, ProbeSetId, ProbeSetFreezeId from ProbeSetXRef where DataId>" (int-to-string first-id) " AND Locus_old is NULL ORDER BY DataId LIMIT " (int-to-string num)))
+ (map (lambda (r)
+ (make-hit (assoc-ref r "DataId") (assoc-ref r "ProbeSetId") (assoc-ref r "ProbeSetFreezeId")))
+ (get-rows db '())
+ ))
(define (get-precompute-hit db prev-id)
(car (get-precompute-hits db prev-id 1)))
diff --git a/gn/data/strains.scm b/gn/data/strains.scm
index e5f839b..07b69ff 100644
--- a/gn/data/strains.scm
+++ b/gn/data/strains.scm
@@ -25,7 +25,7 @@
"Return assoc list of tuples of strain id+names:
((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)...
-used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME
+optional key used-for-mapping? will say whether the strains/individuals are used for mapping.
"
(call-with-db
(lambda (db)
diff --git a/gn/db/sources/wikidata.scm b/gn/db/sources/wikidata.scm
new file mode 100644
index 0000000..954ce93
--- /dev/null
+++ b/gn/db/sources/wikidata.scm
@@ -0,0 +1,65 @@
+#!
+
+Wikidata queries, initially lifted over from the gn3 gene-alias code (that was written in Racket).
+
+Note you can take a SPARQL query and push it into https://query.wikidata.org/. E.g. generate a query and
+copy paste into the query service:
+
+scheme@(guile-user) [3]> (display (wikidata-query-geneids "Shh"))
+```
+SELECT DISTINCT ?wikidata_id
+ WHERE {
+ ?wikidata_id wdt:P31 wd:Q7187;
+ wdt:P703 ?species .
+ VALUES (?species) { (wd:Q15978631 ) ( wd:Q83310 ) ( wd:Q184224 ) } .
+ ?wikidata_id rdfs:label "Shh"@en .
+ }
+```
+
+It is possible to run queries through curl with
+
+```
+curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query="
+ SELECT DISTINCT ?alias
+ WHERE {
+ wd:Q24420953 rdfs:label ?name ;
+ skos:altLabel ?alias .
+ FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").
+ }"
+```
+!#
+
+(define-module (gn db sources wikidata)
+ #:export (wikidata-query-geneids
+ wikidata-query-gene-aliases
+ )
+)
+
+(define ps-encoded-by "ps:P702")
+(define wdt-instance-of "wdt:P31")
+(define wdt-in-taxon "wdt:P703")
+(define wd-human "wd:Q15978631")
+(define wd-mouse "wd:Q83310")
+(define wd-rat "wd:Q184224")
+(define wd-gene "wd:Q7187")
+(define wd-shh-rat "wd:Q24420953")
+
+(define (wikidata-query-geneids gene_name)
+ "SPARQL query to get the wikidata identifiers pointing to genes of listed species, e.g. 'Shh'"
+ (string-append
+ "SELECT DISTINCT ?wikidata_id
+ WHERE {
+ ?wikidata_id " wdt-instance-of " " wd-gene ";
+ " wdt-in-taxon " ?species .
+ VALUES (?species) { (" wd-human " ) ( " wd-mouse" ) ( " wd-rat" ) } .
+ ?wikidata_id rdfs:label \"" gene_name "\"@en .}"))
+
+(define (wikidata-query-gene-aliases wikidata_id)
+ "SPARQL query to get a list of gene aliases based on a wikidata identifier, e.g. for Q24420953. This
+version supports the expanded id only, so <http://www.wikidata.org/entity/Q24420953> including the <,>."
+ (string-append
+ "SELECT DISTINCT ?stripped_alias
+ WHERE { " wikidata_id " rdfs:label ?name ;
+ skos:altLabel ?alias .
+ BIND (STR(?alias) AS ?stripped_alias) .
+ FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").}"))
diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm
index b7d94f3..bd7a306 100644
--- a/gn/db/sparql.scm
+++ b/gn/db/sparql.scm
@@ -2,23 +2,25 @@
Module for handling SPARQL primitives.
-Note that GN queries should go into gn/data - this is currently not
+Note that GN queries should go into gn/db/sources - this is currently not
the case.
!#
(define-module (gn db sparql)
- #:use-module (json)
- #:use-module (ice-9 match)
+ #:use-module (gn cache memoize)
+ #:use-module (gn db sources wikidata)
#:use-module (ice-9 format)
#:use-module (ice-9 iconv)
+ #:use-module (ice-9 match)
#:use-module (ice-9 receive)
#:use-module (ice-9 string-fun)
+ #:use-module (json)
+ #:use-module (srfi srfi-1)
#:use-module (web client)
+ #:use-module (web gn-uri)
#:use-module (web request)
#:use-module (web uri)
- #:use-module (gn cache memoize)
- #:use-module (web gn-uri)
#:export (memo-sparql-species
memo-sparql-species-meta
@@ -26,6 +28,8 @@ the case.
sparql-groups-meta
sparql-group-info
memo-sparql-wd-species-info
+ memo-sparql-wd-gene-aliases
+ memo-sparql-wd-geneids
compile-species
compile-groups-meta
get-rows
@@ -72,7 +76,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
(define (sparql-tsv endpoint-url query)
"Execute raw SPARQL query returning response as a UTF8 string, e.g.
-(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\"))
+(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")).
+
+Note this procedure works for wikidata, but not for gn!
"
; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2
(receive (response-status response-body)
@@ -92,7 +98,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
(unpack "bindings" (unpack "results" response)))
(define (sparql-scm endpoint-url query)
- "Return dual S-exp 'resultset' of varnames and results"
+ "Return dual S-exp 'resultset' of varnames and results.
+
+Note this procedure works for GN, but does not yet work for wikidata"
(let ((response (json-string->scm
(sparql-exec endpoint-url (gn-sparql-prefix query)))))
(values (sparql-names response) (sparql-results response))))
@@ -160,9 +168,47 @@ SELECT DISTINCT ?taxon ?ncbi ?descr where {
")))
+(define (flatten lst)
+ (cond ((null? lst) '())
+ ((pair? lst) (append (flatten (car lst)) (flatten (cdr lst))))
+ (else (list lst))))
+
+(define (remove-quotes s)
+ (substring s 1 (- (string-length s) 1)))
+
(define memo-sparql-wd-species-info
(memoize sparql-wd-species-info))
+(define (sparql-wd-geneids gene-name)
+ "Return a list of expanded wikidata ids, e.g.
+(\"<http://www.wikidata.org/entity/Q14860079>\" \"<http://www.wikidata.org/entity/Q24420953>\")"
+ (receive (type values)
+ (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-geneids gene-name)))
+ (map (lambda (item) (car item)) values) ;; flatten list
+ ))
+
+(define memo-sparql-wd-geneids
+ (memoize sparql-wd-geneids))
+
+(define (sparql-wd-gene-aliases geneids)
+ "Returns a flattened and dedpulicated list of geneids with
+(sparql-wd-gene-aliases '(\"Q14860079\" \"Q24420953\"))
+"
+ (let* ([aliases
+ (map (lambda (geneid)
+ (receive (type values)
+ (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-gene-aliases (pk geneid))))
+ (map (lambda (item) (car item)) values) ;; flatten list))
+ )
+ ) geneids)]
+ [rm-quotes-aliases (map (lambda (s) (remove-quotes s)) (flatten aliases))]
+ )
+ (delete-duplicates rm-quotes-aliases)))
+
+(define memo-sparql-wd-gene-aliases
+ (memoize sparql-wd-gene-aliases))
+
+
#!
gn:Mus_musculus rdf:type gnc:species .
gn:Mus_musculus gnt:name "Mouse" .
diff --git a/gn/runner/gemma.scm b/gn/runner/gemma.scm
index 69991dd..9a5c0fc 100644
--- a/gn/runner/gemma.scm
+++ b/gn/runner/gemma.scm
@@ -39,24 +39,27 @@
))
)
-(define (invoke-gemma-wrapper-loco name trait-name pheno-fn)
+(define (invoke-gemma-wrapper-loco name trait-name trait-fn pheno-fn geno-fn)
"Create a tmpdir and invoke gemma-wrapper using parallel LOCO. Note that at this point we use a number of defaults for BXD"
(let* [(population "BXD")
(sys-tmpdir (getenv "TMPDIR"))
(tmpdir (mkdtemp (string-append sys-tmpdir "/run-gemma-XXXXXX")))
(k-json-fn (string-append tmpdir "/K.json"))
- (gwa-json-fn (string-append tmpdir "/GWA.json"))]
+ (gwa-json-fn (string-append tmpdir "/GWA.json"))
+ (trait-json-fn (string-append tmpdir "/" trait-fn))]
+ (copy-file trait-fn trait-json-fn)
;; --- First we compute K - control output goes to K.json
- (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --verbose --population \"" population "\" --name \"" name "\" --trait \"" trait-name "\" --verbose --loco --json --parallel -- -gk -g BXD.8_geno.txt.gz -p " pheno-fn " -a BXD.8_snps.txt > " k-json-fn )))]
+ (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --verbose --population \"" population "\" --name \"" name "\" --trait \"" trait-name "\" --verbose --loco --json --parallel -- -gk -g " geno-fn " -p " pheno-fn " -a BXD.8_snps.txt > " k-json-fn )))]
(if (not (= err 0))
(exit err)))
- (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --population \"" population "\" --name \"" name "\" --id \"" trait-name "\" --trait \"" trait-name "\" --verbose --loco --json --input " k-json-fn " -- -g BXD.8_geno.txt.gz -p " pheno-fn " -a BXD.8_snps.txt -lmm 9 -maf 0.1 > " gwa-json-fn)))]
+ (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --meta \"" trait-json-fn "\" --population \"" population "\" --name \"" name "\" --id \"" trait-name "\" --trait \"" trait-name "\" --verbose --loco --json --lmdb --input " k-json-fn " -- -g " geno-fn " -p " pheno-fn " -a BXD.8_snps.txt -lmm 9 -maf 0.1 > " gwa-json-fn)))]
(if (not (= err 0))
(exit err)))
- ;; (delete-file pheno-fn)
- ;; (delete-file gwa-json-fn)
- ;; (delete-file k-json-fn)
- ;; (rmdir tmpdir)
+ (delete-file pheno-fn)
+ (delete-file gwa-json-fn)
+ (delete-file k-json-fn)
+ (delete-file trait-json-fn)
+ (rmdir tmpdir)
)
)