diff options
Diffstat (limited to 'gn')
-rw-r--r-- | gn/data/hits.scm | 22 | ||||
-rw-r--r-- | gn/data/strains.scm | 2 | ||||
-rw-r--r-- | gn/db/sources/wikidata.scm | 65 | ||||
-rw-r--r-- | gn/db/sparql.scm | 60 | ||||
-rw-r--r-- | gn/runner/gemma.scm | 19 |
5 files changed, 149 insertions, 19 deletions
diff --git a/gn/data/hits.scm b/gn/data/hits.scm index f7ce49e..85c4912 100644 --- a/gn/data/hits.scm +++ b/gn/data/hits.scm @@ -5,6 +5,7 @@ #:use-module (ice-9 iconv) #:use-module (ice-9 receive) #:use-module (ice-9 string-fun) + #:use-module (srfi srfi-9) ;; #:use-module (gn db sparql) #:use-module (dbi dbi) #:use-module (gn db mysql) @@ -17,11 +18,26 @@ get-precompute-hit set-precompute-hit-status! update-precompute! + hit-data-id + hit-probeset-id + hit-probesetfreeze-id )) -(define (get-precompute-hits db prev-id num) - (dbi-query db (string-append "select Locus, DataId, ProbeSetId, ProbeSetFreezeId from ProbeSetXRef where DataId>" (int-to-string prev-id) " AND Locus_old is NULL ORDER BY DataId LIMIT " (format #f "~d" num))) - (get-rows db '())) + +(define-record-type <hit> + (make-hit data-id probeset-id probesetfreeze-id) + hit? + (data-id hit-data-id) + (probeset-id hit-probeset-id) + (probesetfreeze-id hit-probesetfreeze-id) + ) + +(define (get-precompute-hits db first-id num) + (dbi-query db (string-append "select Locus, DataId, ProbeSetId, ProbeSetFreezeId from ProbeSetXRef where DataId>" (int-to-string first-id) " AND Locus_old is NULL ORDER BY DataId LIMIT " (int-to-string num))) + (map (lambda (r) + (make-hit (assoc-ref r "DataId") (assoc-ref r "ProbeSetId") (assoc-ref r "ProbeSetFreezeId"))) + (get-rows db '()) + )) (define (get-precompute-hit db prev-id) (car (get-precompute-hits db prev-id 1))) diff --git a/gn/data/strains.scm b/gn/data/strains.scm index e5f839b..07b69ff 100644 --- a/gn/data/strains.scm +++ b/gn/data/strains.scm @@ -25,7 +25,7 @@ "Return assoc list of tuples of strain id+names: ((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)... -used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME +optional key used-for-mapping? will say whether the strains/individuals are used for mapping. " (call-with-db (lambda (db) diff --git a/gn/db/sources/wikidata.scm b/gn/db/sources/wikidata.scm new file mode 100644 index 0000000..954ce93 --- /dev/null +++ b/gn/db/sources/wikidata.scm @@ -0,0 +1,65 @@ +#! + +Wikidata queries, initially lifted over from the gn3 gene-alias code (that was written in Racket). + +Note you can take a SPARQL query and push it into https://query.wikidata.org/. E.g. generate a query and +copy paste into the query service: + +scheme@(guile-user) [3]> (display (wikidata-query-geneids "Shh")) +``` +SELECT DISTINCT ?wikidata_id + WHERE { + ?wikidata_id wdt:P31 wd:Q7187; + wdt:P703 ?species . + VALUES (?species) { (wd:Q15978631 ) ( wd:Q83310 ) ( wd:Q184224 ) } . + ?wikidata_id rdfs:label "Shh"@en . + } +``` + +It is possible to run queries through curl with + +``` +curl -G https://query.wikidata.org/sparql -H "Accept: application/json; charset=utf-8" --data-urlencode query=" + SELECT DISTINCT ?alias + WHERE { + wd:Q24420953 rdfs:label ?name ; + skos:altLabel ?alias . + FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\"). + }" +``` +!# + +(define-module (gn db sources wikidata) + #:export (wikidata-query-geneids + wikidata-query-gene-aliases + ) +) + +(define ps-encoded-by "ps:P702") +(define wdt-instance-of "wdt:P31") +(define wdt-in-taxon "wdt:P703") +(define wd-human "wd:Q15978631") +(define wd-mouse "wd:Q83310") +(define wd-rat "wd:Q184224") +(define wd-gene "wd:Q7187") +(define wd-shh-rat "wd:Q24420953") + +(define (wikidata-query-geneids gene_name) + "SPARQL query to get the wikidata identifiers pointing to genes of listed species, e.g. 'Shh'" + (string-append + "SELECT DISTINCT ?wikidata_id + WHERE { + ?wikidata_id " wdt-instance-of " " wd-gene "; + " wdt-in-taxon " ?species . + VALUES (?species) { (" wd-human " ) ( " wd-mouse" ) ( " wd-rat" ) } . + ?wikidata_id rdfs:label \"" gene_name "\"@en .}")) + +(define (wikidata-query-gene-aliases wikidata_id) + "SPARQL query to get a list of gene aliases based on a wikidata identifier, e.g. for Q24420953. This +version supports the expanded id only, so <http://www.wikidata.org/entity/Q24420953> including the <,>." + (string-append + "SELECT DISTINCT ?stripped_alias + WHERE { " wikidata_id " rdfs:label ?name ; + skos:altLabel ?alias . + BIND (STR(?alias) AS ?stripped_alias) . + FILTER(LANG(?name) = \"en\" && LANG(?alias) = \"en\").}")) diff --git a/gn/db/sparql.scm b/gn/db/sparql.scm index b7d94f3..bd7a306 100644 --- a/gn/db/sparql.scm +++ b/gn/db/sparql.scm @@ -2,23 +2,25 @@ Module for handling SPARQL primitives. -Note that GN queries should go into gn/data - this is currently not +Note that GN queries should go into gn/db/sources - this is currently not the case. !# (define-module (gn db sparql) - #:use-module (json) - #:use-module (ice-9 match) + #:use-module (gn cache memoize) + #:use-module (gn db sources wikidata) #:use-module (ice-9 format) #:use-module (ice-9 iconv) + #:use-module (ice-9 match) #:use-module (ice-9 receive) #:use-module (ice-9 string-fun) + #:use-module (json) + #:use-module (srfi srfi-1) #:use-module (web client) + #:use-module (web gn-uri) #:use-module (web request) #:use-module (web uri) - #:use-module (gn cache memoize) - #:use-module (web gn-uri) #:export (memo-sparql-species memo-sparql-species-meta @@ -26,6 +28,8 @@ the case. sparql-groups-meta sparql-group-info memo-sparql-wd-species-info + memo-sparql-wd-gene-aliases + memo-sparql-wd-geneids compile-species compile-groups-meta get-rows @@ -72,7 +76,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> (define (sparql-tsv endpoint-url query) "Execute raw SPARQL query returning response as a UTF8 string, e.g. -(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")) +(tsv->scm (sparql-tsv (wd-sparql-endpoint-url) \"wd:Q158695\")). + +Note this procedure works for wikidata, but not for gn! " ; GET /sparql?query=SELECT%20DISTINCT%20%2A%20where%20%7B%0A%20%20wd%3AQ158695%20wdt%3AP225%20%3Fo%20.%0A%7D%20limit%205 HTTP/2 (receive (response-status response-body) @@ -92,7 +98,9 @@ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> (unpack "bindings" (unpack "results" response))) (define (sparql-scm endpoint-url query) - "Return dual S-exp 'resultset' of varnames and results" + "Return dual S-exp 'resultset' of varnames and results. + +Note this procedure works for GN, but does not yet work for wikidata" (let ((response (json-string->scm (sparql-exec endpoint-url (gn-sparql-prefix query))))) (values (sparql-names response) (sparql-results response)))) @@ -160,9 +168,47 @@ SELECT DISTINCT ?taxon ?ncbi ?descr where { "))) +(define (flatten lst) + (cond ((null? lst) '()) + ((pair? lst) (append (flatten (car lst)) (flatten (cdr lst)))) + (else (list lst)))) + +(define (remove-quotes s) + (substring s 1 (- (string-length s) 1))) + (define memo-sparql-wd-species-info (memoize sparql-wd-species-info)) +(define (sparql-wd-geneids gene-name) + "Return a list of expanded wikidata ids, e.g. +(\"<http://www.wikidata.org/entity/Q14860079>\" \"<http://www.wikidata.org/entity/Q24420953>\")" + (receive (type values) + (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-geneids gene-name))) + (map (lambda (item) (car item)) values) ;; flatten list + )) + +(define memo-sparql-wd-geneids + (memoize sparql-wd-geneids)) + +(define (sparql-wd-gene-aliases geneids) + "Returns a flattened and dedpulicated list of geneids with +(sparql-wd-gene-aliases '(\"Q14860079\" \"Q24420953\")) +" + (let* ([aliases + (map (lambda (geneid) + (receive (type values) + (tsv->scm (sparql-tsv (wd-sparql-endpoint-url) (wikidata-query-gene-aliases (pk geneid)))) + (map (lambda (item) (car item)) values) ;; flatten list)) + ) + ) geneids)] + [rm-quotes-aliases (map (lambda (s) (remove-quotes s)) (flatten aliases))] + ) + (delete-duplicates rm-quotes-aliases))) + +(define memo-sparql-wd-gene-aliases + (memoize sparql-wd-gene-aliases)) + + #! gn:Mus_musculus rdf:type gnc:species . gn:Mus_musculus gnt:name "Mouse" . diff --git a/gn/runner/gemma.scm b/gn/runner/gemma.scm index 69991dd..9a5c0fc 100644 --- a/gn/runner/gemma.scm +++ b/gn/runner/gemma.scm @@ -39,24 +39,27 @@ )) ) -(define (invoke-gemma-wrapper-loco name trait-name pheno-fn) +(define (invoke-gemma-wrapper-loco name trait-name trait-fn pheno-fn geno-fn) "Create a tmpdir and invoke gemma-wrapper using parallel LOCO. Note that at this point we use a number of defaults for BXD" (let* [(population "BXD") (sys-tmpdir (getenv "TMPDIR")) (tmpdir (mkdtemp (string-append sys-tmpdir "/run-gemma-XXXXXX"))) (k-json-fn (string-append tmpdir "/K.json")) - (gwa-json-fn (string-append tmpdir "/GWA.json"))] + (gwa-json-fn (string-append tmpdir "/GWA.json")) + (trait-json-fn (string-append tmpdir "/" trait-fn))] + (copy-file trait-fn trait-json-fn) ;; --- First we compute K - control output goes to K.json - (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --verbose --population \"" population "\" --name \"" name "\" --trait \"" trait-name "\" --verbose --loco --json --parallel -- -gk -g BXD.8_geno.txt.gz -p " pheno-fn " -a BXD.8_snps.txt > " k-json-fn )))] + (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --verbose --population \"" population "\" --name \"" name "\" --trait \"" trait-name "\" --verbose --loco --json --parallel -- -gk -g " geno-fn " -p " pheno-fn " -a BXD.8_snps.txt > " k-json-fn )))] (if (not (= err 0)) (exit err))) - (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --population \"" population "\" --name \"" name "\" --id \"" trait-name "\" --trait \"" trait-name "\" --verbose --loco --json --input " k-json-fn " -- -g BXD.8_geno.txt.gz -p " pheno-fn " -a BXD.8_snps.txt -lmm 9 -maf 0.1 > " gwa-json-fn)))] + (let [(err (system (string-append "/gemma-wrapper/bin/gemma-wrapper --meta \"" trait-json-fn "\" --population \"" population "\" --name \"" name "\" --id \"" trait-name "\" --trait \"" trait-name "\" --verbose --loco --json --lmdb --input " k-json-fn " -- -g " geno-fn " -p " pheno-fn " -a BXD.8_snps.txt -lmm 9 -maf 0.1 > " gwa-json-fn)))] (if (not (= err 0)) (exit err))) - ;; (delete-file pheno-fn) - ;; (delete-file gwa-json-fn) - ;; (delete-file k-json-fn) - ;; (rmdir tmpdir) + (delete-file pheno-fn) + (delete-file gwa-json-fn) + (delete-file k-json-fn) + (delete-file trait-json-fn) + (rmdir tmpdir) ) ) |