diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/lmdb-publishdata-export.scm | 229 | ||||
-rwxr-xr-x | scripts/precompute/list-traits-to-compute.scm | 76 | ||||
-rwxr-xr-x | scripts/precompute/run-gemma.scm | 42 |
3 files changed, 318 insertions, 29 deletions
diff --git a/scripts/lmdb-publishdata-export.scm b/scripts/lmdb-publishdata-export.scm new file mode 100755 index 0000000..8427112 --- /dev/null +++ b/scripts/lmdb-publishdata-export.scm @@ -0,0 +1,229 @@ +#! /usr/bin/env guile +!# +;; To run this script run: +;; +;; $ guix shell guile guile-hashing guile3-dbi guile3-dbd-mysql \ +;; guile-lmdb -- guile lmdb-publishdata-export.scm +;; conn.scm +;; +;; Example conn.scm: +;; ((sql-username . "webqtlout") +;; (sql-password . "xxxx") +;; (sql-database . "db_webqtl") +;; (sql-host . "localhost") +;; (sql-port . 3306) +;; (output-dir . "/tmp/data") +;; (log-file . "/tmp/export.log")) + +(use-modules (dbi dbi) + (rnrs bytevectors) + (system foreign) + (ice-9 match) + (srfi srfi-1) + (srfi srfi-26) + (srfi srfi-43) + (rnrs io ports) + (hashing md5) + ((lmdb lmdb) #:prefix mdb:) + (ice-9 format) + (ice-9 exceptions) + (json) + (logging logger) + (logging rotating-log) + (logging port-log) + (oop goops)) + + +;; Set up logging +(define* (setup-logging #:key (log-file "lmdb-dump-log")) + "Initialize the logging system with rotating file logs and error port output. + Creates a new logger, adds rotating and error port handlers, + sets it as the default logger, and opens the log for writing." + (let ((lgr (make <logger>)) + (rotating (make <rotating-log> + #:num-files 3 + #:size-limit 1024 + #:file-name log-file)) + (err (make <port-log> #:port (current-error-port)))) + ;; add the handlers to our logger + (add-handler! lgr rotating) + (add-handler! lgr err) + ;; make this the application's default logger + (set-default-logger! lgr) + (open-log! lgr))) + +(define (shutdown-logging) + "Properly shutdown the logging system. + Flushes any pending log messages, closes the log handlers, + and removes the default logger reference." + (flush-log) ;; since no args, it uses the default + (close-log!) ;; since no args, it uses the default + (set-default-logger! #f)) + +(define (call-with-database backend connection-string proc) + "Execute PROC with an open database connection. BACKEND is the +database type (e.g. \"mysql\"). CONNECTION-STRING is the database +connection string." + (let ((db #f)) + (dynamic-wind + (lambda () + (set! db (dbi-open backend connection-string))) + (cut proc db) + (lambda () + (when db + (dbi-close db)))))) + +(define (call-with-target-database connection-settings proc) + "Connect to the target database using CONNECTION-SETTINGS and execute +PROC." + (call-with-database "mysql" (string-join + (list (assq-ref connection-settings 'sql-username) + (assq-ref connection-settings 'sql-password) + (assq-ref connection-settings 'sql-database) + "tcp" + (assq-ref connection-settings 'sql-host) + (number->string + (assq-ref connection-settings 'sql-port))) + ":") + proc)) + +(define* (lmdb-save path key value) + "Save a NUM with KEY to PATH." + (mdb:with-env-and-txn + (path) (env txn) + (let ((dbi (mdb:dbi-open txn #f 0))) + (mdb:put txn dbi key + (if (number? value) + (number->string value) + value))))) + +(define (sql-exec db statement) + "Execute an SQL STATEMENT on database connection DB. Throws an error +if the statement execution fails." + (dbi-query db statement) + (database-check-status db)) + +(define (sql-fold proc init db statement) + "Fold over SQL query results." + (sql-exec db statement) + (let loop ((result init)) + (let ((row (dbi-get_row db))) + (if row + (loop (proc row result)) + result)))) + +(define (sql-for-each proc db statement) + "Apply PROC to each row returned by STATEMENT." + (sql-fold (lambda (row _) + (proc row)) + #f db statement)) + +(define (sql-map proc db statement) + "Map PROC over rows returned by STATEMENT." + (sql-fold (lambda (row result) + (cons (proc row) result)) + (list) db statement)) + +(define (sql-find db statement) + (sql-exec db statement) + (dbi-get_row db)) + +(define (database-check-status db) + "Check the status of the last database operation on DB. Throws an +error if the status code is non-zero." + (match (dbi-get_status db) + ((code . str) + (unless (zero? code) + (error str))))) + +(define* (save-dataset-values settings) + "Main function to extract and save dataset values. Queries the +database for datasets and their values, computes MD5 hashes for +dataset-trait combinations, and saves strain values to LMDB files in +/export5/lmdb-data-hashes/." + (dynamic-wind + (lambda () + (setup-logging #:log-file (assq-ref settings 'log-file)) + (log-msg 'INFO "Starting dataset value extraction")) + (lambda () + (call-with-target-database + settings + (lambda (db) + (sql-for-each + (lambda (row) + (match row + ((("Name" . dataset-name) + ("Id" . trait-id)) + (let* ((data-query (format #f "SELECT +JSON_ARRAYAGG(JSON_ARRAY(Strain.Name, PublishData.Value)) AS data, + MD5(JSON_ARRAY(Strain.Name, PublishData.Value)) as md5hash +FROM + PublishData + INNER JOIN Strain ON PublishData.StrainId = Strain.Id + INNER JOIN PublishXRef ON PublishData.Id = PublishXRef.DataId + INNER JOIN PublishFreeze ON PublishXRef.InbredSetId = PublishFreeze.InbredSetId +LEFT JOIN PublishSE ON + PublishSE.DataId = PublishData.Id AND + PublishSE.StrainId = PublishData.StrainId +LEFT JOIN NStrain ON + NStrain.DataId = PublishData.Id AND + NStrain.StrainId = PublishData.StrainId +WHERE + PublishFreeze.Name = \"~a\" AND + PublishXRef.Id = ~a AND + PublishFreeze.public > 0 AND + PublishData.value IS NOT NULL AND + PublishFreeze.confidentiality < 1 +ORDER BY + LENGTH(Strain.Name), Strain.Name" dataset-name trait-id))) + (match (call-with-target-database + settings + (lambda (db2) (sql-find db2 data-query))) + ((("data" . data) + ("md5hash" . md5-hash)) + (let* ((trait-name (format #f "~a~a" dataset-name trait-id)) + (base-dir (assq-ref settings 'output-dir)) + (out (format #f "~a-~a" trait-name + (substring md5-hash 0 12))) + (out-dir (format #f "~a/~a" base-dir out))) + (log-msg + 'INFO (format #f "Writing ~a to: ~a" trait-name out-dir)) + (unless (file-exists? out-dir) + (mkdir out-dir)) + (lmdb-save (format #f "~a/index" base-dir) trait-name out) + (vector-for-each + (lambda (_ x) + (match x + (#(strain value) + (lmdb-save out-dir strain value)))) + (json-string->scm data))))))))) + db + "SELECT DISTINCT PublishFreeze.Name, PublishXRef.Id FROM +PublishData INNER JOIN Strain ON PublishData.StrainId = Strain.Id +INNER JOIN PublishXRef ON PublishData.Id = PublishXRef.DataId +INNER JOIN PublishFreeze ON PublishXRef.InbredSetId = PublishFreeze.InbredSetId +LEFT JOIN PublishSE ON + PublishSE.DataId = PublishData.Id AND + PublishSE.StrainId = PublishData.StrainId +LEFT JOIN NStrain ON + NStrain.DataId = PublishData.Id AND + NStrain.StrainId = PublishData.StrainId +WHERE + PublishFreeze.public > 0 AND + PublishFreeze.confidentiality < 1 +ORDER BY + PublishFreeze.Id, PublishXRef.Id")))) + (lambda () + (shutdown-logging)))) + +(define main + (match-lambda* + ((_ connection-settings-file) + (save-dataset-values (call-with-input-file connection-settings-file + read))) + ((arg0 _ ...) + (display (format "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0) + (current-error-port)) + (exit #f)))) + +(apply main (command-line)) diff --git a/scripts/precompute/list-traits-to-compute.scm b/scripts/precompute/list-traits-to-compute.scm index 2c48d83..9f900d1 100755 --- a/scripts/precompute/list-traits-to-compute.scm +++ b/scripts/precompute/list-traits-to-compute.scm @@ -78,19 +78,50 @@ When that is the case we might as well write the phenotype file because we have (json) (rnrs bytevectors) (srfi srfi-1) + (srfi srfi-9) (srfi srfi-19) ; time ) +(define (get-trait db probeset-id) + (dbi-query db (string-append "select Id,Chr,Mb,Name,Symbol,description from ProbeSet where Id=" (int-to-string probeset-id) " limit 1")) + (get-row db) + ) + +#! + +The following is produced by gemma-wrapper as metadata + + "meta": { + "type": "gemma-wrapper", + "version": "0.99.7-pre1", + "population": "BXD", + "name": "HC_U_0304_R", + "trait": "101500_at", + "url": "https://genenetwork.org/show_trait?trait_id=101500_at&dataset=HC_U_0304_R", + "archive_GRM": "46bfba373fe8c19e68be6156cad3750120280e2e-gemma-cXX.tar.xz", + "archive_GWA": "779a54a59e4cd03608178db4068791db4ca44ab3-gemma-GWA.tar.xz", + "dataid": 75629, + "probesetid": 1097, + "probesetfreezeid": 7 + } -(define (write-json-ld id recs) +!# + + +(define (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs) ;; see also https://www.w3.org/2018/jsonld-cg-reports/json-ld/ (display id) + (display ":") + (display name) + (display ":") + (display trait-name) (newline) (let* [(traits (map (lambda (r) (match r [(strain-id . value) (cons (bxd-name strain-id) value)] )) (reverse recs))) + (uri (format #f "https://genenetwork.org/show_trait?trait_id=~a&dataset=~a" trait-name name)) (sha256 (sha-256->string (sha-256 (string->utf8 (scm->json-string traits))))) (json-data `(("@context" . "https://genenetwork.org/resource") (type . traits) @@ -98,8 +129,16 @@ When that is the case we might as well write the phenotype file because we have (steps . ()) (sha256 . ((input-traits . ,sha256))) (time . ,(date->string (time-utc->date (current-time)))))) - (traits . - ((,id . ,traits)))))] + (data . + ((,id . + ((group . "BXD") + (probesetfreeze-id . ,probesetfreeze-id) + (probeset-id . ,probeset-id) + (name . ,name) + (trait-name . ,trait-name) + (uri . ,uri) + (traits . ,traits) + ))))))] (call-with-output-file (string-append (number->string id) ".json") (lambda (port) (put-string port (scm->json-string json-data)))) @@ -111,22 +150,23 @@ When that is the case we might as well write the phenotype file because we have (begin ;; (let [(bxd-strains (memo-bxd-strain-id-names #:used-for-mapping? #t))] (define (run-list-traits-to-compute db num prev-id) + ;; ---- Build a query to collect num traits (let* [(count (if (< batch-size num) batch-size num)) (rest (- num count)) (hits (get-precompute-hits db prev-id count)) - (data-ids (map (lambda (hit) - (let* [(data-id (assoc-ref hit "DataId")) - ; (data-id-str (int-to-string data-id)) - ] - data-id)) + (data-ids (map (lambda (h) + (hit-data-id h)) hits)) (data-str-ids (map (lambda (id) (string-append "Id=" (int-to-string id))) data-ids)) (data-ids-query (string-join data-str-ids " OR ")) (query (string-append "SELECT Id,StrainId,value FROM ProbeSetData WHERE " data-ids-query)) ] + (format #t "writing-phenotypes from ~d to ~d batch ~d to ~d\n" first-id (+ first-id num) prev-id (+ prev-id count)) + (display query) (dbi-query db query) + ;; ---- Walk each resulting trait and build a hash of data-id and list of trait values (let [(id-traits (get-rows db '())) (nrecs '())] (for-each (lambda (r) @@ -140,12 +180,20 @@ When that is the case we might as well write the phenotype file because we have )] (set! nrecs (assoc-set! nrecs data-id lst)))) id-traits) - (for-each (lambda (r) - (match r - ((id . recs) (if (has-bxd? recs) - (write-json-ld id recs) - )) - )) nrecs) + ;; --- create the json output as a file by walking traits and hits + (for-each (lambda (h) + (let* [ + (id (hit-data-id h)) + (probeset-id (hit-probeset-id h)) + (probesetfreeze-id (hit-probesetfreeze-id h)) + (name (dataset-name db probesetfreeze-id)) + (trait (get-trait db probeset-id)) + (trait-name (assoc-ref trait "Name")) + (recs (assoc-ref nrecs id)) + ] + (if (has-bxd? recs) + (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs) + ))) hits) (if (> rest 0) (run-list-traits-to-compute db rest (first (reverse data-ids)))) ;; start precompute ))) diff --git a/scripts/precompute/run-gemma.scm b/scripts/precompute/run-gemma.scm index e6a4e26..4952834 100755 --- a/scripts/precompute/run-gemma.scm +++ b/scripts/precompute/run-gemma.scm @@ -9,7 +9,7 @@ Run from base dir with and with some extra paths (for gemma) -~/opt/guix-pull/bin/guix shell -C -F xz tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug guile -L . -e main -s ./scripts/precompute/run-gemma.scm +~/opt/guix-pull/bin/guix shell -C -F xz python python-lmdb tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug guile -L . -e main -s ./scripts/precompute/run-gemma.scm --id 21529 !# @@ -30,24 +30,36 @@ and with some extra paths (for gemma) ;; (write args) (let* [ (option-spec '( (version (single-char #\v) (value #f)) + (id (value #t)) (help (single-char #\h) (value #f)))) (options (getopt-long args option-spec)) + (show-version (option-ref options 'version #f)) (help-wanted (option-ref options 'help #f))] - (display "RUNNING") + (if show-version + (begin + (display "run-gemma version 1.0\n") + (exit))) (if help-wanted - (format #t "list-traits-to-compute writes JSON traits files from the GN DB -Usage: list-traits-to-compute [options...] - -h, --help Display this help -")) - (let [(trait-name "115475")] - (call-with-input-file "115475.json" + (format #t "run-gemma +Usage: run-gemma [options...] filename(s) + --id Run on identifier + -v --version Display version + -h --help Display this help +") + (let* [(trait-id (option-ref options 'id "0")) + (trait-fn (string-append trait-id ".json")) + ] + + (call-with-input-file trait-fn (lambda (port) (let* [(json (json->scm port)) - (dataset (assoc-ref json "traits")) - (dataset-name (car (car dataset))) - (traits (assoc-ref dataset dataset-name)) + (dataset (car (assoc-ref json "data"))) + (data (cdr dataset)) + (dataset-name (assoc-ref data "name")) + (trait-name (assoc-ref data "trait-name")) + (traits (assoc-ref data "traits")) + (pheno-fn (string-append trait-id "-pheno.txt")) ] - (display dataset) - (write-pheno-file "pheno.txt" traits) - (invoke-gemma-wrapper-loco dataset-name trait-name "pheno.txt") - )))))) + (write-pheno-file pheno-fn traits) + (invoke-gemma-wrapper-loco dataset-name trait-name trait-fn pheno-fn "BXD.8_geno.txt.gz") + ))))))) |