aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/lmdb-publishdata-export.scm229
-rwxr-xr-xscripts/precompute/list-traits-to-compute.scm76
-rwxr-xr-xscripts/precompute/run-gemma.scm42
3 files changed, 318 insertions, 29 deletions
diff --git a/scripts/lmdb-publishdata-export.scm b/scripts/lmdb-publishdata-export.scm
new file mode 100755
index 0000000..8427112
--- /dev/null
+++ b/scripts/lmdb-publishdata-export.scm
@@ -0,0 +1,229 @@
+#! /usr/bin/env guile
+!#
+;; To run this script run:
+;;
+;; $ guix shell guile guile-hashing guile3-dbi guile3-dbd-mysql \
+;; guile-lmdb -- guile lmdb-publishdata-export.scm
+;; conn.scm
+;;
+;; Example conn.scm:
+;; ((sql-username . "webqtlout")
+;; (sql-password . "xxxx")
+;; (sql-database . "db_webqtl")
+;; (sql-host . "localhost")
+;; (sql-port . 3306)
+;; (output-dir . "/tmp/data")
+;; (log-file . "/tmp/export.log"))
+
+(use-modules (dbi dbi)
+ (rnrs bytevectors)
+ (system foreign)
+ (ice-9 match)
+ (srfi srfi-1)
+ (srfi srfi-26)
+ (srfi srfi-43)
+ (rnrs io ports)
+ (hashing md5)
+ ((lmdb lmdb) #:prefix mdb:)
+ (ice-9 format)
+ (ice-9 exceptions)
+ (json)
+ (logging logger)
+ (logging rotating-log)
+ (logging port-log)
+ (oop goops))
+
+
+;; Set up logging
+(define* (setup-logging #:key (log-file "lmdb-dump-log"))
+ "Initialize the logging system with rotating file logs and error port output.
+ Creates a new logger, adds rotating and error port handlers,
+ sets it as the default logger, and opens the log for writing."
+ (let ((lgr (make <logger>))
+ (rotating (make <rotating-log>
+ #:num-files 3
+ #:size-limit 1024
+ #:file-name log-file))
+ (err (make <port-log> #:port (current-error-port))))
+ ;; add the handlers to our logger
+ (add-handler! lgr rotating)
+ (add-handler! lgr err)
+ ;; make this the application's default logger
+ (set-default-logger! lgr)
+ (open-log! lgr)))
+
+(define (shutdown-logging)
+ "Properly shutdown the logging system.
+ Flushes any pending log messages, closes the log handlers,
+ and removes the default logger reference."
+ (flush-log) ;; since no args, it uses the default
+ (close-log!) ;; since no args, it uses the default
+ (set-default-logger! #f))
+
+(define (call-with-database backend connection-string proc)
+ "Execute PROC with an open database connection. BACKEND is the
+database type (e.g. \"mysql\"). CONNECTION-STRING is the database
+connection string."
+ (let ((db #f))
+ (dynamic-wind
+ (lambda ()
+ (set! db (dbi-open backend connection-string)))
+ (cut proc db)
+ (lambda ()
+ (when db
+ (dbi-close db))))))
+
+(define (call-with-target-database connection-settings proc)
+ "Connect to the target database using CONNECTION-SETTINGS and execute
+PROC."
+ (call-with-database "mysql" (string-join
+ (list (assq-ref connection-settings 'sql-username)
+ (assq-ref connection-settings 'sql-password)
+ (assq-ref connection-settings 'sql-database)
+ "tcp"
+ (assq-ref connection-settings 'sql-host)
+ (number->string
+ (assq-ref connection-settings 'sql-port)))
+ ":")
+ proc))
+
+(define* (lmdb-save path key value)
+ "Save a NUM with KEY to PATH."
+ (mdb:with-env-and-txn
+ (path) (env txn)
+ (let ((dbi (mdb:dbi-open txn #f 0)))
+ (mdb:put txn dbi key
+ (if (number? value)
+ (number->string value)
+ value)))))
+
+(define (sql-exec db statement)
+ "Execute an SQL STATEMENT on database connection DB. Throws an error
+if the statement execution fails."
+ (dbi-query db statement)
+ (database-check-status db))
+
+(define (sql-fold proc init db statement)
+ "Fold over SQL query results."
+ (sql-exec db statement)
+ (let loop ((result init))
+ (let ((row (dbi-get_row db)))
+ (if row
+ (loop (proc row result))
+ result))))
+
+(define (sql-for-each proc db statement)
+ "Apply PROC to each row returned by STATEMENT."
+ (sql-fold (lambda (row _)
+ (proc row))
+ #f db statement))
+
+(define (sql-map proc db statement)
+ "Map PROC over rows returned by STATEMENT."
+ (sql-fold (lambda (row result)
+ (cons (proc row) result))
+ (list) db statement))
+
+(define (sql-find db statement)
+ (sql-exec db statement)
+ (dbi-get_row db))
+
+(define (database-check-status db)
+ "Check the status of the last database operation on DB. Throws an
+error if the status code is non-zero."
+ (match (dbi-get_status db)
+ ((code . str)
+ (unless (zero? code)
+ (error str)))))
+
+(define* (save-dataset-values settings)
+ "Main function to extract and save dataset values. Queries the
+database for datasets and their values, computes MD5 hashes for
+dataset-trait combinations, and saves strain values to LMDB files in
+/export5/lmdb-data-hashes/."
+ (dynamic-wind
+ (lambda ()
+ (setup-logging #:log-file (assq-ref settings 'log-file))
+ (log-msg 'INFO "Starting dataset value extraction"))
+ (lambda ()
+ (call-with-target-database
+ settings
+ (lambda (db)
+ (sql-for-each
+ (lambda (row)
+ (match row
+ ((("Name" . dataset-name)
+ ("Id" . trait-id))
+ (let* ((data-query (format #f "SELECT
+JSON_ARRAYAGG(JSON_ARRAY(Strain.Name, PublishData.Value)) AS data,
+ MD5(JSON_ARRAY(Strain.Name, PublishData.Value)) as md5hash
+FROM
+ PublishData
+ INNER JOIN Strain ON PublishData.StrainId = Strain.Id
+ INNER JOIN PublishXRef ON PublishData.Id = PublishXRef.DataId
+ INNER JOIN PublishFreeze ON PublishXRef.InbredSetId = PublishFreeze.InbredSetId
+LEFT JOIN PublishSE ON
+ PublishSE.DataId = PublishData.Id AND
+ PublishSE.StrainId = PublishData.StrainId
+LEFT JOIN NStrain ON
+ NStrain.DataId = PublishData.Id AND
+ NStrain.StrainId = PublishData.StrainId
+WHERE
+ PublishFreeze.Name = \"~a\" AND
+ PublishXRef.Id = ~a AND
+ PublishFreeze.public > 0 AND
+ PublishData.value IS NOT NULL AND
+ PublishFreeze.confidentiality < 1
+ORDER BY
+ LENGTH(Strain.Name), Strain.Name" dataset-name trait-id)))
+ (match (call-with-target-database
+ settings
+ (lambda (db2) (sql-find db2 data-query)))
+ ((("data" . data)
+ ("md5hash" . md5-hash))
+ (let* ((trait-name (format #f "~a~a" dataset-name trait-id))
+ (base-dir (assq-ref settings 'output-dir))
+ (out (format #f "~a-~a" trait-name
+ (substring md5-hash 0 12)))
+ (out-dir (format #f "~a/~a" base-dir out)))
+ (log-msg
+ 'INFO (format #f "Writing ~a to: ~a" trait-name out-dir))
+ (unless (file-exists? out-dir)
+ (mkdir out-dir))
+ (lmdb-save (format #f "~a/index" base-dir) trait-name out)
+ (vector-for-each
+ (lambda (_ x)
+ (match x
+ (#(strain value)
+ (lmdb-save out-dir strain value))))
+ (json-string->scm data)))))))))
+ db
+ "SELECT DISTINCT PublishFreeze.Name, PublishXRef.Id FROM
+PublishData INNER JOIN Strain ON PublishData.StrainId = Strain.Id
+INNER JOIN PublishXRef ON PublishData.Id = PublishXRef.DataId
+INNER JOIN PublishFreeze ON PublishXRef.InbredSetId = PublishFreeze.InbredSetId
+LEFT JOIN PublishSE ON
+ PublishSE.DataId = PublishData.Id AND
+ PublishSE.StrainId = PublishData.StrainId
+LEFT JOIN NStrain ON
+ NStrain.DataId = PublishData.Id AND
+ NStrain.StrainId = PublishData.StrainId
+WHERE
+ PublishFreeze.public > 0 AND
+ PublishFreeze.confidentiality < 1
+ORDER BY
+ PublishFreeze.Id, PublishXRef.Id"))))
+ (lambda ()
+ (shutdown-logging))))
+
+(define main
+ (match-lambda*
+ ((_ connection-settings-file)
+ (save-dataset-values (call-with-input-file connection-settings-file
+ read)))
+ ((arg0 _ ...)
+ (display (format "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0)
+ (current-error-port))
+ (exit #f))))
+
+(apply main (command-line))
diff --git a/scripts/precompute/list-traits-to-compute.scm b/scripts/precompute/list-traits-to-compute.scm
index 2c48d83..9f900d1 100755
--- a/scripts/precompute/list-traits-to-compute.scm
+++ b/scripts/precompute/list-traits-to-compute.scm
@@ -78,19 +78,50 @@ When that is the case we might as well write the phenotype file because we have
(json)
(rnrs bytevectors)
(srfi srfi-1)
+ (srfi srfi-9)
(srfi srfi-19) ; time
)
+(define (get-trait db probeset-id)
+ (dbi-query db (string-append "select Id,Chr,Mb,Name,Symbol,description from ProbeSet where Id=" (int-to-string probeset-id) " limit 1"))
+ (get-row db)
+ )
+
+#!
+
+The following is produced by gemma-wrapper as metadata
+
+ "meta": {
+ "type": "gemma-wrapper",
+ "version": "0.99.7-pre1",
+ "population": "BXD",
+ "name": "HC_U_0304_R",
+ "trait": "101500_at",
+ "url": "https://genenetwork.org/show_trait?trait_id=101500_at&dataset=HC_U_0304_R",
+ "archive_GRM": "46bfba373fe8c19e68be6156cad3750120280e2e-gemma-cXX.tar.xz",
+ "archive_GWA": "779a54a59e4cd03608178db4068791db4ca44ab3-gemma-GWA.tar.xz",
+ "dataid": 75629,
+ "probesetid": 1097,
+ "probesetfreezeid": 7
+ }
-(define (write-json-ld id recs)
+!#
+
+
+(define (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs)
;; see also https://www.w3.org/2018/jsonld-cg-reports/json-ld/
(display id)
+ (display ":")
+ (display name)
+ (display ":")
+ (display trait-name)
(newline)
(let* [(traits (map (lambda (r)
(match r
[(strain-id . value) (cons (bxd-name strain-id) value)]
))
(reverse recs)))
+ (uri (format #f "https://genenetwork.org/show_trait?trait_id=~a&dataset=~a" trait-name name))
(sha256 (sha-256->string (sha-256 (string->utf8 (scm->json-string traits)))))
(json-data `(("@context" . "https://genenetwork.org/resource")
(type . traits)
@@ -98,8 +129,16 @@ When that is the case we might as well write the phenotype file because we have
(steps . ())
(sha256 . ((input-traits . ,sha256)))
(time . ,(date->string (time-utc->date (current-time))))))
- (traits .
- ((,id . ,traits)))))]
+ (data .
+ ((,id .
+ ((group . "BXD")
+ (probesetfreeze-id . ,probesetfreeze-id)
+ (probeset-id . ,probeset-id)
+ (name . ,name)
+ (trait-name . ,trait-name)
+ (uri . ,uri)
+ (traits . ,traits)
+ ))))))]
(call-with-output-file (string-append (number->string id) ".json")
(lambda (port)
(put-string port (scm->json-string json-data))))
@@ -111,22 +150,23 @@ When that is the case we might as well write the phenotype file because we have
(begin
;; (let [(bxd-strains (memo-bxd-strain-id-names #:used-for-mapping? #t))]
(define (run-list-traits-to-compute db num prev-id)
+ ;; ---- Build a query to collect num traits
(let* [(count (if (< batch-size num)
batch-size
num))
(rest (- num count))
(hits (get-precompute-hits db prev-id count))
- (data-ids (map (lambda (hit)
- (let* [(data-id (assoc-ref hit "DataId"))
- ; (data-id-str (int-to-string data-id))
- ]
- data-id))
+ (data-ids (map (lambda (h)
+ (hit-data-id h))
hits))
(data-str-ids (map (lambda (id) (string-append "Id=" (int-to-string id))) data-ids))
(data-ids-query (string-join data-str-ids " OR "))
(query (string-append "SELECT Id,StrainId,value FROM ProbeSetData WHERE " data-ids-query))
]
+ (format #t "writing-phenotypes from ~d to ~d batch ~d to ~d\n" first-id (+ first-id num) prev-id (+ prev-id count))
+ (display query)
(dbi-query db query)
+ ;; ---- Walk each resulting trait and build a hash of data-id and list of trait values
(let [(id-traits (get-rows db '()))
(nrecs '())]
(for-each (lambda (r)
@@ -140,12 +180,20 @@ When that is the case we might as well write the phenotype file because we have
)]
(set! nrecs (assoc-set! nrecs data-id lst))))
id-traits)
- (for-each (lambda (r)
- (match r
- ((id . recs) (if (has-bxd? recs)
- (write-json-ld id recs)
- ))
- )) nrecs)
+ ;; --- create the json output as a file by walking traits and hits
+ (for-each (lambda (h)
+ (let* [
+ (id (hit-data-id h))
+ (probeset-id (hit-probeset-id h))
+ (probesetfreeze-id (hit-probesetfreeze-id h))
+ (name (dataset-name db probesetfreeze-id))
+ (trait (get-trait db probeset-id))
+ (trait-name (assoc-ref trait "Name"))
+ (recs (assoc-ref nrecs id))
+ ]
+ (if (has-bxd? recs)
+ (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs)
+ ))) hits)
(if (> rest 0)
(run-list-traits-to-compute db rest (first (reverse data-ids)))) ;; start precompute
)))
diff --git a/scripts/precompute/run-gemma.scm b/scripts/precompute/run-gemma.scm
index e6a4e26..4952834 100755
--- a/scripts/precompute/run-gemma.scm
+++ b/scripts/precompute/run-gemma.scm
@@ -9,7 +9,7 @@ Run from base dir with
and with some extra paths (for gemma)
-~/opt/guix-pull/bin/guix shell -C -F xz tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug guile -L . -e main -s ./scripts/precompute/run-gemma.scm
+~/opt/guix-pull/bin/guix shell -C -F xz python python-lmdb tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug guile -L . -e main -s ./scripts/precompute/run-gemma.scm --id 21529
!#
@@ -30,24 +30,36 @@ and with some extra paths (for gemma)
;; (write args)
(let* [
(option-spec '( (version (single-char #\v) (value #f))
+ (id (value #t))
(help (single-char #\h) (value #f))))
(options (getopt-long args option-spec))
+ (show-version (option-ref options 'version #f))
(help-wanted (option-ref options 'help #f))]
- (display "RUNNING")
+ (if show-version
+ (begin
+ (display "run-gemma version 1.0\n")
+ (exit)))
(if help-wanted
- (format #t "list-traits-to-compute writes JSON traits files from the GN DB
-Usage: list-traits-to-compute [options...]
- -h, --help Display this help
-"))
- (let [(trait-name "115475")]
- (call-with-input-file "115475.json"
+ (format #t "run-gemma
+Usage: run-gemma [options...] filename(s)
+ --id Run on identifier
+ -v --version Display version
+ -h --help Display this help
+")
+ (let* [(trait-id (option-ref options 'id "0"))
+ (trait-fn (string-append trait-id ".json"))
+ ]
+
+ (call-with-input-file trait-fn
(lambda (port)
(let* [(json (json->scm port))
- (dataset (assoc-ref json "traits"))
- (dataset-name (car (car dataset)))
- (traits (assoc-ref dataset dataset-name))
+ (dataset (car (assoc-ref json "data")))
+ (data (cdr dataset))
+ (dataset-name (assoc-ref data "name"))
+ (trait-name (assoc-ref data "trait-name"))
+ (traits (assoc-ref data "traits"))
+ (pheno-fn (string-append trait-id "-pheno.txt"))
]
- (display dataset)
- (write-pheno-file "pheno.txt" traits)
- (invoke-gemma-wrapper-loco dataset-name trait-name "pheno.txt")
- ))))))
+ (write-pheno-file pheno-fn traits)
+ (invoke-gemma-wrapper-loco dataset-name trait-name trait-fn pheno-fn "BXD.8_geno.txt.gz")
+ )))))))