about summary refs log tree commit diff
path: root/scripts/precompute
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/precompute')
-rwxr-xr-xscripts/precompute/list-traits-to-compute.scm80
-rwxr-xr-xscripts/precompute/run-gemma.scm42
2 files changed, 93 insertions, 29 deletions
diff --git a/scripts/precompute/list-traits-to-compute.scm b/scripts/precompute/list-traits-to-compute.scm
index 2c48d83..102a6fa 100755
--- a/scripts/precompute/list-traits-to-compute.scm
+++ b/scripts/precompute/list-traits-to-compute.scm
@@ -15,6 +15,10 @@ You may want to forward a mysql port if there is no DB locally
 
     ssh -L 3306:127.0.0.1:3306 -f -N tux02.genenetwork.org
 
+ignore IPv6 message:
+
+    bind [::1]:3306: Cannot assign requested address
+
 test connection with mysql client:
 
     mysql -uwebqtlout -pwebqtlout -A -h 127.0.0.1 -P 3306 db_webqtl -e "show tables;"
@@ -78,19 +82,50 @@ When that is the case we might as well write the phenotype file because we have
              (json)
              (rnrs bytevectors)
              (srfi srfi-1)
+             (srfi srfi-9)
              (srfi srfi-19) ; time
              )
 
+(define (get-trait db probeset-id)
+  (dbi-query db (string-append "select Id,Chr,Mb,Name,Symbol,description from ProbeSet where Id=" (int-to-string probeset-id) " limit 1"))
+  (get-row db)
+  )
+
+#!
+
+The following is produced by gemma-wrapper as metadata
+
+  "meta": {
+    "type": "gemma-wrapper",
+    "version": "0.99.7-pre1",
+    "population": "BXD",
+    "name": "HC_U_0304_R",
+    "trait": "101500_at",
+    "url": "https://genenetwork.org/show_trait?trait_id=101500_at&dataset=HC_U_0304_R",
+    "archive_GRM": "46bfba373fe8c19e68be6156cad3750120280e2e-gemma-cXX.tar.xz",
+    "archive_GWA": "779a54a59e4cd03608178db4068791db4ca44ab3-gemma-GWA.tar.xz",
+    "dataid": 75629,
+    "probesetid": 1097,
+    "probesetfreezeid": 7
+    }
 
-(define (write-json-ld id recs)
+!#
+
+
+(define (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs)
   ;; see also https://www.w3.org/2018/jsonld-cg-reports/json-ld/
   (display id)
+  (display ":")
+  (display name)
+  (display ":")
+  (display trait-name)
   (newline)
   (let* [(traits (map (lambda (r)
                         (match r
                           [(strain-id . value) (cons (bxd-name strain-id) value)]
                           ))
                       (reverse recs)))
+         (uri (format #f "https://genenetwork.org/show_trait?trait_id=~a&dataset=~a" trait-name name))
          (sha256 (sha-256->string (sha-256 (string->utf8 (scm->json-string traits)))))
          (json-data `(("@context" . "https://genenetwork.org/resource")
                       (type . traits)
@@ -98,8 +133,16 @@ When that is the case we might as well write the phenotype file because we have
                                (steps . ())
                                (sha256 . ((input-traits . ,sha256)))
                                (time . ,(date->string (time-utc->date (current-time))))))
-                      (traits .
-                      ((,id . ,traits)))))]
+                      (data .
+                            ((,id .
+                                  ((group . "BXD")
+                                   (probesetfreeze-id . ,probesetfreeze-id)
+                                   (probeset-id . ,probeset-id)
+                                   (name . ,name)
+                                   (trait-name . ,trait-name)
+                                   (uri . ,uri)
+                                   (traits . ,traits)
+                                   ))))))]
     (call-with-output-file (string-append (number->string id) ".json")
       (lambda (port)
         (put-string port (scm->json-string json-data))))
@@ -111,22 +154,23 @@ When that is the case we might as well write the phenotype file because we have
    (begin
      ;; (let [(bxd-strains (memo-bxd-strain-id-names #:used-for-mapping? #t))]
        (define (run-list-traits-to-compute db num prev-id)
+         ;; ---- Build a query to collect num traits
          (let* [(count (if (< batch-size num)
                            batch-size
                            num))
                 (rest (- num count))
                 (hits (get-precompute-hits db prev-id count))
-                (data-ids (map (lambda (hit)
-                                 (let* [(data-id (assoc-ref hit "DataId"))
-                                        ; (data-id-str (int-to-string data-id))
-                                        ]
-                                   data-id))
+                (data-ids (map (lambda (h)
+                                 (hit-data-id h))
                                hits))
                 (data-str-ids (map (lambda (id) (string-append "Id=" (int-to-string id))) data-ids))
                 (data-ids-query (string-join data-str-ids " OR "))
                 (query (string-append "SELECT Id,StrainId,value FROM ProbeSetData WHERE " data-ids-query))
                 ]
+           (format #t "writing-phenotypes from ~d to ~d batch ~d to ~d\n" first-id (+ first-id num) prev-id (+ prev-id count))
+           (display query)
            (dbi-query db query)
+           ;; ---- Walk each resulting trait and build a hash of data-id and list of trait values
            (let [(id-traits (get-rows db '()))
                  (nrecs '())]
              (for-each (lambda (r)
@@ -140,12 +184,20 @@ When that is the case we might as well write the phenotype file because we have
                                 )]
                       (set! nrecs (assoc-set! nrecs data-id lst))))
                        id-traits)
-             (for-each (lambda (r)
-                         (match r
-                           ((id . recs) (if (has-bxd? recs)
-                                              (write-json-ld id recs)
-                                            ))
-                            )) nrecs)
+             ;; --- create the json output as a file by walking traits and hits
+             (for-each (lambda (h)
+                         (let* [
+                                (id (hit-data-id h))
+                                (probeset-id (hit-probeset-id h))
+                                (probesetfreeze-id (hit-probesetfreeze-id h))
+                                (name (dataset-name db probesetfreeze-id))
+                                (trait (get-trait db probeset-id))
+                                (trait-name (assoc-ref trait "Name"))
+                                (recs (assoc-ref nrecs id))
+                                ]
+                             (if (has-bxd? recs)
+                                              (write-json-ld id name trait trait-name probesetfreeze-id probeset-id recs)
+                             ))) hits)
              (if (> rest 0)
                  (run-list-traits-to-compute db rest (first (reverse data-ids)))) ;; start precompute
            )))
diff --git a/scripts/precompute/run-gemma.scm b/scripts/precompute/run-gemma.scm
index e6a4e26..4952834 100755
--- a/scripts/precompute/run-gemma.scm
+++ b/scripts/precompute/run-gemma.scm
@@ -9,7 +9,7 @@ Run from base dir with
 
 and with some extra paths (for gemma)
 
-~/opt/guix-pull/bin/guix shell -C -F xz tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug  guile -L . -e main -s ./scripts/precompute/run-gemma.scm
+~/opt/guix-pull/bin/guix shell -C -F xz python python-lmdb tar time parallel coreutils-minimal guile guile-dbi guile-json ruby --expose=/home/wrk/iwrk/opensource/code/genetics/gemma-wrapper/=/gemma-wrapper --expose=/home/wrk/iwrk/opensource/code/genetics/gemma/=/gemma -- env TMPDIR=tmp GEMMA_COMMAND=/gemma/bin/gemma-0.98.5-linux-static-debug  guile -L . -e main -s ./scripts/precompute/run-gemma.scm --id 21529
 
 !#
 
@@ -30,24 +30,36 @@ and with some extra paths (for gemma)
   ;; (write args)
   (let* [
          (option-spec '( (version (single-char #\v) (value #f))
+                         (id      (value #t))
                          (help    (single-char #\h) (value #f))))
          (options (getopt-long args option-spec))
+         (show-version (option-ref options 'version #f))
          (help-wanted (option-ref options 'help #f))]
-    (display "RUNNING")
+    (if show-version
+        (begin
+          (display "run-gemma version 1.0\n")
+          (exit)))
     (if help-wanted
-        (format #t "list-traits-to-compute writes JSON traits files from the GN DB
-Usage: list-traits-to-compute [options...]
-  -h, --help          Display this help
-"))
-    (let [(trait-name "115475")]
-      (call-with-input-file "115475.json"
+        (format #t "run-gemma
+Usage: run-gemma [options...] filename(s)
+  --id               Run on identifier
+  -v --version       Display version
+  -h --help          Display this help
+")
+        (let* [(trait-id (option-ref options 'id "0"))
+          (trait-fn (string-append trait-id ".json"))
+          ]
+
+      (call-with-input-file trait-fn
         (lambda (port)
           (let* [(json (json->scm port))
-                 (dataset (assoc-ref json "traits"))
-                 (dataset-name (car (car dataset)))
-                 (traits (assoc-ref dataset dataset-name))
+                 (dataset (car (assoc-ref json "data")))
+                 (data (cdr dataset))
+                 (dataset-name (assoc-ref data "name"))
+                 (trait-name (assoc-ref data "trait-name"))
+                 (traits (assoc-ref data "traits"))
+                 (pheno-fn (string-append trait-id "-pheno.txt"))
                  ]
-            (display dataset)
-            (write-pheno-file "pheno.txt" traits)
-            (invoke-gemma-wrapper-loco dataset-name trait-name "pheno.txt")
-          ))))))
+            (write-pheno-file pheno-fn traits)
+            (invoke-gemma-wrapper-loco dataset-name trait-name trait-fn pheno-fn "BXD.8_geno.txt.gz")
+            )))))))