From a950fc8d6c856bf700841514af113d689e30afc5 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 7 May 2024 13:53:32 +0200
Subject: Fetch a batch of traits so we can process faster

---
 gn/data/strains.scm                           | 17 +++--
 gn/db/mysql.scm                               |  4 +-
 scripts/precompute/list-traits-to-compute.scm | 89 +++++++++++++--------------
 3 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/gn/data/strains.scm b/gn/data/strains.scm
index f1348ac..4a251d4 100644
--- a/gn/data/strains.scm
+++ b/gn/data/strains.scm
@@ -17,17 +17,17 @@
             bxd-strain-id-names
             ))
 
-(define* (strain-id-names inbred-set #:key (used-for-mapping? #f))
+(define* (strain-id-names inbred-set #:key (used-for-mapping? #t))
   "Return assoc list of tuples of strain id+names:
    ((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)...
 
-used-for-mapping? will say whether the strains/individuals are used for mapping.
+used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME
 "
   (call-with-db
    (lambda (db)
      (dbi-query db (string-append "SELECT StrainId,Strain.Name FROM Strain, StrainXRef WHERE StrainXRef.StrainId = Strain.Id AND StrainXRef.InbredSetId = " (int-to-string inbred-set)
                                   (if used-for-mapping?
-                                      " AND Used_for_mapping='Y'"
+                                      ;; " AND Used_for_mapping='Y'"
                                       "")
                                   " ORDER BY StrainId;"))
       (get-rows-apply db (lambda (r) `(,(assoc-ref r "StrainId") . ,(assoc-ref r "Name"))) '()))))
@@ -35,5 +35,12 @@ used-for-mapping? will say whether the strains/individuals are used for mapping.
 (define* (bxd-strain-id-names #:key (used-for-mapping? #f))
   "Return assoc list of tuples of strain id + names. Same as strain-id-names, but just for the BXD
 
-used-for-mapping? will say whether the strains/individuals are used for mapping."
-   (strain-id-names 1))
+used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME"
+  (filter (lambda (l) l)
+  (map (lambda (l)
+         (let [(id (car l))
+               (name (cdr l))]
+           (if (or (< id 42) (string-contains name "BXD"))
+               l
+               #f))
+         ) (strain-id-names 1 #:used-for-mapping? used-for-mapping?))))
diff --git a/gn/db/mysql.scm b/gn/db/mysql.scm
index 66f28db..ccd414a 100644
--- a/gn/db/mysql.scm
+++ b/gn/db/mysql.scm
@@ -29,8 +29,8 @@
 
 (define (db-open)
   (begin
-    (display "===> OPENING DB")
-    (newline)
+    ;; (display "===> OPENING DB")
+    ;; (newline)
     (let [(db (dbi-open "mysql" "webqtlout:webqtlout:db_webqtl:tcp:127.0.0.1:3306"))]
       (ensure db)
       db
diff --git a/scripts/precompute/list-traits-to-compute.scm b/scripts/precompute/list-traits-to-compute.scm
index 68f0711..db12eed 100755
--- a/scripts/precompute/list-traits-to-compute.scm
+++ b/scripts/precompute/list-traits-to-compute.scm
@@ -35,6 +35,27 @@ Now list the next 1000 trait IDs:
 
     . .guix-shell -- guile -L . -s ./scripts/precompute/list-traits-to-compute.scm --next 1000
 
+The current logic is to list all datasets that contain a
+BXD. (bxd-strain-id-names #:used-for-mapping? #t) fetches all ids and
+strain names listed in GN. Note that this differs from the actual
+genotype file.
+
+To find the StrainId in a dataset:
+
+MariaDB [db_webqtl]> SELECT StrainId,value from ProbeSetData WHERE Id=115467;
++----------+---------+
+| StrainId | value   |
++----------+---------+
+|        1 | 9.47169 |
+|        2 | 9.21621 |
+|        3 |   9.728 |
+|        4 | 9.28976 |
+|        5 | 9.55523 |
+|        6 | 9.63562 ...
+
+to speed things up a little we batch them up and check whether the BXD is part of it.
+When that is the case we might as well write the phenotype file because we have the trait values.
+
 !#
 
 (use-modules (dbi dbi)
@@ -49,51 +70,29 @@ Now list the next 1000 trait IDs:
              (srfi srfi-1)
              )
 
-
-
 (call-with-db
  (lambda (db)
    (begin
-     (define bxd-strains (bxd-strain-id-names #:used-for-mapping? #t))
-     (define (get-trait db probeset-id)
-       (dbi-query db (string-append "select Id,Chr,Mb,Name,Symbol,description from ProbeSet where Id=" (int-to-string probeset-id) " limit 1"))
-       (get-row db))
-     (define (run-list-traits-to-compute db prev-id count)
-       (let [(hits (get-precompute-hits db prev-id count))]
-         (for-each (lambda (hit)
-                     (let* [(data-id (assoc-ref hit "DataId"))
-                            (data-id-str (int-to-string data-id))
-                            ;; (probesetfreeze-id (assoc-ref hit "ProbeSetFreezeId"))
-                            ;; (probeset-id (assoc-ref hit "ProbeSetId"))
-                            ;; (trait (get-trait db probeset-id))
-                            ;; (trait-name (assoc-ref trait "Name"))
-                            ;; (name (dataset-name db probesetfreeze-id))
-                            ]
-                       (display hit)
-                       (newline)
-                       ;; ---- Get strains and phenotypes for this dataset
-                       (dbi-query db (string-append "SELECT StrainId,value from ProbeSetData WHERE Id=" data-id-str))
-                       (define id_traits (get-rows-apply db
-                                                         (lambda (r) `(,(assoc-ref r "StrainId") . ,(assoc-ref r "value")))
-                                                         '()))
-                       ;; ---- Now we need to make sure that all strains belong to BXD
-                       (define non-bxd (fold
-                                        (lambda (strain lst)
-                                          (let* [(id (car strain))
-                                                 (name (assoc id bxd-strains))]
-                                            (if name
-                                                lst
-                                                (append lst `(,name)))))
-
-                                        '()
-                                        id_traits))
-                       (define traits (map
-                                       (lambda (t)
-                                         (match t
-                                           ((id . value) (cons (assoc-ref bxd-strains id) value)
-                                            )))
-                                       id_traits))
-                       #t))
-                   hits)))
-         (run-list-traits-to-compute db 0 5) ;; start precompute
-       )))
+     (let [(bxd-strains (bxd-strain-id-names #:used-for-mapping? #t))]
+       (define (run-list-traits-to-compute db prev-id count)
+         (let* [(hits (get-precompute-hits db prev-id count))
+                (data-ids (map (lambda (hit)
+                                 (let* [(data-id (assoc-ref hit "DataId"))
+                                        ; (data-id-str (int-to-string data-id))
+                                        ]
+                                   data-id))
+                               hits))
+                ;; (data-ids-query (fold (lambda (id query) "" (string-append query "Id=" (int-to-string id) " OR ")) "" data-ids))
+                (data-str-ids (map (lambda (id) (string-append "Id=" (int-to-string id))) data-ids))
+                (data-ids-query (string-join data-str-ids " OR "))
+                (query (string-append "SELECT Id,StrainId,value FROM ProbeSetData WHERE " data-ids-query))
+                ]
+           ; (display data-str-ids)
+           ; (display data-ids-query)
+           ; (display data-ids)
+           (display query)
+           ))
+       ; (display data-ids)
+       (run-list-traits-to-compute db 0 50) ;; start precompute
+       ;; (write bxd-strains)
+       ))))
-- 
cgit v1.2.3