From a950fc8d6c856bf700841514af113d689e30afc5 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 7 May 2024 13:53:32 +0200 Subject: Fetch a batch of traits so we can process faster --- gn/data/strains.scm | 17 +++-- gn/db/mysql.scm | 4 +- scripts/precompute/list-traits-to-compute.scm | 89 +++++++++++++-------------- 3 files changed, 58 insertions(+), 52 deletions(-) diff --git a/gn/data/strains.scm b/gn/data/strains.scm index f1348ac..4a251d4 100644 --- a/gn/data/strains.scm +++ b/gn/data/strains.scm @@ -17,17 +17,17 @@ bxd-strain-id-names )) -(define* (strain-id-names inbred-set #:key (used-for-mapping? #f)) +(define* (strain-id-names inbred-set #:key (used-for-mapping? #t)) "Return assoc list of tuples of strain id+names: ((4 . BXD1) (5 . BXD2) (6 . BXD5) (7 . BXD6)... -used-for-mapping? will say whether the strains/individuals are used for mapping. +used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME " (call-with-db (lambda (db) (dbi-query db (string-append "SELECT StrainId,Strain.Name FROM Strain, StrainXRef WHERE StrainXRef.StrainId = Strain.Id AND StrainXRef.InbredSetId = " (int-to-string inbred-set) (if used-for-mapping? - " AND Used_for_mapping='Y'" + ;; " AND Used_for_mapping='Y'" "") " ORDER BY StrainId;")) (get-rows-apply db (lambda (r) `(,(assoc-ref r "StrainId") . ,(assoc-ref r "Name"))) '())))) @@ -35,5 +35,12 @@ used-for-mapping? will say whether the strains/individuals are used for mapping. (define* (bxd-strain-id-names #:key (used-for-mapping? #f)) "Return assoc list of tuples of strain id + names. Same as strain-id-names, but just for the BXD -used-for-mapping? will say whether the strains/individuals are used for mapping." - (strain-id-names 1)) +used-for-mapping? will say whether the strains/individuals are used for mapping. Always True, FIXME" + (filter (lambda (l) l) + (map (lambda (l) + (let [(id (car l)) + (name (cdr l))] + (if (or (< id 42) (string-contains name "BXD")) + l + #f)) + ) (strain-id-names 1 #:used-for-mapping? used-for-mapping?)))) diff --git a/gn/db/mysql.scm b/gn/db/mysql.scm index 66f28db..ccd414a 100644 --- a/gn/db/mysql.scm +++ b/gn/db/mysql.scm @@ -29,8 +29,8 @@ (define (db-open) (begin - (display "===> OPENING DB") - (newline) + ;; (display "===> OPENING DB") + ;; (newline) (let [(db (dbi-open "mysql" "webqtlout:webqtlout:db_webqtl:tcp:127.0.0.1:3306"))] (ensure db) db diff --git a/scripts/precompute/list-traits-to-compute.scm b/scripts/precompute/list-traits-to-compute.scm index 68f0711..db12eed 100755 --- a/scripts/precompute/list-traits-to-compute.scm +++ b/scripts/precompute/list-traits-to-compute.scm @@ -35,6 +35,27 @@ Now list the next 1000 trait IDs: . .guix-shell -- guile -L . -s ./scripts/precompute/list-traits-to-compute.scm --next 1000 +The current logic is to list all datasets that contain a +BXD. (bxd-strain-id-names #:used-for-mapping? #t) fetches all ids and +strain names listed in GN. Note that this differs from the actual +genotype file. + +To find the StrainId in a dataset: + +MariaDB [db_webqtl]> SELECT StrainId,value from ProbeSetData WHERE Id=115467; ++----------+---------+ +| StrainId | value | ++----------+---------+ +| 1 | 9.47169 | +| 2 | 9.21621 | +| 3 | 9.728 | +| 4 | 9.28976 | +| 5 | 9.55523 | +| 6 | 9.63562 ... + +to speed things up a little we batch them up and check whether the BXD is part of it. +When that is the case we might as well write the phenotype file because we have the trait values. + !# (use-modules (dbi dbi) @@ -49,51 +70,29 @@ Now list the next 1000 trait IDs: (srfi srfi-1) ) - - (call-with-db (lambda (db) (begin - (define bxd-strains (bxd-strain-id-names #:used-for-mapping? #t)) - (define (get-trait db probeset-id) - (dbi-query db (string-append "select Id,Chr,Mb,Name,Symbol,description from ProbeSet where Id=" (int-to-string probeset-id) " limit 1")) - (get-row db)) - (define (run-list-traits-to-compute db prev-id count) - (let [(hits (get-precompute-hits db prev-id count))] - (for-each (lambda (hit) - (let* [(data-id (assoc-ref hit "DataId")) - (data-id-str (int-to-string data-id)) - ;; (probesetfreeze-id (assoc-ref hit "ProbeSetFreezeId")) - ;; (probeset-id (assoc-ref hit "ProbeSetId")) - ;; (trait (get-trait db probeset-id)) - ;; (trait-name (assoc-ref trait "Name")) - ;; (name (dataset-name db probesetfreeze-id)) - ] - (display hit) - (newline) - ;; ---- Get strains and phenotypes for this dataset - (dbi-query db (string-append "SELECT StrainId,value from ProbeSetData WHERE Id=" data-id-str)) - (define id_traits (get-rows-apply db - (lambda (r) `(,(assoc-ref r "StrainId") . ,(assoc-ref r "value"))) - '())) - ;; ---- Now we need to make sure that all strains belong to BXD - (define non-bxd (fold - (lambda (strain lst) - (let* [(id (car strain)) - (name (assoc id bxd-strains))] - (if name - lst - (append lst `(,name))))) - - '() - id_traits)) - (define traits (map - (lambda (t) - (match t - ((id . value) (cons (assoc-ref bxd-strains id) value) - ))) - id_traits)) - #t)) - hits))) - (run-list-traits-to-compute db 0 5) ;; start precompute - ))) + (let [(bxd-strains (bxd-strain-id-names #:used-for-mapping? #t))] + (define (run-list-traits-to-compute db prev-id count) + (let* [(hits (get-precompute-hits db prev-id count)) + (data-ids (map (lambda (hit) + (let* [(data-id (assoc-ref hit "DataId")) + ; (data-id-str (int-to-string data-id)) + ] + data-id)) + hits)) + ;; (data-ids-query (fold (lambda (id query) "" (string-append query "Id=" (int-to-string id) " OR ")) "" data-ids)) + (data-str-ids (map (lambda (id) (string-append "Id=" (int-to-string id))) data-ids)) + (data-ids-query (string-join data-str-ids " OR ")) + (query (string-append "SELECT Id,StrainId,value FROM ProbeSetData WHERE " data-ids-query)) + ] + ; (display data-str-ids) + ; (display data-ids-query) + ; (display data-ids) + (display query) + )) + ; (display data-ids) + (run-list-traits-to-compute db 0 50) ;; start precompute + ;; (write bxd-strains) + )))) -- cgit v1.2.3