4 files changed, 224 insertions, 0 deletions
diff --git a/test/performance/releases.org b/test/performance/releases.org
index b208e54..b9c451d 100644
--- a/test/performance/releases.org
+++ b/test/performance/releases.org
@@ -1,5 +1,108 @@
 * GEMMA performance stats
 
+** GEMMA 1.00-pre1
+
+
+Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
+
+Introducing mdb genotype format led to a 30% speed increase on the small mouse set:
+
+#+begin_src sh
+real    0m6.403s
+user    0m11.529s
+sys     0m6.325s
+#+end_src sh
+
+that may not look like much, but we are only starting!
+
+** Picking up the pieces
+
+We are facing a time regression.
+
+#+begin_src sh
+premake5 gmake && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.mdb -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check
+#+end_src sh
+
+With openblas 0.3.21 we go a bit faster. Still 10% behind though, there is room for tweaking. It may actually be a new SSD. I want to run some bigger files first.
+
+#+begin_src sh
+Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025
+Reading Files ...
+## number of total individuals = 1940
+## number of analyzed individuals = 1410
+## number of covariates = 1
+## number of phenotypes = 1
+## number of total SNPs/var        =    12226
+## number of analyzed SNPs         =    10768
+Start Eigen-Decomposition...
+pve estimate =0.608801
+se(pve) =0.032774
+================================================== 100%
+real    0m9.017s
+user    0m13.168s
+sys     0m5.919s
+#+end_src sh
+
+Before it was
+
+#+begin_src sh
+Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025
+Reading Files ...
+## number of total individuals = 1940
+## number of analyzed individuals = 1410
+## number of covariates = 1
+## number of phenotypes = 1
+## number of total SNPs/var        =    12226
+## number of analyzed SNPs         =    10768
+Start Eigen-Decomposition...
+pve estimate =0.608801
+se(pve) =0.032774
+================================================== 100%
+real    0m16.772s
+user    0m25.443s
+sys     0m0.901s
+#+end_src sh
+
+The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice.
+
+When compiled with the profiler library prefix the gemma run with
+
+#+begin_src sh
+premake5 gmake && make verbose=1 config=debug -j 8 gemma && time CPUPROFILE=gemma.prof LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug
+CPUPROFILE=gemma.prof
+pprof --text build/bin/Debug/gemma gemma.prof
+
+    1007  49.2%  49.2%     1015  49.6% dot_compute
+      94   4.6%  53.8%       94   4.6% rpcc
+      74   3.6%  57.5%       74   3.6% gsl_vector_div
+      62   3.0%  60.5%       92   4.5% ____strtod_l_internal
+      42   2.1%  62.5%       42   2.1% dgemm_kernel_ZEN
+#+end_src sh
+
+this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing.
+
+Well, at least I found the problem. Time for a special openblas build like I used to do.
+
+
+*** Bigger run
+
+We translate this 10Gb (gzip compressed) job from our pangenome precompute
+
+```
+/bin/gemma -loco 3 -k /export2/data/wrk/services/gemma-wrapper/tmp/tmp/panlmm/93f6b39ec06c09fb9ba9ca628b5fb990921b6c60.3.cXX.txt.cXX.txt -o a3248cec40b3fe6b9e8672352b3ab2d7280c426c.3.assoc.txt -p pheno.json.txt -g pangenome-13M-genotypes.txt -a snps-matched.txt -lmm 9 -maf 0.1 -n 2 -outdir /export2/data/wrk/services/gemma-wrapper/tmp/tmp/panlmm/d20251126-4190721-c8bbo8
+```
+
+to
+
+```
+time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g tmp/pangenome-13M-genotypes.txt -p tmp/pheno.json.txt -n 1 -a tmp/snps-matched.txt -k tmp/93f6b39ec06c09fb9ba9ca628b5fb990921b6c60.3.cXX.txt.cXX.txt -lmm 9 -no-check
+real    20m4.687s
+user    23m42.508s
+sys     9m51.929s
+```
+
+On my AMD Ryzen 7 3700X it uses about ~10Gb of RAM. With the -debug switch it clapped out because of sqrt(NaN). There is a lot that can be gained with better IO and multi-core use.
+
 ** GEMMA 0.98.5-pre1
 
 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
diff --git a/test/runner b/test/runner
new file mode 100755
index 0000000..5002d80
--- /dev/null
+++ b/test/runner
@@ -0,0 +1,18 @@
+#!/bin/sh
+# -*- mode: scheme; -*-
+exec guile --debug -s "$0" "$@"
+!#
+
+(define-module (test-runner)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-1)  ; for last
+  #:use-module (srfi srfi-13)
+  #:use-module (srfi srfi-64) ; for tests
+  #:use-module (ice-9 rdelim)
+  )
+
+(test-begin "all-tests")
+
+(load "test-uvlmm-integration.scm")
+
+(test-end "all-tests")
diff --git a/test/test-mdb-integration.scm b/test/test-mdb-integration.scm
new file mode 100755
index 0000000..006c241
--- /dev/null
+++ b/test/test-mdb-integration.scm
@@ -0,0 +1,51 @@
+#!/bin/sh
+# -*- mode: scheme; -*-
+exec guile --debug -s "$0" "$@"
+!#
+
+(define-module (test-runner)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-1)  ; for last
+  #:use-module (srfi srfi-13)
+  #:use-module (srfi srfi-64) ; for tests
+  #:use-module (ice-9 rdelim)
+  )
+
+(define kinship-fn "./output/mouse_hs1940.cXX.txt")
+(define gwa-fn "./output/mouse_hs1940.assoc.txt")
+
+(test-begin "uvlmm-mdb-kinship-run")
+
+(when (file-exists? kinship-fn)
+  (delete-file kinship-fn))
+(let [(err (system "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.mdb -p ./example/mouse_hs1940.pheno.txt -gk -o mouse_hs1940 -debug"))]
+  (test-eqv 0 err))
+
+(test-end "uvlmm-mdb-kinship-run")
+
+(test-begin "uvlmm-mdb-gwa-run")
+
+(when (file-exists? gwa-fn)
+  (delete-file gwa-fn))
+;; The following integration test runs gemma uvlmm and adds up the output column as a check.
+;; It uses the kinship-run matrix from the earlier test
+(let [(err (system (string-append "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.mdb -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k " kinship-fn " -o mouse_hs1940 -lmm 9 -debug")))]
+  (test-eqv 0 err))
+(call-with-input-file gwa-fn
+  (lambda (port)
+    (read-line port)  ; skip first line
+    (let* ((fields (string-split (read-line port) #\tab))
+           (last-field (last fields)))
+      (test-eqv 208.0 (truncate (* 1000 (string->number last-field)))))
+    (test-eqv 5720672.0
+      (let loop ((line (read-line port))
+                 (sum 208.0))
+        (if (eof-object? line)
+            sum
+            (let* ((fields (string-split line #\tab))
+                   (last-field (last fields))
+                   (value (string->number last-field)))
+              (loop (read-line port)
+                    (+ sum (truncate (* 1000 value))))))))))
+
+(test-end "uvlmm-mdb-gwa-run")
diff --git a/test/test-uvlmm-integration.scm b/test/test-uvlmm-integration.scm
new file mode 100755
index 0000000..91eb14a
--- /dev/null
+++ b/test/test-uvlmm-integration.scm
@@ -0,0 +1,52 @@
+#!/bin/sh
+# -*- mode: scheme; -*-
+exec guile --debug -s "$0" "$@"
+!#
+
+(define-module (test-runner)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-1)  ; for last
+  #:use-module (srfi srfi-13)
+  #:use-module (srfi srfi-64) ; for tests
+  #:use-module (ice-9 rdelim)
+  )
+
+(define kinship-fn "./output/mouse_hs1940.cXX.txt")
+(define gwa-fn "./output/mouse_hs1940.assoc.txt")
+
+(test-begin "uvlmm-bimbam-kinship-run")
+
+(when (file-exists? kinship-fn)
+  (delete-file kinship-fn))
+(let [(err (system "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -gk -p ./example/mouse_hs1940.pheno.txt -o mouse_hs1940 -debug"))]
+  (test-eqv 0 err))
+
+(test-end "uvlmm-bimbam-kinship-run")
+
+
+(test-begin "uvlmm-bimbam-gwa-run")
+
+(when (file-exists? gwa-fn)
+  (delete-file gwa-fn))
+;; The following integration test runs gemma uvlmm and adds up the output column as a check.
+;; It uses the kinship-run matrix from the earlier test
+(let [(err (system (string-append "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k " kinship-fn " -o mouse_hs1940 -lmm 9 -debug")))]
+  (test-eqv 0 err))
+(call-with-input-file gwa-fn
+  (lambda (port)
+    (read-line port)  ; skip first line
+    (let* ((fields (string-split (read-line port) #\tab))
+           (last-field (last fields)))
+      (test-eqv 208.0 (truncate (* 1000 (string->number last-field)))))
+    (test-eqv 5720672.0
+      (let loop ((line (read-line port))
+                 (sum 208.0))
+        (if (eof-object? line)
+            sum
+            (let* ((fields (string-split line #\tab))
+                   (last-field (last fields))
+                   (value (string->number last-field)))
+              (loop (read-line port)
+                    (+ sum (truncate (* 1000 value))))))))))
+
+(test-end "uvlmm-bimbam-gwa-run")