about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--guix.scm101
-rw-r--r--premake5.lua10
-rw-r--r--src/debug.cpp4
-rw-r--r--src/debug.h11
-rw-r--r--test/performance/releases.org67
-rwxr-xr-xtest/runner24
7 files changed, 213 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
index d915f60..86c3228 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ PanGemma.make
 gemma.make
 gemmalib.make
 Makefile
+gdb.txt
+*.prof
 *.o
 *.tar.gz
 src/Eigen
diff --git a/guix.scm b/guix.scm
index 299332e..23b7583 100644
--- a/guix.scm
+++ b/guix.scm
@@ -7,10 +7,15 @@
 ;;
 ;;   guix shell -C -D -F -v 3 -L . pangemma-shell-git  # pangemma-shell-git
 ;;
-;;   see premake5.lua header for examples.
+;; optimized for arch:
+;;
+;;   guix shell --tune=native -C -D -F -v 3 -L . pangemma-shell-git  # pangemma-shell-git
+;;
+;; see premake5.lua header for examples.
 ;;
 ;;   guix shell -C -D -F -v 3 -L . gemma-git           # for specific packages
 ;;
+;; To optimize use guix --tune=march-type (e.g. --tune=native)
 
 (define-module (guix)
   #:use-module ((guix licenses) #:prefix license:)
@@ -18,13 +23,17 @@
   #:use-module (guix packages)
   #:use-module (guix git-download)
   #:use-module (guix build-system gnu)
+  #:use-module (guix utils)
+
   #:use-module (gnu packages algebra)
   #:use-module (gnu packages base)
   #:use-module (gnu packages build-tools)
   #:use-module (gnu packages compression)
   #:use-module (gnu packages commencement)
   #:use-module (gnu packages check)
+  #:use-module (gnu packages cpp)
   #:use-module (gnu packages databases)
+  #:use-module (gnu packages gcc)
   #:use-module (gnu packages gdb)
   #:use-module (gnu packages guile)
   #:use-module (gnu packages guile-xyz)
@@ -45,6 +54,90 @@
 (define %pangemma-version
     (read-string (open-pipe "cat VERSION|tr -d $'\n'" OPEN_READ)))
 
+(define-public openblas-pangemma
+;; we are fixating on an older openblas, for now
+  (package
+    (name "openblas-pangemma")
+    (version "0.3.21")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/xianyi/OpenBLAS")
+             (commit (string-append "v" version))))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32
+         "0yx1axiki12y0xz0d5s76vvl7ds36k0npv1sww08k2qslhz1g9qp"))))
+    (build-system gnu-build-system)
+    (properties `((tunable? . #t)))
+    (arguments
+     (list
+      #:tests? #f ;; skip tests
+      #:test-target "test"
+      ;; No default baseline is supplied for powerpc-linux.
+      #:substitutable? (not (target-ppc32?))
+      #:make-flags
+      #~(list (string-append "PREFIX=" #$output)
+              (string-append "CFLAGS=-O3 -g -Wno-incompatible-pointer-types -Wno-error=implicit-function-declaration")
+              "SHELL=bash"
+              "MAKE_NB_JOBS=0"          ;use jobserver for submakes
+
+              ;; This is the maximum number of threads OpenBLAS will ever use (that
+              ;; is, if $OPENBLAS_NUM_THREADS is greater than that, then NUM_THREADS
+              ;; is used.)  If we don't set it, the makefile sets it to the number
+              ;; of cores of the build machine, which is obviously wrong.
+              "NUM_THREADS=128"
+
+              ;; DYNAMIC_ARCH is only supported on some architectures.
+              ;; DYNAMIC_ARCH combined with TARGET=GENERIC provides a library
+              ;; which uses the optimizations for the detected CPU.  This can
+              ;; be overridden at runtime with the environment variable
+              ;; OPENBLAS_CORETYPE=<type>, where "type" is a supported CPU
+              ;; type.  On other architectures we target only the baseline CPU
+              ;; supported by Guix.
+              #$@(cond
+                    ((or (target-x86-64?)
+                         (target-x86-32?)
+                         (target-ppc64le?)
+                         (target-aarch64?))
+                     ;; Dynamic older enables a few extra CPU architectures
+                     ;; on x86_64 that were released before 2010.
+                     '("DYNAMIC_ARCH=1" "DYNAMIC_OLDER=1" "TARGET=GENERIC"))
+                    ;; On some of these architectures the CPU type can't be detected.
+                    ;; We list the oldest CPU core we want to have support for.
+                    ;; On MIPS we force the "SICORTEX" TARGET, as for the other
+                    ;; two available MIPS targets special extended instructions
+                    ;; for Loongson cores are used.
+                    ((target-mips64el?)
+                     '("TARGET=SICORTEX"))
+                    ((target-arm32?)
+                     '("TARGET=ARMV7"))
+                    ((target-riscv64?)
+                     '("TARGET=RISCV64_GENERIC"))
+                    (else '())))
+      ;; no configure script
+      #:phases
+      #~(modify-phases %standard-phases
+          (delete 'configure)
+          (add-before 'build 'set-extralib
+            (lambda* (#:key inputs #:allow-other-keys)
+              ;; Get libgfortran found when building in utest.
+              (setenv "FEXTRALIB"
+                      (string-append
+                       "-L"
+                       (dirname
+                        (search-input-file inputs "/lib/libgfortran.so")))))))))
+    (inputs
+     (list `(,gfortran "lib")))
+    (native-inputs
+     (list cunit gfortran perl))
+    (home-page "https://www.openblas.net/")
+    (synopsis "Optimized BLAS library based on GotoBLAS")
+    (description
+     "OpenBLAS is a BLAS library forked from the GotoBLAS2-1.13 BSD version.")
+    (license license:bsd-3)))
+
 (define-public pangemma-base-git
   "Pangemma base build package"
   (package
@@ -54,7 +147,7 @@
     (build-system gnu-build-system)
     (inputs
      (list gsl
-           openblas
+           openblas-pangemma
            guile-3.0
            `(,guile-3.0 "debug")
            ;; `(,guile-3.0 "dev")
@@ -102,7 +195,7 @@ genome-wide association studies (GWAS).")
     (build-system gnu-build-system)
     (propagated-inputs
      (modify-inputs (package-inputs pangemma-base-git)
-                    (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb ;; for the shell
+                    (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell
                  )))
     (arguments
      `(#:phases (modify-phases %standard-phases
@@ -126,7 +219,7 @@ genome-wide association studies (GWAS).")
      (list catch2
            gdb
            gsl
-           openblas
+           openblas-pangemma
            zlib))
        ;; ("gsl-static" ,gsl-static)
        ;; ("zlib:static" ,zlib "static")
diff --git a/premake5.lua b/premake5.lua
index 0c5483d..bb06cbd 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -1,6 +1,6 @@
 -- Build with
 --
---   make clean && rm build/Release/ -rf
+--   make clean && rm build -rf
 --   premake5 gmake2 && make verbose=1 gemmalib -j 8
 --
 -- Including bin
@@ -39,10 +39,14 @@ workspace "PanGemma"
 
    filter "configurations:Debug"
       defines { "DEBUG" }
+      buildoptions { pkg_cpp_flags }
+      linkoptions { pkg_linker_flags }
       symbols "On"
 
    filter "configurations:Release"
       defines { "NDEBUG", "HAVE_INLINE" }
+      buildoptions { pkg_cpp_flags }
+      linkoptions { pkg_linker_flags }
       buildoptions { "-pthread", "-Wall" }
       optimize "Speed"
 
@@ -59,9 +63,11 @@ project "gemma"
    includedirs { "src/" }
    links { "openblas" }
 
-
    filter "configurations:Debug"
       defines { "DEBUG" }
+      buildoptions { pkg_cpp_flags }
+      linkoptions { pkg_linker_flags }
+      links { "profiler" }
       symbols "On"
 
    filter "configurations:Release"
diff --git a/src/debug.cpp b/src/debug.cpp
index b26e173..6cefcc7 100644
--- a/src/debug.cpp
+++ b/src/debug.cpp
@@ -162,6 +162,8 @@ void disable_segfpe() {
 }
 */
 
+#ifndef NDEBUG
+
 void write(const char *s, const char *msg) {
   if (!is_debug_data_mode() && !is_debug_dump_mode()) return;
   ofstream out(debug_dump_path + "debug-dump-" + msg + ".txt");
@@ -232,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) {
           cout << "}" << endl;
 }
 
+#endif // NDEBUG
+
 /*
   Helper function to make sure gsl allocations do their job because
   gsl_matrix_alloc does not initiatize values (behaviour that changed
diff --git a/src/debug.h b/src/debug.h
index 0489a81..a32bfd2 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -60,11 +60,22 @@ void disable_segfpe();
   { auto x = m * n;                                      \
     enforce_msg(x / m == n, "multiply integer overflow"); }
 
+#ifndef NDEBUG
+
 void write(const double d, const char *msg = "");
 void write(const char *s, const char *msg = "");
 void write(const gsl_vector *v, const char *msg = "");
 void write(const gsl_matrix *m, const char *msg = "");
 
+#else // NDEBUG
+
+inline void write(const double d, const char *msg = "") {};
+inline void write(const char *s, const char *msg = "") {};
+inline void write(const gsl_vector *v, const char *msg = "") {};
+inline void write(const gsl_matrix *m, const char *msg = "") {};
+
+#endif // NDEBUG
+
 gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols);
 int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src);
 void gsl_matrix_safe_free (gsl_matrix *v);
diff --git a/test/performance/releases.org b/test/performance/releases.org
index b208e54..4cc92f1 100644
--- a/test/performance/releases.org
+++ b/test/performance/releases.org
@@ -1,5 +1,72 @@
 * GEMMA performance stats
 
+** GEMMA 0.98.5
+
+Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
+
+We are facing a time regression.
+
+premake5 gmake2 && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug
+
+With openblas 0.3.21 we go a bit faster. Still behind though, there is room for tweaking. But I want to run some bigger files first.
+
+#+begin_src sh
+Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025
+Reading Files ...
+## number of total individuals = 1940
+## number of analyzed individuals = 1410
+## number of covariates = 1
+## number of phenotypes = 1
+## number of total SNPs/var        =    12226
+## number of analyzed SNPs         =    10768
+Start Eigen-Decomposition...
+pve estimate =0.608801
+se(pve) =0.032774
+================================================== 100%
+real    0m9.017s
+user    0m13.168s
+sys     0m5.919s
+#+end_src sh
+
+
+#+begin_src sh
+Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025
+Reading Files ...
+## number of total individuals = 1940
+## number of analyzed individuals = 1410
+## number of covariates = 1
+## number of phenotypes = 1
+## number of total SNPs/var        =    12226
+## number of analyzed SNPs         =    10768
+Start Eigen-Decomposition...
+pve estimate =0.608801
+se(pve) =0.032774
+================================================== 100%
+real    0m16.772s
+user    0m25.443s
+sys     0m0.901s
+#+end_src sh
+
+The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice.
+
+When compiled with the profiler library prefix the gemma run with
+
+#+begin_src sh
+premake5 gmake2 && make verbose=1 config=debug -j 8 gemma && time CPUPROFILE=gemma.prof LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug
+CPUPROFILE=gemma.prof
+pprof --text build/bin/Debug/gemma gemma.prof
+
+    1007  49.2%  49.2%     1015  49.6% dot_compute
+      94   4.6%  53.8%       94   4.6% rpcc
+      74   3.6%  57.5%       74   3.6% gsl_vector_div
+      62   3.0%  60.5%       92   4.5% ____strtod_l_internal
+      42   2.1%  62.5%       42   2.1% dgemm_kernel_ZEN
+#+end_src sh
+
+this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing.
+
+Well, at least I found the problem. Time for a special openblas build like I used to do.
+
 ** GEMMA 0.98.5-pre1
 
 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
diff --git a/test/runner b/test/runner
new file mode 100755
index 0000000..ad5b381
--- /dev/null
+++ b/test/runner
@@ -0,0 +1,24 @@
+#!/bin/sh
+# -*- mode: scheme; -*-
+exec guile --debug -s "$0" "$@"
+!#
+
+(define-module (test-runner)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-64)
+  )
+
+(test-begin "runner")
+
+(test-begin "vec-test")
+(define v (make-vector 5 99))
+;; Require that an expression evaluate to true.
+(test-assert (vector? v))
+;; Test that an expression is eqv? to some other expression.
+(test-eqv 99 (vector-ref v 2))
+(vector-set! v 2 7)
+(test-eqv 7 (vector-ref v 2))
+;; Finish the testsuite, and report results.
+(test-end "vec-test")
+
+(test-end "runner")