Added profiler and figured speed regression with openblas

author: Pjotr Prins 2025-11-24 13:06:50 +0100
committer: Pjotr Prins 2025-11-24 13:06:50 +0100
commit: f03c82ea21acda54de8cced07ba8150cfafb3769 (patch)
tree: 2432c99cbfed02f3fe9a84a5b55643aff44c1bdb
parent: c5a402a651d3c6393b1f758fc011c7247e4f042f (diff)
download: pangemma-f03c82ea21acda54de8cced07ba8150cfafb3769.tar.gz
7 files changed, 94 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore
index d915f60..86c3228 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ PanGemma.make
 gemma.make
 gemmalib.make
 Makefile
+gdb.txt
+*.prof
 *.o
 *.tar.gz
 src/Eigen
diff --git a/guix.scm b/guix.scm
index 299332e..e142d7e 100644
--- a/guix.scm
+++ b/guix.scm
@@ -5,12 +5,13 @@
 ;;
 ;; To get a development container (e.g., run in emacs shell).
 ;;
-;;   guix shell -C -D -F -v 3 -L . pangemma-shell-git  # pangemma-shell-git
+;;   guix shell --tune=native -C -D -F -v 3 -L . pangemma-shell-git  # pangemma-shell-git
 ;;
 ;;   see premake5.lua header for examples.
 ;;
 ;;   guix shell -C -D -F -v 3 -L . gemma-git           # for specific packages
 ;;
+;; To optimize use guix --tune=march-type (e.g. --tune=native)
 
 (define-module (guix)
   #:use-module ((guix licenses) #:prefix license:)
@@ -24,6 +25,7 @@
   #:use-module (gnu packages compression)
   #:use-module (gnu packages commencement)
   #:use-module (gnu packages check)
+  #:use-module (gnu packages cpp)
   #:use-module (gnu packages databases)
   #:use-module (gnu packages gdb)
   #:use-module (gnu packages guile)
@@ -52,6 +54,7 @@
     (version (git-version %pangemma-version "HEAD" %git-commit))
     (source (local-file %source-dir #:recursive? #t))
     (build-system gnu-build-system)
+    (properties `((tunable? . #t)))
     (inputs
      (list gsl
            openblas
@@ -102,7 +105,7 @@ genome-wide association studies (GWAS).")
     (build-system gnu-build-system)
     (propagated-inputs
      (modify-inputs (package-inputs pangemma-base-git)
-                    (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb ;; for the shell
+                    (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell
                  )))
     (arguments
      `(#:phases (modify-phases %standard-phases
diff --git a/premake5.lua b/premake5.lua
index 0c5483d..1091cd8 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -59,9 +59,11 @@ project "gemma"
    includedirs { "src/" }
    links { "openblas" }
 
-
    filter "configurations:Debug"
       defines { "DEBUG" }
+      buildoptions { pkg_cpp_flags }
+      linkoptions { pkg_linker_flags }
+      links { "profiler" }
       symbols "On"
 
    filter "configurations:Release"
diff --git a/src/debug.cpp b/src/debug.cpp
index b26e173..6cefcc7 100644
--- a/src/debug.cpp
+++ b/src/debug.cpp
@@ -162,6 +162,8 @@ void disable_segfpe() {
 }
 */
 
+#ifndef NDEBUG
+
 void write(const char *s, const char *msg) {
   if (!is_debug_data_mode() && !is_debug_dump_mode()) return;
   ofstream out(debug_dump_path + "debug-dump-" + msg + ".txt");
@@ -232,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) {
           cout << "}" << endl;
 }
 
+#endif // NDEBUG
+
 /*
   Helper function to make sure gsl allocations do their job because
   gsl_matrix_alloc does not initiatize values (behaviour that changed
diff --git a/src/debug.h b/src/debug.h
index 0489a81..a32bfd2 100644
--- a/src/debug.h
+++ b/src/debug.h
@@ -60,11 +60,22 @@ void disable_segfpe();
   { auto x = m * n;                                      \
     enforce_msg(x / m == n, "multiply integer overflow"); }
 
+#ifndef NDEBUG
+
 void write(const double d, const char *msg = "");
 void write(const char *s, const char *msg = "");
 void write(const gsl_vector *v, const char *msg = "");
 void write(const gsl_matrix *m, const char *msg = "");
 
+#else // NDEBUG
+
+inline void write(const double d, const char *msg = "") {};
+inline void write(const char *s, const char *msg = "") {};
+inline void write(const gsl_vector *v, const char *msg = "") {};
+inline void write(const gsl_matrix *m, const char *msg = "") {};
+
+#endif // NDEBUG
+
 gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols);
 int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src);
 void gsl_matrix_safe_free (gsl_matrix *v);
diff --git a/test/performance/releases.org b/test/performance/releases.org
index b208e54..c973607 100644
--- a/test/performance/releases.org
+++ b/test/performance/releases.org
@@ -1,5 +1,50 @@
 * GEMMA performance stats
 
+** GEMMA 0.98.5
+
+Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
+
+We are facing a time regression.
+
+premake5 gmake2 && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug
+
+
+#+begin_src sh
+Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025
+Reading Files ...
+## number of total individuals = 1940
+## number of analyzed individuals = 1410
+## number of covariates = 1
+## number of phenotypes = 1
+## number of total SNPs/var        =    12226
+## number of analyzed SNPs         =    10768
+Start Eigen-Decomposition...
+pve estimate =0.608801
+se(pve) =0.032774
+================================================== 100%
+real    0m16.772s
+user    0m25.443s
+sys     0m0.901s
+#+end_src sh
+
+The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice.
+
+When compiled with the profile library prefix the gemma run with
+
+#+begin_src sh
+CPUPROFILE=gemma.prof
+pprof --text build/bin/Debug/gemma gemma.prof
+
+    1024  50.7%  50.7%     1024  50.7% dcopy_k_ZEN
+      99   4.9%  55.6%       99   4.9% openblas_read_env
+      67   3.3%  58.9%      107   5.3% ____strtod_l_internal
+      67   3.3%  62.3%       67   3.3% gsl_vector_div
+#+end_src sh
+
+this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing.
+
+Well, at least I found the problem. Time for a special openblas build like I used to do.
+
 ** GEMMA 0.98.5-pre1
 
 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
diff --git a/test/runner b/test/runner
new file mode 100755
index 0000000..ad5b381
--- /dev/null
+++ b/test/runner
@@ -0,0 +1,24 @@
+#!/bin/sh
+# -*- mode: scheme; -*-
+exec guile --debug -s "$0" "$@"
+!#
+
+(define-module (test-runner)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-64)
+  )
+
+(test-begin "runner")
+
+(test-begin "vec-test")
+(define v (make-vector 5 99))
+;; Require that an expression evaluate to true.
+(test-assert (vector? v))
+;; Test that an expression is eqv? to some other expression.
+(test-eqv 99 (vector-ref v 2))
+(vector-set! v 2 7)
+(test-eqv 7 (vector-ref v 2))
+;; Finish the testsuite, and report results.
+(test-end "vec-test")
+
+(test-end "runner")
author	Pjotr Prins	2025-11-24 13:06:50 +0100
committer	Pjotr Prins	2025-11-24 13:06:50 +0100
commit	f03c82ea21acda54de8cced07ba8150cfafb3769 (patch)
tree	2432c99cbfed02f3fe9a84a5b55643aff44c1bdb
parent	c5a402a651d3c6393b1f758fc011c7247e4f042f (diff)
download	pangemma-f03c82ea21acda54de8cced07ba8150cfafb3769.tar.gz