diff options
| author | Pjotr Prins | 2025-11-24 13:06:50 +0100 |
|---|---|---|
| committer | Pjotr Prins | 2025-11-24 13:06:50 +0100 |
| commit | f03c82ea21acda54de8cced07ba8150cfafb3769 (patch) | |
| tree | 2432c99cbfed02f3fe9a84a5b55643aff44c1bdb | |
| parent | c5a402a651d3c6393b1f758fc011c7247e4f042f (diff) | |
| download | pangemma-f03c82ea21acda54de8cced07ba8150cfafb3769.tar.gz | |
Added profiler and figured speed regression with openblas
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | guix.scm | 7 | ||||
| -rw-r--r-- | premake5.lua | 4 | ||||
| -rw-r--r-- | src/debug.cpp | 4 | ||||
| -rw-r--r-- | src/debug.h | 11 | ||||
| -rw-r--r-- | test/performance/releases.org | 45 | ||||
| -rwxr-xr-x | test/runner | 24 |
7 files changed, 94 insertions, 3 deletions
diff --git a/.gitignore b/.gitignore index d915f60..86c3228 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ PanGemma.make gemma.make gemmalib.make Makefile +gdb.txt +*.prof *.o *.tar.gz src/Eigen diff --git a/guix.scm b/guix.scm index 299332e..e142d7e 100644 --- a/guix.scm +++ b/guix.scm @@ -5,12 +5,13 @@ ;; ;; To get a development container (e.g., run in emacs shell). ;; -;; guix shell -C -D -F -v 3 -L . pangemma-shell-git # pangemma-shell-git +;; guix shell --tune=native -C -D -F -v 3 -L . pangemma-shell-git # pangemma-shell-git ;; ;; see premake5.lua header for examples. ;; ;; guix shell -C -D -F -v 3 -L . gemma-git # for specific packages ;; +;; To optimize use guix --tune=march-type (e.g. --tune=native) (define-module (guix) #:use-module ((guix licenses) #:prefix license:) @@ -24,6 +25,7 @@ #:use-module (gnu packages compression) #:use-module (gnu packages commencement) #:use-module (gnu packages check) + #:use-module (gnu packages cpp) #:use-module (gnu packages databases) #:use-module (gnu packages gdb) #:use-module (gnu packages guile) @@ -52,6 +54,7 @@ (version (git-version %pangemma-version "HEAD" %git-commit)) (source (local-file %source-dir #:recursive? #t)) (build-system gnu-build-system) + (properties `((tunable? . #t))) (inputs (list gsl openblas @@ -102,7 +105,7 @@ genome-wide association studies (GWAS).") (build-system gnu-build-system) (propagated-inputs (modify-inputs (package-inputs pangemma-base-git) - (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb ;; for the shell + (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell ))) (arguments `(#:phases (modify-phases %standard-phases diff --git a/premake5.lua b/premake5.lua index 0c5483d..1091cd8 100644 --- a/premake5.lua +++ b/premake5.lua @@ -59,9 +59,11 @@ project "gemma" includedirs { "src/" } links { "openblas" } - filter "configurations:Debug" defines { "DEBUG" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + links { "profiler" } symbols "On" filter "configurations:Release" diff --git a/src/debug.cpp b/src/debug.cpp index b26e173..6cefcc7 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -162,6 +162,8 @@ void disable_segfpe() { } */ +#ifndef NDEBUG + void write(const char *s, const char *msg) { if (!is_debug_data_mode() && !is_debug_dump_mode()) return; ofstream out(debug_dump_path + "debug-dump-" + msg + ".txt"); @@ -232,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) { cout << "}" << endl; } +#endif // NDEBUG + /* Helper function to make sure gsl allocations do their job because gsl_matrix_alloc does not initiatize values (behaviour that changed diff --git a/src/debug.h b/src/debug.h index 0489a81..a32bfd2 100644 --- a/src/debug.h +++ b/src/debug.h @@ -60,11 +60,22 @@ void disable_segfpe(); { auto x = m * n; \ enforce_msg(x / m == n, "multiply integer overflow"); } +#ifndef NDEBUG + void write(const double d, const char *msg = ""); void write(const char *s, const char *msg = ""); void write(const gsl_vector *v, const char *msg = ""); void write(const gsl_matrix *m, const char *msg = ""); +#else // NDEBUG + +inline void write(const double d, const char *msg = "") {}; +inline void write(const char *s, const char *msg = "") {}; +inline void write(const gsl_vector *v, const char *msg = "") {}; +inline void write(const gsl_matrix *m, const char *msg = "") {}; + +#endif // NDEBUG + gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols); int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src); void gsl_matrix_safe_free (gsl_matrix *v); diff --git a/test/performance/releases.org b/test/performance/releases.org index b208e54..c973607 100644 --- a/test/performance/releases.org +++ b/test/performance/releases.org @@ -1,5 +1,50 @@ * GEMMA performance stats +** GEMMA 0.98.5 + +Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. + +We are facing a time regression. + +premake5 gmake2 && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug + + +#+begin_src sh +Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025 +Reading Files ... +## number of total individuals = 1940 +## number of analyzed individuals = 1410 +## number of covariates = 1 +## number of phenotypes = 1 +## number of total SNPs/var = 12226 +## number of analyzed SNPs = 10768 +Start Eigen-Decomposition... +pve estimate =0.608801 +se(pve) =0.032774 +================================================== 100% +real 0m16.772s +user 0m25.443s +sys 0m0.901s +#+end_src sh + +The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice. + +When compiled with the profile library prefix the gemma run with + +#+begin_src sh +CPUPROFILE=gemma.prof +pprof --text build/bin/Debug/gemma gemma.prof + + 1024 50.7% 50.7% 1024 50.7% dcopy_k_ZEN + 99 4.9% 55.6% 99 4.9% openblas_read_env + 67 3.3% 58.9% 107 5.3% ____strtod_l_internal + 67 3.3% 62.3% 67 3.3% gsl_vector_div +#+end_src sh + +this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing. + +Well, at least I found the problem. Time for a special openblas build like I used to do. + ** GEMMA 0.98.5-pre1 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. diff --git a/test/runner b/test/runner new file mode 100755 index 0000000..ad5b381 --- /dev/null +++ b/test/runner @@ -0,0 +1,24 @@ +#!/bin/sh +# -*- mode: scheme; -*- +exec guile --debug -s "$0" "$@" +!# + +(define-module (test-runner) + #:use-module (ice-9 match) + #:use-module (srfi srfi-64) + ) + +(test-begin "runner") + +(test-begin "vec-test") +(define v (make-vector 5 99)) +;; Require that an expression evaluate to true. +(test-assert (vector? v)) +;; Test that an expression is eqv? to some other expression. +(test-eqv 99 (vector-ref v 2)) +(vector-set! v 2 7) +(test-eqv 7 (vector-ref v 2)) +;; Finish the testsuite, and report results. +(test-end "vec-test") + +(test-end "runner") |
