diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | guix.scm | 101 | ||||
| -rw-r--r-- | premake5.lua | 10 | ||||
| -rw-r--r-- | src/debug.cpp | 4 | ||||
| -rw-r--r-- | src/debug.h | 11 | ||||
| -rw-r--r-- | test/performance/releases.org | 67 | ||||
| -rwxr-xr-x | test/runner | 24 |
7 files changed, 213 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore index d915f60..86c3228 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ PanGemma.make gemma.make gemmalib.make Makefile +gdb.txt +*.prof *.o *.tar.gz src/Eigen diff --git a/guix.scm b/guix.scm index 299332e..23b7583 100644 --- a/guix.scm +++ b/guix.scm @@ -7,10 +7,15 @@ ;; ;; guix shell -C -D -F -v 3 -L . pangemma-shell-git # pangemma-shell-git ;; -;; see premake5.lua header for examples. +;; optimized for arch: +;; +;; guix shell --tune=native -C -D -F -v 3 -L . pangemma-shell-git # pangemma-shell-git +;; +;; see premake5.lua header for examples. ;; ;; guix shell -C -D -F -v 3 -L . gemma-git # for specific packages ;; +;; To optimize use guix --tune=march-type (e.g. --tune=native) (define-module (guix) #:use-module ((guix licenses) #:prefix license:) @@ -18,13 +23,17 @@ #:use-module (guix packages) #:use-module (guix git-download) #:use-module (guix build-system gnu) + #:use-module (guix utils) + #:use-module (gnu packages algebra) #:use-module (gnu packages base) #:use-module (gnu packages build-tools) #:use-module (gnu packages compression) #:use-module (gnu packages commencement) #:use-module (gnu packages check) + #:use-module (gnu packages cpp) #:use-module (gnu packages databases) + #:use-module (gnu packages gcc) #:use-module (gnu packages gdb) #:use-module (gnu packages guile) #:use-module (gnu packages guile-xyz) @@ -45,6 +54,90 @@ (define %pangemma-version (read-string (open-pipe "cat VERSION|tr -d $'\n'" OPEN_READ))) +(define-public openblas-pangemma +;; we are fixating on an older openblas, for now + (package + (name "openblas-pangemma") + (version "0.3.21") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/xianyi/OpenBLAS") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0yx1axiki12y0xz0d5s76vvl7ds36k0npv1sww08k2qslhz1g9qp")))) + (build-system gnu-build-system) + (properties `((tunable? . #t))) + (arguments + (list + #:tests? #f ;; skip tests + #:test-target "test" + ;; No default baseline is supplied for powerpc-linux. + #:substitutable? (not (target-ppc32?)) + #:make-flags + #~(list (string-append "PREFIX=" #$output) + (string-append "CFLAGS=-O3 -g -Wno-incompatible-pointer-types -Wno-error=implicit-function-declaration") + "SHELL=bash" + "MAKE_NB_JOBS=0" ;use jobserver for submakes + + ;; This is the maximum number of threads OpenBLAS will ever use (that + ;; is, if $OPENBLAS_NUM_THREADS is greater than that, then NUM_THREADS + ;; is used.) If we don't set it, the makefile sets it to the number + ;; of cores of the build machine, which is obviously wrong. + "NUM_THREADS=128" + + ;; DYNAMIC_ARCH is only supported on some architectures. + ;; DYNAMIC_ARCH combined with TARGET=GENERIC provides a library + ;; which uses the optimizations for the detected CPU. This can + ;; be overridden at runtime with the environment variable + ;; OPENBLAS_CORETYPE=<type>, where "type" is a supported CPU + ;; type. On other architectures we target only the baseline CPU + ;; supported by Guix. + #$@(cond + ((or (target-x86-64?) + (target-x86-32?) + (target-ppc64le?) + (target-aarch64?)) + ;; Dynamic older enables a few extra CPU architectures + ;; on x86_64 that were released before 2010. + '("DYNAMIC_ARCH=1" "DYNAMIC_OLDER=1" "TARGET=GENERIC")) + ;; On some of these architectures the CPU type can't be detected. + ;; We list the oldest CPU core we want to have support for. + ;; On MIPS we force the "SICORTEX" TARGET, as for the other + ;; two available MIPS targets special extended instructions + ;; for Loongson cores are used. + ((target-mips64el?) + '("TARGET=SICORTEX")) + ((target-arm32?) + '("TARGET=ARMV7")) + ((target-riscv64?) + '("TARGET=RISCV64_GENERIC")) + (else '()))) + ;; no configure script + #:phases + #~(modify-phases %standard-phases + (delete 'configure) + (add-before 'build 'set-extralib + (lambda* (#:key inputs #:allow-other-keys) + ;; Get libgfortran found when building in utest. + (setenv "FEXTRALIB" + (string-append + "-L" + (dirname + (search-input-file inputs "/lib/libgfortran.so"))))))))) + (inputs + (list `(,gfortran "lib"))) + (native-inputs + (list cunit gfortran perl)) + (home-page "https://www.openblas.net/") + (synopsis "Optimized BLAS library based on GotoBLAS") + (description + "OpenBLAS is a BLAS library forked from the GotoBLAS2-1.13 BSD version.") + (license license:bsd-3))) + (define-public pangemma-base-git "Pangemma base build package" (package @@ -54,7 +147,7 @@ (build-system gnu-build-system) (inputs (list gsl - openblas + openblas-pangemma guile-3.0 `(,guile-3.0 "debug") ;; `(,guile-3.0 "dev") @@ -102,7 +195,7 @@ genome-wide association studies (GWAS).") (build-system gnu-build-system) (propagated-inputs (modify-inputs (package-inputs pangemma-base-git) - (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb ;; for the shell + (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell ))) (arguments `(#:phases (modify-phases %standard-phases @@ -126,7 +219,7 @@ genome-wide association studies (GWAS).") (list catch2 gdb gsl - openblas + openblas-pangemma zlib)) ;; ("gsl-static" ,gsl-static) ;; ("zlib:static" ,zlib "static") diff --git a/premake5.lua b/premake5.lua index 0c5483d..bb06cbd 100644 --- a/premake5.lua +++ b/premake5.lua @@ -1,6 +1,6 @@ -- Build with -- --- make clean && rm build/Release/ -rf +-- make clean && rm build -rf -- premake5 gmake2 && make verbose=1 gemmalib -j 8 -- -- Including bin @@ -39,10 +39,14 @@ workspace "PanGemma" filter "configurations:Debug" defines { "DEBUG" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } symbols "On" filter "configurations:Release" defines { "NDEBUG", "HAVE_INLINE" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } buildoptions { "-pthread", "-Wall" } optimize "Speed" @@ -59,9 +63,11 @@ project "gemma" includedirs { "src/" } links { "openblas" } - filter "configurations:Debug" defines { "DEBUG" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + links { "profiler" } symbols "On" filter "configurations:Release" diff --git a/src/debug.cpp b/src/debug.cpp index b26e173..6cefcc7 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -162,6 +162,8 @@ void disable_segfpe() { } */ +#ifndef NDEBUG + void write(const char *s, const char *msg) { if (!is_debug_data_mode() && !is_debug_dump_mode()) return; ofstream out(debug_dump_path + "debug-dump-" + msg + ".txt"); @@ -232,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) { cout << "}" << endl; } +#endif // NDEBUG + /* Helper function to make sure gsl allocations do their job because gsl_matrix_alloc does not initiatize values (behaviour that changed diff --git a/src/debug.h b/src/debug.h index 0489a81..a32bfd2 100644 --- a/src/debug.h +++ b/src/debug.h @@ -60,11 +60,22 @@ void disable_segfpe(); { auto x = m * n; \ enforce_msg(x / m == n, "multiply integer overflow"); } +#ifndef NDEBUG + void write(const double d, const char *msg = ""); void write(const char *s, const char *msg = ""); void write(const gsl_vector *v, const char *msg = ""); void write(const gsl_matrix *m, const char *msg = ""); +#else // NDEBUG + +inline void write(const double d, const char *msg = "") {}; +inline void write(const char *s, const char *msg = "") {}; +inline void write(const gsl_vector *v, const char *msg = "") {}; +inline void write(const gsl_matrix *m, const char *msg = "") {}; + +#endif // NDEBUG + gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols); int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src); void gsl_matrix_safe_free (gsl_matrix *v); diff --git a/test/performance/releases.org b/test/performance/releases.org index b208e54..4cc92f1 100644 --- a/test/performance/releases.org +++ b/test/performance/releases.org @@ -1,5 +1,72 @@ * GEMMA performance stats +** GEMMA 0.98.5 + +Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. + +We are facing a time regression. + +premake5 gmake2 && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug + +With openblas 0.3.21 we go a bit faster. Still behind though, there is room for tweaking. But I want to run some bigger files first. + +#+begin_src sh +Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025 +Reading Files ... +## number of total individuals = 1940 +## number of analyzed individuals = 1410 +## number of covariates = 1 +## number of phenotypes = 1 +## number of total SNPs/var = 12226 +## number of analyzed SNPs = 10768 +Start Eigen-Decomposition... +pve estimate =0.608801 +se(pve) =0.032774 +================================================== 100% +real 0m9.017s +user 0m13.168s +sys 0m5.919s +#+end_src sh + + +#+begin_src sh +Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025 +Reading Files ... +## number of total individuals = 1940 +## number of analyzed individuals = 1410 +## number of covariates = 1 +## number of phenotypes = 1 +## number of total SNPs/var = 12226 +## number of analyzed SNPs = 10768 +Start Eigen-Decomposition... +pve estimate =0.608801 +se(pve) =0.032774 +================================================== 100% +real 0m16.772s +user 0m25.443s +sys 0m0.901s +#+end_src sh + +The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice. + +When compiled with the profiler library prefix the gemma run with + +#+begin_src sh +premake5 gmake2 && make verbose=1 config=debug -j 8 gemma && time CPUPROFILE=gemma.prof LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug +CPUPROFILE=gemma.prof +pprof --text build/bin/Debug/gemma gemma.prof + + 1007 49.2% 49.2% 1015 49.6% dot_compute + 94 4.6% 53.8% 94 4.6% rpcc + 74 3.6% 57.5% 74 3.6% gsl_vector_div + 62 3.0% 60.5% 92 4.5% ____strtod_l_internal + 42 2.1% 62.5% 42 2.1% dgemm_kernel_ZEN +#+end_src sh + +this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing. + +Well, at least I found the problem. Time for a special openblas build like I used to do. + ** GEMMA 0.98.5-pre1 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. diff --git a/test/runner b/test/runner new file mode 100755 index 0000000..ad5b381 --- /dev/null +++ b/test/runner @@ -0,0 +1,24 @@ +#!/bin/sh +# -*- mode: scheme; -*- +exec guile --debug -s "$0" "$@" +!# + +(define-module (test-runner) + #:use-module (ice-9 match) + #:use-module (srfi srfi-64) + ) + +(test-begin "runner") + +(test-begin "vec-test") +(define v (make-vector 5 99)) +;; Require that an expression evaluate to true. +(test-assert (vector? v)) +;; Test that an expression is eqv? to some other expression. +(test-eqv 99 (vector-ref v 2)) +(vector-set! v 2 7) +(test-eqv 7 (vector-ref v 2)) +;; Finish the testsuite, and report results. +(test-end "vec-test") + +(test-end "runner") |
