diff options
| -rw-r--r-- | README.md | 1 | ||||
| -rw-r--r-- | guix.scm | 105 | ||||
| -rw-r--r-- | guix/guix.scm | 256 | ||||
| -rw-r--r-- | src/lmm.cpp | 85 | ||||
| -rw-r--r-- | src/lmm.h | 10 |
5 files changed, 165 insertions, 292 deletions
diff --git a/README.md b/README.md index 3239ab9..1ed831d 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ GEMMA is the original software toolkit for fast application of linear mixed mode NOTE: December 2024 main software development has moved to [PanGEMMA](https://git.genenetwork.org/pangemma/about/)! Pangemma is essentially a fork of GEMMA that is meant to scale up for pangenomics. We are also taking the opportunity to revamp the code base. GEMMA itself is in *maintenance* mode. +See also [Speeding up GEMMA](https://issues.genenetwork.org/issues/genetics/speeding-up-gemma). Check out [RELEASE-NOTES.md](./RELEASE-NOTES.md) to see what's new in each release. diff --git a/guix.scm b/guix.scm new file mode 100644 index 0000000..ca4d365 --- /dev/null +++ b/guix.scm @@ -0,0 +1,105 @@ +;; To use this file to build HEAD of pangemma: +;; +;; guix build -f guix.scm --tune=native +;; +;; To get a development container (e.g., run in emacs shell). +;; +;; guix shell -C -D -f guix.scm +;; + +(use-modules + ((guix licenses) #:prefix license:) + (guix gexp) + (guix packages) + (guix git-download) + (guix build-system copy) + (guix build-system gnu) + (guix utils) + (gnu packages compression) + (gnu packages build-tools) + (gnu packages databases) + (gnu packages gcc) + (gnu packages guile) + (gnu packages maths) + (gnu packages pkg-config) + (srfi srfi-1) + (ice-9 popen) + (ice-9 rdelim)) + +(define %source-dir (dirname (current-filename))) + +(define %git-commit + (string-trim-right (read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ)))) + +(define %pangemma-version + (string-trim-right (read-string (open-pipe "cat VERSION" OPEN_READ)))) + +(define-public lmdbxx + (let ((commit "6f497d1d8e1a6e0afa8ae83891d13b3fac68b62c") + (revision "0")) + (package + (name "lmdbxx") + (version (git-version "1.0.0" revision commit)) + (source (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/hoytech/lmdbxx") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 "1www519kfm0lq4g79i8hpfydwnfrf6p21fqrqcfl2sv072rghygj")))) + (build-system copy-build-system) ;; This is a headed only package + (arguments + (list #:install-plan + #~'(("include/lmdbxx/lmdb++.h" "include/lmdb++.h")))) + (home-page "https://github.com/hoytech/lmdbxx") + (synopsis "C++17 wrapper for the LMDB database library") + (description "Header-only C++17 wrapper for the LMDB embedded +key-value store, with string_view-based API.") + (license license:unlicense)))) + +(define-public pangemma-git + (package + (name "pangemma-git") + (version (git-version %pangemma-version "HEAD" %git-commit)) + (source (local-file %source-dir #:recursive? #t)) + (build-system gnu-build-system) + (inputs + (list gsl + guile-3.0 + lmdb + openblas ;; already optimized for AVX-512 kernels alongside SSE2/AVX2/etc. + zlib)) + (native-inputs + (list lmdbxx + pkg-config + premake5)) + (arguments + (list + #:tests? #f ; tests need test data and ruby + #:phases + #~(modify-phases %standard-phases + (delete 'configure) + (replace 'build + (lambda _ + (invoke "premake5" "gmake") + (invoke "make" "verbose=1" "config=release" "gemma" + (string-append "CXXFLAGS=-std=c++17" + " -I" #$(this-package-native-input "lmdbxx") "/include" + " -I" #$(this-package-input "lmdb") "/include") + (string-append "-j" (number->string + (parallel-job-count)))))) + (replace 'install + (lambda _ + (install-file "build/bin/Release/gemma" + (string-append #$output "/bin"))))))) + (properties '((tunable? . #t))) + (home-page "https://github.com/genetics-statistics/pangemma") + (synopsis "Pangenome-enabled GEMMA for genome-wide association studies") + (description "PanGemma is a pangenome-enabled version of GEMMA +(Genome-wide Efficient Mixed Model Association) for genome-wide association +studies (GWAS) with support for LMDB-backed genotype storage and Guile +scripting.") + (license license:gpl3))) + +pangemma-git diff --git a/guix/guix.scm b/guix/guix.scm deleted file mode 100644 index d8d593b..0000000 --- a/guix/guix.scm +++ /dev/null @@ -1,256 +0,0 @@ -;; To use this file to build HEAD of gemma: -;; -;; guix build -f guix/guix.scm # default builds pangemma-git -;; -;; To get a development container (e.g., run in emacs shell). -;; -;; guix shell -C -D -F -f guix/guix.scm # pangemma-shell-git -;; -;; see premake5.lua for build/test instructions -;; -;; optimized for arch: -;; -;; guix shell --tune=native -C -D -F # pangemma-shell-git -;; -;; see premake5.lua header for examples. -;; -;; To optimize use guix --tune=march-type (e.g. --tune=native) - -(define-module (guix) - #:use-module ((guix licenses) #:prefix license:) - #:use-module (guix gexp) - #:use-module (guix packages) - #:use-module (guix git-download) - #:use-module (guix build-system gnu) - #:use-module (guix utils) - - #:use-module (gnu packages algebra) - #:use-module (gnu packages base) - #:use-module (gnu packages build-tools) - #:use-module (gnu packages compression) - #:use-module (gnu packages commencement) - #:use-module (gnu packages check) - #:use-module (gnu packages cpp) - #:use-module (gnu packages databases) - #:use-module (gnu packages gcc) - #:use-module (gnu packages gdb) - #:use-module (gnu packages guile) - #:use-module (gnu packages guile-xyz) - #:use-module (gnu packages maths) - #:use-module (gnu packages ninja) - #:use-module (gnu packages perl) - #:use-module (gnu packages pkg-config) - #:use-module (gnu packages ruby) - #:use-module (gnu packages time) - #:use-module (srfi srfi-1) - #:use-module (ice-9 popen) - #:use-module (ice-9 rdelim)) - -(define %source-dir (dirname (current-filename))) - -(define %git-commit - (read-string (open-pipe "git describe --always --tags --long|tr -d $'\n'" OPEN_READ))) - -(define %pangemma-version - (read-string (open-pipe "cat VERSION|tr -d $'\n'" OPEN_READ))) - -(define-public openblas-pangemma -;; we are fixating on an older openblas, for now - (package - (name "openblas-pangemma") - (version "0.3.21") - (source - (origin - (method git-fetch) - (uri (git-reference - (url "https://github.com/xianyi/OpenBLAS") - (commit (string-append "v" version)))) - (file-name (git-file-name name version)) - (sha256 - (base32 - "0yx1axiki12y0xz0d5s76vvl7ds36k0npv1sww08k2qslhz1g9qp")))) - (build-system gnu-build-system) - (properties `((tunable? . #t))) - (arguments - (list - #:tests? #f ;; skip tests - #:test-target "test" - ;; No default baseline is supplied for powerpc-linux. - #:substitutable? (not (target-ppc32?)) - #:make-flags - #~(list (string-append "PREFIX=" #$output) - (string-append "CFLAGS=-O3 -g -Wno-incompatible-pointer-types -Wno-error=implicit-function-declaration") - "COPT=" - "COMMON_OPT=" - "DYNAMIC_ARCH=" - "SHELL=bash" - "MAKE_NB_JOBS=0" ;use jobserver for submakes - - ;; This is the maximum number of threads OpenBLAS will ever use (that - ;; is, if $OPENBLAS_NUM_THREADS is greater than that, then NUM_THREADS - ;; is used.) If we don't set it, the makefile sets it to the number - ;; of cores of the build machine, which is obviously wrong. - "NUM_THREADS=128" - - ;; DYNAMIC_ARCH is only supported on some architectures. - ;; DYNAMIC_ARCH combined with TARGET=GENERIC provides a library - ;; which uses the optimizations for the detected CPU. This can - ;; be overridden at runtime with the environment variable - ;; OPENBLAS_CORETYPE=<type>, where "type" is a supported CPU - ;; type. On other architectures we target only the baseline CPU - ;; supported by Guix. - #$@(cond - ((or (target-x86-64?) - (target-x86-32?) - (target-ppc64le?) - (target-aarch64?)) - ;; Dynamic older enables a few extra CPU architectures - ;; on x86_64 that were released before 2010. - '("DYNAMIC_ARCH=1" "TARGET=GENERIC")) - ;; '("DYNAMIC_ARCH=" "TARGET_CORE=ZEN")) - ;; On some of these architectures the CPU type can't be detected. - ;; We list the oldest CPU core we want to have support for. - ;; On MIPS we force the "SICORTEX" TARGET, as for the other - ;; two available MIPS targets special extended instructions - ;; for Loongson cores are used. - ((target-mips64el?) - '("TARGET=SICORTEX")) - ((target-arm32?) - '("TARGET=ARMV7")) - ((target-riscv64?) - '("TARGET=RISCV64_GENERIC")) - (else '()))) - ;; no configure script - #:phases - #~(modify-phases %standard-phases - (delete 'configure) - (add-before 'build 'set-extralib - (lambda* (#:key inputs #:allow-other-keys) - ;; Get libgfortran found when building in utest. - (setenv "FEXTRALIB" - (string-append - "-L" - (dirname - (search-input-file inputs "/lib/libgfortran.so"))))))))) - (inputs - (list `(,gfortran "lib"))) - (native-inputs - (list cunit gfortran perl)) - (home-page "https://www.openblas.net/") - (synopsis "Optimized BLAS library based on GotoBLAS") - (description - "OpenBLAS is a BLAS library forked from the GotoBLAS2-1.13 BSD version.") - (license license:bsd-3))) - -(define-public pangemma-base-git - "Pangemma base build package" - (package - (name "pangemma-git") - (version (git-version %pangemma-version "HEAD" %git-commit)) - (source (local-file %source-dir #:recursive? #t)) - (build-system gnu-build-system) - (inputs - (list gsl - openblas-pangemma - guile-3.0 - `(,guile-3.0 "debug") - ;; `(,guile-3.0 "dev") - guile-lmdb - lmdb - lmdbxx - pkg-config - ;; ninja - ;; ruby - time - zlib)) - ;; (propagated-inputs - ;; (list - ;; `("guile" ,guile-3.0-latest) - ;; `("guile-debug" ,guile-3.0-latest "debug") - ;; `("guile" ,guile-3.0-latest "dev"))) - - ;; ("gsl-static" ,gsl-static) - ;; ("zlib:static" ,zlib "static") - (arguments - `(#:phases - (modify-phases %standard-phases - (delete 'configure) - (delete 'validate-runpath) - (add-before 'build 'bin-mkdir - (lambda _ - (mkdir-p "bin") - )) - (replace 'install - (lambda* (#:key outputs #:allow-other-keys) - (let ((out (assoc-ref outputs "out"))) - (install-file "bin/gemma" (string-append out "/bin")))))) - #:tests? #t - #:parallel-tests? #f)) - (home-page "https://git.genenetwork.org/pangemma/") - (synopsis "Tool for genome-wide efficient mixed model association") - (description "New version of Genome-wide Efficient Mixed Model Association (PANGEMMA) -provides a standard linear mixed model resolver with application in -genome-wide association studies (GWAS).") - (license license:gpl3))) - -(define-public pangemma-shell-git - "Shell version for development" - (package - (inherit pangemma-base-git) - (name "pangemma-shell-git") - (build-system gnu-build-system) - (propagated-inputs - (modify-inputs (package-inputs pangemma-base-git) - (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell - ))) - (arguments - `(#:phases (modify-phases %standard-phases - (delete 'configure) - (delete 'build) - (delete 'package) - (delete 'check) - (delete 'install)))) - (description "Pangemma shell for development") - )) - -;; ---- legacy build ----------------------------------------------------------------- -(define-public gemma-git - "Original legacy gemma -- for as long as it compiles" - (package - (name "gemma-git") - (version (git-version %pangemma-version "HEAD" %git-commit)) - (source (local-file %source-dir #:recursive? #t)) - (build-system gnu-build-system) - (inputs - (list catch2 - gdb - gsl - openblas-pangemma - zlib)) - ;; ("gsl-static" ,gsl-static) - ;; ("zlib:static" ,zlib "static") - (native-inputs ; for running tests - (list perl which)) - (arguments - `(#:phases - (modify-phases %standard-phases - (delete 'configure) - (delete 'validate-runpath) - (add-before 'build 'bin-mkdir - (lambda _ - (mkdir-p "bin") - )) - (replace 'install - (lambda* (#:key outputs #:allow-other-keys) - (let ((out (assoc-ref outputs "out"))) - (install-file "bin/gemma" (string-append out "/bin")))))) - #:tests? #t - #:parallel-tests? #f)) - (home-page "https://github.com/genetics-statistics") - (synopsis "Tool for genome-wide efficient mixed model association") - (description "Genome-wide Efficient Mixed Model Association (GEMMA) -provides a standard linear mixed model resolver with application in -genome-wide association studies (GWAS).") - (license license:gpl3))) - -pangemma-shell-git diff --git a/src/lmm.cpp b/src/lmm.cpp index 969e6fc..9fe6bbb 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -1981,7 +1981,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); } - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); auto markerinfo = markers[i]; @@ -1991,14 +1990,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, } }; - /* - const auto num_markers = indicator_snp.size(); - enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?"); - if (num_markers < 50) { - cerr << num_markers << " SNPs" << endl; - warning_msg("very few SNPs processed"); - } - */ const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace); Markers markers; @@ -2012,23 +2003,17 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, // continue; auto tup = fetch_snp(t); // use the callback - auto success = get<0>(tup); - if (!success) + auto state = get<0>(tup); + if (state == SKIP) continue; - // typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; - // auto marker = get<1>(tup); - // auto chr = get<2>(tup); - // auto mpos = get<3>(tup); + if (state == LAST) + break; // marker loop because of LOCO + auto markerinfo = get<1>(tup); auto gs = get<2>(tup); markers.push_back(markerinfo); - // check whether SNP is included in gwasnps (used by LOCO) - /* - if (process_gwasnps && gwasnps.count(snp) == 0) - continue; - */ // drop missing idv and plug mean values for missing geno double x_total = 0.0; // sum genotype values to compute x_mean uint vpos = 0; // position in target vector @@ -2067,7 +2052,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, gsl_vector_safe_memcpy(&Xlarge_col.vector, x); c++; // count markers going in - if (c % msize == 0) { batch_compute(msize,markers); markers.clear(); @@ -2094,6 +2078,44 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, return; } + outfile << "chr" << "\t" + << "rs" << "\t" + << "ps" << "\t" + << "n_miss" << "\t" + << "allele1" << "\t" + << "allele0" << "\t" + << "af" << "\t"; + + if (a_mode != M_LMM2) { + outfile << "beta" << "\t"; + outfile << "se" << "\t"; + } + + if (a_mode != M_LMM3 && a_mode != M_LMM9) + outfile << "logl_H1" << "\t"; + + switch(a_mode) { + case M_LMM1: + outfile << "l_remle" << "\t" + << "p_wald" << endl; + break; + case M_LMM2: + case M_LMM9: + outfile << "l_mle" << "\t" + << "p_lrt" << endl; + break; + case M_LMM3: + outfile << "p_score" << endl; + break; + case M_LMM4: + outfile << "l_remle" << "\t" + << "l_mle" << "\t" + << "p_wald" << "\t" + << "p_lrt" << "\t" + << "p_score" << endl; + break; + } + auto sumstats = [&] (SUMSTAT2 st) { outfile << scientific << setprecision(6); auto m = st.markerinfo; @@ -2212,10 +2234,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, mdb_stat(rtxn, geno_mdb, &stat); auto num_markers = stat.ms_entries; - // fetch_snp is a callback function for every SNP row - // returns typedef std::tuple<bool,string,std::vector<double> > SnpNameValues; - - // size_t prev_line = 0; auto mdb_fetch = MDB_FIRST; auto cursor = lmdb::cursor::open(rtxn, geno_mdb); @@ -2227,20 +2245,14 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, string_view key,value; - /* - while (prev_line <= num) { - cursor.get(key, value, MDB_NEXT); - prev_line++; - } - */ - auto success = cursor.get(key, value, mdb_fetch); + auto mdb_success = cursor.get(key, value, mdb_fetch); mdb_fetch = MDB_NEXT; // uint8_t chr; vector<double> gs; MarkerInfo markerinfo; - if (success) { + if (mdb_success) { size_t size = 0; // ---- Depending on the format we get different buffers - currently float and byte are supported: if (format == "Gb") { @@ -2282,7 +2294,10 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, // printf("%#02x %#02x\n", chr, loco_chr); if (is_loco && loco_chr != chr) { - return make_tuple(false, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + if (chr > loco_chr) + return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + else + return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); } string_view value2; @@ -2299,7 +2314,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl; } - return make_tuple(success, markerinfo, gs); + return make_tuple(COMPUTE, markerinfo, gs); }; LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers); diff --git a/src/lmm.h b/src/lmm.h index da5ad21..295602a 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -55,7 +55,15 @@ struct MarkerInfo { typedef vector<MarkerInfo> Markers; typedef tuple< string,vector<double> > SnpNameValues; -typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) + +enum MarkerState { + FAIL, + COMPUTE, + SKIP, + LAST +}; + +typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) // Results for LMM. class SUMSTAT2 { public: |
