about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README.md1
-rw-r--r--guix.scm105
-rw-r--r--guix/guix.scm256
-rw-r--r--src/lmm.cpp85
-rw-r--r--src/lmm.h10
5 files changed, 165 insertions, 292 deletions
diff --git a/README.md b/README.md
index 3239ab9..1ed831d 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ GEMMA is the original software toolkit for fast application of linear mixed mode
 NOTE: December 2024 main software development has moved to [PanGEMMA](https://git.genenetwork.org/pangemma/about/)!
 Pangemma is essentially a fork of GEMMA that is meant to scale up for pangenomics. We are also taking the opportunity to revamp the code base. GEMMA itself is in *maintenance* mode.
 
+See also [Speeding up GEMMA](https://issues.genenetwork.org/issues/genetics/speeding-up-gemma).
 
 Check out [RELEASE-NOTES.md](./RELEASE-NOTES.md) to see what's new in each release.
 
diff --git a/guix.scm b/guix.scm
new file mode 100644
index 0000000..ca4d365
--- /dev/null
+++ b/guix.scm
@@ -0,0 +1,105 @@
+;; To use this file to build HEAD of pangemma:
+;;
+;;   guix build -f guix.scm --tune=native
+;;
+;; To get a development container (e.g., run in emacs shell).
+;;
+;;   guix shell -C -D -f guix.scm
+;;
+
+(use-modules
+  ((guix licenses) #:prefix license:)
+  (guix gexp)
+  (guix packages)
+  (guix git-download)
+  (guix build-system copy)
+  (guix build-system gnu)
+  (guix utils)
+  (gnu packages compression)
+  (gnu packages build-tools)
+  (gnu packages databases)
+  (gnu packages gcc)
+  (gnu packages guile)
+  (gnu packages maths)
+  (gnu packages pkg-config)
+  (srfi srfi-1)
+  (ice-9 popen)
+  (ice-9 rdelim))
+
+(define %source-dir (dirname (current-filename)))
+
+(define %git-commit
+    (string-trim-right (read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ))))
+
+(define %pangemma-version
+    (string-trim-right (read-string (open-pipe "cat VERSION" OPEN_READ))))
+
+(define-public lmdbxx
+  (let ((commit "6f497d1d8e1a6e0afa8ae83891d13b3fac68b62c")
+        (revision "0"))
+    (package
+      (name "lmdbxx")
+      (version (git-version "1.0.0" revision commit))
+      (source (origin
+                (method git-fetch)
+                (uri (git-reference
+                      (url "https://github.com/hoytech/lmdbxx")
+                      (commit commit)))
+                (file-name (git-file-name name version))
+                (sha256
+                 (base32 "1www519kfm0lq4g79i8hpfydwnfrf6p21fqrqcfl2sv072rghygj"))))
+      (build-system copy-build-system) ;; This is a headed only package
+      (arguments
+       (list #:install-plan
+             #~'(("include/lmdbxx/lmdb++.h" "include/lmdb++.h"))))
+      (home-page "https://github.com/hoytech/lmdbxx")
+      (synopsis "C++17 wrapper for the LMDB database library")
+      (description "Header-only C++17 wrapper for the LMDB embedded
+key-value store, with string_view-based API.")
+      (license license:unlicense))))
+
+(define-public pangemma-git
+  (package
+    (name "pangemma-git")
+    (version (git-version %pangemma-version "HEAD" %git-commit))
+    (source (local-file %source-dir #:recursive? #t))
+    (build-system gnu-build-system)
+    (inputs
+     (list gsl
+           guile-3.0
+           lmdb
+           openblas ;; already optimized for AVX-512 kernels alongside SSE2/AVX2/etc.
+           zlib))
+    (native-inputs
+     (list lmdbxx
+           pkg-config
+           premake5))
+    (arguments
+     (list
+      #:tests? #f ; tests need test data and ruby
+      #:phases
+      #~(modify-phases %standard-phases
+          (delete 'configure)
+          (replace 'build
+            (lambda _
+              (invoke "premake5" "gmake")
+              (invoke "make" "verbose=1" "config=release" "gemma"
+                      (string-append "CXXFLAGS=-std=c++17"
+                                     " -I" #$(this-package-native-input "lmdbxx") "/include"
+                                     " -I" #$(this-package-input "lmdb") "/include")
+                      (string-append "-j" (number->string
+                                            (parallel-job-count))))))
+          (replace 'install
+            (lambda _
+              (install-file "build/bin/Release/gemma"
+                            (string-append #$output "/bin")))))))
+    (properties '((tunable? . #t)))
+    (home-page "https://github.com/genetics-statistics/pangemma")
+    (synopsis "Pangenome-enabled GEMMA for genome-wide association studies")
+    (description "PanGemma is a pangenome-enabled version of GEMMA
+(Genome-wide Efficient Mixed Model Association) for genome-wide association
+studies (GWAS) with support for LMDB-backed genotype storage and Guile
+scripting.")
+    (license license:gpl3)))
+
+pangemma-git
diff --git a/guix/guix.scm b/guix/guix.scm
deleted file mode 100644
index d8d593b..0000000
--- a/guix/guix.scm
+++ /dev/null
@@ -1,256 +0,0 @@
-;; To use this file to build HEAD of gemma:
-;;
-;;   guix build -f guix/guix.scm       # default builds pangemma-git
-;;
-;; To get a development container (e.g., run in emacs shell).
-;;
-;;   guix shell -C -D -F -f guix/guix.scm # pangemma-shell-git
-;;
-;; see premake5.lua for build/test instructions
-;;
-;; optimized for arch:
-;;
-;;   guix shell --tune=native -C -D -F # pangemma-shell-git
-;;
-;; see premake5.lua header for examples.
-;;
-;; To optimize use guix --tune=march-type (e.g. --tune=native)
-
-(define-module (guix)
-  #:use-module ((guix licenses) #:prefix license:)
-  #:use-module (guix gexp)
-  #:use-module (guix packages)
-  #:use-module (guix git-download)
-  #:use-module (guix build-system gnu)
-  #:use-module (guix utils)
-
-  #:use-module (gnu packages algebra)
-  #:use-module (gnu packages base)
-  #:use-module (gnu packages build-tools)
-  #:use-module (gnu packages compression)
-  #:use-module (gnu packages commencement)
-  #:use-module (gnu packages check)
-  #:use-module (gnu packages cpp)
-  #:use-module (gnu packages databases)
-  #:use-module (gnu packages gcc)
-  #:use-module (gnu packages gdb)
-  #:use-module (gnu packages guile)
-  #:use-module (gnu packages guile-xyz)
-  #:use-module (gnu packages maths)
-  #:use-module (gnu packages ninja)
-  #:use-module (gnu packages perl)
-  #:use-module (gnu packages pkg-config)
-  #:use-module (gnu packages ruby)
-  #:use-module (gnu packages time)
-  #:use-module (srfi srfi-1)
-  #:use-module (ice-9 popen)
-  #:use-module (ice-9 rdelim))
-
-(define %source-dir (dirname (current-filename)))
-
-(define %git-commit
-  (read-string (open-pipe "git describe --always --tags --long|tr -d $'\n'" OPEN_READ)))
-
-(define %pangemma-version
-    (read-string (open-pipe "cat VERSION|tr -d $'\n'" OPEN_READ)))
-
-(define-public openblas-pangemma
-;; we are fixating on an older openblas, for now
-  (package
-    (name "openblas-pangemma")
-    (version "0.3.21")
-    (source
-     (origin
-       (method git-fetch)
-       (uri (git-reference
-             (url "https://github.com/xianyi/OpenBLAS")
-             (commit (string-append "v" version))))
-       (file-name (git-file-name name version))
-       (sha256
-        (base32
-         "0yx1axiki12y0xz0d5s76vvl7ds36k0npv1sww08k2qslhz1g9qp"))))
-    (build-system gnu-build-system)
-    (properties `((tunable? . #t)))
-    (arguments
-     (list
-      #:tests? #f ;; skip tests
-      #:test-target "test"
-      ;; No default baseline is supplied for powerpc-linux.
-      #:substitutable? (not (target-ppc32?))
-      #:make-flags
-      #~(list (string-append "PREFIX=" #$output)
-              (string-append "CFLAGS=-O3 -g -Wno-incompatible-pointer-types -Wno-error=implicit-function-declaration")
-              "COPT="
-              "COMMON_OPT="
-              "DYNAMIC_ARCH="
-              "SHELL=bash"
-              "MAKE_NB_JOBS=0"          ;use jobserver for submakes
-
-              ;; This is the maximum number of threads OpenBLAS will ever use (that
-              ;; is, if $OPENBLAS_NUM_THREADS is greater than that, then NUM_THREADS
-              ;; is used.)  If we don't set it, the makefile sets it to the number
-              ;; of cores of the build machine, which is obviously wrong.
-              "NUM_THREADS=128"
-
-              ;; DYNAMIC_ARCH is only supported on some architectures.
-              ;; DYNAMIC_ARCH combined with TARGET=GENERIC provides a library
-              ;; which uses the optimizations for the detected CPU.  This can
-              ;; be overridden at runtime with the environment variable
-              ;; OPENBLAS_CORETYPE=<type>, where "type" is a supported CPU
-              ;; type.  On other architectures we target only the baseline CPU
-              ;; supported by Guix.
-              #$@(cond
-                    ((or (target-x86-64?)
-                         (target-x86-32?)
-                         (target-ppc64le?)
-                         (target-aarch64?))
-                     ;; Dynamic older enables a few extra CPU architectures
-                     ;; on x86_64 that were released before 2010.
-                     '("DYNAMIC_ARCH=1" "TARGET=GENERIC"))
-                     ;; '("DYNAMIC_ARCH=" "TARGET_CORE=ZEN"))
-                    ;; On some of these architectures the CPU type can't be detected.
-                    ;; We list the oldest CPU core we want to have support for.
-                    ;; On MIPS we force the "SICORTEX" TARGET, as for the other
-                    ;; two available MIPS targets special extended instructions
-                    ;; for Loongson cores are used.
-                    ((target-mips64el?)
-                     '("TARGET=SICORTEX"))
-                    ((target-arm32?)
-                     '("TARGET=ARMV7"))
-                    ((target-riscv64?)
-                     '("TARGET=RISCV64_GENERIC"))
-                    (else '())))
-      ;; no configure script
-      #:phases
-      #~(modify-phases %standard-phases
-          (delete 'configure)
-          (add-before 'build 'set-extralib
-            (lambda* (#:key inputs #:allow-other-keys)
-              ;; Get libgfortran found when building in utest.
-              (setenv "FEXTRALIB"
-                      (string-append
-                       "-L"
-                       (dirname
-                        (search-input-file inputs "/lib/libgfortran.so")))))))))
-    (inputs
-     (list `(,gfortran "lib")))
-    (native-inputs
-     (list cunit gfortran perl))
-    (home-page "https://www.openblas.net/")
-    (synopsis "Optimized BLAS library based on GotoBLAS")
-    (description
-     "OpenBLAS is a BLAS library forked from the GotoBLAS2-1.13 BSD version.")
-    (license license:bsd-3)))
-
-(define-public pangemma-base-git
-  "Pangemma base build package"
-  (package
-    (name "pangemma-git")
-    (version (git-version %pangemma-version "HEAD" %git-commit))
-    (source (local-file %source-dir #:recursive? #t))
-    (build-system gnu-build-system)
-    (inputs
-     (list gsl
-           openblas-pangemma
-           guile-3.0
-           `(,guile-3.0 "debug")
-           ;; `(,guile-3.0 "dev")
-           guile-lmdb
-           lmdb
-           lmdbxx
-           pkg-config
-           ;; ninja
-           ;; ruby
-           time
-           zlib))
-    ;; (propagated-inputs
-    ;;  (list
-    ;;   `("guile" ,guile-3.0-latest)
-    ;;   `("guile-debug" ,guile-3.0-latest "debug")
-    ;;  `("guile" ,guile-3.0-latest "dev")))
-
-    ;; ("gsl-static" ,gsl-static)
-    ;; ("zlib:static" ,zlib "static")
-    (arguments
-     `(#:phases
-       (modify-phases %standard-phases
-                      (delete 'configure)
-                      (delete 'validate-runpath)
-                      (add-before 'build 'bin-mkdir
-                                  (lambda _
-                                    (mkdir-p "bin")
-                                    ))
-                      (replace 'install
-                               (lambda* (#:key outputs #:allow-other-keys)
-                                 (let ((out (assoc-ref outputs "out")))
-                                   (install-file "bin/gemma" (string-append out "/bin"))))))
-       #:tests? #t
-       #:parallel-tests? #f))
-    (home-page "https://git.genenetwork.org/pangemma/")
-    (synopsis "Tool for genome-wide efficient mixed model association")
-    (description "New version of Genome-wide Efficient Mixed Model Association (PANGEMMA)
-provides a standard linear mixed model resolver with application in
-genome-wide association studies (GWAS).")
-    (license license:gpl3)))
-
-(define-public pangemma-shell-git
-  "Shell version for development"
-  (package
-    (inherit pangemma-base-git)
-    (name "pangemma-shell-git")
-    (build-system gnu-build-system)
-    (propagated-inputs
-     (modify-inputs (package-inputs pangemma-base-git)
-                    (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell
-                 )))
-    (arguments
-     `(#:phases (modify-phases %standard-phases
-                               (delete 'configure)
-                               (delete 'build)
-                               (delete 'package)
-                               (delete 'check)
-                               (delete 'install))))
-    (description "Pangemma shell for development")
-    ))
-
-;; ---- legacy build -----------------------------------------------------------------
-(define-public gemma-git
-  "Original legacy gemma -- for as long as it compiles"
-  (package
-    (name "gemma-git")
-    (version (git-version %pangemma-version "HEAD" %git-commit))
-    (source (local-file %source-dir #:recursive? #t))
-    (build-system gnu-build-system)
-    (inputs
-     (list catch2
-           gdb
-           gsl
-           openblas-pangemma
-           zlib))
-       ;; ("gsl-static" ,gsl-static)
-       ;; ("zlib:static" ,zlib "static")
-    (native-inputs ; for running tests
-     (list perl which))
-    (arguments
-     `(#:phases
-       (modify-phases %standard-phases
-                      (delete 'configure)
-                      (delete 'validate-runpath)
-                      (add-before 'build 'bin-mkdir
-                                  (lambda _
-                                    (mkdir-p "bin")
-                                    ))
-                      (replace 'install
-                               (lambda* (#:key outputs #:allow-other-keys)
-                                 (let ((out (assoc-ref outputs "out")))
-                                   (install-file "bin/gemma" (string-append out "/bin"))))))
-       #:tests? #t
-       #:parallel-tests? #f))
-    (home-page "https://github.com/genetics-statistics")
-    (synopsis "Tool for genome-wide efficient mixed model association")
-    (description "Genome-wide Efficient Mixed Model Association (GEMMA)
-provides a standard linear mixed model resolver with application in
-genome-wide association studies (GWAS).")
-    (license license:gpl3)))
-
-pangemma-shell-git
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 969e6fc..9fe6bbb 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -1981,7 +1981,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
         CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
         p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
       }
-
       time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
 
       auto markerinfo = markers[i];
@@ -1991,14 +1990,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     }
   };
 
-  /*
-  const auto num_markers = indicator_snp.size();
-  enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?");
-  if (num_markers < 50) {
-    cerr << num_markers << " SNPs" << endl;
-    warning_msg("very few SNPs processed");
-  }
-  */
   const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace);
   Markers markers;
 
@@ -2012,23 +2003,17 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     // continue;
 
     auto tup = fetch_snp(t); // use the callback
-    auto success = get<0>(tup);
-    if (!success)
+    auto state = get<0>(tup);
+    if (state == SKIP)
       continue;
-    // typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2;
-    // auto marker = get<1>(tup);
-    // auto chr = get<2>(tup);
-    // auto mpos = get<3>(tup);
+    if (state == LAST)
+      break; // marker loop because of LOCO
+
     auto markerinfo = get<1>(tup);
     auto gs = get<2>(tup);
 
     markers.push_back(markerinfo);
 
-    // check whether SNP is included in gwasnps (used by LOCO)
-    /*
-    if (process_gwasnps && gwasnps.count(snp) == 0)
-      continue;
-    */
     // drop missing idv and plug mean values for missing geno
     double x_total = 0.0; // sum genotype values to compute x_mean
     uint vpos = 0;        // position in target vector
@@ -2067,7 +2052,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     gsl_vector_safe_memcpy(&Xlarge_col.vector, x);
     c++; // count markers going in
 
-
     if (c % msize == 0) {
       batch_compute(msize,markers);
       markers.clear();
@@ -2094,6 +2078,44 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     return;
   }
 
+  outfile << "chr" << "\t"
+          << "rs" << "\t"
+          << "ps" << "\t"
+          << "n_miss" << "\t"
+          << "allele1" << "\t"
+          << "allele0" << "\t"
+          << "af" << "\t";
+
+  if (a_mode != M_LMM2) {
+    outfile << "beta" << "\t";
+    outfile << "se" << "\t";
+  }
+
+  if (a_mode != M_LMM3 && a_mode != M_LMM9)
+    outfile << "logl_H1" << "\t";
+
+  switch(a_mode) {
+  case M_LMM1:
+    outfile << "l_remle" << "\t"
+            << "p_wald" << endl;
+    break;
+  case M_LMM2:
+  case M_LMM9:
+    outfile << "l_mle" << "\t"
+            << "p_lrt" << endl;
+    break;
+  case M_LMM3:
+    outfile << "p_score" << endl;
+    break;
+  case M_LMM4:
+    outfile << "l_remle" << "\t"
+            << "l_mle" << "\t"
+            << "p_wald" << "\t"
+            << "p_lrt" << "\t"
+            << "p_score" << endl;
+    break;
+  }
+
   auto sumstats = [&] (SUMSTAT2 st) {
     outfile << scientific << setprecision(6);
     auto m = st.markerinfo;
@@ -2212,10 +2234,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
   mdb_stat(rtxn, geno_mdb, &stat);
   auto num_markers = stat.ms_entries;
 
-  // fetch_snp is a callback function for every SNP row
-  // returns typedef std::tuple<bool,string,std::vector<double> > SnpNameValues;
-
-  // size_t prev_line = 0;
   auto mdb_fetch = MDB_FIRST;
 
   auto cursor = lmdb::cursor::open(rtxn, geno_mdb);
@@ -2227,20 +2245,14 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
 
     string_view key,value;
 
-    /*
-    while (prev_line <= num) {
-      cursor.get(key, value, MDB_NEXT);
-      prev_line++;
-    }
-    */
-    auto success = cursor.get(key, value, mdb_fetch);
+    auto mdb_success = cursor.get(key, value, mdb_fetch);
     mdb_fetch = MDB_NEXT;
 
     // uint8_t chr;
     vector<double> gs;
     MarkerInfo markerinfo;
 
-    if (success) {
+    if (mdb_success) {
       size_t size = 0;
       // ---- Depending on the format we get different buffers - currently float and byte are supported:
       if (format == "Gb") {
@@ -2282,7 +2294,10 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
       // printf("%#02x %#02x\n", chr, loco_chr);
 
       if (is_loco && loco_chr != chr) {
-        return make_tuple(false, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+        if (chr > loco_chr)
+            return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+          else
+            return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
       }
 
       string_view value2;
@@ -2299,7 +2314,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
 
       // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
     }
-    return make_tuple(success, markerinfo, gs);
+    return make_tuple(COMPUTE, markerinfo, gs);
   };
   LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers);
 
diff --git a/src/lmm.h b/src/lmm.h
index da5ad21..295602a 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -55,7 +55,15 @@ struct MarkerInfo {
 
 typedef vector<MarkerInfo> Markers;
 typedef tuple< string,vector<double> > SnpNameValues;
-typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed)
+
+enum MarkerState {
+  FAIL,
+  COMPUTE,
+  SKIP,
+  LAST
+};
+
+typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed)
 // Results for LMM.
 class SUMSTAT2 {
 public: