1 files changed, 853 insertions, 26 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 85e92fe..2b730f3 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -2,7 +2,7 @@
     Genome-wide Efficient Mixed Model Association (GEMMA)
     Copyright © 2011-2017, Xiang Zhou
     Copyright © 2017, Peter Carbonetto
-    Copyright © 2017-2022 Pjotr Prins
+    Copyright © 2017-2025 Pjotr Prins
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -40,10 +40,14 @@
 #include "gsl/gsl_min.h"
 #include "gsl/gsl_roots.h"
 #include "gsl/gsl_vector.h"
+#include <lmdb++.h>
+// #include <lmdb.h>
+#include <sys/mman.h>
 
 #include "gzstream.h"
 #include "gemma.h"
 #include "gemma_io.h"
+#include "checkpoint.h"
 #include "fastblas.h"
 #include "lapack.h"
 #include "lmm.h"
@@ -54,6 +58,7 @@
 using namespace std;
 
 void LMM::CopyFromParam(PARAM &cPar) {
+  checkpoint_nofile("lmm-copy-from-param");
   a_mode = cPar.a_mode;
   d_pace = cPar.d_pace;
 
@@ -90,10 +95,12 @@ void LMM::CopyFromParam(PARAM &cPar) {
 }
 
 void LMM::CopyToParam(PARAM &cPar) {
+  checkpoint_nofile("lmm-copy-to-param");
   cPar.time_UtX = time_UtX;
   cPar.time_opt = time_opt;
-
-  cPar.ng_test = ng_test;
+  cPar.ns_total = ns_total;
+  cPar.ns_test  = ns_test;
+  cPar.ng_test  = ng_test; // number of markers tested
 
   return;
 }
@@ -104,10 +111,11 @@ void LMM::WriteFiles() {
 
   file_str = path_out + "/" + file_out;
   file_str += ".assoc.txt";
+  checkpoint("lmm-write-files",file_str);
 
   ofstream outfile(file_str.c_str(), ofstream::out);
   if (!outfile) {
-    cout << "error writing file: " << file_str.c_str() << endl;
+    cout << "error writing file: " << file_str << endl;
     return;
   }
 
@@ -147,7 +155,7 @@ void LMM::WriteFiles() {
     outfile << scientific << setprecision(6);
 
     if (a_mode != M_LMM2) {
-      outfile << st.beta << "\t";
+      outfile << scientific << st.beta << "\t";
       outfile << st.se << "\t";
     }
 
@@ -201,21 +209,29 @@ void LMM::WriteFiles() {
 
     common_header();
 
-    size_t t = 0;
-    for (size_t i = 0; i < snpInfo.size(); ++i) {
-      if (indicator_snp[i] == 0)
-        continue;
-      auto snp = snpInfo[i].rs_number;
-      if (process_gwasnps && setGWASnps.count(snp) == 0)
-        continue;
-      // cout << t << endl;
-      outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
-              << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
-              << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t"
-              << fixed << setprecision(3) << snpInfo[i].maf << "\t";
-
-      sumstats(sumStat[t]);
-      t++;
+    if (snpInfo.size()) {
+      size_t t = 0;
+      for (size_t i = 0; i < snpInfo.size(); ++i) {
+        if (indicator_snp[i] == 0)
+          continue;
+        auto snp = snpInfo[i].rs_number;
+        if (process_gwasnps && setGWASnps.count(snp) == 0)
+          continue;
+        // cout << t << endl;
+        outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
+                << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
+                << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t"
+                << fixed << setprecision(3) << snpInfo[i].maf << "\t";
+
+        sumstats(sumStat[t]);
+        t++;
+      }
+    }
+    else
+    {
+      for (auto &s : sumStat) {
+        sumstats(s);
+      }
     }
   }
 
@@ -1210,6 +1226,142 @@ void LMM::CalcRLScore(const double l, const FUNC_PARAM &params, double &beta,
   gsl_vector_safe_free(v_temp);
 }
 
+/*
+  ## What `CalcUab` does:
+
+`CalcUab` **computes element-wise products of transformed covariates and phenotypes** to create sufficient statistics needed for linear mixed model calculations.
+
+### Purpose:
+
+Creates a matrix `Uab` containing all pairwise element-wise products of:
+- Transformed covariates: columns of `UtW` (U^T × W)
+- Transformed phenotype: vector `Uty` (U^T × y)
+
+These products are used later to efficiently compute variance components and test statistics without repeatedly accessing the original data.
+
+### Function Signature:
+```cpp
+void CalcUab(const gsl_matrix *UtW,   // U^T × W (transformed covariates)
+             const gsl_vector *Uty,   // U^T × y (transformed phenotype)
+             gsl_matrix *Uab)         // OUTPUT: matrix of products
+```
+
+### Matrix Structure:
+
+Given:
+- **n_cvt** covariates (columns in W)
+- 1 phenotype (y)
+
+The function creates products for indices `a` and `b` where:
+- `a, b ∈ {1, 2, ..., n_cvt, n_cvt+2}` (note: n_cvt+1 is **skipped**)
+- Only computes for `b ≤ a` (lower triangular, symmetric)
+
+**Index mapping:**
+- `1` to `n_cvt`: Covariate columns from UtW
+- `n_cvt+1`: **SKIPPED** (reserved for genotype, added later)
+- `n_cvt+2`: Phenotype (Uty)
+
+### Algorithm Walkthrough:
+
+```cpp
+for (size_t a = 1; a <= n_cvt + 2; ++a) {
+  // Get column 'a'
+  if (a == n_cvt + 2) {
+    u_a = Uty;  // Phenotype
+  } else {
+    u_a = UtW[:, a-1];  // Covariate a
+  }
+
+  for (size_t b = a; b >= 1; --b) {
+    // Get column 'b'
+    if (b == n_cvt + 2) {
+      u_b = Uty;  // Phenotype
+    } else {
+      u_b = UtW[:, b-1];  // Covariate b
+    }
+
+    // Store element-wise product: u_a .* u_b
+    index_ab = GetabIndex(a, b, n_cvt);
+    Uab[:, index_ab] = u_a .* u_b;
+  }
+}
+```
+
+### Example:
+
+With **n_cvt = 2** (2 covariates):
+
+**Columns stored in Uab:**
+```
+GetabIndex(1, 1, 2) → UtW₁ .* UtW₁    (covariate 1 × covariate 1)
+GetabIndex(2, 1, 2) → UtW₂ .* UtW₁    (covariate 2 × covariate 1)
+GetabIndex(2, 2, 2) → UtW₂ .* UtW₂    (covariate 2 × covariate 2)
+GetabIndex(4, 1, 2) → Uty .* UtW₁     (phenotype × covariate 1)
+GetabIndex(4, 2, 2) → Uty .* UtW₂     (phenotype × covariate 2)
+GetabIndex(4, 4, 2) → Uty .* Uty      (phenotype × phenotype)
+
+Note: Index 3 (n_cvt+1) is skipped - reserved for genotype
+```
+
+### Why Skip `n_cvt+1`?
+
+```cpp
+if (a == n_cvt + 1) continue;
+if (b == n_cvt + 1) continue;
+```
+
+The slot `n_cvt+1` is **reserved for the genotype (SNP)** being tested, which is added by the overloaded version:
+
+```cpp
+void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty,
+             const gsl_vector *Utx,  // ← GENOTYPE
+             gsl_matrix *Uab);
+```
+
+This design allows the base covariate products to be computed once, then genotype-specific products added for each SNP.
+
+### Index Function:
+
+```cpp
+size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt)
+```
+
+Maps `(a, b)` pairs to column indices in `Uab`, ensuring:
+- Only lower triangular entries (b ≤ a)
+- Skips the genotype position (n_cvt+1)
+
+### Matrix Dimensions:
+
+```cpp
+size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
+```
+
+Number of unique products in the symmetric matrix (including diagonal), accounting for the skipped genotype position.
+
+### In Context (from `batch_compute`):
+
+```cpp
+// Pre-compute covariate products once
+CalcUab(UtW, Uty, Uab);
+
+// For each SNP:
+for (size_t i = 0; i < batch_size; i++) {
+  gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
+  gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector);
+
+  // Add genotype-specific products
+  CalcUab(UtW, Uty, Utx, Uab);  // Overloaded version
+
+  // Use Uab for likelihood calculations
+  CalcLambda(..., Uab, ...);
+}
+```
+
+### Summary:
+
+**`CalcUab` pre-computes all pairwise element-wise products** of transformed covariates and phenotype (UtW columns and Uty). These products are sufficient statistics used repeatedly in likelihood calculations for different SNPs, avoiding redundant computation. The design reserves space for genotype products to be added later per-SNP while reusing the fixed covariate products.
+*/
+
 void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) {
   size_t index_ab;
   size_t n_cvt = UtW->size2;
@@ -1470,6 +1622,53 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval,
   return;
 }
 
+/*
+Looking at the LMM::Analyze function, here are the parameters that get modified:
+Modified (Mutated) Parameters:
+
+None of the function parameters are directly modified.
+
+All parameters are either:
+
+    const gsl_matrix * - pointer to const (read-only)
+    const gsl_vector * - pointer to const (read-only)
+    const set<string> - const set passed by value (copy)
+    std::function<...>& - reference to a function, which is called but not modified itself
+
+What Actually Gets Modified:
+
+The function modifies member variables of the LMM class object:
+
+    sumStat (member variable)
+        Modified: sumStat.push_back(SNPs) is called in the batch_compute lambda
+        Accumulates summary statistics (beta, se, lambda values, p-values) for each SNP that passes filters
+        This is the primary output of the analysis
+    Timing member variables (implied by member access):
+        time_UtX: time_UtX += (clock() - time_start) / ...
+        time_opt: time_opt += (clock() - time_start) / ...
+        These accumulate timing information for performance profiling
+    Member variables read but not modified:
+        indicator_snp - read to determine which SNPs to process
+        indicator_idv - read to determine which individuals to include
+        ni_total - number of total individuals
+        ni_test - number of individuals in test
+        n_cvt - number of covariates
+        l_mle_null, l_min, l_max, n_region, logl_mle_H0 - parameters for likelihood calculations
+        a_mode - analysis mode (M_LMM1, M_LMM2, etc.)
+        d_pace - for progress display
+
+Summary:
+
+From the outside caller's perspective: None of the parameters passed to Analyze() are mutated. All inputs are const.
+
+The actual output is stored in the member variable sumStat, which accumulates one SUMSTAT structure per analyzed SNP containing:
+
+    beta - effect size
+    se - standard error
+    lambda_remle, lambda_mle - variance component estimates
+    p_wald, p_lrt, p_score - p-values from different tests
+    logl_H1 - log-likelihood under alternative hypothesis
+*/
 
 void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
                   const gsl_matrix *U, const gsl_vector *eval,
@@ -1478,12 +1677,9 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
                   const set<string> gwasnps) {
   clock_t time_start = clock();
 
-  write(W, "W");
-  write(y, "y");
+  checkpoint_nofile("start-lmm-analyze");
   // Subset/LOCO support
   bool process_gwasnps = gwasnps.size();
-  if (process_gwasnps)
-    debug_msg("Analyze subset of SNPs (LOCO)");
 
   // Calculate basic quantities.
   size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
@@ -1510,24 +1706,36 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   // start reading genotypes and analyze
   size_t c = 0;
 
+  /*
+    batch_compute(l) takes l SNPs that have been loaded into Xlarge,
+    transforms them all at once using the eigenvector matrix U, then
+    loops through each transformed SNP to compute association
+    statistics (beta, standard errors, p-values) and stores results in
+    sumStat.
+  */
   auto batch_compute = [&](size_t l) { // using a C++ closure
     // Compute SNPs in batch, note the computations are independent per SNP
     // debug_msg("enter batch_compute");
+    checkpoint_nofile("\nstart-batch_compute");
     gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l);
     gsl_matrix_view UtXlarge_sub =
         gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
 
     time_start = clock();
+    // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge
+    // This is much faster than doing l separate matrix-vector products
+    // U is the eigenvector matrix from the spectral decomposition of the kinship matrix
     fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
                    &UtXlarge_sub.matrix);
     time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
 
     gsl_matrix_set_zero(Xlarge);
     for (size_t i = 0; i < l; i++) {
-      // for every batch...
+      // for each snp batch item extract transformed genotype:
       gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
       gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector);
 
+      // Calculate design matrix components and compute sufficient statistics for the regression model
       CalcUab(UtW, Uty, Utx, Uab);
 
       time_start = clock();
@@ -1537,17 +1745,24 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
       double p_lrt = 0.0, p_score = 0.0;
       double logl_H1 = 0.0;
 
+      // Run statistical tests based on analysis mode
       // 3 is before 1.
       if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) {
         CalcRLScore(l_mle_null, param1, beta, se, p_score);
       }
 
+      // Computes Wald statistic for testing association
       if (a_mode == M_LMM1 || a_mode == M_LMM4) {
         // for univariate a_mode is 1
+        // Estimates variance component (lambda) via REML
         CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
         CalcRLWald(lambda_remle, param1, beta, se, p_wald);
       }
 
+      // Estimates variance component (lambda) via REML
+      // Likelihood Ratio Test (modes 2, 4, 9):
+      // Estimates variance component via MLE
+      // Compares log-likelihood under alternative vs null hypothesis
       if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) {
         CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
         p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
@@ -1561,6 +1776,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
       sumStat.push_back(SNPs);
     }
     // debug_msg("exit batch_compute");
+    checkpoint_nofile("end-batch_compute");
   };
 
   const auto num_snps = indicator_snp.size();
@@ -1642,9 +1858,10 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   }
 
   batch_compute(c % msize);
-  ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1);
+  ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1);
   // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
   cout << endl;
+  checkpoint_nofile("end-lmm-analyze");
 
   gsl_vector_safe_free(x);
   gsl_vector_safe_free(x_miss);
@@ -1657,12 +1874,474 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
 
 }
 
+/*
+  This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead.
+ */
+
+
+
+void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
+                      const gsl_matrix *U, const gsl_vector *eval,
+                      const gsl_matrix *UtW, const gsl_vector *Uty,
+                      const gsl_matrix *W, const gsl_vector *y,
+                      size_t num_markers) {
+  vector<SUMSTAT2> sumstat2;
+  clock_t time_start = clock();
+
+  checkpoint_nofile("start-lmm-mdb-analyze");
+
+  // Calculate basic quantities.
+  size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
+
+  const size_t inds = U->size1;
+  enforce(inds == ni_test);
+  assert(inds > 0);
+
+  gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds
+  gsl_vector *x_miss = gsl_vector_safe_alloc(inds);
+  assert(ni_test == U->size2);
+  assert(ni_test > 0);
+  assert(ni_total > 0);
+  assert(n_index > 0);
+  gsl_vector *Utx = gsl_vector_safe_alloc(ni_test);
+  gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index);
+  gsl_vector *ab = gsl_vector_safe_alloc(n_index);
+
+  const size_t msize = LMM_BATCH_SIZE;
+  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(inds, msize);
+  gsl_matrix *UtXlarge = gsl_matrix_safe_alloc(inds, msize);
+  enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure
+  enforce(Xlarge->size1 == inds);
+  gsl_matrix_set_zero(Xlarge);
+  gsl_matrix_set_zero(Uab);
+  CalcUab(UtW, Uty, Uab);
+
+  // start reading genotypes and analyze
+  size_t c = 0;
+
+  /*
+    batch_compute(l) takes l x SNPs that have been loaded into Xlarge,
+    transforms them all at once using the eigenvector matrix U, then
+    loops through each transformed SNP to compute association
+    statistics (beta, standard errors, p-values) and stores results in
+    sumStat.
+  */
+  auto batch_compute = [&](size_t l, const Markers &markers) { // using a C++ closure
+    // Compute SNPs in batch, note the computations are independent per SNP
+    debug_msg("enter batch_compute");
+    assert(l>0);
+    assert(inds>0);
+    gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l);
+    gsl_matrix_view UtXlarge_sub =
+        gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
+
+    time_start = clock();
+    // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge
+    // This is much faster than doing l separate matrix-vector products
+    // U is the eigenvector matrix from the spectral decomposition of the kinship matrix
+    fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
+                   &UtXlarge_sub.matrix);
+    time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+    gsl_matrix_set_zero(Xlarge);
+    for (size_t i = 0; i < l; i++) {
+      // for each snp batch item extract transformed genotype:
+      gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
+      gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector);
+
+      // Calculate design matrix components and compute sufficient statistics for the regression model
+      CalcUab(UtW, Uty, Utx, Uab);
+
+      time_start = clock();
+      FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+      double lambda_mle = 0.0, lambda_remle = 0.0, beta = 0.0, se = 0.0, p_wald = 0.0;
+      double p_lrt = 0.0, p_score = 0.0;
+      double logl_H1 = 0.0;
+
+      // Run statistical tests based on analysis mode
+      // 3 is before 1.
+      if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) {
+        CalcRLScore(l_mle_null, param1, beta, se, p_score);
+      }
+
+      // Computes Wald statistic for testing association
+      if (a_mode == M_LMM1 || a_mode == M_LMM4) {
+        // for univariate a_mode is 1
+        // Estimates variance component (lambda) via REML
+        CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+        CalcRLWald(lambda_remle, param1, beta, se, p_wald);
+      }
+
+      // Estimates variance component (lambda) via REML
+      // Likelihood Ratio Test (modes 2, 4, 9):
+      // Estimates variance component via MLE
+      // Compares log-likelihood under alternative vs null hypothesis
+      if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) {
+        CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+        p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
+      }
+
+      time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+      auto markerinfo = markers[i];
+      // Store summary data.
+      SUMSTAT2 st = {markerinfo, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score, logl_H1};
+      sumstat2.push_back(st);
+    }
+  };
+
+  /*
+  const auto num_markers = indicator_snp.size();
+  enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?");
+  if (num_markers < 50) {
+    cerr << num_markers << " SNPs" << endl;
+    warning_msg("very few SNPs processed");
+  }
+  */
+  const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace);
+  Markers markers;
+
+  assert(num_markers > 0);
+  for (size_t t = 0; t < num_markers; ++t) {
+  // for (size_t t = 0; t < 2; ++t) {
+    if (t % progress_step == 0 || t == (num_markers - 1)) {
+      ProgressBar("Reading markers", t, num_markers - 1);
+    }
+    // if (indicator_snp[t] == 0)
+    // continue;
+
+    auto tup = fetch_snp(t); // use the callback
+    auto state = get<0>(tup);
+    if (state == SKIP)
+      continue;
+    if (state == LAST)
+      break; // marker loop because of LOCO
+
+    auto markerinfo = get<1>(tup);
+    auto gs = get<2>(tup);
+
+    markers.push_back(markerinfo);
+
+    // drop missing idv and plug mean values for missing geno
+    double x_total = 0.0; // sum genotype values to compute x_mean
+    uint vpos = 0;        // position in target vector
+    uint n_miss = 0;      // count NA genotypes
+    gsl_vector_set_zero(x_miss);
+    for (size_t i = 0; i < ni_total; ++i) {
+      // get the genotypes per individual and compute stats per SNP
+      if (indicator_idv[i] == 0) // skip individual
+        continue;
+
+      double geno = gs[i];
+      if (isnan(geno)) {
+        gsl_vector_set(x_miss, vpos, 1.0);
+        n_miss++;
+      } else {
+        gsl_vector_set(x, vpos, geno);
+        x_total += geno;
+      }
+      vpos++;
+    }
+    enforce(vpos == ni_test);
+
+    const double x_mean = x_total/(double)(ni_test - n_miss);
+
+    // plug x_mean back into missing values
+    for (size_t i = 0; i < ni_test; ++i) {
+      if (gsl_vector_get(x_miss, i) == 1.0) {
+        gsl_vector_set(x, i, x_mean);
+      }
+    }
+
+    enforce(x->size == ni_test);
+
+    // copy genotype values for SNP into Xlarge cache
+    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize);
+    gsl_vector_safe_memcpy(&Xlarge_col.vector, x);
+    c++; // count markers going in
+
+    if (c % msize == 0) {
+      batch_compute(msize,markers);
+      markers.clear();
+      markers.reserve(msize);
+    }
+  }
+
+  if (c % msize)
+    batch_compute(c % msize,markers);
+  ProgressBar("Reading markers", num_markers - 1, num_markers - 1);
+  cout << endl;
+  cout << "Counted markers " << c << " sumStat " << sumstat2.size() << endl;
+  checkpoint_nofile("end-lmm-mdb-analyze");
+
+  string file_str;
+  debug_msg("LMM::WriteFiles");
+  file_str = path_out + "/" + file_out;
+  file_str += ".assoc.txt";
+  checkpoint("lmm-write-files",file_str);
+
+  ofstream outfile(file_str.c_str(), ofstream::out);
+  if (!outfile) {
+    cout << "error writing file: " << file_str << endl;
+    return;
+  }
+
+  auto sumstats = [&] (SUMSTAT2 st) {
+    outfile << scientific << setprecision(6);
+    auto m = st.markerinfo;
+    auto name = m.name;
+    auto chr  = m.chr;
+    string chr_s;
+    if (chr == CHR_X)
+      chr_s = "X";
+    else if (chr == CHR_Y)
+      chr_s = "Y";
+    else if (chr == CHR_M)
+      chr_s = "M";
+    else
+      chr_s = to_string(chr);
+
+    outfile << chr_s << "\t";
+    outfile << name << "\t";
+    outfile << m.pos << "\t";
+    outfile << m.n_miss << "\t-\t-\t"; // n_miss column + allele1 + allele0
+    outfile << fixed << setprecision(3) << m.maf << "\t";
+    outfile << scientific << setprecision(6);
+    if (a_mode != M_LMM2) {
+      outfile << st.beta << "\t";
+      outfile << st.se << "\t";
+    }
+
+    if (a_mode != M_LMM3 && a_mode != M_LMM9)
+      outfile << st.logl_H1 << "\t";
+
+    switch(a_mode) {
+    case M_LMM1:
+      outfile << st.lambda_remle << "\t"
+              << st.p_wald << endl;
+      break;
+    case M_LMM2:
+    case M_LMM9:
+      outfile << st.lambda_mle << "\t"
+              << st.p_lrt << endl;
+      break;
+    case M_LMM3:
+      outfile << st.p_score << endl;
+      break;
+    case M_LMM4:
+      outfile << st.lambda_remle << "\t"
+              << st.lambda_mle << "\t"
+              << st.p_wald << "\t"
+              << st.p_lrt << "\t"
+              << st.p_score << endl;
+      break;
+    }
+  };
+
+  for (auto &s : sumstat2) {
+    sumstats(s);
+  }
+
+  gsl_vector_safe_free(x);
+  gsl_vector_safe_free(x_miss);
+  gsl_vector_safe_free(Utx);
+  gsl_matrix_safe_free(Uab);
+  gsl_vector_free(ab); // unused
+
+  gsl_matrix_safe_free(Xlarge);
+  gsl_matrix_safe_free(UtXlarge);
+
+}
+
+
+void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
+                          const gsl_matrix *UtW, const gsl_vector *Uty,
+                       const gsl_matrix *W, const gsl_vector *y, const string loco) {
+  checkpoint("mdb-calc-gwa",file_geno);
+  bool is_loco = !loco.empty();
+
+  // Convert loco string to what we use in the chrpos index
+  uint8_t loco_chr;
+  if (is_loco) {
+    if (loco == "X") {
+      loco_chr = CHR_X;
+    } else if (loco == "Y") {
+      loco_chr = CHR_Y;
+    } else if (loco == "M") {
+      loco_chr = CHR_M;
+    } else {
+      try {
+        loco_chr = static_cast<uint8_t>(stoi(loco));
+      } catch (...) {
+        loco_chr = 0;
+      }
+    }
+  }
+
+  auto env = lmdb::env::create();
+  env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL);
+  env.set_max_dbs(10);
+  env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664);
+  // Get mmap info using lmdb++ wrapper
+  MDB_envinfo info;
+  mdb_env_info(env.handle(), &info);
+  // Linux kernel aggressive readahead hints
+  madvise(info.me_mapaddr, info.me_mapsize, MADV_SEQUENTIAL);
+  madvise(info.me_mapaddr, info.me_mapsize, MADV_WILLNEED);
+
+  std::cout << "## LMDB opened with optimized readahead; map size = " << (info.me_mapsize / 1024 / 1024) << " MB" << std::endl;
+
+  auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
+  auto geno_mdb = lmdb::dbi::open(rtxn, "geno");
+
+  auto marker_mdb = lmdb::dbi::open(rtxn, "marker");
+  auto info_mdb = lmdb::dbi::open(rtxn, "info");
+  string_view info_key,info_value;
+  info_mdb.get(rtxn,"format",info_value);
+  auto format = string(info_value);
+
+  MDB_stat stat;
+  mdb_stat(rtxn, geno_mdb, &stat);
+  auto num_markers = stat.ms_entries;
+
+  auto mdb_fetch = MDB_FIRST;
+
+  auto cursor = lmdb::cursor::open(rtxn, geno_mdb);
+  cout << "## number of total individuals = " << ni_total << endl;
+  cout << "## number of analyzed individuals = " << ni_test << endl;
+  cout << "## number of analyzed SNPs/var = " << num_markers << endl;
+
+  std::function<SnpNameValues2(size_t)>  fetch_snp = [&](size_t num) {
+
+    string_view key,value;
+
+    auto mdb_success = cursor.get(key, value, mdb_fetch);
+    mdb_fetch = MDB_NEXT;
+
+    // uint8_t chr;
+    vector<double> gs;
+    MarkerInfo markerinfo;
+
+    if (mdb_success) {
+      size_t size = 0;
+      // ---- Depending on the format we get different buffers - currently float and byte are supported:
+      if (format == "Gb") {
+        size_t num_bytes = value.size() / sizeof(uint8_t);
+        assert(num_bytes == ni_total);
+        size = num_bytes;
+        const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data());
+        gs.reserve(size);
+        for (size_t i = 0; i < size; ++i) {
+          double g = static_cast<double>(gs_bbuf[i])/127.0;
+          gs.push_back(g);
+        }
+      } else {
+        size_t num_floats = value.size() / sizeof(float);
+        assert(num_floats == ni_total);
+        size = num_floats;
+        const float* gs_fbuf = reinterpret_cast<const float*>(value.data());
+        gs.reserve(size);
+        for (size_t i = 0; i < size; ++i) {
+          gs.push_back(static_cast<double>(gs_fbuf[i]));
+        }
+      }
+      // Start unpacking key chr+pos
+      if (key.size() != 10) {
+        cerr << "key.size=" << key.size() << endl;
+        throw std::runtime_error("Invalid key size");
+      }
+      // "S>L>L>"
+      const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data());
+      auto chr = static_cast<uint8_t>(data[1]);
+      // Extract big-endian uint32
+      // uint32_t rest = static_cast<uint32_t>(data[2]);
+      uint32_t pos =  (data[2] << 24) | (data[3] << 16) |
+        (data[4] << 8) | data[5];
+
+      uint32_t num = (data[6] << 24) | (data[7] << 16) |
+        (data[8] << 8) | data[9];
+
+      // printf("%#02x %#02x\n", chr, loco_chr);
+
+      if (is_loco && loco_chr != chr) {
+        if (chr > loco_chr)
+            return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+          else
+            return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+      }
+
+      string_view value2;
+      marker_mdb.get(rtxn,key,value2);
+      auto marker = string(value2);
+      // 1       rs13476251      174792257
+      // cout << static_cast<int>(chr) << ":" << pos2 << " line " << rest2 << ":" << marker << endl ;
+
+      // compute maf and n_miss (NAs)
+      size_t n_miss = 0; // count NAs: FIXME
+      double maf = compute_maf(ni_total, ni_test, n_miss, gs.data(), indicator_idv);
+
+      markerinfo = MarkerInfo { .name=marker,.chr=chr,.pos=pos,.line_no=num,.n_miss=n_miss,.maf=maf };
+
+      // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
+    }
+    return make_tuple(COMPUTE, markerinfo, gs);
+  };
+  LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers);
+
+  ns_total = ns_test = num_markers; // track global number of snps in original gemma (goes to cPar)
+}
+
+
+/*
+
+Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified:
+
+## Modified (Mutated) Parameters:
+
+**None of the parameters are modified.**
+
+All parameters are passed as:
+- `const gsl_matrix *` - pointer to const matrix (read-only)
+- `const gsl_vector *` - pointer to const vector (read-only)
+- `const set<string>` - const set passed by value (copy)
+
+## What Actually Gets Modified:
+
+The function modifies **internal state** and **local variables** only:
+
+1. **Local variables modified**:
+   - `prev_line` - tracks current line position in file
+   - `gs` - vector that gets resized and reassigned for each SNP
+   - `infile` - file stream that gets opened, read from, and closed
+
+2. **Member variables** (implied by `this->`):
+   - `file_geno` - used to open the file (read-only here)
+   - `ni_total` - number of individuals (read-only here)
+
+3. **The lambda function `fetch_snp`** captures local variables by reference (`[&]`) and modifies:
+   - `prev_line` - incremented as lines are read
+   - `gs` - reassigned with new genotype values for each SNP
+   - `infile` - advanced through the file
+
+## Summary:
+
+**From the outside caller's perspective**: Nothing passed into `AnalyzeBimbam` gets mutated. All inputs are treated as read-only (`const`).
+
+The function's work happens through:
+- Reading from the genotype file
+- Calling `LMM::Analyze(fetch_snp, ...)` with a callback that reads SNP data
+- Any mutations likely happen inside the `LMM::Analyze` method (which would need to be examined separately to see what it modifies)
+
+*/
+
 void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
                         const gsl_matrix *UtW, const gsl_vector *Uty,
                         const gsl_matrix *W, const gsl_vector *y,
                         const set<string> gwasnps) {
   debug_msg(file_geno);
   auto infilen = file_geno.c_str();
+  checkpoint("start-read-geno-file",file_geno);
 
   igzstream infile(infilen, igzstream::in);
   enforce_msg(infile, "error reading genotype file");
@@ -1942,6 +2621,153 @@ void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX,
   return;
 }
 
+/*
+
+## What `CalcLambda` does:
+
+`CalcLambda` **estimates the variance component ratio (lambda)** in a linear mixed model by finding the value that maximizes either the log-likelihood or log-restricted likelihood function.
+
+### Purpose:
+
+In LMM, the variance structure is:
+- **Var(y) = λτσ² K + τσ² I**
+
+Where:
+- **λ (lambda)**: The ratio of genetic variance to environmental variance (Vg/Ve)
+- **K**: Kinship/relatedness matrix
+- **I**: Identity matrix
+
+Lambda determines how much of the phenotypic variance is explained by genetic relatedness vs random environmental effects.
+
+### Function Signature:
+```cpp
+void CalcLambda(const char func_name,      // 'R' for REML, 'L' for MLE
+                FUNC_PARAM &params,        // Model parameters
+                const double l_min,        // Min lambda to search
+                const double l_max,        // Max lambda to search
+                const size_t n_region,     // # intervals to divide search
+                double &lambda,            // OUTPUT: estimated lambda
+                double &logf)              // OUTPUT: log-likelihood at lambda
+```
+
+### Algorithm Overview:
+
+#### **Phase 1: Identify Candidate Intervals**
+```cpp
+for (size_t i = 0; i < n_region; ++i) {
+  lambda_l = l_min * exp(lambda_interval * i);
+  lambda_h = l_min * exp(lambda_interval * (i + 1.0));
+
+  dev1_l = LogRL_dev1(lambda_l, &params);  // First derivative at lower bound
+  dev1_h = LogRL_dev1(lambda_h, &params);  // First derivative at upper bound
+
+  if (dev1_l * dev1_h <= 0) {
+    lambda_lh.push_back(make_pair(lambda_l, lambda_h));  // Sign change = root
+  }
+}
+```
+- Divides the search range [l_min, l_max] into `n_region` intervals (log-spaced)
+- Evaluates the **first derivative** at interval boundaries
+- If the derivative changes sign, there's a local maximum in that interval
+
+#### **Phase 2: Find Maximum**
+
+**Case A: No sign changes found**
+```cpp
+if (lambda_lh.empty()) {
+  // Maximum is at a boundary
+  if (logf_l >= logf_h) {
+    lambda = l_min;
+  } else {
+    lambda = l_max;
+  }
+}
+```
+
+**Case B: Sign changes found (interior maximum)**
+
+For each candidate interval:
+
+1. **Brent's Method** (robust root-finding):
+   ```cpp
+   gsl_root_fsolver_brent
+   ```
+   - Finds where the first derivative = 0 (critical point)
+   - Doesn't require computing second derivatives
+   - Robust but slower
+
+2. **Newton-Raphson Method** (refinement):
+   ```cpp
+   gsl_root_fdfsolver_newton
+   ```
+   - Uses both first and second derivatives
+   - Refines the estimate from Brent's method
+   - Faster convergence near the root
+
+3. **Compare across intervals**:
+   ```cpp
+   if (logf < logf_l) {
+     logf = logf_l;
+     lambda = l;
+   }
+   ```
+   - Keeps track of the lambda with the highest log-likelihood
+   - Handles multiple local maxima
+
+#### **Phase 3: Check Boundaries**
+```cpp
+if (logf_l > logf) {
+  lambda = l_min;
+}
+if (logf_h > logf) {
+  lambda = l_max;
+}
+```
+- Ensures the global maximum isn't at a boundary
+
+### Function Types:
+
+- **`func_name = 'R'` or `'r'`**: REML (Restricted Maximum Likelihood)
+  - Uses `LogRL_f`, `LogRL_dev1`, `LogRL_dev2`
+  - Accounts for loss of degrees of freedom from fixed effects
+  - Preferred for variance component estimation
+
+- **`func_name = 'L'` or `'l'`**: MLE (Maximum Likelihood)
+  - Uses `LogL_f`, `LogL_dev1`, `LogL_dev2`
+  - Direct likelihood maximization
+  - Used for likelihood ratio tests
+
+### Error Handling:
+
+```cpp
+if (status == GSL_CONTINUE || status != GSL_SUCCESS) {
+  logf = nan("NAN");
+  lambda = nan("NAN");
+  return;
+}
+```
+- If optimization fails, returns NaN values
+- Warnings issued for non-convergence
+
+### In Context (from `batch_compute`):
+
+```cpp
+// REML for Wald test
+CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+CalcRLWald(lambda_remle, param1, beta, se, p_wald);
+
+// MLE for LRT test
+CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
+```
+
+### Summary:
+
+**`CalcLambda` finds the optimal variance component ratio (λ)** that maximizes the (restricted) log-likelihood for a given SNP's genotype data in a linear mixed model. This λ value quantifies the relative contribution of genetic relatedness vs environmental noise to phenotypic variation, and is essential for computing association test statistics (Wald, LRT) while properly accounting for population structure and relatedness.
+
+*/
+
+
 void CalcLambda(const char func_name, FUNC_PARAM &params, const double l_min,
                 const double l_max, const size_t n_region, double &lambda,
                 double &logf) {
@@ -2291,6 +3117,7 @@ void LMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
     cout << "error reading genotype file:" << file_geno << endl;
     return;
   }
+  checkpoint("start-read-geno-file",file_geno);
 
   clock_t time_start = clock();