Code loads markers - getting ready to compute

author: Pjotr Prins 2025-12-01 16:48:13 +0100
committer: Pjotr Prins 2025-12-01 16:48:13 +0100
commit: a144f052139b3c4e6fa761ffb0a1e435bd1bad22 (patch)
tree: e36483a8745b556860c710d9bdfe13c310353847 /src
parent: 16a0419f5bd2e5122a57ae557b556b3cbe6b402b (diff)
download: pangemma-a144f052139b3c4e6fa761ffb0a1e435bd1bad22.tar.gz
2 files changed, 52 insertions, 46 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp
index ee7ba42..d015988 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -1864,18 +1864,19 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
 }
 
 /*
-  This is the mirror function of below AnalyzeBimbam, but uses mdb input instead.
+  This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead.
  */
 
 
 void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
-                  const gsl_matrix *U, const gsl_vector *eval,
-                  const gsl_matrix *UtW, const gsl_vector *Uty,
-                  const gsl_matrix *W, const gsl_vector *y,
-                  const set<string> gwasnps) {
+                      const gsl_matrix *U, const gsl_vector *eval,
+                      const gsl_matrix *UtW, const gsl_vector *Uty,
+                      const gsl_matrix *W, const gsl_vector *y,
+                      const set<string> gwasnps,
+                      size_t num_markers) {
   clock_t time_start = clock();
 
-  checkpoint_nofile("start-lmm-analyze");
+  checkpoint_nofile("start-lmm-mdb-analyze");
   // Subset/LOCO support
   bool process_gwasnps = gwasnps.size();
 
@@ -1884,10 +1885,16 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
 
   const size_t inds = U->size1;
   enforce(inds == ni_test);
+  assert(inds > 0);
+
   gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds
   gsl_vector *x_miss = gsl_vector_safe_alloc(inds);
-  gsl_vector *Utx = gsl_vector_safe_alloc(U->size2);
-  gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index);
+  assert(ni_test == U->size2);
+  assert(ni_test > 0);
+  assert(ni_total > 0);
+  assert(n_index > 0);
+  gsl_vector *Utx = gsl_vector_safe_alloc(ni_test);
+  gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index);
   gsl_vector *ab = gsl_vector_safe_alloc(n_index);
 
   // Create a large matrix with LMM_BATCH_SIZE columns for batched processing
@@ -1905,7 +1912,7 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   size_t c = 0;
 
   /*
-    batch_compute(l) takes l SNPs that have been loaded into Xlarge,
+    batch_compute(l) takes l x SNPs that have been loaded into Xlarge,
     transforms them all at once using the eigenvector matrix U, then
     loops through each transformed SNP to compute association
     statistics (beta, standard errors, p-values) and stores results in
@@ -1913,8 +1920,9 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   */
   auto batch_compute = [&](size_t l) { // using a C++ closure
     // Compute SNPs in batch, note the computations are independent per SNP
-    // debug_msg("enter batch_compute");
-    checkpoint_nofile("\nstart-batch_compute");
+    debug_msg("enter batch_compute");
+    assert(l>0);
+    assert(inds>0);
     gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l);
     gsl_matrix_view UtXlarge_sub =
         gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
@@ -1973,33 +1981,36 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
                       p_wald, p_lrt, p_score, logl_H1};
       sumStat.push_back(SNPs);
     }
-    // debug_msg("exit batch_compute");
-    checkpoint_nofile("end-batch_compute");
   };
 
-  const auto num_snps = indicator_snp.size();
-  enforce_msg(num_snps > 0,"Zero SNPs to process - data corrupt?");
-  if (num_snps < 50) {
-    cerr << num_snps << " SNPs" << endl;
+  /*
+  const auto num_markers = indicator_snp.size();
+  enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?");
+  if (num_markers < 50) {
+    cerr << num_markers << " SNPs" << endl;
     warning_msg("very few SNPs processed");
   }
-  const size_t progress_step = (num_snps/50>d_pace ? num_snps/50 : d_pace);
+  */
+  const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace);
 
-  for (size_t t = 0; t < num_snps; ++t) {
-    if (t % progress_step == 0 || t == (num_snps - 1)) {
-      ProgressBar("Reading SNPs", t, num_snps - 1);
+  assert(num_markers > 0);
+  for (size_t t = 0; t < num_markers; ++t) {
+    if (t % progress_step == 0 || t == (num_markers - 1)) {
+      ProgressBar("Reading SNPs", t, num_markers - 1);
     }
-    if (indicator_snp[t] == 0)
-      continue;
+    // if (indicator_snp[t] == 0)
+    // continue;
 
-    auto tup = fetch_snp(t);
+    auto tup = fetch_snp(t); // use the call back
     auto snp = get<0>(tup);
     auto gs = get<1>(tup);
+    cout << "SNP: " << snp << endl;
 
     // check whether SNP is included in gwasnps (used by LOCO)
+    /*
     if (process_gwasnps && gwasnps.count(snp) == 0)
       continue;
-
+    */
     // drop missing idv and plug mean values for missing geno
     double x_total = 0.0; // sum genotype values to compute x_mean
     uint pos = 0;         // position in target vector
@@ -2031,18 +2042,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
       }
     }
 
-    /* this is what below GxE does
-    for (size_t i = 0; i < ni_test; ++i) {
-      auto geno = gsl_vector_get(x, i);
-      if (std::isnan(geno)) {
-        gsl_vector_set(x, i, x_mean);
-        geno = x_mean;
-      }
-      if (x_mean > 1.0) {
-        gsl_vector_set(x, i, 2 - geno);
-      }
-    }
-    */
     enforce(x->size == ni_test);
 
     // copy genotype values for SNP into Xlarge cache
@@ -2055,11 +2054,12 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
     }
   }
 
-  batch_compute(c % msize);
-  ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1);
+  if (c % msize)
+    batch_compute(c % msize);
+  ProgressBar("Reading SNPS", num_markers - 1, num_markers - 1);
   // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
   cout << endl;
-  checkpoint_nofile("end-lmm-analyze");
+  checkpoint_nofile("end-lmm-mdb-analyze");
 
   gsl_vector_safe_free(x);
   gsl_vector_safe_free(x_miss);
@@ -2089,6 +2089,12 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
   auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
   auto geno_mdb = lmdb::dbi::open(rtxn, "geno");
 
+
+  MDB_stat stat;
+  mdb_stat(rtxn, geno_mdb, &stat);
+  cout << "Number of records: " << stat.ms_entries << endl;
+  auto num_markers = stat.ms_entries;
+
   // fetch_snp is a callback function for every SNP row
   // returns typedef std::tuple<string,std::vector<double> > SnpNameValues;
 
@@ -2112,12 +2118,12 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
     const float* gsbuf = reinterpret_cast<const float*>(value.data());
     auto snp = string(key);
     std::vector<double> gs(gsbuf,gsbuf+sizeof(float)*value.size());
-    cout << "!!!!" << snp << endl;
+    cout << "!!!!" << snp << endl << endl;
 
     return std::make_tuple(snp,gs);
   };
 
-  LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps);
+  LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps,num_markers);
 }
 
 
diff --git a/src/lmm.h b/src/lmm.h
index c628baa..29e6341 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -101,10 +101,10 @@ public:
                const gsl_matrix *W, const gsl_vector *y,
                const set<string> gwasnps);
   void mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
-               const gsl_matrix *U, const gsl_vector *eval,
-               const gsl_matrix *UtW, const gsl_vector *Uty,
-               const gsl_matrix *W, const gsl_vector *y,
-               const set<string> gwasnps);
+                   const gsl_matrix *U, const gsl_vector *eval,
+                   const gsl_matrix *UtW, const gsl_vector *Uty,
+                   const gsl_matrix *W, const gsl_vector *y,
+                   const set<string> gwasnps, size_t num_markers);
   void mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
                      const gsl_matrix *UtW, const gsl_vector *Uty,
                      const gsl_matrix *W, const gsl_vector *y,
author	Pjotr Prins	2025-12-01 16:48:13 +0100
committer	Pjotr Prins	2025-12-01 16:48:13 +0100
commit	a144f052139b3c4e6fa761ffb0a1e435bd1bad22 (patch)
tree	e36483a8745b556860c710d9bdfe13c310353847 /src
parent	16a0419f5bd2e5122a57ae557b556b3cbe6b402b (diff)
download	pangemma-a144f052139b3c4e6fa761ffb0a1e435bd1bad22.tar.gz