about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2025-11-28 20:13:34 +0100
committerPjotr Prins2025-11-28 20:13:34 +0100
commit1dcecc851c02785b19bdd9b25dbab3317ac9c886 (patch)
treea6c65e38da36bffcbfaa5c95a044f2480e87adb5
parent01fa01a3553eeadbdd56e11f5fcd020f4dd71310 (diff)
downloadpangemma-1dcecc851c02785b19bdd9b25dbab3317ac9c886.tar.gz
Toying with lmdbi
-rw-r--r--guix/guix.scm1
-rw-r--r--premake5.lua4
-rw-r--r--src/gemma.cpp21
-rw-r--r--src/gemma_io.cpp273
-rw-r--r--src/gemma_io.h2
-rw-r--r--src/param.cpp17
-rw-r--r--src/param.h1
7 files changed, 299 insertions, 20 deletions
diff --git a/guix/guix.scm b/guix/guix.scm
index 90ef001..ad8841d 100644
--- a/guix/guix.scm
+++ b/guix/guix.scm
@@ -156,6 +156,7 @@
            ;; `(,guile-3.0 "dev")
            guile-lmdb
            lmdb
+           lmdbxx
            pkg-config
            ninja
            ruby
diff --git a/premake5.lua b/premake5.lua
index d380dc4..9229800 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -5,7 +5,7 @@
 --
 -- Including bin
 --
---   premake5 gmake && make verbose=1 config=debug -j 8 && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./test/runner
+--   premake5 gmake && make verbose=1 config=debug gemma -j 8 && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./test/runner
 --
 -- Or
 --
@@ -61,7 +61,7 @@ project "gemma"
    files { "src/*.h src/*.c src/**.hpp", "src/**.cpp" }
    removefiles { "src/gemma_api.cpp" }
    includedirs { "src/" }
-   links { }
+   links { "lmdb" }
 
    filter "configurations:Debug"
       defines { "DEBUG" }
diff --git a/src/gemma.cpp b/src/gemma.cpp
index fdb19fd..6b42763 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -1914,8 +1914,14 @@ void GEMMA::BatchRun(PARAM &cPar) {
   if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) {
     cout << "Calculating Relatedness Matrix ... " << endl;
 
-    gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total);
-    enforce_msg(G, "allocate G"); // just to be sure
+    if (cPar.is_mdb) {
+      cPar.MdbCalcKin();
+      return;
+    }
+
+    enforce(cPar.ni_total > 0);
+    gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total);
+    enforce_msg(K, "allocate G"); // just to be sure
 
     time_start = clock();
 
@@ -1923,7 +1929,7 @@ void GEMMA::BatchRun(PARAM &cPar) {
       cout << "error! failed to prepare for calculating relatedness matrix. " << endl;
       return;
     }
-    cPar.CalcKin(G);
+    cPar.CalcKin(K);
 
     cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
     if (cPar.error == true) {
@@ -1932,15 +1938,15 @@ void GEMMA::BatchRun(PARAM &cPar) {
     }
 
     // Now we have the Kinship matrix test it
-    validate_K(G);
+    validate_K(K);
 
     if (cPar.a_mode == M_KIN) {
-      cPar.WriteMatrix(G, "cXX");
+      cPar.WriteMatrix(K, "cXX");
     } else { // M_KIN2
-      cPar.WriteMatrix(G, "sXX");
+      cPar.WriteMatrix(K, "sXX");
     }
 
-    gsl_matrix_safe_free(G);
+    gsl_matrix_safe_free(K);
   }
 
   // Compute the LDSC weights (not implemented yet)
@@ -2582,6 +2588,7 @@ void GEMMA::BatchRun(PARAM &cPar) {
     gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2);      // B is a d by c (effect size)
                                                           // matrix
     gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2);   // se
+    enforce(Y->size1 > 0);
     gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1);      // Kinship aka as G/GRM in the equation ind x ind
     gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1);
     gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2);        // Transformed ind x covariates
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 664a4bd..a727337 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1474,6 +1474,277 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
   return;
 }
 
+#include <lmdb.h>
+#include <lmdb++.h>
+
+// Read bimbam mean genotype file and calculate kinship matrix.
+int mdb_calc_kin(const string file_geno, const set<string> ksnps,
+               vector<int> &indicator_snp, const int k_mode) {
+  checkpoint("mdb-kin",file_geno);
+
+  /* Create and open the LMDB environment: */
+  auto env = lmdb::env::create();
+  env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL); /* 1 GiB */
+  env.open("./example.mdb/", 0, 0664);
+  lmdb::dbi dbi;
+
+
+  MDB_env *env;
+  MDB_dbi dbi;
+  MDB_val key, data;
+  MDB_txn *txn;
+  MDB_cursor *cursor;
+  int rc;
+
+  // Create environment handle
+  rc = mdb_env_create(&env);
+  if (rc != 0) {
+    std::cerr << "mdb_env_create: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+  // Open the environment (directory containing the LMDB files)
+  rc = mdb_env_open(env, file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664);
+  if (rc != 0) {
+    std::cerr << "mdb_env_open: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+  mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
+  // Open the database in the environment
+  rc = mdb_open(txn, NULL, 0, &dbi);
+  if (rc != 0) {
+    std::cerr << "mdb_dbi_open: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  char *skey = "meta\0";
+  key.mv_size = strlen(skey);
+  key.mv_data = skey;
+
+  // std::string skey = "rs13475963";
+  // key.mv_size = skey.size();
+  // key.mv_data = (void*)skey.c_str();
+  cerr << "HERE" << endl;
+
+  MDB_val val;
+  rc = mdb_get(txn, dbi, &key, &val);
+  if (rc != 0) {
+    std::cerr << "mdb_get: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  printf("!!!!!!! %s\n", val.mv_data);
+
+  mdb_txn_abort(txn);
+
+//---
+  MDB_stat stat;
+  // Get statistics
+  rc = mdb_stat(txn, dbi, &stat);
+  if (rc != 0) {
+    std::cerr << "mdb_stat: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  // Print number of entries
+  std::cout << "Number of keys: " << stat.ms_entries << std::endl;
+
+  cerr << "HERE" << endl;
+
+  // Create a transaction for reading
+  rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
+  if (rc != 0) {
+    std::cerr << "mdb_txn_begin: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+
+
+  // Create a cursor to iterate through all entries
+  rc = mdb_cursor_open(txn, dbi, &cursor);
+  if (rc != 0) {
+    std::cerr << "mdb_cursor_open: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  cerr << "HERE" << endl;
+
+
+  std::string searchKey = "meta";
+  key.mv_size = searchKey.size();
+  key.mv_data = (void*)searchKey.c_str();
+
+  rc = mdb_get(txn, dbi, &key, &data);
+  if (rc == 0) {
+    std::string value((char*)data.mv_data, data.mv_size);
+    std::cout << "- Found: " << value << std::endl;
+  } else if (rc == MDB_NOTFOUND) {
+    std::cout << "--- Key not found" << std::endl;
+  } else {
+    std::cerr << "--- Error: " << mdb_strerror(rc) << std::endl;
+  }
+
+  cerr << "HERE HERE" << endl;
+
+  // Iterate through all key-value pairs
+  while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) {
+    std::string keyStr((char*)key.mv_data, key.mv_size);
+    std::string dataStr((char*)data.mv_data, data.mv_size);
+
+    std::cout << "Key: " << keyStr << std::endl;
+    // std::cout << "Key: " << keyStr << ", Value: " << dataStr << std::endl;
+  }
+
+  // Clean up
+  mdb_cursor_close(cursor);
+  mdb_txn_abort(txn);
+  mdb_dbi_close(env, dbi);
+  mdb_env_close(env);
+
+  size_t n_miss;
+  double geno_mean, geno_var;
+
+  // setKSnp and/or LOCO support
+  bool process_ksnps = ksnps.size();
+
+  gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(100,100);
+  size_t ni_total = matrix_kin->size1;
+  cout << "!!!!" << ni_total << endl;
+  gsl_vector *geno = gsl_vector_safe_alloc(ni_total);
+  gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total);
+
+  // Xlarge contains inds x markers
+  const size_t msize = K_BATCH_SIZE;
+  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize);
+  enforce_msg(Xlarge, "allocate Xlarge");
+
+  gsl_matrix_set_zero(Xlarge);
+  write(matrix_kin,"K before");
+
+  // For every SNP read the genotype per individual
+  size_t ns_test = 0;
+  size_t display_pace = 1000;
+  for (size_t t = 0; t < indicator_snp.size(); ++t) {
+    string line;
+    /// safeGetline(infile, line).eof();
+    // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) {
+    if (t % display_pace == 0) {
+      ProgressBar("Reading SNPs", t, indicator_snp.size() - 1);
+    }
+    if (indicator_snp[t] == 0)
+      continue;
+
+    // Using regular expressions is slow:
+    // std::regex_token_iterator<std::string::iterator> rend;
+    // regex split_on("[,[:blank:]]+");
+    // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
+    //                                               split_on, -1);
+    /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen);
+    auto tokens = tokenize_whitespace(line,ni_total+3,"");
+
+    auto token_i = tokens.begin();
+    // const char *snp = *token_i; // first field
+    const char *snp = tokens[0]; // first field
+    // cout << snp << "!";
+    // check whether SNP is included in ksnps (used by LOCO)
+    if (process_ksnps && ksnps.count(snp) == 0)
+      continue;
+
+    token_i++; // skip snp name
+    token_i++; // skip nucleotide field
+    token_i++; // skip nucleotide field
+
+    // calc SNP stats
+    geno_mean = 0.0;
+    n_miss = 0;
+    geno_var = 0.0;
+    gsl_vector_set_all(geno_miss, 0);
+    for (size_t i = 0; i < ni_total; ++i) {
+      enforce_str(token_i != tokens.end(), line + " number of fields");
+      auto field = *token_i;
+      auto sfield = std::string(field);
+      // cout << i << ":" << sfield << "," << endl;
+      if (strncmp(field,"NA",2)==0) { // missing value
+        gsl_vector_set(geno_miss, i, 0);
+        n_miss++;
+      } else {
+        double d = atof(field);
+        if (is_strict_mode() && d == 0.0)
+          enforce_is_float(std::string(field));  // rule out non NA and non-float fields
+        gsl_vector_set(geno, i, d);
+        gsl_vector_set(geno_miss, i, 1);
+        geno_mean += d;
+        geno_var += d * d;
+      }
+      token_i++;
+    }
+
+    geno_mean /= (double)(ni_total - n_miss);
+    geno_var += geno_mean * geno_mean * (double)n_miss;
+    geno_var /= (double)ni_total;
+    geno_var -= geno_mean * geno_mean;
+
+    if (ns_test<1) {
+      write(geno,"geno raw");
+      write(geno_mean,"geno mean");
+    }
+
+    // impute missing values by plugging in the mean
+    for (size_t i = 0; i < ni_total; ++i) {
+      if (gsl_vector_get(geno_miss, i) == 0) {
+        gsl_vector_set(geno, i, geno_mean);
+      }
+    }
+
+    if (ns_test<1) write(geno,"geno imputed");
+
+    // subtract the mean (centering genotype values)
+    gsl_vector_add_constant(geno, -1.0 * geno_mean);
+    if (ns_test<1) write(geno,"geno mean");
+
+    // scale the genotypes
+    if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2
+                                        // flag does this
+      gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
+    }
+
+    if (ns_test<1) {
+      write(geno_var,"geno var");
+      write(geno,"geno z-scored");
+    }
+
+    // set the SNP column ns_test to copy geno into the compute matrix Xlarge
+    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
+    enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
+
+    ns_test++;
+
+    // Every msize rows batch compute kinship matrix and return
+    // by adding to matrix_kin
+    if (ns_test % msize == 0) {
+      fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+      gsl_matrix_set_zero(Xlarge);
+      write(matrix_kin,"K updated");
+    }
+  }
+  if (ns_test % msize != 0) { // compute last batch
+    fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+    write(matrix_kin,"K updated");
+  }
+  ProgressBar("Reading SNPs", 100, 100);
+  cout << endl;
+
+  enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test));
+
+  gsl_vector_free(geno);
+  gsl_vector_free(geno_miss);
+  gsl_matrix_free(Xlarge);
+
+  return true;
+}
+
 // Read bimbam mean genotype file and calculate kinship matrix.
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
@@ -1483,7 +1754,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
   auto infilen = file_geno.c_str();
   igzstream infile(infilen, igzstream::in);
   enforce_msg(infilen, "error reading genotype file");
-  checkpoint("read-geno-file",file_geno);
+  checkpoint("bimbam-kin",file_geno);
 
   size_t n_miss;
   double geno_mean, geno_var;
diff --git a/src/gemma_io.h b/src/gemma_io.h
index 139f379..a20007a 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -87,6 +87,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
 void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
 void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
 
+int mdb_calc_kin(const string file_geno, const set<string> ksnps,
+            vector<int> &indicator_snp, const int mode);
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
                const int display_pace, gsl_matrix *matrix_kin,
diff --git a/src/param.cpp b/src/param.cpp
index 017d588..a720801 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -1293,28 +1293,25 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_
   return;
 }
 
+void PARAM::MdbCalcKin() {
+  error = !mdb_calc_kin(file_geno, setKSnps, indicator_snp, a_mode - 20);
+  return;
+}
+
 void PARAM::CalcKin(gsl_matrix *matrix_kin) {
   checkpoint_nofile("PARAM::CalcKin");
   string file_str;
-
   gsl_matrix_set_zero(matrix_kin);
 
   if (!file_bfile.empty()) {
     file_str = file_bfile + ".bed";
-    // enforce_msg(loco.empty(), "FIXME: LOCO nyi");
     if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
         false) {
       error = true;
     }
   } else {
-    file_str = file_geno;
-    if (is_mdb)
-      error = !BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
-                      matrix_kin, ni_max == 0);
-    else
-      error = !BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
-                         matrix_kin, ni_max == 0);
-
+    error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace,
+                       matrix_kin, ni_max == 0);
   }
 
   return;
diff --git a/src/param.h b/src/param.h
index 38b9430..a6dfac8 100644
--- a/src/param.h
+++ b/src/param.h
@@ -347,6 +347,7 @@ public:
   void ProcessCvtPhen();
   void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag);
   void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag);
+  void MdbCalcKin();
   void CalcKin(gsl_matrix *matrix_kin);
   void CalcS(const map<string, double> &mapRS2wA,
              const map<string, double> &mapRS2wK, const gsl_matrix *W,