about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gemma_io.cpp159
-rw-r--r--src/gemma_io.h3
-rw-r--r--src/param.cpp3
3 files changed, 61 insertions, 104 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 6265417..8fa0eab 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -41,6 +41,7 @@
 #include "gsl/gsl_linalg.h"
 #include "gsl/gsl_matrix.h"
 #include "gsl/gsl_vector.h"
+#include <lmdb++.h>
 
 #include "checkpoint.h"
 #include "debug.h"
@@ -1474,12 +1475,9 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
   return;
 }
 
-// #include <lmdb.h>
-#include <lmdb++.h>
 
 // Read bimbam mean genotype file and calculate kinship matrix.
-int mdb_calc_kin(const string file_geno, bool is_loco,
-               vector<int> &indicator_snp, const int k_mode) {
+int mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, const vector<int> &indicator_snp) {
   checkpoint("mdb-kin",file_geno);
 
   /* Create and open the LMDB environment: */
@@ -1489,25 +1487,7 @@ int mdb_calc_kin(const string file_geno, bool is_loco,
   env.set_max_dbs(10);
   env.open("example/mouse_hs1940.geno.mdb", MDB_RDONLY | MDB_NOSUBDIR, 0664);
 
-  /*
-  {
-    // Print out all the values using a cursor:
-    auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
-    dbi = lmdb::dbi::open(rtxn, "mouse_hs1940.geno.txt.mdb");
-    {
-      auto cursor = lmdb::cursor::open(rtxn, dbi);
-
-      string_view key, value;
-      int count = 0;
-      if (cursor.get(key, value, MDB_FIRST)) {
-        do {
-          cout << count++ << " key: " << key << endl ; // "  value: " << value << endl;
-        } while (cursor.get(key, value, MDB_NEXT));
-      }
-    } // destroying cursor before committing/aborting transaction (see below)
-  }
-  */
-  size_t ni_total = 0;
+  size_t ni_total = 0, num_markers = 0;
   {
     auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
     auto info = lmdb::dbi::open(rtxn, "info");
@@ -1526,86 +1506,76 @@ int mdb_calc_kin(const string file_geno, bool is_loco,
     mdb_stat(rtxn, info, &stat);
     assert(stat.ms_entries == 3);
   }
-  {
-    auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
-    auto geno = lmdb::dbi::open(rtxn, "geno");
+  auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
+  auto geno_mdb = lmdb::dbi::open(rtxn, "geno");
 
-    // Get statistics - ms_entries contains the number of records
-    MDB_stat stat;
-    mdb_stat(rtxn, geno, &stat);
-
-    cout << "Number of records: " << stat.ms_entries << endl;
-  }
+  MDB_stat stat;
+  mdb_stat(rtxn, geno_mdb, &stat);
+  cout << "Number of records: " << stat.ms_entries << endl;
+  num_markers = stat.ms_entries;
 
   gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total);
-  // cout << "ni_total: " << ni_total << endl;
   gsl_vector *geno = gsl_vector_safe_alloc(ni_total);
   gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total);
 
   // Xlarge contains inds x markers
-  const size_t msize = K_BATCH_SIZE;
-  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize);
-  enforce_msg(Xlarge, "allocate Xlarge");
+  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE);
+  enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE");
 
-  gsl_matrix_set_zero(Xlarge);
-  write(matrix_kin,"K before");
+  // [ ] check XLarge is zeroed properly
+  // [ ] handle missing data
 
-  // For every SNP read the genotype per individual
+  // For every marker read the genotype x inds
   size_t ns_test = 0;
   size_t display_pace = 1000;
-  for (size_t t = 0; t < indicator_snp.size(); ++t) {
-    string line;
-    /// safeGetline(infile, line).eof();
-    // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) {
-    if (t % display_pace == 0) {
-      ProgressBar("Reading SNPs", t, indicator_snp.size() - 1);
-    }
-    if (indicator_snp[t] == 0)
-      continue;
-
-    // Using regular expressions is slow:
-    // std::regex_token_iterator<std::string::iterator> rend;
-    // regex split_on("[,[:blank:]]+");
-    // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
-    //                                               split_on, -1);
-    /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen);
-    auto tokens = tokenize_whitespace(line,ni_total+3,"");
 
-    auto token_i = tokens.begin();
-    // const char *snp = *token_i; // first field
-    const char *snp = tokens[0]; // first field
-    // cout << snp << "!";
-    // check whether SNP is included in ksnps (used by LOCO)
-    // if (process_ksnps && ksnps.count(snp) == 0)
-    //   continue;
+  auto cursor = lmdb::cursor::open(rtxn, geno_mdb);
 
-    token_i++; // skip snp name
-    token_i++; // skip nucleotide field
-    token_i++; // skip nucleotide field
-
-    // calc SNP stats
+  auto mdb_fetch = MDB_FIRST;
+  for (size_t t = 0; t < num_markers; ++t) {
+    string line;
+    if (t % display_pace == 0) {
+      ProgressBar("Reading SNPs", t, num_markers - 1);
+    }
+
+    // auto tokens = tokenize_whitespace(line,ni_total+3,"");
+    // auto token_i = tokens.begin();
+    // const char *snp = tokens[0]; // first field
+    // token_i++; // skip snp name
+    // token_i++; // skip nucleotide field
+    // token_i++; // skip nucleotide field
+
+    string_view key, value;
+    cursor.get(key, value, mdb_fetch);
+    mdb_fetch = MDB_NEXT;
+    // cout << " key: " << key << endl ;
+    size_t num_floats = value.size() / sizeof(float);
+    // cout << " num: " << num_floats << endl ;
+    assert(num_floats == ni_total);
+    const float* gs = reinterpret_cast<const float*>(value.data());
     size_t n_miss = 0;
     double geno_mean = 0.0;
     double geno_var = 0.0;
     gsl_vector_set_all(geno_miss, 0);
+
     for (size_t i = 0; i < ni_total; ++i) {
-      enforce_str(token_i != tokens.end(), line + " number of fields");
-      auto field = *token_i;
-      auto sfield = std::string(field);
+      // enforce_str(token_i != tokens.end(), line + " number of fields");
+      // auto field = *token_i;
+      // auto sfield = std::string(field);
       // cout << i << ":" << sfield << "," << endl;
-      if (strncmp(field,"NA",2)==0) { // missing value
-        gsl_vector_set(geno_miss, i, 0);
-        n_miss++;
-      } else {
-        double d = atof(field);
-        if (is_strict_mode() && d == 0.0)
-          enforce_is_float(std::string(field));  // rule out non NA and non-float fields
-        gsl_vector_set(geno, i, d);
-        gsl_vector_set(geno_miss, i, 1);
-        geno_mean += d;
-        geno_var += d * d;
-      }
-      token_i++;
+      // if (strncmp(field,"NA",2)==0) { // missing value
+      //  gsl_vector_set(geno_miss, i, 0);
+      //  n_miss++;
+      // } else {
+      double d = gs[i];
+      // if (is_strict_mode() && d == 0.0)
+      //   enforce_is_float(std::string(field));  // rule out non NA and non-float fields
+      gsl_vector_set(geno, i, d);
+      gsl_vector_set(geno_miss, i, 1);
+      geno_mean += d;
+      geno_var += d * d;
+      // }
+      // token_i++;
     }
 
     geno_mean /= (double)(ni_total - n_miss);
@@ -1613,11 +1583,6 @@ int mdb_calc_kin(const string file_geno, bool is_loco,
     geno_var /= (double)ni_total;
     geno_var -= geno_mean * geno_mean;
 
-    if (ns_test<1) {
-      write(geno,"geno raw");
-      write(geno_mean,"geno mean");
-    }
-
     // impute missing values by plugging in the mean
     for (size_t i = 0; i < ni_total; ++i) {
       if (gsl_vector_get(geno_miss, i) == 0) {
@@ -1625,11 +1590,8 @@ int mdb_calc_kin(const string file_geno, bool is_loco,
       }
     }
 
-    if (ns_test<1) write(geno,"geno imputed");
-
     // subtract the mean (centering genotype values)
     gsl_vector_add_constant(geno, -1.0 * geno_mean);
-    if (ns_test<1) write(geno,"geno mean");
 
     // scale the genotypes
     if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2
@@ -1637,26 +1599,21 @@ int mdb_calc_kin(const string file_geno, bool is_loco,
       gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
     }
 
-    if (ns_test<1) {
-      write(geno_var,"geno var");
-      write(geno,"geno z-scored");
-    }
-
     // set the SNP column ns_test to copy geno into the compute matrix Xlarge
-    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
+    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % K_BATCH_SIZE);
     enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
 
     ns_test++;
 
-    // Every msize rows batch compute kinship matrix and return
+    // Every K_BATCH_SIZE rows batch compute kinship matrix and return
     // by adding to matrix_kin
-    if (ns_test % msize == 0) {
+    if (ns_test % K_BATCH_SIZE == 0) {
       fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
       gsl_matrix_set_zero(Xlarge);
       write(matrix_kin,"K updated");
     }
   }
-  if (ns_test % msize != 0) { // compute last batch
+  if (ns_test % K_BATCH_SIZE != 0) { // compute last batch
     fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
     write(matrix_kin,"K updated");
   }
diff --git a/src/gemma_io.h b/src/gemma_io.h
index 593d704..ad13379 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -87,8 +87,7 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
 void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
 void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
 
-int mdb_calc_kin(const string file_geno, bool is_loco,
-            vector<int> &indicator_snp, const int mode);
+int mdb_calc_kin(const string file_geno, const bool is_loco, const int mode, const vector<int> &indicator_snp);
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
                const int display_pace, gsl_matrix *matrix_kin,
diff --git a/src/param.cpp b/src/param.cpp
index 05e4a7b..3b74e53 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -1294,7 +1294,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_
 }
 
 void PARAM::MdbCalcKin() {
-  error = !mdb_calc_kin(file_geno, is_loco(), indicator_snp, a_mode - 20);
+  error = !mdb_calc_kin(file_geno, is_loco(), a_mode - 20, indicator_snp);
   return;
 }
 
@@ -1310,6 +1310,7 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) {
       error = true;
     }
   } else {
+    // indicator_snp is an array of bool
     error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace,
                        matrix_kin, ni_max == 0);
   }