about summary refs log tree commit diff
path: root/src/gemma_io.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gemma_io.cpp')
-rw-r--r--src/gemma_io.cpp273
1 files changed, 272 insertions, 1 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 664a4bd..a727337 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1474,6 +1474,277 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
   return;
 }
 
+#include <lmdb.h>
+#include <lmdb++.h>
+
+// Read bimbam mean genotype file and calculate kinship matrix.
+int mdb_calc_kin(const string file_geno, const set<string> ksnps,
+               vector<int> &indicator_snp, const int k_mode) {
+  checkpoint("mdb-kin",file_geno);
+
+  /* Create and open the LMDB environment: */
+  auto env = lmdb::env::create();
+  env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL); /* 1 GiB */
+  env.open("./example.mdb/", 0, 0664);
+  lmdb::dbi dbi;
+
+
+  MDB_env *env;
+  MDB_dbi dbi;
+  MDB_val key, data;
+  MDB_txn *txn;
+  MDB_cursor *cursor;
+  int rc;
+
+  // Create environment handle
+  rc = mdb_env_create(&env);
+  if (rc != 0) {
+    std::cerr << "mdb_env_create: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+  // Open the environment (directory containing the LMDB files)
+  rc = mdb_env_open(env, file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664);
+  if (rc != 0) {
+    std::cerr << "mdb_env_open: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+  mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
+  // Open the database in the environment
+  rc = mdb_open(txn, NULL, 0, &dbi);
+  if (rc != 0) {
+    std::cerr << "mdb_dbi_open: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  char *skey = "meta\0";
+  key.mv_size = strlen(skey);
+  key.mv_data = skey;
+
+  // std::string skey = "rs13475963";
+  // key.mv_size = skey.size();
+  // key.mv_data = (void*)skey.c_str();
+  cerr << "HERE" << endl;
+
+  MDB_val val;
+  rc = mdb_get(txn, dbi, &key, &val);
+  if (rc != 0) {
+    std::cerr << "mdb_get: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  printf("!!!!!!! %s\n", val.mv_data);
+
+  mdb_txn_abort(txn);
+
+//---
+  MDB_stat stat;
+  // Get statistics
+  rc = mdb_stat(txn, dbi, &stat);
+  if (rc != 0) {
+    std::cerr << "mdb_stat: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  // Print number of entries
+  std::cout << "Number of keys: " << stat.ms_entries << std::endl;
+
+  cerr << "HERE" << endl;
+
+  // Create a transaction for reading
+  rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
+  if (rc != 0) {
+    std::cerr << "mdb_txn_begin: " << mdb_strerror(rc) << std::endl;
+    return 1;
+  }
+
+
+  // Create a cursor to iterate through all entries
+  rc = mdb_cursor_open(txn, dbi, &cursor);
+  if (rc != 0) {
+    std::cerr << "mdb_cursor_open: " << mdb_strerror(rc) << std::endl;
+    mdb_txn_abort(txn);
+    return 1;
+  }
+
+  cerr << "HERE" << endl;
+
+
+  std::string searchKey = "meta";
+  key.mv_size = searchKey.size();
+  key.mv_data = (void*)searchKey.c_str();
+
+  rc = mdb_get(txn, dbi, &key, &data);
+  if (rc == 0) {
+    std::string value((char*)data.mv_data, data.mv_size);
+    std::cout << "- Found: " << value << std::endl;
+  } else if (rc == MDB_NOTFOUND) {
+    std::cout << "--- Key not found" << std::endl;
+  } else {
+    std::cerr << "--- Error: " << mdb_strerror(rc) << std::endl;
+  }
+
+  cerr << "HERE HERE" << endl;
+
+  // Iterate through all key-value pairs
+  while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) {
+    std::string keyStr((char*)key.mv_data, key.mv_size);
+    std::string dataStr((char*)data.mv_data, data.mv_size);
+
+    std::cout << "Key: " << keyStr << std::endl;
+    // std::cout << "Key: " << keyStr << ", Value: " << dataStr << std::endl;
+  }
+
+  // Clean up
+  mdb_cursor_close(cursor);
+  mdb_txn_abort(txn);
+  mdb_dbi_close(env, dbi);
+  mdb_env_close(env);
+
+  size_t n_miss;
+  double geno_mean, geno_var;
+
+  // setKSnp and/or LOCO support
+  bool process_ksnps = ksnps.size();
+
+  gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(100,100);
+  size_t ni_total = matrix_kin->size1;
+  cout << "!!!!" << ni_total << endl;
+  gsl_vector *geno = gsl_vector_safe_alloc(ni_total);
+  gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total);
+
+  // Xlarge contains inds x markers
+  const size_t msize = K_BATCH_SIZE;
+  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize);
+  enforce_msg(Xlarge, "allocate Xlarge");
+
+  gsl_matrix_set_zero(Xlarge);
+  write(matrix_kin,"K before");
+
+  // For every SNP read the genotype per individual
+  size_t ns_test = 0;
+  size_t display_pace = 1000;
+  for (size_t t = 0; t < indicator_snp.size(); ++t) {
+    string line;
+    /// safeGetline(infile, line).eof();
+    // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) {
+    if (t % display_pace == 0) {
+      ProgressBar("Reading SNPs", t, indicator_snp.size() - 1);
+    }
+    if (indicator_snp[t] == 0)
+      continue;
+
+    // Using regular expressions is slow:
+    // std::regex_token_iterator<std::string::iterator> rend;
+    // regex split_on("[,[:blank:]]+");
+    // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
+    //                                               split_on, -1);
+    /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen);
+    auto tokens = tokenize_whitespace(line,ni_total+3,"");
+
+    auto token_i = tokens.begin();
+    // const char *snp = *token_i; // first field
+    const char *snp = tokens[0]; // first field
+    // cout << snp << "!";
+    // check whether SNP is included in ksnps (used by LOCO)
+    if (process_ksnps && ksnps.count(snp) == 0)
+      continue;
+
+    token_i++; // skip snp name
+    token_i++; // skip nucleotide field
+    token_i++; // skip nucleotide field
+
+    // calc SNP stats
+    geno_mean = 0.0;
+    n_miss = 0;
+    geno_var = 0.0;
+    gsl_vector_set_all(geno_miss, 0);
+    for (size_t i = 0; i < ni_total; ++i) {
+      enforce_str(token_i != tokens.end(), line + " number of fields");
+      auto field = *token_i;
+      auto sfield = std::string(field);
+      // cout << i << ":" << sfield << "," << endl;
+      if (strncmp(field,"NA",2)==0) { // missing value
+        gsl_vector_set(geno_miss, i, 0);
+        n_miss++;
+      } else {
+        double d = atof(field);
+        if (is_strict_mode() && d == 0.0)
+          enforce_is_float(std::string(field));  // rule out non NA and non-float fields
+        gsl_vector_set(geno, i, d);
+        gsl_vector_set(geno_miss, i, 1);
+        geno_mean += d;
+        geno_var += d * d;
+      }
+      token_i++;
+    }
+
+    geno_mean /= (double)(ni_total - n_miss);
+    geno_var += geno_mean * geno_mean * (double)n_miss;
+    geno_var /= (double)ni_total;
+    geno_var -= geno_mean * geno_mean;
+
+    if (ns_test<1) {
+      write(geno,"geno raw");
+      write(geno_mean,"geno mean");
+    }
+
+    // impute missing values by plugging in the mean
+    for (size_t i = 0; i < ni_total; ++i) {
+      if (gsl_vector_get(geno_miss, i) == 0) {
+        gsl_vector_set(geno, i, geno_mean);
+      }
+    }
+
+    if (ns_test<1) write(geno,"geno imputed");
+
+    // subtract the mean (centering genotype values)
+    gsl_vector_add_constant(geno, -1.0 * geno_mean);
+    if (ns_test<1) write(geno,"geno mean");
+
+    // scale the genotypes
+    if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2
+                                        // flag does this
+      gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
+    }
+
+    if (ns_test<1) {
+      write(geno_var,"geno var");
+      write(geno,"geno z-scored");
+    }
+
+    // set the SNP column ns_test to copy geno into the compute matrix Xlarge
+    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
+    enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
+
+    ns_test++;
+
+    // Every msize rows batch compute kinship matrix and return
+    // by adding to matrix_kin
+    if (ns_test % msize == 0) {
+      fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+      gsl_matrix_set_zero(Xlarge);
+      write(matrix_kin,"K updated");
+    }
+  }
+  if (ns_test % msize != 0) { // compute last batch
+    fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+    write(matrix_kin,"K updated");
+  }
+  ProgressBar("Reading SNPs", 100, 100);
+  cout << endl;
+
+  enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test));
+
+  gsl_vector_free(geno);
+  gsl_vector_free(geno_miss);
+  gsl_matrix_free(Xlarge);
+
+  return true;
+}
+
 // Read bimbam mean genotype file and calculate kinship matrix.
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
@@ -1483,7 +1754,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
   auto infilen = file_geno.c_str();
   igzstream infile(infilen, igzstream::in);
   enforce_msg(infilen, "error reading genotype file");
-  checkpoint("read-geno-file",file_geno);
+  checkpoint("bimbam-kin",file_geno);
 
   size_t n_miss;
   double geno_mean, geno_var;