about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gemma_io.cpp66
-rw-r--r--src/gemma_io.h8
-rw-r--r--src/lmm.cpp50
-rw-r--r--src/lmm.h15
-rw-r--r--src/param.cpp2
5 files changed, 91 insertions, 50 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index e183e3f..e630f64 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1478,8 +1478,27 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
 
 
 // Read bimbam mean genotype file and calculate kinship matrix.
-gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) {
+gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) {
   checkpoint("mdb-kin",file_geno);
+  bool is_loco = !loco.empty();
+
+  // Convert loco string to what we use in the chrpos index
+  uint8_t loco_chr = 0;
+  if (is_loco) {
+    if (loco == "X") {
+      loco_chr = CHR_X;
+    } else if (loco == "Y") {
+      loco_chr = CHR_Y;
+    } else if (loco == "M") {
+      loco_chr = CHR_M;
+    } else {
+      try {
+        loco_chr = static_cast<uint8_t>(stoi(loco));
+      } catch (...) {
+        loco_chr = 0;
+      }
+    }
+  }
 
   /* Create and open the LMDB environment: */
   auto env = lmdb::env::create();
@@ -1487,6 +1506,7 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode)
   env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */
   env.set_max_dbs(10);
   env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664);
+  string format;
 
   size_t ni_total = 0, num_markers = 0;
   {
@@ -1502,12 +1522,17 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode)
     std::string_view view_int;
     info.get(rtxn, "numsamples", view_int);
     ni_total = lmdb::from_sv<size_t>(view_int);
+    string_view info_key,info_value;
+    info.get(rtxn,"format",info_value);
+    format = string(info_value);
 
     MDB_stat stat;
     mdb_stat(rtxn, info, &stat);
-    assert(stat.ms_entries == 3);
+    assert(stat.ms_entries == 5);
   }
+
   auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
+
   auto geno_mdb = lmdb::dbi::open(rtxn, "geno");
 
   MDB_stat stat;
@@ -1548,15 +1573,36 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode)
     string_view key, value;
     cursor.get(key, value, mdb_fetch);
     mdb_fetch = MDB_NEXT;
-    /* FIXME!
-    if (indicator_snp[t] == 0)
+
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data());
+    auto chr = static_cast<uint8_t>(data[1]);
+
+    if (is_loco && loco_chr == chr)
       continue;
-    */
-#if !defined NDEBUG
-    size_t num_floats = value.size() / sizeof(float);
-    assert(num_floats == ni_total);
-#endif
-    const float* gs = reinterpret_cast<const float*>(value.data());
+
+    // ---- Depending on the format we get different buffers - currently float and byte are supported:
+    vector<double> gs;
+    if (format == "Gb") {
+      size_t num_bytes = value.size() / sizeof(uint8_t);
+      assert(num_bytes == ni_total);
+      auto size = num_bytes;
+      const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data());
+      gs.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        double g = static_cast<double>(gs_bbuf[i])/127.0;
+        gs.push_back(g);
+      }
+    } else {
+      size_t num_floats = value.size() / sizeof(float);
+      assert(num_floats == ni_total);
+      auto size = num_floats;
+      const float* gs_fbuf = reinterpret_cast<const float*>(value.data());
+      gs.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        gs.push_back(static_cast<double>(gs_fbuf[i]));
+      }
+    }
+
     size_t n_miss = 0;
     double geno_mean = 0.0;
     double geno_var = 0.0;
diff --git a/src/gemma_io.h b/src/gemma_io.h
index 4e9f4c1..06cd0ee 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -26,12 +26,17 @@
 #include <algorithm>
 #include <map>
 #include <vector>
+#include <cstdint>
 
 #include "gzstream.h"
 #include "param.h"
 
 #define tab(col) ( col ? "\t" : "")
 
+constexpr uint8_t CHR_X = 'X';
+constexpr uint8_t CHR_Y = 'Y';
+constexpr uint8_t CHR_M = 'M';
+
 using namespace std;
 
 void ProgressBar(string str, double p, double total, double ratio = -1.0);
@@ -87,7 +92,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
 void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
 void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
 
-gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode);
+gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco);
+
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
                const int display_pace, gsl_matrix *matrix_kin,
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 129cbd6..2b730f3 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -2012,23 +2012,17 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     // continue;
 
     auto tup = fetch_snp(t); // use the callback
-    auto success = get<0>(tup);
-    if (!success)
+    auto state = get<0>(tup);
+    if (state == SKIP)
       continue;
-    // typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2;
-    // auto marker = get<1>(tup);
-    // auto chr = get<2>(tup);
-    // auto mpos = get<3>(tup);
+    if (state == LAST)
+      break; // marker loop because of LOCO
+
     auto markerinfo = get<1>(tup);
     auto gs = get<2>(tup);
 
     markers.push_back(markerinfo);
 
-    // check whether SNP is included in gwasnps (used by LOCO)
-    /*
-    if (process_gwasnps && gwasnps.count(snp) == 0)
-      continue;
-    */
     // drop missing idv and plug mean values for missing geno
     double x_total = 0.0; // sum genotype values to compute x_mean
     uint vpos = 0;        // position in target vector
@@ -2067,7 +2061,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     gsl_vector_safe_memcpy(&Xlarge_col.vector, x);
     c++; // count markers going in
 
-
     if (c % msize == 0) {
       batch_compute(msize,markers);
       markers.clear();
@@ -2212,10 +2205,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
   mdb_stat(rtxn, geno_mdb, &stat);
   auto num_markers = stat.ms_entries;
 
-  // fetch_snp is a callback function for every SNP row
-  // returns typedef std::tuple<bool,string,std::vector<double> > SnpNameValues;
-
-  // size_t prev_line = 0;
   auto mdb_fetch = MDB_FIRST;
 
   auto cursor = lmdb::cursor::open(rtxn, geno_mdb);
@@ -2227,21 +2216,16 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
 
     string_view key,value;
 
-    /*
-    while (prev_line <= num) {
-      cursor.get(key, value, MDB_NEXT);
-      prev_line++;
-    }
-    */
-    auto success = cursor.get(key, value, mdb_fetch);
+    auto mdb_success = cursor.get(key, value, mdb_fetch);
     mdb_fetch = MDB_NEXT;
 
     // uint8_t chr;
     vector<double> gs;
     MarkerInfo markerinfo;
 
-    if (success) {
+    if (mdb_success) {
       size_t size = 0;
+      // ---- Depending on the format we get different buffers - currently float and byte are supported:
       if (format == "Gb") {
         size_t num_bytes = value.size() / sizeof(uint8_t);
         assert(num_bytes == ni_total);
@@ -2270,13 +2254,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
       // "S>L>L>"
       const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data());
       auto chr = static_cast<uint8_t>(data[1]);
-
-      // printf("%#02x %#02x\n", chr, loco_chr);
-
-      if (is_loco && loco_chr != chr) {
-        return make_tuple(false, markerinfo, gs);
-      }
-
       // Extract big-endian uint32
       // uint32_t rest = static_cast<uint32_t>(data[2]);
       uint32_t pos =  (data[2] << 24) | (data[3] << 16) |
@@ -2285,6 +2262,15 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
       uint32_t num = (data[6] << 24) | (data[7] << 16) |
         (data[8] << 8) | data[9];
 
+      // printf("%#02x %#02x\n", chr, loco_chr);
+
+      if (is_loco && loco_chr != chr) {
+        if (chr > loco_chr)
+            return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+          else
+            return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs);
+      }
+
       string_view value2;
       marker_mdb.get(rtxn,key,value2);
       auto marker = string(value2);
@@ -2299,7 +2285,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
 
       // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
     }
-    return make_tuple(success, markerinfo, gs);
+    return make_tuple(COMPUTE, markerinfo, gs);
   };
   LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers);
 
diff --git a/src/lmm.h b/src/lmm.h
index 8a65f8d..295602a 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -27,7 +27,6 @@
 #include "param.h"
 #include <functional>
 #include <tuple>
-#include <cstdint>
 
 using namespace std;
 
@@ -46,10 +45,6 @@ public:
 };
 
 
-constexpr uint8_t CHR_X = 'X';
-constexpr uint8_t CHR_Y = 'Y';
-constexpr uint8_t CHR_M = 'M';
-
 // typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerInfo; // name, chr, pos, line
 struct MarkerInfo {
   string name;
@@ -60,7 +55,15 @@ struct MarkerInfo {
 
 typedef vector<MarkerInfo> Markers;
 typedef tuple< string,vector<double> > SnpNameValues;
-typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed)
+
+enum MarkerState {
+  FAIL,
+  COMPUTE,
+  SKIP,
+  LAST
+};
+
+typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed)
 // Results for LMM.
 class SUMSTAT2 {
 public:
diff --git a/src/param.cpp b/src/param.cpp
index 516dfd8..96a9cd2 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -1303,7 +1303,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_
 }
 
 gsl_matrix *PARAM::MdbCalcKin() {
-  return mdb_calc_kin(file_geno, is_loco(), a_mode - 20);
+  return mdb_calc_kin(file_geno, a_mode - 20, loco);
 }
 
 void PARAM::CalcKin(gsl_matrix *matrix_kin) {