diff options
Diffstat (limited to 'src/gemma_io.cpp')
| -rw-r--r-- | src/gemma_io.cpp | 66 |
1 files changed, 56 insertions, 10 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index e183e3f..e630f64 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -1478,8 +1478,27 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { // Read bimbam mean genotype file and calculate kinship matrix. -gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) { +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) { checkpoint("mdb-kin",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr = 0; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } /* Create and open the LMDB environment: */ auto env = lmdb::env::create(); @@ -1487,6 +1506,7 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ env.set_max_dbs(10); env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + string format; size_t ni_total = 0, num_markers = 0; { @@ -1502,12 +1522,17 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) std::string_view view_int; info.get(rtxn, "numsamples", view_int); ni_total = lmdb::from_sv<size_t>(view_int); + string_view info_key,info_value; + info.get(rtxn,"format",info_value); + format = string(info_value); MDB_stat stat; mdb_stat(rtxn, info, &stat); - assert(stat.ms_entries == 3); + assert(stat.ms_entries == 5); } + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); MDB_stat stat; @@ -1548,15 +1573,36 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) string_view key, value; cursor.get(key, value, mdb_fetch); mdb_fetch = MDB_NEXT; - /* FIXME! - if (indicator_snp[t] == 0) + + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + + if (is_loco && loco_chr == chr) continue; - */ -#if !defined NDEBUG - size_t num_floats = value.size() / sizeof(float); - assert(num_floats == ni_total); -#endif - const float* gs = reinterpret_cast<const float*>(value.data()); + + // ---- Depending on the format we get different buffers - currently float and byte are supported: + vector<double> gs; + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + auto size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + auto size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + size_t n_miss = 0; double geno_mean = 0.0; double geno_var = 0.0; |
