diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/gemma_io.cpp | 66 | ||||
| -rw-r--r-- | src/gemma_io.h | 8 | ||||
| -rw-r--r-- | src/lmm.cpp | 50 | ||||
| -rw-r--r-- | src/lmm.h | 15 | ||||
| -rw-r--r-- | src/param.cpp | 2 |
5 files changed, 91 insertions, 50 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index e183e3f..e630f64 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -1478,8 +1478,27 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { // Read bimbam mean genotype file and calculate kinship matrix. -gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) { +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) { checkpoint("mdb-kin",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr = 0; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } /* Create and open the LMDB environment: */ auto env = lmdb::env::create(); @@ -1487,6 +1506,7 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ env.set_max_dbs(10); env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + string format; size_t ni_total = 0, num_markers = 0; { @@ -1502,12 +1522,17 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) std::string_view view_int; info.get(rtxn, "numsamples", view_int); ni_total = lmdb::from_sv<size_t>(view_int); + string_view info_key,info_value; + info.get(rtxn,"format",info_value); + format = string(info_value); MDB_stat stat; mdb_stat(rtxn, info, &stat); - assert(stat.ms_entries == 3); + assert(stat.ms_entries == 5); } + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); MDB_stat stat; @@ -1548,15 +1573,36 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) string_view key, value; cursor.get(key, value, mdb_fetch); mdb_fetch = MDB_NEXT; - /* FIXME! - if (indicator_snp[t] == 0) + + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + + if (is_loco && loco_chr == chr) continue; - */ -#if !defined NDEBUG - size_t num_floats = value.size() / sizeof(float); - assert(num_floats == ni_total); -#endif - const float* gs = reinterpret_cast<const float*>(value.data()); + + // ---- Depending on the format we get different buffers - currently float and byte are supported: + vector<double> gs; + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + auto size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + auto size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + size_t n_miss = 0; double geno_mean = 0.0; double geno_var = 0.0; diff --git a/src/gemma_io.h b/src/gemma_io.h index 4e9f4c1..06cd0ee 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -26,12 +26,17 @@ #include <algorithm> #include <map> #include <vector> +#include <cstdint> #include "gzstream.h" #include "param.h" #define tab(col) ( col ? "\t" : "") +constexpr uint8_t CHR_X = 'X'; +constexpr uint8_t CHR_Y = 'Y'; +constexpr uint8_t CHR_M = 'M'; + using namespace std; void ProgressBar(string str, double p, double total, double ratio = -1.0); @@ -87,7 +92,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); -gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode); +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco); + bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, diff --git a/src/lmm.cpp b/src/lmm.cpp index 129cbd6..2b730f3 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -2012,23 +2012,17 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, // continue; auto tup = fetch_snp(t); // use the callback - auto success = get<0>(tup); - if (!success) + auto state = get<0>(tup); + if (state == SKIP) continue; - // typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; - // auto marker = get<1>(tup); - // auto chr = get<2>(tup); - // auto mpos = get<3>(tup); + if (state == LAST) + break; // marker loop because of LOCO + auto markerinfo = get<1>(tup); auto gs = get<2>(tup); markers.push_back(markerinfo); - // check whether SNP is included in gwasnps (used by LOCO) - /* - if (process_gwasnps && gwasnps.count(snp) == 0) - continue; - */ // drop missing idv and plug mean values for missing geno double x_total = 0.0; // sum genotype values to compute x_mean uint vpos = 0; // position in target vector @@ -2067,7 +2061,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, gsl_vector_safe_memcpy(&Xlarge_col.vector, x); c++; // count markers going in - if (c % msize == 0) { batch_compute(msize,markers); markers.clear(); @@ -2212,10 +2205,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, mdb_stat(rtxn, geno_mdb, &stat); auto num_markers = stat.ms_entries; - // fetch_snp is a callback function for every SNP row - // returns typedef std::tuple<bool,string,std::vector<double> > SnpNameValues; - - // size_t prev_line = 0; auto mdb_fetch = MDB_FIRST; auto cursor = lmdb::cursor::open(rtxn, geno_mdb); @@ -2227,21 +2216,16 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, string_view key,value; - /* - while (prev_line <= num) { - cursor.get(key, value, MDB_NEXT); - prev_line++; - } - */ - auto success = cursor.get(key, value, mdb_fetch); + auto mdb_success = cursor.get(key, value, mdb_fetch); mdb_fetch = MDB_NEXT; // uint8_t chr; vector<double> gs; MarkerInfo markerinfo; - if (success) { + if (mdb_success) { size_t size = 0; + // ---- Depending on the format we get different buffers - currently float and byte are supported: if (format == "Gb") { size_t num_bytes = value.size() / sizeof(uint8_t); assert(num_bytes == ni_total); @@ -2270,13 +2254,6 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, // "S>L>L>" const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); auto chr = static_cast<uint8_t>(data[1]); - - // printf("%#02x %#02x\n", chr, loco_chr); - - if (is_loco && loco_chr != chr) { - return make_tuple(false, markerinfo, gs); - } - // Extract big-endian uint32 // uint32_t rest = static_cast<uint32_t>(data[2]); uint32_t pos = (data[2] << 24) | (data[3] << 16) | @@ -2285,6 +2262,15 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, uint32_t num = (data[6] << 24) | (data[7] << 16) | (data[8] << 8) | data[9]; + // printf("%#02x %#02x\n", chr, loco_chr); + + if (is_loco && loco_chr != chr) { + if (chr > loco_chr) + return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + else + return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + } + string_view value2; marker_mdb.get(rtxn,key,value2); auto marker = string(value2); @@ -2299,7 +2285,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl; } - return make_tuple(success, markerinfo, gs); + return make_tuple(COMPUTE, markerinfo, gs); }; LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers); diff --git a/src/lmm.h b/src/lmm.h index 8a65f8d..295602a 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -27,7 +27,6 @@ #include "param.h" #include <functional> #include <tuple> -#include <cstdint> using namespace std; @@ -46,10 +45,6 @@ public: }; -constexpr uint8_t CHR_X = 'X'; -constexpr uint8_t CHR_Y = 'Y'; -constexpr uint8_t CHR_M = 'M'; - // typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerInfo; // name, chr, pos, line struct MarkerInfo { string name; @@ -60,7 +55,15 @@ struct MarkerInfo { typedef vector<MarkerInfo> Markers; typedef tuple< string,vector<double> > SnpNameValues; -typedef tuple< bool,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) + +enum MarkerState { + FAIL, + COMPUTE, + SKIP, + LAST +}; + +typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) // Results for LMM. class SUMSTAT2 { public: diff --git a/src/param.cpp b/src/param.cpp index 516dfd8..96a9cd2 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1303,7 +1303,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_ } gsl_matrix *PARAM::MdbCalcKin() { - return mdb_calc_kin(file_geno, is_loco(), a_mode - 20); + return mdb_calc_kin(file_geno, a_mode - 20, loco); } void PARAM::CalcKin(gsl_matrix *matrix_kin) { |
