diff options
Diffstat (limited to 'src/lmm.cpp')
| -rw-r--r-- | src/lmm.cpp | 58 |
1 files changed, 29 insertions, 29 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp index 6d89f77..2ac9835 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -40,6 +40,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> #include "gzstream.h" #include "gemma.h" @@ -1870,49 +1871,48 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps) { - debug_msg(file_geno); - auto genofilen = file_geno.c_str(); checkpoint("mdb-calc-gwa",file_geno); - igzstream genofile(genofilen, igzstream::in); - enforce_msg(genofile, "error reading genotype file"); - size_t prev_line = 0; + const auto num_snps = indicator_snp.size(); + enforce_msg(num_snps > 0,"Zero SNPs to process - data corrupt?"); - std::vector <double> gs; - gs.resize(ni_total); + auto env = lmdb::env::create(); + + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); // fetch_snp is a callback function for every SNP row + // returns typedef std::tuple<string,std::vector<double> > SnpNameValues; + + size_t prev_line = 0; + auto mdb_fetch = MDB_FIRST; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + std::function<SnpNameValues(size_t)> fetch_snp = [&](size_t num) { - string line; + + string_view key,value; + while (prev_line <= num) { - // also read SNPs that were skipped - safeGetline(genofile, line); + cursor.get(key, value, MDB_NEXT); prev_line++; } - char *ch_ptr = strtok_safe2((char *)line.c_str(), " , \t",genofilen); - // enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // ch_ptr should not be NULL - - auto snp = string(ch_ptr); - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); // skip column - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); // skip column + cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + const float* gsbuf = reinterpret_cast<const float*>(value.data()); + auto snp = string(key); + std::vector<double> gs(gsbuf,gsbuf+sizeof(float)*value.size()); + cout << "!!!!" << snp << endl; - gs.assign (ni_total,nan("")); // wipe values - - for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); - if (strcmp(ch_ptr, "NA") != 0) { - gs[i] = atof(ch_ptr); - if (is_strict_mode() && gs[i] == 0.0) - enforce_is_float(std::string(ch_ptr)); // only allow for NA and valid numbers - } - } return std::make_tuple(snp,gs); }; LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); - - genofile.close(); - genofile.clear(); } |
