diff options
| author | Pjotr Prins | 2025-12-01 13:24:32 +0100 |
|---|---|---|
| committer | Pjotr Prins | 2025-12-01 13:24:32 +0100 |
| commit | a31832901f66735433e8fcdbae51627b0a26fa0c (patch) | |
| tree | 3702cc4466324d234ec87ab9486f83c6dd294bf5 | |
| parent | 9091c780b9816fba987287ece703ec2317d3cc2f (diff) | |
| download | pangemma-a31832901f66735433e8fcdbae51627b0a26fa0c.tar.gz | |
Set up lmdb reading of geno file again
| -rw-r--r-- | src/gemma_io.cpp | 11 | ||||
| -rw-r--r-- | src/lmm.cpp | 58 |
2 files changed, 35 insertions, 34 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index cfa92d1..959e1f5 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -406,13 +406,14 @@ bool ReadFile_pheno(const string &file_pheno, return false; } - string line; - char *ch_ptr; - - string id; + // string id; double p; vector<double> pheno_row; + + string line; + char *ch_ptr; + vector<int> indicator_pheno_row; size_t p_max = *max_element(p_column.begin(), p_column.end()); @@ -1516,7 +1517,7 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ env.set_max_dbs(10); - env.open("example/mouse_hs1940.geno.mdb", MDB_RDONLY | MDB_NOSUBDIR, 0664); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); size_t ni_total = 0, num_markers = 0; { diff --git a/src/lmm.cpp b/src/lmm.cpp index 6d89f77..2ac9835 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -40,6 +40,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> #include "gzstream.h" #include "gemma.h" @@ -1870,49 +1871,48 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps) { - debug_msg(file_geno); - auto genofilen = file_geno.c_str(); checkpoint("mdb-calc-gwa",file_geno); - igzstream genofile(genofilen, igzstream::in); - enforce_msg(genofile, "error reading genotype file"); - size_t prev_line = 0; + const auto num_snps = indicator_snp.size(); + enforce_msg(num_snps > 0,"Zero SNPs to process - data corrupt?"); - std::vector <double> gs; - gs.resize(ni_total); + auto env = lmdb::env::create(); + + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); // fetch_snp is a callback function for every SNP row + // returns typedef std::tuple<string,std::vector<double> > SnpNameValues; + + size_t prev_line = 0; + auto mdb_fetch = MDB_FIRST; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + std::function<SnpNameValues(size_t)> fetch_snp = [&](size_t num) { - string line; + + string_view key,value; + while (prev_line <= num) { - // also read SNPs that were skipped - safeGetline(genofile, line); + cursor.get(key, value, MDB_NEXT); prev_line++; } - char *ch_ptr = strtok_safe2((char *)line.c_str(), " , \t",genofilen); - // enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // ch_ptr should not be NULL - - auto snp = string(ch_ptr); - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); // skip column - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); // skip column + cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + const float* gsbuf = reinterpret_cast<const float*>(value.data()); + auto snp = string(key); + std::vector<double> gs(gsbuf,gsbuf+sizeof(float)*value.size()); + cout << "!!!!" << snp << endl; - gs.assign (ni_total,nan("")); // wipe values - - for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok_safe2(NULL, " , \t",genofilen); - if (strcmp(ch_ptr, "NA") != 0) { - gs[i] = atof(ch_ptr); - if (is_strict_mode() && gs[i] == 0.0) - enforce_is_float(std::string(ch_ptr)); // only allow for NA and valid numbers - } - } return std::make_tuple(snp,gs); }; LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); - - genofile.close(); - genofile.clear(); } |
