From 254113f4e02d5b423af302e9facb0ed580749ef5 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 30 Nov 2025 08:32:33 +0100 Subject: K: reads geno mdb --- src/gemma_io.cpp | 159 ++++++++++++++++++++----------------------------------- src/gemma_io.h | 3 +- src/param.cpp | 3 +- 3 files changed, 61 insertions(+), 104 deletions(-) (limited to 'src') diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index 6265417..8fa0eab 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -41,6 +41,7 @@ #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include #include "checkpoint.h" #include "debug.h" @@ -1474,12 +1475,9 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { return; } -// #include -#include // Read bimbam mean genotype file and calculate kinship matrix. -int mdb_calc_kin(const string file_geno, bool is_loco, - vector &indicator_snp, const int k_mode) { +int mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, const vector &indicator_snp) { checkpoint("mdb-kin",file_geno); /* Create and open the LMDB environment: */ @@ -1489,25 +1487,7 @@ int mdb_calc_kin(const string file_geno, bool is_loco, env.set_max_dbs(10); env.open("example/mouse_hs1940.geno.mdb", MDB_RDONLY | MDB_NOSUBDIR, 0664); - /* - { - // Print out all the values using a cursor: - auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); - dbi = lmdb::dbi::open(rtxn, "mouse_hs1940.geno.txt.mdb"); - { - auto cursor = lmdb::cursor::open(rtxn, dbi); - - string_view key, value; - int count = 0; - if (cursor.get(key, value, MDB_FIRST)) { - do { - cout << count++ << " key: " << key << endl ; // " value: " << value << endl; - } while (cursor.get(key, value, MDB_NEXT)); - } - } // destroying cursor before committing/aborting transaction (see below) - } - */ - size_t ni_total = 0; + size_t ni_total = 0, num_markers = 0; { auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); auto info = lmdb::dbi::open(rtxn, "info"); @@ -1526,86 +1506,76 @@ int mdb_calc_kin(const string file_geno, bool is_loco, mdb_stat(rtxn, info, &stat); assert(stat.ms_entries == 3); } - { - auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); - auto geno = lmdb::dbi::open(rtxn, "geno"); + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); - // Get statistics - ms_entries contains the number of records - MDB_stat stat; - mdb_stat(rtxn, geno, &stat); - - cout << "Number of records: " << stat.ms_entries << endl; - } + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + cout << "Number of records: " << stat.ms_entries << endl; + num_markers = stat.ms_entries; gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total); - // cout << "ni_total: " << ni_total << endl; gsl_vector *geno = gsl_vector_safe_alloc(ni_total); gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); // Xlarge contains inds x markers - const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); - enforce_msg(Xlarge, "allocate Xlarge"); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE); + enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE"); - gsl_matrix_set_zero(Xlarge); - write(matrix_kin,"K before"); + // [ ] check XLarge is zeroed properly + // [ ] handle missing data - // For every SNP read the genotype per individual + // For every marker read the genotype x inds size_t ns_test = 0; size_t display_pace = 1000; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - string line; - /// safeGetline(infile, line).eof(); - // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - if (t % display_pace == 0) { - ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); - } - if (indicator_snp[t] == 0) - continue; - - // Using regular expressions is slow: - // std::regex_token_iterator rend; - // regex split_on("[,[:blank:]]+"); - // regex_token_iterator tokens(line.begin(), line.end(), - // split_on, -1); - /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen); - auto tokens = tokenize_whitespace(line,ni_total+3,""); - auto token_i = tokens.begin(); - // const char *snp = *token_i; // first field - const char *snp = tokens[0]; // first field - // cout << snp << "!"; - // check whether SNP is included in ksnps (used by LOCO) - // if (process_ksnps && ksnps.count(snp) == 0) - // continue; + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); - token_i++; // skip snp name - token_i++; // skip nucleotide field - token_i++; // skip nucleotide field - - // calc SNP stats + auto mdb_fetch = MDB_FIRST; + for (size_t t = 0; t < num_markers; ++t) { + string line; + if (t % display_pace == 0) { + ProgressBar("Reading SNPs", t, num_markers - 1); + } + + // auto tokens = tokenize_whitespace(line,ni_total+3,""); + // auto token_i = tokens.begin(); + // const char *snp = tokens[0]; // first field + // token_i++; // skip snp name + // token_i++; // skip nucleotide field + // token_i++; // skip nucleotide field + + string_view key, value; + cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + // cout << " key: " << key << endl ; + size_t num_floats = value.size() / sizeof(float); + // cout << " num: " << num_floats << endl ; + assert(num_floats == ni_total); + const float* gs = reinterpret_cast(value.data()); size_t n_miss = 0; double geno_mean = 0.0; double geno_var = 0.0; gsl_vector_set_all(geno_miss, 0); + for (size_t i = 0; i < ni_total; ++i) { - enforce_str(token_i != tokens.end(), line + " number of fields"); - auto field = *token_i; - auto sfield = std::string(field); + // enforce_str(token_i != tokens.end(), line + " number of fields"); + // auto field = *token_i; + // auto sfield = std::string(field); // cout << i << ":" << sfield << "," << endl; - if (strncmp(field,"NA",2)==0) { // missing value - gsl_vector_set(geno_miss, i, 0); - n_miss++; - } else { - double d = atof(field); - if (is_strict_mode() && d == 0.0) - enforce_is_float(std::string(field)); // rule out non NA and non-float fields - gsl_vector_set(geno, i, d); - gsl_vector_set(geno_miss, i, 1); - geno_mean += d; - geno_var += d * d; - } - token_i++; + // if (strncmp(field,"NA",2)==0) { // missing value + // gsl_vector_set(geno_miss, i, 0); + // n_miss++; + // } else { + double d = gs[i]; + // if (is_strict_mode() && d == 0.0) + // enforce_is_float(std::string(field)); // rule out non NA and non-float fields + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + // } + // token_i++; } geno_mean /= (double)(ni_total - n_miss); @@ -1613,11 +1583,6 @@ int mdb_calc_kin(const string file_geno, bool is_loco, geno_var /= (double)ni_total; geno_var -= geno_mean * geno_mean; - if (ns_test<1) { - write(geno,"geno raw"); - write(geno_mean,"geno mean"); - } - // impute missing values by plugging in the mean for (size_t i = 0; i < ni_total; ++i) { if (gsl_vector_get(geno_miss, i) == 0) { @@ -1625,11 +1590,8 @@ int mdb_calc_kin(const string file_geno, bool is_loco, } } - if (ns_test<1) write(geno,"geno imputed"); - // subtract the mean (centering genotype values) gsl_vector_add_constant(geno, -1.0 * geno_mean); - if (ns_test<1) write(geno,"geno mean"); // scale the genotypes if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2 @@ -1637,26 +1599,21 @@ int mdb_calc_kin(const string file_geno, bool is_loco, gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); } - if (ns_test<1) { - write(geno_var,"geno var"); - write(geno,"geno z-scored"); - } - // set the SNP column ns_test to copy geno into the compute matrix Xlarge - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % K_BATCH_SIZE); enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); ns_test++; - // Every msize rows batch compute kinship matrix and return + // Every K_BATCH_SIZE rows batch compute kinship matrix and return // by adding to matrix_kin - if (ns_test % msize == 0) { + if (ns_test % K_BATCH_SIZE == 0) { fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); write(matrix_kin,"K updated"); } } - if (ns_test % msize != 0) { // compute last batch + if (ns_test % K_BATCH_SIZE != 0) { // compute last batch fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); write(matrix_kin,"K updated"); } diff --git a/src/gemma_io.h b/src/gemma_io.h index 593d704..ad13379 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -87,8 +87,7 @@ void ReadFile_mk(const string &file_mk, vector &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); -int mdb_calc_kin(const string file_geno, bool is_loco, - vector &indicator_snp, const int mode); +int mdb_calc_kin(const string file_geno, const bool is_loco, const int mode, const vector &indicator_snp); bool BimbamKin(const string file_geno, const set ksnps, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, diff --git a/src/param.cpp b/src/param.cpp index 05e4a7b..3b74e53 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1294,7 +1294,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_ } void PARAM::MdbCalcKin() { - error = !mdb_calc_kin(file_geno, is_loco(), indicator_snp, a_mode - 20); + error = !mdb_calc_kin(file_geno, is_loco(), a_mode - 20, indicator_snp); return; } @@ -1310,6 +1310,7 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) { error = true; } } else { + // indicator_snp is an array of bool error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace, matrix_kin, ni_max == 0); } -- cgit 1.4.1