diff options
Diffstat (limited to 'src/gemma_io.cpp')
| -rw-r--r-- | src/gemma_io.cpp | 273 |
1 files changed, 272 insertions, 1 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index 664a4bd..a727337 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -1474,6 +1474,277 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { return; } +#include <lmdb.h> +#include <lmdb++.h> + +// Read bimbam mean genotype file and calculate kinship matrix. +int mdb_calc_kin(const string file_geno, const set<string> ksnps, + vector<int> &indicator_snp, const int k_mode) { + checkpoint("mdb-kin",file_geno); + + /* Create and open the LMDB environment: */ + auto env = lmdb::env::create(); + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL); /* 1 GiB */ + env.open("./example.mdb/", 0, 0664); + lmdb::dbi dbi; + + + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDB_cursor *cursor; + int rc; + + // Create environment handle + rc = mdb_env_create(&env); + if (rc != 0) { + std::cerr << "mdb_env_create: " << mdb_strerror(rc) << std::endl; + return 1; + } + // Open the environment (directory containing the LMDB files) + rc = mdb_env_open(env, file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + if (rc != 0) { + std::cerr << "mdb_env_open: " << mdb_strerror(rc) << std::endl; + return 1; + } + mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + // Open the database in the environment + rc = mdb_open(txn, NULL, 0, &dbi); + if (rc != 0) { + std::cerr << "mdb_dbi_open: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + char *skey = "meta\0"; + key.mv_size = strlen(skey); + key.mv_data = skey; + + // std::string skey = "rs13475963"; + // key.mv_size = skey.size(); + // key.mv_data = (void*)skey.c_str(); + cerr << "HERE" << endl; + + MDB_val val; + rc = mdb_get(txn, dbi, &key, &val); + if (rc != 0) { + std::cerr << "mdb_get: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + printf("!!!!!!! %s\n", val.mv_data); + + mdb_txn_abort(txn); + +//--- + MDB_stat stat; + // Get statistics + rc = mdb_stat(txn, dbi, &stat); + if (rc != 0) { + std::cerr << "mdb_stat: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + // Print number of entries + std::cout << "Number of keys: " << stat.ms_entries << std::endl; + + cerr << "HERE" << endl; + + // Create a transaction for reading + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc != 0) { + std::cerr << "mdb_txn_begin: " << mdb_strerror(rc) << std::endl; + return 1; + } + + + // Create a cursor to iterate through all entries + rc = mdb_cursor_open(txn, dbi, &cursor); + if (rc != 0) { + std::cerr << "mdb_cursor_open: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + cerr << "HERE" << endl; + + + std::string searchKey = "meta"; + key.mv_size = searchKey.size(); + key.mv_data = (void*)searchKey.c_str(); + + rc = mdb_get(txn, dbi, &key, &data); + if (rc == 0) { + std::string value((char*)data.mv_data, data.mv_size); + std::cout << "- Found: " << value << std::endl; + } else if (rc == MDB_NOTFOUND) { + std::cout << "--- Key not found" << std::endl; + } else { + std::cerr << "--- Error: " << mdb_strerror(rc) << std::endl; + } + + cerr << "HERE HERE" << endl; + + // Iterate through all key-value pairs + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + std::string keyStr((char*)key.mv_data, key.mv_size); + std::string dataStr((char*)data.mv_data, data.mv_size); + + std::cout << "Key: " << keyStr << std::endl; + // std::cout << "Key: " << keyStr << ", Value: " << dataStr << std::endl; + } + + // Clean up + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + mdb_dbi_close(env, dbi); + mdb_env_close(env); + + size_t n_miss; + double geno_mean, geno_var; + + // setKSnp and/or LOCO support + bool process_ksnps = ksnps.size(); + + gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(100,100); + size_t ni_total = matrix_kin->size1; + cout << "!!!!" << ni_total << endl; + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); + + // Xlarge contains inds x markers + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); + enforce_msg(Xlarge, "allocate Xlarge"); + + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K before"); + + // For every SNP read the genotype per individual + size_t ns_test = 0; + size_t display_pace = 1000; + for (size_t t = 0; t < indicator_snp.size(); ++t) { + string line; + /// safeGetline(infile, line).eof(); + // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + if (t % display_pace == 0) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) + continue; + + // Using regular expressions is slow: + // std::regex_token_iterator<std::string::iterator> rend; + // regex split_on("[,[:blank:]]+"); + // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(), + // split_on, -1); + /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen); + auto tokens = tokenize_whitespace(line,ni_total+3,""); + + auto token_i = tokens.begin(); + // const char *snp = *token_i; // first field + const char *snp = tokens[0]; // first field + // cout << snp << "!"; + // check whether SNP is included in ksnps (used by LOCO) + if (process_ksnps && ksnps.count(snp) == 0) + continue; + + token_i++; // skip snp name + token_i++; // skip nucleotide field + token_i++; // skip nucleotide field + + // calc SNP stats + geno_mean = 0.0; + n_miss = 0; + geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + for (size_t i = 0; i < ni_total; ++i) { + enforce_str(token_i != tokens.end(), line + " number of fields"); + auto field = *token_i; + auto sfield = std::string(field); + // cout << i << ":" << sfield << "," << endl; + if (strncmp(field,"NA",2)==0) { // missing value + gsl_vector_set(geno_miss, i, 0); + n_miss++; + } else { + double d = atof(field); + if (is_strict_mode() && d == 0.0) + enforce_is_float(std::string(field)); // rule out non NA and non-float fields + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + } + token_i++; + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + if (ns_test<1) { + write(geno,"geno raw"); + write(geno_mean,"geno mean"); + } + + // impute missing values by plugging in the mean + for (size_t i = 0; i < ni_total; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + if (ns_test<1) write(geno,"geno imputed"); + + // subtract the mean (centering genotype values) + gsl_vector_add_constant(geno, -1.0 * geno_mean); + if (ns_test<1) write(geno,"geno mean"); + + // scale the genotypes + if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2 + // flag does this + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + + if (ns_test<1) { + write(geno_var,"geno var"); + write(geno,"geno z-scored"); + } + + // set the SNP column ns_test to copy geno into the compute matrix Xlarge + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); + + ns_test++; + + // Every msize rows batch compute kinship matrix and return + // by adding to matrix_kin + if (ns_test % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K updated"); + } + } + if (ns_test % msize != 0) { // compute last batch + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + write(matrix_kin,"K updated"); + } + ProgressBar("Reading SNPs", 100, 100); + cout << endl; + + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + gsl_matrix_free(Xlarge); + + return true; +} + // Read bimbam mean genotype file and calculate kinship matrix. bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, @@ -1483,7 +1754,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, auto infilen = file_geno.c_str(); igzstream infile(infilen, igzstream::in); enforce_msg(infilen, "error reading genotype file"); - checkpoint("read-geno-file",file_geno); + checkpoint("bimbam-kin",file_geno); size_t n_miss; double geno_mean, geno_var; |
