diff options
| -rw-r--r-- | guix/guix.scm | 1 | ||||
| -rw-r--r-- | premake5.lua | 4 | ||||
| -rw-r--r-- | src/gemma.cpp | 21 | ||||
| -rw-r--r-- | src/gemma_io.cpp | 273 | ||||
| -rw-r--r-- | src/gemma_io.h | 2 | ||||
| -rw-r--r-- | src/param.cpp | 17 | ||||
| -rw-r--r-- | src/param.h | 1 |
7 files changed, 299 insertions, 20 deletions
diff --git a/guix/guix.scm b/guix/guix.scm index 90ef001..ad8841d 100644 --- a/guix/guix.scm +++ b/guix/guix.scm @@ -156,6 +156,7 @@ ;; `(,guile-3.0 "dev") guile-lmdb lmdb + lmdbxx pkg-config ninja ruby diff --git a/premake5.lua b/premake5.lua index d380dc4..9229800 100644 --- a/premake5.lua +++ b/premake5.lua @@ -5,7 +5,7 @@ -- -- Including bin -- --- premake5 gmake && make verbose=1 config=debug -j 8 && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./test/runner +-- premake5 gmake && make verbose=1 config=debug gemma -j 8 && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./test/runner -- -- Or -- @@ -61,7 +61,7 @@ project "gemma" files { "src/*.h src/*.c src/**.hpp", "src/**.cpp" } removefiles { "src/gemma_api.cpp" } includedirs { "src/" } - links { } + links { "lmdb" } filter "configurations:Debug" defines { "DEBUG" } diff --git a/src/gemma.cpp b/src/gemma.cpp index fdb19fd..6b42763 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -1914,8 +1914,14 @@ void GEMMA::BatchRun(PARAM &cPar) { if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) { cout << "Calculating Relatedness Matrix ... " << endl; - gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); - enforce_msg(G, "allocate G"); // just to be sure + if (cPar.is_mdb) { + cPar.MdbCalcKin(); + return; + } + + enforce(cPar.ni_total > 0); + gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); + enforce_msg(K, "allocate G"); // just to be sure time_start = clock(); @@ -1923,7 +1929,7 @@ void GEMMA::BatchRun(PARAM &cPar) { cout << "error! failed to prepare for calculating relatedness matrix. " << endl; return; } - cPar.CalcKin(G); + cPar.CalcKin(K); cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); if (cPar.error == true) { @@ -1932,15 +1938,15 @@ void GEMMA::BatchRun(PARAM &cPar) { } // Now we have the Kinship matrix test it - validate_K(G); + validate_K(K); if (cPar.a_mode == M_KIN) { - cPar.WriteMatrix(G, "cXX"); + cPar.WriteMatrix(K, "cXX"); } else { // M_KIN2 - cPar.WriteMatrix(G, "sXX"); + cPar.WriteMatrix(K, "sXX"); } - gsl_matrix_safe_free(G); + gsl_matrix_safe_free(K); } // Compute the LDSC weights (not implemented yet) @@ -2582,6 +2588,7 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2); // B is a d by c (effect size) // matrix gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2); // se + enforce(Y->size1 > 0); gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1); // Kinship aka as G/GRM in the equation ind x ind gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1); gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2); // Transformed ind x covariates diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index 664a4bd..a727337 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -1474,6 +1474,277 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { return; } +#include <lmdb.h> +#include <lmdb++.h> + +// Read bimbam mean genotype file and calculate kinship matrix. +int mdb_calc_kin(const string file_geno, const set<string> ksnps, + vector<int> &indicator_snp, const int k_mode) { + checkpoint("mdb-kin",file_geno); + + /* Create and open the LMDB environment: */ + auto env = lmdb::env::create(); + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL); /* 1 GiB */ + env.open("./example.mdb/", 0, 0664); + lmdb::dbi dbi; + + + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDB_cursor *cursor; + int rc; + + // Create environment handle + rc = mdb_env_create(&env); + if (rc != 0) { + std::cerr << "mdb_env_create: " << mdb_strerror(rc) << std::endl; + return 1; + } + // Open the environment (directory containing the LMDB files) + rc = mdb_env_open(env, file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + if (rc != 0) { + std::cerr << "mdb_env_open: " << mdb_strerror(rc) << std::endl; + return 1; + } + mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + // Open the database in the environment + rc = mdb_open(txn, NULL, 0, &dbi); + if (rc != 0) { + std::cerr << "mdb_dbi_open: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + char *skey = "meta\0"; + key.mv_size = strlen(skey); + key.mv_data = skey; + + // std::string skey = "rs13475963"; + // key.mv_size = skey.size(); + // key.mv_data = (void*)skey.c_str(); + cerr << "HERE" << endl; + + MDB_val val; + rc = mdb_get(txn, dbi, &key, &val); + if (rc != 0) { + std::cerr << "mdb_get: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + printf("!!!!!!! %s\n", val.mv_data); + + mdb_txn_abort(txn); + +//--- + MDB_stat stat; + // Get statistics + rc = mdb_stat(txn, dbi, &stat); + if (rc != 0) { + std::cerr << "mdb_stat: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + // Print number of entries + std::cout << "Number of keys: " << stat.ms_entries << std::endl; + + cerr << "HERE" << endl; + + // Create a transaction for reading + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc != 0) { + std::cerr << "mdb_txn_begin: " << mdb_strerror(rc) << std::endl; + return 1; + } + + + // Create a cursor to iterate through all entries + rc = mdb_cursor_open(txn, dbi, &cursor); + if (rc != 0) { + std::cerr << "mdb_cursor_open: " << mdb_strerror(rc) << std::endl; + mdb_txn_abort(txn); + return 1; + } + + cerr << "HERE" << endl; + + + std::string searchKey = "meta"; + key.mv_size = searchKey.size(); + key.mv_data = (void*)searchKey.c_str(); + + rc = mdb_get(txn, dbi, &key, &data); + if (rc == 0) { + std::string value((char*)data.mv_data, data.mv_size); + std::cout << "- Found: " << value << std::endl; + } else if (rc == MDB_NOTFOUND) { + std::cout << "--- Key not found" << std::endl; + } else { + std::cerr << "--- Error: " << mdb_strerror(rc) << std::endl; + } + + cerr << "HERE HERE" << endl; + + // Iterate through all key-value pairs + while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + std::string keyStr((char*)key.mv_data, key.mv_size); + std::string dataStr((char*)data.mv_data, data.mv_size); + + std::cout << "Key: " << keyStr << std::endl; + // std::cout << "Key: " << keyStr << ", Value: " << dataStr << std::endl; + } + + // Clean up + mdb_cursor_close(cursor); + mdb_txn_abort(txn); + mdb_dbi_close(env, dbi); + mdb_env_close(env); + + size_t n_miss; + double geno_mean, geno_var; + + // setKSnp and/or LOCO support + bool process_ksnps = ksnps.size(); + + gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(100,100); + size_t ni_total = matrix_kin->size1; + cout << "!!!!" << ni_total << endl; + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); + + // Xlarge contains inds x markers + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); + enforce_msg(Xlarge, "allocate Xlarge"); + + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K before"); + + // For every SNP read the genotype per individual + size_t ns_test = 0; + size_t display_pace = 1000; + for (size_t t = 0; t < indicator_snp.size(); ++t) { + string line; + /// safeGetline(infile, line).eof(); + // if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + if (t % display_pace == 0) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) + continue; + + // Using regular expressions is slow: + // std::regex_token_iterator<std::string::iterator> rend; + // regex split_on("[,[:blank:]]+"); + // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(), + // split_on, -1); + /// auto tokens = tokenize_whitespace(line,ni_total+3,infilen); + auto tokens = tokenize_whitespace(line,ni_total+3,""); + + auto token_i = tokens.begin(); + // const char *snp = *token_i; // first field + const char *snp = tokens[0]; // first field + // cout << snp << "!"; + // check whether SNP is included in ksnps (used by LOCO) + if (process_ksnps && ksnps.count(snp) == 0) + continue; + + token_i++; // skip snp name + token_i++; // skip nucleotide field + token_i++; // skip nucleotide field + + // calc SNP stats + geno_mean = 0.0; + n_miss = 0; + geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + for (size_t i = 0; i < ni_total; ++i) { + enforce_str(token_i != tokens.end(), line + " number of fields"); + auto field = *token_i; + auto sfield = std::string(field); + // cout << i << ":" << sfield << "," << endl; + if (strncmp(field,"NA",2)==0) { // missing value + gsl_vector_set(geno_miss, i, 0); + n_miss++; + } else { + double d = atof(field); + if (is_strict_mode() && d == 0.0) + enforce_is_float(std::string(field)); // rule out non NA and non-float fields + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + } + token_i++; + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + if (ns_test<1) { + write(geno,"geno raw"); + write(geno_mean,"geno mean"); + } + + // impute missing values by plugging in the mean + for (size_t i = 0; i < ni_total; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + if (ns_test<1) write(geno,"geno imputed"); + + // subtract the mean (centering genotype values) + gsl_vector_add_constant(geno, -1.0 * geno_mean); + if (ns_test<1) write(geno,"geno mean"); + + // scale the genotypes + if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2 + // flag does this + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + + if (ns_test<1) { + write(geno_var,"geno var"); + write(geno,"geno z-scored"); + } + + // set the SNP column ns_test to copy geno into the compute matrix Xlarge + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); + + ns_test++; + + // Every msize rows batch compute kinship matrix and return + // by adding to matrix_kin + if (ns_test % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K updated"); + } + } + if (ns_test % msize != 0) { // compute last batch + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + write(matrix_kin,"K updated"); + } + ProgressBar("Reading SNPs", 100, 100); + cout << endl; + + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + gsl_matrix_free(Xlarge); + + return true; +} + // Read bimbam mean genotype file and calculate kinship matrix. bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, @@ -1483,7 +1754,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, auto infilen = file_geno.c_str(); igzstream infile(infilen, igzstream::in); enforce_msg(infilen, "error reading genotype file"); - checkpoint("read-geno-file",file_geno); + checkpoint("bimbam-kin",file_geno); size_t n_miss; double geno_mean, geno_var; diff --git a/src/gemma_io.h b/src/gemma_io.h index 139f379..a20007a 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -87,6 +87,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); +int mdb_calc_kin(const string file_geno, const set<string> ksnps, + vector<int> &indicator_snp, const int mode); bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, diff --git a/src/param.cpp b/src/param.cpp index 017d588..a720801 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1293,28 +1293,25 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_ return; } +void PARAM::MdbCalcKin() { + error = !mdb_calc_kin(file_geno, setKSnps, indicator_snp, a_mode - 20); + return; +} + void PARAM::CalcKin(gsl_matrix *matrix_kin) { checkpoint_nofile("PARAM::CalcKin"); string file_str; - gsl_matrix_set_zero(matrix_kin); if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { - file_str = file_geno; - if (is_mdb) - error = !BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, - matrix_kin, ni_max == 0); - else - error = !BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, - matrix_kin, ni_max == 0); - + error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace, + matrix_kin, ni_max == 0); } return; diff --git a/src/param.h b/src/param.h index 38b9430..a6dfac8 100644 --- a/src/param.h +++ b/src/param.h @@ -347,6 +347,7 @@ public: void ProcessCvtPhen(); void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag); + void MdbCalcKin(); void CalcKin(gsl_matrix *matrix_kin); void CalcS(const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, |
