diff options
| author | Pjotr Prins | 2025-12-01 16:48:13 +0100 |
|---|---|---|
| committer | Pjotr Prins | 2025-12-01 16:48:13 +0100 |
| commit | a144f052139b3c4e6fa761ffb0a1e435bd1bad22 (patch) | |
| tree | e36483a8745b556860c710d9bdfe13c310353847 /src | |
| parent | 16a0419f5bd2e5122a57ae557b556b3cbe6b402b (diff) | |
| download | pangemma-a144f052139b3c4e6fa761ffb0a1e435bd1bad22.tar.gz | |
Code loads markers - getting ready to compute
Diffstat (limited to 'src')
| -rw-r--r-- | src/lmm.cpp | 90 | ||||
| -rw-r--r-- | src/lmm.h | 8 |
2 files changed, 52 insertions, 46 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp index ee7ba42..d015988 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -1864,18 +1864,19 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } /* - This is the mirror function of below AnalyzeBimbam, but uses mdb input instead. + This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead. */ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, - const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y, - const set<string> gwasnps) { + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps, + size_t num_markers) { clock_t time_start = clock(); - checkpoint_nofile("start-lmm-analyze"); + checkpoint_nofile("start-lmm-mdb-analyze"); // Subset/LOCO support bool process_gwasnps = gwasnps.size(); @@ -1884,10 +1885,16 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const size_t inds = U->size1; enforce(inds == ni_test); + assert(inds > 0); + gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds gsl_vector *x_miss = gsl_vector_safe_alloc(inds); - gsl_vector *Utx = gsl_vector_safe_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index); + assert(ni_test == U->size2); + assert(ni_test > 0); + assert(ni_total > 0); + assert(n_index > 0); + gsl_vector *Utx = gsl_vector_safe_alloc(ni_test); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); gsl_vector *ab = gsl_vector_safe_alloc(n_index); // Create a large matrix with LMM_BATCH_SIZE columns for batched processing @@ -1905,7 +1912,7 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, size_t c = 0; /* - batch_compute(l) takes l SNPs that have been loaded into Xlarge, + batch_compute(l) takes l x SNPs that have been loaded into Xlarge, transforms them all at once using the eigenvector matrix U, then loops through each transformed SNP to compute association statistics (beta, standard errors, p-values) and stores results in @@ -1913,8 +1920,9 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, */ auto batch_compute = [&](size_t l) { // using a C++ closure // Compute SNPs in batch, note the computations are independent per SNP - // debug_msg("enter batch_compute"); - checkpoint_nofile("\nstart-batch_compute"); + debug_msg("enter batch_compute"); + assert(l>0); + assert(inds>0); gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); gsl_matrix_view UtXlarge_sub = gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); @@ -1973,33 +1981,36 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, p_wald, p_lrt, p_score, logl_H1}; sumStat.push_back(SNPs); } - // debug_msg("exit batch_compute"); - checkpoint_nofile("end-batch_compute"); }; - const auto num_snps = indicator_snp.size(); - enforce_msg(num_snps > 0,"Zero SNPs to process - data corrupt?"); - if (num_snps < 50) { - cerr << num_snps << " SNPs" << endl; + /* + const auto num_markers = indicator_snp.size(); + enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?"); + if (num_markers < 50) { + cerr << num_markers << " SNPs" << endl; warning_msg("very few SNPs processed"); } - const size_t progress_step = (num_snps/50>d_pace ? num_snps/50 : d_pace); + */ + const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace); - for (size_t t = 0; t < num_snps; ++t) { - if (t % progress_step == 0 || t == (num_snps - 1)) { - ProgressBar("Reading SNPs", t, num_snps - 1); + assert(num_markers > 0); + for (size_t t = 0; t < num_markers; ++t) { + if (t % progress_step == 0 || t == (num_markers - 1)) { + ProgressBar("Reading SNPs", t, num_markers - 1); } - if (indicator_snp[t] == 0) - continue; + // if (indicator_snp[t] == 0) + // continue; - auto tup = fetch_snp(t); + auto tup = fetch_snp(t); // use the call back auto snp = get<0>(tup); auto gs = get<1>(tup); + cout << "SNP: " << snp << endl; // check whether SNP is included in gwasnps (used by LOCO) + /* if (process_gwasnps && gwasnps.count(snp) == 0) continue; - + */ // drop missing idv and plug mean values for missing geno double x_total = 0.0; // sum genotype values to compute x_mean uint pos = 0; // position in target vector @@ -2031,18 +2042,6 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } } - /* this is what below GxE does - for (size_t i = 0; i < ni_test; ++i) { - auto geno = gsl_vector_get(x, i); - if (std::isnan(geno)) { - gsl_vector_set(x, i, x_mean); - geno = x_mean; - } - if (x_mean > 1.0) { - gsl_vector_set(x, i, 2 - geno); - } - } - */ enforce(x->size == ni_test); // copy genotype values for SNP into Xlarge cache @@ -2055,11 +2054,12 @@ void LMM::mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } } - batch_compute(c % msize); - ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1); + if (c % msize) + batch_compute(c % msize); + ProgressBar("Reading SNPS", num_markers - 1, num_markers - 1); // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; - checkpoint_nofile("end-lmm-analyze"); + checkpoint_nofile("end-lmm-mdb-analyze"); gsl_vector_safe_free(x); gsl_vector_safe_free(x_miss); @@ -2089,6 +2089,12 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + cout << "Number of records: " << stat.ms_entries << endl; + auto num_markers = stat.ms_entries; + // fetch_snp is a callback function for every SNP row // returns typedef std::tuple<string,std::vector<double> > SnpNameValues; @@ -2112,12 +2118,12 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, const float* gsbuf = reinterpret_cast<const float*>(value.data()); auto snp = string(key); std::vector<double> gs(gsbuf,gsbuf+sizeof(float)*value.size()); - cout << "!!!!" << snp << endl; + cout << "!!!!" << snp << endl << endl; return std::make_tuple(snp,gs); }; - LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); + LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps,num_markers); } diff --git a/src/lmm.h b/src/lmm.h index c628baa..29e6341 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -101,10 +101,10 @@ public: const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps); void mdb_analyze(std::function< SnpNameValues(size_t) >& fetch_snp, - const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y, - const set<string> gwasnps); + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps, size_t num_markers); void mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, |
