diff options
Diffstat (limited to 'src/lmm.cpp')
-rw-r--r-- | src/lmm.cpp | 203 |
1 files changed, 103 insertions, 100 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp index 3f51073..eb76265 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -80,6 +80,7 @@ void LMM::CopyFromParam(PARAM &cPar) { indicator_idv = cPar.indicator_idv; indicator_snp = cPar.indicator_snp; snpInfo = cPar.snpInfo; + setGWASnps = cPar.setGWASnps; return; } @@ -165,6 +166,7 @@ void LMM::WriteFiles() { } } } else { + bool process_gwasnps = setGWASnps.size(); outfile << "chr" << "\t" << "rs" @@ -217,9 +219,13 @@ void LMM::WriteFiles() { size_t t = 0; for (size_t i = 0; i < snpInfo.size(); ++i) { - if (indicator_snp[i] == 0) { + + if (indicator_snp[i] == 0) continue; - } + auto snp = snpInfo[i].rs_number; + if (process_gwasnps && setGWASnps.count(snp) == 0) + continue; + // cout << t << endl; outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" @@ -1311,78 +1317,126 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y) { - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return; - } - + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps) { clock_t time_start = clock(); - string line; - char *ch_ptr; - - double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; - double p_lrt = 0, p_score = 0; - double logl_H1 = 0.0; - int n_miss, c_phen; - double geno, x_mean; + // LOCO support + bool process_gwasnps = gwasnps.size(); + if (process_gwasnps) + debug_msg("AnalyzeBimbam w. LOCO"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *x_miss = gsl_vector_alloc(U->size1); + const size_t inds = U->size1; + gsl_vector *x = gsl_vector_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_alloc(inds); gsl_vector *Utx = gsl_vector_alloc(U->size2); gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); gsl_vector *ab = gsl_vector_alloc(n_index); - // Create a large matrix. - size_t msize = 10000; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); + // Create a large matrix with LMM_BATCH_SIZE columns for batched processing + // const size_t msize=(process_gwasnps ? 1 : LMM_BATCH_SIZE); + const size_t msize = LMM_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_alloc(inds, msize); + enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + gsl_matrix_set_zero(Xlarge); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); // start reading genotypes and analyze - size_t c = 0, t_last = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (indicator_snp[t] == 0) { - continue; + size_t c = 0; + + igzstream infile(file_geno.c_str(), igzstream::in); + enforce_msg(infile, "error reading genotype file"); + + auto batch_compute = [&](size_t l) { // using a C++ closure + // Compute SNPs in batch, note the computations are independent per SNP + gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); + gsl_matrix_view UtXlarge_sub = + gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); + + time_start = clock(); + eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); + time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < l; i++) { + // for every batch... + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_memcpy(Utx, &UtXlarge_col.vector); + + CalcUab(UtW, Uty, Utx, Uab); + + time_start = clock(); + FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; + + double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; + double p_lrt = 0, p_score = 0; + double logl_H1 = 0.0; + + // 3 is before 1. + if (a_mode == 3 || a_mode == 4) { + CalcRLScore(l_mle_null, param1, beta, se, p_score); + } + + if (a_mode == 1 || a_mode == 4) { + // for univariate a_mode is 1 + CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald(lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode == 2 || a_mode == 4) { + CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); + } + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + // Store summary data. + SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, + p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); } - t_last++; - } + }; + for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + // for every SNP + string line; + safeGetline(infile, line); if (t % d_pace == 0 || t == (ns_total - 1)) { ProgressBar("Reading SNPs ", t, ns_total - 1); } - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + char *ch_ptr = strtok((char *)line.c_str(), " , \t"); + auto snp = string(ch_ptr); + // check whether SNP is included in gwasnps (used by LOCO) + if (process_gwasnps && gwasnps.count(snp) == 0) + continue; ch_ptr = strtok(NULL, " , \t"); ch_ptr = strtok(NULL, " , \t"); - x_mean = 0.0; - c_phen = 0; - n_miss = 0; + double x_mean = 0.0; + int c_phen = 0; + int n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { + // get the genotypes per individual and compute stats per SNP ch_ptr = strtok(NULL, " , \t"); - if (indicator_idv[i] == 0) { + if (indicator_idv[i] == 0) continue; - } if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { - geno = atof(ch_ptr); + double geno = atof(ch_ptr); gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); @@ -1398,66 +1452,16 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, gsl_vector_set(x, i, x_mean); } } - + // copy genotype values for SNP into Xlarge cache gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); gsl_vector_memcpy(&Xlarge_col.vector, x); - c++; - - if (c % msize == 0 || c == t_last) { - size_t l = 0; - if (c % msize == 0) { - l = msize; - } else { - l = c % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); + c++; // count SNPs going in - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(Utx, &UtXlarge_col.vector); - - CalcUab(UtW, Uty, Utx, Uab); - - time_start = clock(); - FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; - - // 3 is before 1. - if (a_mode == 3 || a_mode == 4) { - CalcRLScore(l_mle_null, param1, beta, se, p_score); - } - - if (a_mode == 1 || a_mode == 4) { - CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, - logl_H1); - CalcRLWald(lambda_remle, param1, beta, se, p_wald); - } - - if (a_mode == 2 || a_mode == 4) { - CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, - p_wald, p_lrt, p_score}; - - sumStat.push_back(SNPs); - } - } + if (c == msize) + batch_compute(msize); } + batch_compute(c % msize); + // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; gsl_vector_free(x); @@ -1505,7 +1509,7 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, gsl_vector *ab = gsl_vector_alloc(n_index); // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); @@ -1528,9 +1532,8 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, size_t c = 0, t_last = 0; for (size_t t = 0; t < snpInfo.size(); ++t) { - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } t_last++; } for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { @@ -1697,7 +1700,7 @@ void LMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, gsl_vector *ab = gsl_vector_alloc(n_index); // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); |