aboutsummaryrefslogtreecommitdiff
path: root/src/lmm.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/lmm.cpp')
-rw-r--r--src/lmm.cpp203
1 files changed, 103 insertions, 100 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 3f51073..eb76265 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -80,6 +80,7 @@ void LMM::CopyFromParam(PARAM &cPar) {
indicator_idv = cPar.indicator_idv;
indicator_snp = cPar.indicator_snp;
snpInfo = cPar.snpInfo;
+ setGWASnps = cPar.setGWASnps;
return;
}
@@ -165,6 +166,7 @@ void LMM::WriteFiles() {
}
}
} else {
+ bool process_gwasnps = setGWASnps.size();
outfile << "chr"
<< "\t"
<< "rs"
@@ -217,9 +219,13 @@ void LMM::WriteFiles() {
size_t t = 0;
for (size_t i = 0; i < snpInfo.size(); ++i) {
- if (indicator_snp[i] == 0) {
+
+ if (indicator_snp[i] == 0)
continue;
- }
+ auto snp = snpInfo[i].rs_number;
+ if (process_gwasnps && setGWASnps.count(snp) == 0)
+ continue;
+ // cout << t << endl;
outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
<< snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
@@ -1311,78 +1317,126 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval,
void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
const gsl_matrix *UtW, const gsl_vector *Uty,
- const gsl_matrix *W, const gsl_vector *y) {
- igzstream infile(file_geno.c_str(), igzstream::in);
- if (!infile) {
- cout << "error reading genotype file:" << file_geno << endl;
- return;
- }
-
+ const gsl_matrix *W, const gsl_vector *y,
+ const set<string> gwasnps) {
clock_t time_start = clock();
- string line;
- char *ch_ptr;
-
- double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0;
- double p_lrt = 0, p_score = 0;
- double logl_H1 = 0.0;
- int n_miss, c_phen;
- double geno, x_mean;
+ // LOCO support
+ bool process_gwasnps = gwasnps.size();
+ if (process_gwasnps)
+ debug_msg("AnalyzeBimbam w. LOCO");
// Calculate basic quantities.
size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
- gsl_vector *x = gsl_vector_alloc(U->size1);
- gsl_vector *x_miss = gsl_vector_alloc(U->size1);
+ const size_t inds = U->size1;
+ gsl_vector *x = gsl_vector_alloc(inds); // #inds
+ gsl_vector *x_miss = gsl_vector_alloc(inds);
gsl_vector *Utx = gsl_vector_alloc(U->size2);
gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index);
gsl_vector *ab = gsl_vector_alloc(n_index);
- // Create a large matrix.
- size_t msize = 10000;
- gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
- gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
- gsl_matrix_set_zero(Xlarge);
+ // Create a large matrix with LMM_BATCH_SIZE columns for batched processing
+ // const size_t msize=(process_gwasnps ? 1 : LMM_BATCH_SIZE);
+ const size_t msize = LMM_BATCH_SIZE;
+ gsl_matrix *Xlarge = gsl_matrix_alloc(inds, msize);
+ gsl_matrix *UtXlarge = gsl_matrix_alloc(inds, msize);
+ enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure
+ gsl_matrix_set_zero(Xlarge);
gsl_matrix_set_zero(Uab);
CalcUab(UtW, Uty, Uab);
// start reading genotypes and analyze
- size_t c = 0, t_last = 0;
- for (size_t t = 0; t < indicator_snp.size(); ++t) {
- if (indicator_snp[t] == 0) {
- continue;
+ size_t c = 0;
+
+ igzstream infile(file_geno.c_str(), igzstream::in);
+ enforce_msg(infile, "error reading genotype file");
+
+ auto batch_compute = [&](size_t l) { // using a C++ closure
+ // Compute SNPs in batch, note the computations are independent per SNP
+ gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l);
+ gsl_matrix_view UtXlarge_sub =
+ gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
+
+ time_start = clock();
+ eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
+ &UtXlarge_sub.matrix);
+ time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ gsl_matrix_set_zero(Xlarge);
+ for (size_t i = 0; i < l; i++) {
+ // for every batch...
+ gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
+ gsl_vector_memcpy(Utx, &UtXlarge_col.vector);
+
+ CalcUab(UtW, Uty, Utx, Uab);
+
+ time_start = clock();
+ FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+ double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0;
+ double p_lrt = 0, p_score = 0;
+ double logl_H1 = 0.0;
+
+ // 3 is before 1.
+ if (a_mode == 3 || a_mode == 4) {
+ CalcRLScore(l_mle_null, param1, beta, se, p_score);
+ }
+
+ if (a_mode == 1 || a_mode == 4) {
+ // for univariate a_mode is 1
+ CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+ CalcRLWald(lambda_remle, param1, beta, se, p_wald);
+ }
+
+ if (a_mode == 2 || a_mode == 4) {
+ CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+ p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
+ }
+
+ time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ // Store summary data.
+ SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle,
+ p_wald, p_lrt, p_score};
+ sumStat.push_back(SNPs);
}
- t_last++;
- }
+ };
+
for (size_t t = 0; t < indicator_snp.size(); ++t) {
- !safeGetline(infile, line).eof();
+ // for every SNP
+ string line;
+ safeGetline(infile, line);
if (t % d_pace == 0 || t == (ns_total - 1)) {
ProgressBar("Reading SNPs ", t, ns_total - 1);
}
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
- ch_ptr = strtok((char *)line.c_str(), " , \t");
+ char *ch_ptr = strtok((char *)line.c_str(), " , \t");
+ auto snp = string(ch_ptr);
+ // check whether SNP is included in gwasnps (used by LOCO)
+ if (process_gwasnps && gwasnps.count(snp) == 0)
+ continue;
ch_ptr = strtok(NULL, " , \t");
ch_ptr = strtok(NULL, " , \t");
- x_mean = 0.0;
- c_phen = 0;
- n_miss = 0;
+ double x_mean = 0.0;
+ int c_phen = 0;
+ int n_miss = 0;
gsl_vector_set_zero(x_miss);
for (size_t i = 0; i < ni_total; ++i) {
+ // get the genotypes per individual and compute stats per SNP
ch_ptr = strtok(NULL, " , \t");
- if (indicator_idv[i] == 0) {
+ if (indicator_idv[i] == 0)
continue;
- }
if (strcmp(ch_ptr, "NA") == 0) {
gsl_vector_set(x_miss, c_phen, 0.0);
n_miss++;
} else {
- geno = atof(ch_ptr);
+ double geno = atof(ch_ptr);
gsl_vector_set(x, c_phen, geno);
gsl_vector_set(x_miss, c_phen, 1.0);
@@ -1398,66 +1452,16 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector_set(x, i, x_mean);
}
}
-
+ // copy genotype values for SNP into Xlarge cache
gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize);
gsl_vector_memcpy(&Xlarge_col.vector, x);
- c++;
-
- if (c % msize == 0 || c == t_last) {
- size_t l = 0;
- if (c % msize == 0) {
- l = msize;
- } else {
- l = c % msize;
- }
-
- gsl_matrix_view Xlarge_sub =
- gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
- gsl_matrix_view UtXlarge_sub =
- gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
-
- time_start = clock();
- eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
- &UtXlarge_sub.matrix);
- time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
-
- gsl_matrix_set_zero(Xlarge);
+ c++; // count SNPs going in
- for (size_t i = 0; i < l; i++) {
- gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
- gsl_vector_memcpy(Utx, &UtXlarge_col.vector);
-
- CalcUab(UtW, Uty, Utx, Uab);
-
- time_start = clock();
- FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0};
-
- // 3 is before 1.
- if (a_mode == 3 || a_mode == 4) {
- CalcRLScore(l_mle_null, param1, beta, se, p_score);
- }
-
- if (a_mode == 1 || a_mode == 4) {
- CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle,
- logl_H1);
- CalcRLWald(lambda_remle, param1, beta, se, p_wald);
- }
-
- if (a_mode == 2 || a_mode == 4) {
- CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
- p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
- }
-
- time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
-
- // Store summary data.
- SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle,
- p_wald, p_lrt, p_score};
-
- sumStat.push_back(SNPs);
- }
- }
+ if (c == msize)
+ batch_compute(msize);
}
+ batch_compute(c % msize);
+ // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
cout << endl;
gsl_vector_free(x);
@@ -1505,7 +1509,7 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector *ab = gsl_vector_alloc(n_index);
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
@@ -1528,9 +1532,8 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
size_t c = 0, t_last = 0;
for (size_t t = 0; t < snpInfo.size(); ++t) {
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
t_last++;
}
for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
@@ -1697,7 +1700,7 @@ void LMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector *ab = gsl_vector_alloc(n_index);
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);