diff options
Diffstat (limited to 'src/mvlmm.cpp')
-rw-r--r-- | src/mvlmm.cpp | 551 |
1 files changed, 0 insertions, 551 deletions
diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index c5efb6e..88df111 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -54,7 +54,6 @@ void MVLMM::CopyFromParam(PARAM &cPar) { file_bfile = cPar.file_bfile; file_geno = cPar.file_geno; - file_oxford = cPar.file_oxford; file_out = cPar.file_out; path_out = cPar.path_out; @@ -2950,556 +2949,6 @@ double PCRT(const size_t mode, const size_t d_size, const double p_value, return p_crt; } -// WJA added. -void MVLMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_matrix *UtY) { - debug_msg("entering"); - string file_bgen = file_oxford + ".bgen"; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return; - } - - clock_t time_start = clock(); - time_UtX = 0; - time_opt = 0; - - string line; - - // Create a large matrix. - size_t msize = LMM_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); - - double logl_H0 = 0.0, logl_H1 = 0.0, p_wald = 0, p_lrt = 0, p_score = 0; - double crt_a, crt_b, crt_c; - int n_miss, c_phen; - double geno, x_mean; - size_t c = 0; - size_t n_size = UtY->size1, d_size = UtY->size2, c_size = UtW->size2; - - size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2; - - // Large matrices for EM. - gsl_matrix *U_hat = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *E_hat = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *OmegaU = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *OmegaE = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiY = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiBX = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiU = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiE = gsl_matrix_alloc(d_size, n_size); - - // Large matrices for NR. Each dxd block is H_k^{-1}. - gsl_matrix *Hi_all = gsl_matrix_alloc(d_size, d_size * n_size); - - // Each column is H_k^{-1}y_k. - gsl_matrix *Hiy_all = gsl_matrix_alloc(d_size, n_size); - - // Each dcxdc block is x_k\otimes H_k^{-1}. - gsl_matrix *xHi_all = gsl_matrix_alloc(dc_size, d_size * n_size); - gsl_matrix *Hessian = gsl_matrix_alloc(v_size * 2, v_size * 2); - gsl_vector *x = gsl_vector_alloc(n_size); - gsl_vector *x_miss = gsl_vector_alloc(n_size); - - gsl_matrix *Y = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *X = gsl_matrix_alloc(c_size + 1, n_size); - gsl_matrix *V_g = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *V_e = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *B = gsl_matrix_alloc(d_size, c_size + 1); - gsl_vector *beta = gsl_vector_alloc(d_size); - gsl_matrix *Vbeta = gsl_matrix_alloc(d_size, d_size); - - // Null estimates for initial values. - gsl_matrix *V_g_null = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *V_e_null = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *B_null = gsl_matrix_alloc(d_size, c_size + 1); - gsl_matrix *se_B_null = gsl_matrix_alloc(d_size, c_size); - - gsl_matrix_view X_sub = gsl_matrix_submatrix(X, 0, 0, c_size, n_size); - gsl_matrix_view B_sub = gsl_matrix_submatrix(B, 0, 0, d_size, c_size); - gsl_matrix_view xHi_all_sub = - gsl_matrix_submatrix(xHi_all, 0, 0, d_size * c_size, d_size * n_size); - - gsl_matrix_transpose_memcpy(Y, UtY); - - gsl_matrix_transpose_memcpy(&X_sub.matrix, UtW); - - gsl_vector_view X_row = gsl_matrix_row(X, c_size); - gsl_vector_set_zero(&X_row.vector); - gsl_vector_view B_col = gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); - - MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, - l_max, n_region, V_g, V_e, &B_sub.matrix); - logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, - OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, - V_e, &B_sub.matrix); - logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, - &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, - crt_c); - MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, - se_B_null); - - c = 0; - Vg_remle_null.clear(); - Ve_remle_null.clear(); - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - Vg_remle_null.push_back(gsl_matrix_get(V_g, i, j)); - Ve_remle_null.push_back(gsl_matrix_get(V_e, i, j)); - VVg_remle_null.push_back(gsl_matrix_get(Hessian, c, c)); - VVe_remle_null.push_back(gsl_matrix_get(Hessian, c + v_size, c + v_size)); - c++; - } - } - beta_remle_null.clear(); - se_beta_remle_null.clear(); - for (size_t i = 0; i < se_B_null->size1; i++) { - for (size_t j = 0; j < se_B_null->size2; j++) { - beta_remle_null.push_back(gsl_matrix_get(B, i, j)); - se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j)); - } - } - logl_remle_H0 = logl_H0; - - cout.setf(std::ios_base::fixed, std::ios_base::floatfield); - cout.precision(4); - - cout << "REMLE estimate for Vg in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_g, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Vg): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c, c)) << "\t"; - } - cout << endl; - } - cout << "REMLE estimate for Ve in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_e, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Ve): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c + v_size, c + v_size)) << "\t"; - } - cout << endl; - } - cout << "REMLE likelihood = " << logl_H0 << endl; - - logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, - OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, - V_e, &B_sub.matrix); - logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, - &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, - crt_c); - MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, - se_B_null); - - c = 0; - Vg_mle_null.clear(); - Ve_mle_null.clear(); - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - Vg_mle_null.push_back(gsl_matrix_get(V_g, i, j)); - Ve_mle_null.push_back(gsl_matrix_get(V_e, i, j)); - VVg_mle_null.push_back(gsl_matrix_get(Hessian, c, c)); - VVe_mle_null.push_back(gsl_matrix_get(Hessian, c + v_size, c + v_size)); - c++; - } - } - beta_mle_null.clear(); - se_beta_mle_null.clear(); - for (size_t i = 0; i < se_B_null->size1; i++) { - for (size_t j = 0; j < se_B_null->size2; j++) { - beta_mle_null.push_back(gsl_matrix_get(B, i, j)); - se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j)); - } - } - logl_mle_H0 = logl_H0; - - cout << "MLE estimate for Vg in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_g, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Vg): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c, c)) << "\t"; - } - cout << endl; - } - cout << "MLE estimate for Ve in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_e, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Ve): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c + v_size, c + v_size)) << "\t"; - } - cout << endl; - } - cout << "MLE likelihood = " << logl_H0 << endl; - - vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; - for (size_t i = 0; i < d_size; i++) { - v_beta.push_back(0.0); - } - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - v_Vg.push_back(0.0); - v_Ve.push_back(0.0); - v_Vbeta.push_back(0.0); - } - } - - gsl_matrix_memcpy(V_g_null, V_g); - gsl_matrix_memcpy(V_e_null, V_e); - gsl_matrix_memcpy(B_null, B); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB; - double bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - std::cout << "Warning: WJA hard coded SNP missingness threshold " - << "of 10%" << std::endl; - - // Start reading genotypes and analyze. - size_t csnp = 0, t_last = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (indicator_snp[t] == 0) { - continue; - } - t_last++; - } - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); - } - if (indicator_snp[t] == 0) { - continue; - } - - // Read SNP header. - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - x_mean = 0.0; - c_phen = 0; - n_miss = 0; - gsl_vector_set_zero(x_miss); - for (size_t i = 0; i < bgen_N; ++i) { - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - - // WJA. - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } else { - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean += geno; - } - c_phen++; - } - - x_mean /= static_cast<double>(ni_test - n_miss); - - for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(x_miss, i) == 0) { - gsl_vector_set(x, i, x_mean); - } - } - - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, csnp % msize); - gsl_vector_memcpy(&Xlarge_col.vector, x); - csnp++; - - if (csnp % msize == 0 || csnp == t_last) { - size_t l = 0; - if (csnp % msize == 0) { - l = msize; - } else { - l = csnp % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); - - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(&X_row.vector, &UtXlarge_col.vector); - - // Initial values. - gsl_matrix_memcpy(V_g, V_g_null); - gsl_matrix_memcpy(V_e, V_e_null); - gsl_matrix_memcpy(B, B_null); - - time_start = clock(); - - // 3 is before 1. - if (a_mode == 3 || a_mode == 4) { - p_score = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, - V_e_null, UltVehiY, beta, Vbeta); - if (p_score < p_nr && crt == 1) { - logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all, - Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c); - } - } - - if (a_mode == 2 || a_mode == 4) { - logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, - E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, - UltVehiE, V_g, V_e, B); - - // Calculate beta and Vbeta. - p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size); - - if (p_lrt < p_nr) { - logl_H1 = - MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, - xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - - // Calculate beta and Vbeta. - p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size); - - if (crt == 1) { - p_lrt = PCRT(2, d_size, p_lrt, crt_a, crt_b, crt_c); - } - } - } - - if (a_mode == 1 || a_mode == 4) { - logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, - E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, - UltVehiE, V_g, V_e, B); - p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - - if (p_wald < p_nr) { - logl_H1 = - MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, - xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - - if (crt == 1) { - p_wald = PCRT(1, d_size, p_wald, crt_a, crt_b, crt_c); - } - } - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - for (size_t i = 0; i < d_size; i++) { - v_beta[i] = gsl_vector_get(beta, i); - } - - c = 0; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - v_Vg[c] = gsl_matrix_get(V_g, i, j); - v_Ve[c] = gsl_matrix_get(V_e, i, j); - v_Vbeta[c] = gsl_matrix_get(Vbeta, i, j); - c++; - } - } - - MPHSUMSTAT SNPs = {v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; - sumStat.push_back(SNPs); - } - } - } - cout << endl; - - infile.close(); - infile.clear(); - - gsl_matrix_free(U_hat); - gsl_matrix_free(E_hat); - gsl_matrix_free(OmegaU); - gsl_matrix_free(OmegaE); - gsl_matrix_free(UltVehiY); - gsl_matrix_free(UltVehiBX); - gsl_matrix_free(UltVehiU); - gsl_matrix_free(UltVehiE); - - gsl_matrix_free(Hi_all); - gsl_matrix_free(Hiy_all); - gsl_matrix_free(xHi_all); - gsl_matrix_free(Hessian); - - gsl_vector_free(x); - gsl_vector_free(x_miss); - - gsl_matrix_free(Y); - gsl_matrix_free(X); - gsl_matrix_free(V_g); - gsl_matrix_free(V_e); - gsl_matrix_free(B); - gsl_vector_free(beta); - gsl_matrix_free(Vbeta); - - gsl_matrix_free(V_g_null); - gsl_matrix_free(V_e_null); - gsl_matrix_free(B_null); - gsl_matrix_free(se_B_null); - - gsl_matrix_free(Xlarge); - gsl_matrix_free(UtXlarge); - - return; -} - void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) { debug_msg("entering"); |