/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011-2017 Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include "gsl/gsl_blas.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_integration.h" #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "eigenlib.h" #include "gzstream.h" #include "lapack.h" #include "lm.h" using namespace std; void LM::CopyFromParam(PARAM &cPar) { a_mode = cPar.a_mode; d_pace = cPar.d_pace; file_bfile = cPar.file_bfile; file_geno = cPar.file_geno; file_out = cPar.file_out; path_out = cPar.path_out; file_gene = cPar.file_gene; time_opt = 0.0; ni_total = cPar.ni_total; ns_total = cPar.ns_total; ni_test = cPar.ni_test; ns_test = cPar.ns_test; n_cvt = cPar.n_cvt; ng_total = cPar.ng_total; ng_test = 0; indicator_idv = cPar.indicator_idv; indicator_snp = cPar.indicator_snp; snpInfo = cPar.snpInfo; return; } void LM::CopyToParam(PARAM &cPar) { cPar.time_opt = time_opt; cPar.ng_test = ng_test; return; } void LM::WriteFiles() { string file_str; file_str = path_out + "/" + file_out; file_str += ".assoc.txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } if (!file_gene.empty()) { outfile << "geneID" << "\t"; if (a_mode == 51) { outfile << "beta" << "\t" << "se" << "\t" << "p_wald" << endl; } else if (a_mode == 52) { outfile << "p_lrt" << endl; } else if (a_mode == 53) { outfile << "beta" << "\t" << "se" << "\t" << "p_score" << endl; } else if (a_mode == 54) { outfile << "beta" << "\t" << "se" << "\t" << "p_wald" << "\t" << "p_lrt" << "\t" << "p_score" << endl; } else { } for (vector::size_type t = 0; t < sumStat.size(); ++t) { outfile << snpInfo[t].rs_number << "\t"; if (a_mode == 51) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_wald << endl; } else if (a_mode == 52) { outfile << scientific << setprecision(6) << "\t" << sumStat[t].p_lrt << endl; } else if (a_mode == 53) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_score << endl; } else if (a_mode == 54) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t" << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl; } else { } } } else { outfile << "chr" << "\t" << "rs" << "\t" << "ps" << "\t" << "n_mis" << "\t" << "n_obs" << "\t" << "allele1" << "\t" << "allele0" << "\t" << "af" << "\t"; if (a_mode == 51) { outfile << "beta" << "\t" << "se" << "\t" << "p_wald" << endl; } else if (a_mode == 52) { outfile << "p_lrt" << endl; } else if (a_mode == 53) { outfile << "beta" << "\t" << "se" << "\t" << "p_score" << endl; } else if (a_mode == 54) { outfile << "beta" << "\t" << "se" << "\t" << "p_wald" << "\t" << "p_lrt" << "\t" << "p_score" << endl; } else { } size_t t = 0; for (size_t i = 0; i < snpInfo.size(); ++i) { if (indicator_snp[i] == 0) { continue; } outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" << ni_test - snpInfo[i].n_miss << "\t" << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" << fixed << setprecision(3) << snpInfo[i].maf << "\t"; if (a_mode == 51) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_wald << endl; } else if (a_mode == 52) { outfile << scientific << setprecision(6) << sumStat[t].p_lrt << endl; } else if (a_mode == 53) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_score << endl; } else if (a_mode == 54) { outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t" << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl; } else { } t++; } } outfile.close(); outfile.clear(); return; } void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x, double &xPwy, double &xPwx) { size_t c_size = Wty->size; double d; gsl_vector *WtWiWtx = gsl_vector_alloc(c_size); gsl_blas_ddot(x, x, &xPwx); gsl_blas_ddot(x, y, &xPwy); gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot(WtWiWtx, Wtx, &d); xPwx -= d; gsl_blas_ddot(WtWiWtx, Wty, &d); xPwy -= d; gsl_vector_free(WtWiWtx); return; } void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy) { size_t c_size = Wty->size; double d; gsl_vector *WtWiWty = gsl_vector_alloc(c_size); gsl_blas_ddot(y, y, &yPwy); gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); gsl_blas_ddot(WtWiWty, Wty, &d); yPwy -= d; gsl_vector_free(WtWiWty); return; } // Calculate p-values and beta/se in a linear model. void LmCalcP(const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score) { double yPxy = yPwy - xPwy * xPwy / xPwx; double se_wald, se_score; beta = xPwy / xPwx; se_wald = sqrt(yPxy / (df * xPwx)); se_score = sqrt(yPwy / ((double)n_size * xPwx)); p_wald = gsl_cdf_fdist_Q(beta * beta / (se_wald * se_wald), 1.0, df); p_score = gsl_cdf_fdist_Q(beta * beta / (se_score * se_score), 1.0, df); p_lrt = gsl_cdf_chisq_Q((double)n_size * (log(yPwy) - log(yPxy)), 1); if (test_mode == 3) { se = se_score; } else { se = se_wald; } return; } void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) { debug_msg("entering"); ifstream infile(file_gene.c_str(), ifstream::in); if (!infile) { cout << "error reading gene expression file:" << file_gene << endl; return; } clock_t time_start = clock(); string line; char *ch_ptr; double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; int c_phen; string rs; // Gene id. double d; // Calculate some basic quantities. double yPwy, xPwy, xPwx; double df = (double)W->size1 - (double)W->size2 - 1.0; gsl_vector *y = gsl_vector_alloc(W->size1); gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); gsl_vector *Wty = gsl_vector_alloc(W->size2); gsl_vector *Wtx = gsl_vector_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp(WtW, pmt, &sig); LUInvert(WtW, pmt, WtWi); gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wtx, x, xPwx); // Header. getline(infile, line); for (size_t t = 0; t < ng_total; t++) { getline(infile, line); if (t % d_pace == 0 || t == ng_total - 1) { ProgressBar("Performing Analysis", t, ng_total - 1); } ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; c_phen = 0; for (size_t i = 0; i < indicator_idv.size(); ++i) { ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } d = atof(ch_ptr); gsl_vector_set(y, c_phen, d); c_phen++; } // Calculate statistics. time_start = clock(); gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy); LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); // Store summary data. SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0 }; sumStat.push_back(SNPs); } cout << endl; gsl_vector_free(y); gsl_matrix_free(WtW); gsl_matrix_free(WtWi); gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); infile.close(); infile.clear(); return; } void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) { debug_msg("entering"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { cout << "error reading genotype file:" << file_geno << endl; return; } clock_t time_start = clock(); string line; char *ch_ptr; double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; int n_miss, c_phen; double geno, x_mean; // Calculate some basic quantities. double yPwy, xPwy, xPwx; double df = (double)W->size1 - (double)W->size2 - 1.0; gsl_vector *x = gsl_vector_alloc(W->size1); gsl_vector *x_miss = gsl_vector_alloc(W->size1); gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); gsl_vector *Wty = gsl_vector_alloc(W->size2); gsl_vector *Wtx = gsl_vector_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp(WtW, pmt, &sig); LUInvert(WtW, pmt, WtWi); gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); // Start reading genotypes and analyze. for (size_t t = 0; t < indicator_snp.size(); ++t) { getline(infile, line); if (t % d_pace == 0 || t == (ns_total - 1)) { ProgressBar("Reading SNPs", t, ns_total - 1); } if (indicator_snp[t] == 0) { continue; } ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); ch_ptr = strtok_safe(NULL, " , \t"); ch_ptr = strtok_safe(NULL, " , \t"); x_mean = 0.0; c_phen = 0; n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { geno = atof(ch_ptr); gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); x_mean += geno; } c_phen++; } x_mean /= (double)(ni_test - n_miss); for (size_t i = 0; i < ni_test; ++i) { if (gsl_vector_get(x_miss, i) == 0) { gsl_vector_set(x, i, x_mean); } geno = gsl_vector_get(x, i); } // Calculate statistics. time_start = clock(); gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); // Store summary data. SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0}; sumStat.push_back(SNPs); } cout << endl; gsl_vector_free(x); gsl_vector_free(x_miss); gsl_matrix_free(WtW); gsl_matrix_free(WtWi); gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); infile.close(); infile.clear(); return; } void LM::AnalyzePlink(const gsl_matrix *W, const gsl_vector *y) { debug_msg("entering"); string file_bed = file_bfile + ".bed"; ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { cout << "error reading bed file:" << file_bed << endl; return; } clock_t time_start = clock(); char ch[1]; bitset<8> b; double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; // Calculate some basic quantities. double yPwy, xPwy, xPwx; double df = (double)W->size1 - (double)W->size2 - 1.0; gsl_vector *x = gsl_vector_alloc(W->size1); gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); gsl_vector *Wty = gsl_vector_alloc(W->size2); gsl_vector *Wtx = gsl_vector_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp(WtW, pmt, &sig); LUInvert(WtW, pmt, WtWi); gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); // Calculate n_bit and c, the number of bit for each SNP. if (ni_total % 4 == 0) { n_bit = ni_total / 4; } else { n_bit = ni_total / 4 + 1; } // Print the first three magic numbers. for (int i = 0; i < 3; ++i) { infile.read(ch, 1); b = ch[0]; } for (vector::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { ProgressBar("Reading SNPs", t, snpInfo.size() - 1); } if (indicator_snp[t] == 0) { continue; } // n_bit, and 3 is the number of magic numbers. infile.seekg(t * n_bit + 3); // Read genotypes. x_mean = 0.0; n_miss = 0; ci_total = 0; ci_test = 0; for (int i = 0; i < n_bit; ++i) { infile.read(ch, 1); b = ch[0]; // Minor allele homozygous: 2.0; major: 0.0; for (size_t j = 0; j < 4; ++j) { if ((i == (n_bit - 1)) && ci_total == (int)ni_total) { break; } if (indicator_idv[ci_total] == 0) { ci_total++; continue; } if (b[2 * j] == 0) { if (b[2 * j + 1] == 0) { gsl_vector_set(x, ci_test, 2); x_mean += 2.0; } else { gsl_vector_set(x, ci_test, 1); x_mean += 1.0; } } else { if (b[2 * j + 1] == 1) { gsl_vector_set(x, ci_test, 0); } else { gsl_vector_set(x, ci_test, -9); n_miss++; } } ci_total++; ci_test++; } } x_mean /= (double)(ni_test - n_miss); for (size_t i = 0; i < ni_test; ++i) { geno = gsl_vector_get(x, i); if (geno == -9) { gsl_vector_set(x, i, x_mean); geno = x_mean; } } // Calculate statistics. time_start = clock(); gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); // store summary data SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0}; sumStat.push_back(SNPs); time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); } cout << endl; gsl_vector_free(x); gsl_matrix_free(WtW); gsl_matrix_free(WtWi); gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); infile.close(); infile.clear(); return; } // Make sure that both y and X are centered already. void MatrixCalcLmLR(const gsl_matrix *X, const gsl_vector *y, vector> &pos_loglr) { double yty, xty, xtx, log_lr; gsl_blas_ddot(y, y, &yty); for (size_t i = 0; i < X->size2; ++i) { gsl_vector_const_view X_col = gsl_matrix_const_column(X, i); gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); gsl_blas_ddot(&X_col.vector, y, &xty); log_lr = 0.5 * (double)y->size * (log(yty) - log(yty - xty * xty / xtx)); pos_loglr.push_back(make_pair(i, log_lr)); } return; }