/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_min.h" #include "gsl/gsl_integration.h" #include "eigenlib.h" #include "gzstream.h" #include "lapack.h" #ifdef FORCE_FLOAT #include "lm_float.h" #else #include "lm.h" #endif using namespace std; void LM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; // WJA added file_oxford=cPar.file_oxford; time_opt=0.0; ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; ng_total=cPar.ng_total; ng_test=0; indicator_idv=cPar.indicator_idv; indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; return; } void LM::CopyToParam (PARAM &cPar) { cPar.time_opt=time_opt; cPar.ng_test=ng_test; return; } void LM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; file_str+=".assoc.txt"; ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing file: "<::size_type t=0; tsize; double d; gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); gsl_blas_ddot (x, x, &xPwx); gsl_blas_ddot (x, y, &xPwy); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot (WtWiWtx, Wtx, &d); xPwx-=d; gsl_blas_ddot (WtWiWtx, Wty, &d); xPwy-=d; gsl_vector_free (WtWiWtx); return; } void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy) { size_t c_size=Wty->size; double d; gsl_vector *WtWiWty=gsl_vector_alloc (c_size); gsl_blas_ddot (y, y, &yPwy); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); gsl_blas_ddot (WtWiWty, Wty, &d); yPwy-=d; gsl_vector_free (WtWiWty); return; } //calculate p values and beta/se in a linear model void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score) { double yPxy=yPwy-xPwy*xPwy/xPwx; double se_wald, se_score; beta=xPwy/xPwx; se_wald=sqrt(yPxy/(df*xPwx) ); se_score=sqrt(yPwy/((double)n_size*xPwx) ); p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); if (test_mode==3) {se=se_score;} else {se=se_wald;} return; } void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) { ifstream infile (file_gene.c_str(), ifstream::in); if (!infile) {cout<<"error reading gene expression file:"<size1-(double)W->size2-1.0; gsl_vector *y=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wtx, x, xPwx); //header getline(infile, line); for (size_t t=0; tsize1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout< void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) { string file_bgen=file_oxford+".bgen"; ifstream infile (file_bgen.c_str(), ios::binary); if (!infile) {cout<<"error reading bgen file:"<size1-(double)W->size2-1.0; gsl_vector *x=gsl_vector_alloc (W->size1); gsl_vector *x_miss=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); // read in header uint32_t bgen_snp_block_offset; uint32_t bgen_header_length; uint32_t bgen_nsamples; uint32_t bgen_nsnps; uint32_t bgen_flags; infile.read(reinterpret_cast(&bgen_snp_block_offset),4); infile.read(reinterpret_cast(&bgen_header_length),4); bgen_snp_block_offset-=4; infile.read(reinterpret_cast(&bgen_nsnps),4); bgen_snp_block_offset-=4; infile.read(reinterpret_cast(&bgen_nsamples),4); bgen_snp_block_offset-=4; infile.ignore(4+bgen_header_length-20); bgen_snp_block_offset-=4+bgen_header_length-20; infile.read(reinterpret_cast(&bgen_flags),4); bgen_snp_block_offset-=4; bool CompressedSNPBlocks=bgen_flags&0x1; // bool LongIds=bgen_flags&0x4; infile.ignore(bgen_snp_block_offset); double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; uint32_t bgen_N; uint16_t bgen_LS; uint16_t bgen_LR; uint16_t bgen_LC; uint32_t bgen_SNP_pos; uint32_t bgen_LA; std::string bgen_A_allele; uint32_t bgen_LB; std::string bgen_B_allele; uint32_t bgen_P; size_t unzipped_data_size; string id; string rs; string chr; std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<1) {break;} if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} // read SNP header id.clear(); rs.clear(); chr.clear(); bgen_A_allele.clear(); bgen_B_allele.clear(); infile.read(reinterpret_cast(&bgen_N),4); infile.read(reinterpret_cast(&bgen_LS),2); id.resize(bgen_LS); infile.read(&id[0], bgen_LS); infile.read(reinterpret_cast(&bgen_LR),2); rs.resize(bgen_LR); infile.read(&rs[0], bgen_LR); infile.read(reinterpret_cast(&bgen_LC),2); chr.resize(bgen_LC); infile.read(&chr[0], bgen_LC); infile.read(reinterpret_cast(&bgen_SNP_pos),4); infile.read(reinterpret_cast(&bgen_LA),4); bgen_A_allele.resize(bgen_LA); infile.read(&bgen_A_allele[0], bgen_LA); infile.read(reinterpret_cast(&bgen_LB),4); bgen_B_allele.resize(bgen_LB); infile.read(&bgen_B_allele[0], bgen_LB); uint16_t unzipped_data[3*bgen_N]; if (indicator_snp[t]==0) { if(CompressedSNPBlocks) infile.read(reinterpret_cast(&bgen_P),4); else bgen_P=6*bgen_N; infile.ignore(static_cast(bgen_P)); continue; } if(CompressedSNPBlocks) { infile.read(reinterpret_cast(&bgen_P),4); uint8_t zipped_data[bgen_P]; unzipped_data_size=6*bgen_N; infile.read(reinterpret_cast(zipped_data),bgen_P); int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); assert(result == Z_OK); } else { bgen_P=6*bgen_N; infile.read(reinterpret_cast(unzipped_data),bgen_P); } x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i(unzipped_data[i*3])/32768.0; bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; // WJA bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} else { bgen_geno_prob_AA/=bgen_geno_prob_non_miss; bgen_geno_prob_AB/=bgen_geno_prob_non_miss; bgen_geno_prob_BB/=bgen_geno_prob_non_miss; geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; } x_mean/=static_cast(ni_test-n_miss); for (size_t i=0; i1) { //gsl_vector_set(x, i, 2-geno); //} } //calculate statistics time_start=clock(); gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout<size1-(double)W->size2-1.0; gsl_vector *x=gsl_vector_alloc (W->size1); gsl_vector *x_miss=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); //start reading genotypes and analyze for (size_t t=0; t1) {break;} getline(infile, line); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i1) { //gsl_vector_set(x, i, 2-geno); //} } //calculate statistics time_start=clock(); gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout< b; double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; gsl_vector *x=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } for (vector::size_type t=0; t1) { //gsl_vector_set(x, i, 2-geno); //} } //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } cout< > &pos_loglr) { double yty, xty, xtx, log_lr; gsl_blas_ddot(y, y, &yty); for (size_t i=0; isize2; ++i) { gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); gsl_blas_ddot(&X_col.vector, y, &xty); log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); pos_loglr.push_back(make_pair(i,log_lr) ); } return; }