diff options
author | xiangzhou | 2015-07-11 12:57:37 -0400 |
---|---|---|
committer | xiangzhou | 2015-07-11 12:57:37 -0400 |
commit | b3b491cd9143d33bfebd4c5b26629573afcf0970 (patch) | |
tree | 37fc935d3e11a7b28fca4a0e4033125efb870490 /src/lm.cpp | |
parent | e6c7cc839136b84f9486b7baa18b6f4a163db7ac (diff) | |
download | pangemma-b3b491cd9143d33bfebd4c5b26629573afcf0970.tar.gz |
add GXE test
Diffstat (limited to 'src/lm.cpp')
-rw-r--r-- | src/lm.cpp | 516 |
1 files changed, 380 insertions, 136 deletions
@@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -26,7 +26,7 @@ #include <cmath> #include <iostream> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <bitset> #include <cstring> @@ -57,48 +57,50 @@ using namespace std; -void LM::CopyFromParam (PARAM &cPar) +void LM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; - + // WJA added + file_oxford=cPar.file_oxford; + time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + ng_total=cPar.ng_total; ng_test=0; - - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void LM::CopyToParam (PARAM &cPar) +void LM::CopyToParam (PARAM &cPar) { - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.ng_test=ng_test; - + return; } -void LM::WriteFiles () +void LM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; @@ -109,7 +111,7 @@ void LM::WriteFiles () if (!file_gene.empty()) { outfile<<"geneID"<<"\t"; - + if (a_mode==51) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; } else if (a_mode==52) { @@ -119,10 +121,10 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - - for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { + + for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { outfile<<snpInfo[t].rs_number<<"\t"; - + if (a_mode==51) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==52) { @@ -132,10 +134,10 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl; } else {} - } + } } else { - outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; - + outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis"<<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; + if (a_mode==51) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; } else if (a_mode==52) { @@ -145,13 +147,13 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - + size_t t=0; for (size_t i=0; i<snpInfo.size(); ++i) { if (indicator_snp[i]==0) {continue;} - - outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; - + + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<ni_test-snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; + if (a_mode==51) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==52) { @@ -164,8 +166,8 @@ void LM::WriteFiles () t++; } } - - + + outfile.close(); outfile.clear(); return; @@ -179,21 +181,21 @@ void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wt { size_t c_size=Wty->size; double d; - + gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); - + gsl_blas_ddot (x, x, &xPwx); gsl_blas_ddot (x, y, &xPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - - gsl_blas_ddot (WtWiWtx, Wtx, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + + gsl_blas_ddot (WtWiWtx, Wtx, &d); xPwx-=d; - - gsl_blas_ddot (WtWiWtx, Wty, &d); + + gsl_blas_ddot (WtWiWtx, Wty, &d); xPwy-=d; - + gsl_vector_free (WtWiWtx); - + return; } @@ -202,17 +204,17 @@ void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, { size_t c_size=Wty->size; double d; - + gsl_vector *WtWiWty=gsl_vector_alloc (c_size); - + gsl_blas_ddot (y, y, &yPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); - - gsl_blas_ddot (WtWiWty, Wty, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + + gsl_blas_ddot (WtWiWty, Wty, &d); yPwy-=d; - + gsl_vector_free (WtWiWty); - + return; } @@ -223,38 +225,38 @@ void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, cons { double yPxy=yPwy-xPwy*xPwy/xPwx; double se_wald, se_score; - + beta=xPwy/xPwx; se_wald=sqrt(yPxy/(df*xPwx) ); se_score=sqrt(yPwy/((double)n_size*xPwx) ); - + p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); - + if (test_mode==3) {se=se_score;} else {se=se_wald;} - + return; } -void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) +void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) { ifstream infile (file_gene.c_str(), ifstream::in); if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;} - + clock_t time_start=clock(); - + string line; char *ch_ptr; - + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int c_phen; string rs; //gene id double d; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -262,7 +264,7 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_vector *y=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -274,42 +276,42 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wtx, x, xPwx); - + //header getline(infile, line); - + for (size_t t=0; t<ng_total; t++) { getline(infile, line); if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);} ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; - - c_phen=0; + + c_phen=0; for (size_t i=0; i<indicator_idv.size(); ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - - d=atof(ch_ptr); + + d=atof(ch_ptr); gsl_vector_set(y, c_phen, d); - + c_phen++; } - - //calculate statistics - time_start=clock(); - + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout<<endl; - + gsl_vector_free(y); gsl_matrix_free(WtW); @@ -317,31 +319,259 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); infile.clear(); - + return; } +// WJA added +#include <assert.h> +void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;} + + + clock_t time_start=clock(); + + string line; + char *ch_ptr; + + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + int n_miss, c_phen; + double geno, x_mean; + + //calculate some basic quantities + double yPwy, xPwy, xPwx; + double df=(double)W->size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + gsl_vector *x_miss=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<<std::endl; + + + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) + { + +// if (t>1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<bgen_N; ++i) { + if (indicator_idv[i]==0) {continue;} + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast<double>(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<<endl; + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return; +} + + + void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) { igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} - + clock_t time_start=clock(); - + string line; char *ch_ptr; - + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_miss, c_phen; double geno, x_mean; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -350,7 +580,7 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x_miss=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -362,58 +592,58 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - - //start reading genotypes and analyze + + //start reading genotypes and analyze for (size_t t=0; t<indicator_snp.size(); ++t) { //if (t>1) {break;} getline(infile, line); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; - } - + } + x_mean/=(double)(ni_test-n_miss); - + for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } - } - - //calculate statistics - time_start=clock(); + } - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; gsl_vector_free(x); @@ -424,10 +654,10 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); infile.clear(); - + return; } @@ -437,21 +667,21 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) -void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) +void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) { string file_bed=file_bfile+".bed"; ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} - + clock_t time_start=clock(); - + char ch[1]; - bitset<8> b; - + bitset<8> b; + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -459,7 +689,7 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -471,90 +701,104 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - - + + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} - + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes - x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;} if (indicator_idv[ci_total]==0) {ci_total++; continue;} - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; } else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } } else { - if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } else {gsl_vector_set(x, ci_test, -9); n_miss++; } } - + ci_total++; ci_test++; } } - + x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { + + for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } } - - //calculate statistics - time_start=clock(); - + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; - + gsl_vector_free(x); gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); + gsl_matrix_free(WtWi); gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return; } + + + + + + + + + + + + + + //make sure that both y and X are centered already -void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) +void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) { double yty, xty, xtx, log_lr; gsl_blas_ddot(y, y, &yty); @@ -567,6 +811,6 @@ void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_ log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); pos_loglr.push_back(make_pair(i,log_lr) ); } - + return; } |