diff options
Diffstat (limited to 'src/lm.cpp')
-rw-r--r-- | src/lm.cpp | 1500 |
1 files changed, 776 insertions, 724 deletions
@@ -16,28 +16,28 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include <iostream> #include <fstream> +#include <iostream> #include <sstream> -#include <iomanip> +#include <assert.h> +#include <bitset> #include <cmath> +#include <cstring> +#include <iomanip> #include <iostream> #include <stdio.h> #include <stdlib.h> -#include <assert.h> -#include <bitset> -#include <cstring> -#include "gsl/gsl_vector.h" -#include "gsl/gsl_matrix.h" -#include "gsl/gsl_linalg.h" #include "gsl/gsl_blas.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" #include "gsl/gsl_cdf.h" -#include "gsl/gsl_roots.h" -#include "gsl/gsl_min.h" #include "gsl/gsl_integration.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_roots.h" #include "eigenlib.h" #include "gzstream.h" @@ -46,783 +46,835 @@ using namespace std; -void LM::CopyFromParam (PARAM &cPar) { - a_mode=cPar.a_mode; - d_pace=cPar.d_pace; +void LM::CopyFromParam(PARAM &cPar) { + a_mode = cPar.a_mode; + d_pace = cPar.d_pace; - file_bfile=cPar.file_bfile; - file_geno=cPar.file_geno; - file_out=cPar.file_out; - path_out=cPar.path_out; - file_gene=cPar.file_gene; - // WJA added - file_oxford=cPar.file_oxford; + file_bfile = cPar.file_bfile; + file_geno = cPar.file_geno; + file_out = cPar.file_out; + path_out = cPar.path_out; + file_gene = cPar.file_gene; + // WJA added + file_oxford = cPar.file_oxford; - time_opt=0.0; + time_opt = 0.0; - ni_total=cPar.ni_total; - ns_total=cPar.ns_total; - ni_test=cPar.ni_test; - ns_test=cPar.ns_test; - n_cvt=cPar.n_cvt; + ni_total = cPar.ni_total; + ns_total = cPar.ns_total; + ni_test = cPar.ni_test; + ns_test = cPar.ns_test; + n_cvt = cPar.n_cvt; - ng_total=cPar.ng_total; - ng_test=0; + ng_total = cPar.ng_total; + ng_test = 0; - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; - snpInfo=cPar.snpInfo; + indicator_idv = cPar.indicator_idv; + indicator_snp = cPar.indicator_snp; + snpInfo = cPar.snpInfo; - return; + return; } -void LM::CopyToParam (PARAM &cPar) { - cPar.time_opt=time_opt; - cPar.ng_test=ng_test; - return; +void LM::CopyToParam(PARAM &cPar) { + cPar.time_opt = time_opt; + cPar.ng_test = ng_test; + return; } -void LM::WriteFiles () { - string file_str; - file_str=path_out+"/"+file_out; - file_str+=".assoc.txt"; - - ofstream outfile (file_str.c_str(), ofstream::out); - if (!outfile) { - cout << "error writing file: " << file_str.c_str() << endl; - return; - } - - if (!file_gene.empty()) { - outfile<<"geneID"<<"\t"; - - if (a_mode==51) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; - } else if (a_mode==52) { - outfile<<"p_lrt"<<endl; - } else if (a_mode==53) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl; - } else if (a_mode==54) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<< - "\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; - } else {} - - for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { - outfile<<snpInfo[t].rs_number<<"\t"; - - if (a_mode==51) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_wald <<endl; - } else if (a_mode==52) { - outfile<<scientific<<setprecision(6)<< - "\t"<<sumStat[t].p_lrt<<endl; - } else if (a_mode==53) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_score<<endl; - } else if (a_mode==54) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_wald <<"\t"<< - sumStat[t].p_lrt<<"\t"<< - sumStat[t].p_score<<endl; - } else {} - } - } else { - outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis"<< - "\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<< - "af"<<"\t"; - - if (a_mode==51) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; - } else if (a_mode==52) { - outfile<<"p_lrt"<<endl; - } else if (a_mode==53) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl; - } else if (a_mode==54) { - outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t" - <<"p_lrt"<<"\t"<<"p_score"<<endl; - } else {} - - size_t t=0; - for (size_t i=0; i<snpInfo.size(); ++i) { - if (indicator_snp[i]==0) {continue;} - - outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<< - "\t"<<snpInfo[i].base_position<<"\t"<< - snpInfo[i].n_miss<<"\t"<<ni_test-snpInfo[i].n_miss<< - "\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<< - "\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; - - if (a_mode==51) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_wald <<endl; - } else if (a_mode==52) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].p_lrt<<endl; - } else if (a_mode==53) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_score<<endl; - } else if (a_mode==54) { - outfile<<scientific<<setprecision(6)<< - sumStat[t].beta<<"\t"<<sumStat[t].se<< - "\t"<<sumStat[t].p_wald <<"\t"<< - sumStat[t].p_lrt<<"\t"<< - sumStat[t].p_score<<endl; - } else {} - t++; - } - } - - outfile.close(); - outfile.clear(); - return; +void LM::WriteFiles() { + string file_str; + file_str = path_out + "/" + file_out; + file_str += ".assoc.txt"; + + ofstream outfile(file_str.c_str(), ofstream::out); + if (!outfile) { + cout << "error writing file: " << file_str.c_str() << endl; + return; + } + + if (!file_gene.empty()) { + outfile << "geneID" + << "\t"; + + if (a_mode == 51) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_wald" << endl; + } else if (a_mode == 52) { + outfile << "p_lrt" << endl; + } else if (a_mode == 53) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_score" << endl; + } else if (a_mode == 54) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_wald" + << "\t" + << "p_lrt" + << "\t" + << "p_score" << endl; + } else { + } + + for (vector<SUMSTAT>::size_type t = 0; t < sumStat.size(); ++t) { + outfile << snpInfo[t].rs_number << "\t"; + + if (a_mode == 51) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_wald << endl; + } else if (a_mode == 52) { + outfile << scientific << setprecision(6) << "\t" << sumStat[t].p_lrt + << endl; + } else if (a_mode == 53) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_score << endl; + } else if (a_mode == 54) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t" + << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl; + } else { + } + } + } else { + outfile << "chr" + << "\t" + << "rs" + << "\t" + << "ps" + << "\t" + << "n_mis" + << "\t" + << "n_obs" + << "\t" + << "allele1" + << "\t" + << "allele0" + << "\t" + << "af" + << "\t"; + + if (a_mode == 51) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_wald" << endl; + } else if (a_mode == 52) { + outfile << "p_lrt" << endl; + } else if (a_mode == 53) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_score" << endl; + } else if (a_mode == 54) { + outfile << "beta" + << "\t" + << "se" + << "\t" + << "p_wald" + << "\t" + << "p_lrt" + << "\t" + << "p_score" << endl; + } else { + } + + size_t t = 0; + for (size_t i = 0; i < snpInfo.size(); ++i) { + if (indicator_snp[i] == 0) { + continue; + } + + outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" + << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" + << ni_test - snpInfo[i].n_miss << "\t" << snpInfo[i].a_minor + << "\t" << snpInfo[i].a_major << "\t" << fixed << setprecision(3) + << snpInfo[i].maf << "\t"; + + if (a_mode == 51) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_wald << endl; + } else if (a_mode == 52) { + outfile << scientific << setprecision(6) << sumStat[t].p_lrt << endl; + } else if (a_mode == 53) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_score << endl; + } else if (a_mode == 54) { + outfile << scientific << setprecision(6) << sumStat[t].beta << "\t" + << sumStat[t].se << "\t" << sumStat[t].p_wald << "\t" + << sumStat[t].p_lrt << "\t" << sumStat[t].p_score << endl; + } else { + } + t++; + } + } + + outfile.close(); + outfile.clear(); + return; } void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, - const gsl_vector *Wtx, const gsl_vector *y, - const gsl_vector *x, double &xPwy, double &xPwx) { - size_t c_size=Wty->size; - double d; + const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x, + double &xPwy, double &xPwx) { + size_t c_size = Wty->size; + double d; - gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); + gsl_vector *WtWiWtx = gsl_vector_alloc(c_size); - gsl_blas_ddot (x, x, &xPwx); - gsl_blas_ddot (x, y, &xPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot(x, x, &xPwx); + gsl_blas_ddot(x, y, &xPwy); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_ddot (WtWiWtx, Wtx, &d); - xPwx-=d; + gsl_blas_ddot(WtWiWtx, Wtx, &d); + xPwx -= d; - gsl_blas_ddot (WtWiWtx, Wty, &d); - xPwy-=d; + gsl_blas_ddot(WtWiWtx, Wty, &d); + xPwy -= d; - gsl_vector_free (WtWiWtx); + gsl_vector_free(WtWiWtx); - return; + return; } -void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, - const gsl_vector *y, double &yPwy) { - size_t c_size=Wty->size; - double d; +void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, + double &yPwy) { + size_t c_size = Wty->size; + double d; - gsl_vector *WtWiWty=gsl_vector_alloc (c_size); + gsl_vector *WtWiWty = gsl_vector_alloc(c_size); - gsl_blas_ddot (y, y, &yPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + gsl_blas_ddot(y, y, &yPwy); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); - gsl_blas_ddot (WtWiWty, Wty, &d); - yPwy-=d; + gsl_blas_ddot(WtWiWty, Wty, &d); + yPwy -= d; - gsl_vector_free (WtWiWty); + gsl_vector_free(WtWiWty); - return; + return; } // Calculate p-values and beta/se in a linear model. -void LmCalcP (const size_t test_mode, const double yPwy, - const double xPwy, const double xPwx, const double df, - const size_t n_size, double &beta, double &se, - double &p_wald, double &p_lrt, double &p_score) { - double yPxy=yPwy-xPwy*xPwy/xPwx; - double se_wald, se_score; - - beta=xPwy/xPwx; - se_wald=sqrt(yPxy/(df*xPwx) ); - se_score=sqrt(yPwy/((double)n_size*xPwx) ); - - p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); - p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); - p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); - - if (test_mode==3) {se=se_score;} else {se=se_wald;} - - return; +void LmCalcP(const size_t test_mode, const double yPwy, const double xPwy, + const double xPwx, const double df, const size_t n_size, + double &beta, double &se, double &p_wald, double &p_lrt, + double &p_score) { + double yPxy = yPwy - xPwy * xPwy / xPwx; + double se_wald, se_score; + + beta = xPwy / xPwx; + se_wald = sqrt(yPxy / (df * xPwx)); + se_score = sqrt(yPwy / ((double)n_size * xPwx)); + + p_wald = gsl_cdf_fdist_Q(beta * beta / (se_wald * se_wald), 1.0, df); + p_score = gsl_cdf_fdist_Q(beta * beta / (se_score * se_score), 1.0, df); + p_lrt = gsl_cdf_chisq_Q((double)n_size * (log(yPwy) - log(yPxy)), 1); + + if (test_mode == 3) { + se = se_score; + } else { + se = se_wald; + } + + return; } -void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) { - ifstream infile (file_gene.c_str(), ifstream::in); - if (!infile) { - cout<<"error reading gene expression file:"<<file_gene<<endl; - return; - } +void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) { + ifstream infile(file_gene.c_str(), ifstream::in); + if (!infile) { + cout << "error reading gene expression file:" << file_gene << endl; + return; + } - clock_t time_start=clock(); + clock_t time_start = clock(); - string line; - char *ch_ptr; + string line; + char *ch_ptr; - double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; - int c_phen; - string rs; // Gene id. - double d; + double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; + int c_phen; + string rs; // Gene id. + double d; - // Calculate some basic quantities. - double yPwy, xPwy, xPwx; - double df=(double)W->size1-(double)W->size2-1.0; + // Calculate some basic quantities. + double yPwy, xPwy, xPwx; + double df = (double)W->size1 - (double)W->size2 - 1.0; - gsl_vector *y=gsl_vector_alloc (W->size1); + gsl_vector *y = gsl_vector_alloc(W->size1); - gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); - gsl_vector *Wty=gsl_vector_alloc (W->size2); - gsl_vector *Wtx=gsl_vector_alloc (W->size2); - gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); + gsl_vector *Wty = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp (WtW, pmt, &sig); - LUInvert (WtW, pmt, WtWi); + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); - gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wtx, x, xPwx); + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wtx, x, xPwx); - // Header. - getline(infile, line); + // Header. + getline(infile, line); - for (size_t t=0; t<ng_total; t++) { - getline(infile, line); - if (t%d_pace==0 || t==ng_total-1) { - ProgressBar ("Performing Analysis ", t, ng_total-1); - } - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - rs=ch_ptr; + for (size_t t = 0; t < ng_total; t++) { + getline(infile, line); + if (t % d_pace == 0 || t == ng_total - 1) { + ProgressBar("Performing Analysis ", t, ng_total - 1); + } + ch_ptr = strtok((char *)line.c_str(), " , \t"); + rs = ch_ptr; - c_phen=0; - for (size_t i=0; i<indicator_idv.size(); ++i) { - ch_ptr=strtok (NULL, " , \t"); - if (indicator_idv[i]==0) {continue;} + c_phen = 0; + for (size_t i = 0; i < indicator_idv.size(); ++i) { + ch_ptr = strtok(NULL, " , \t"); + if (indicator_idv[i] == 0) { + continue; + } - d=atof(ch_ptr); - gsl_vector_set(y, c_phen, d); + d = atof(ch_ptr); + gsl_vector_set(y, c_phen, d); - c_phen++; - } + c_phen++; + } - // Calculate statistics. - time_start=clock(); + // Calculate statistics. + time_start = clock(); - gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); - CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, - beta, se, p_wald, p_lrt, p_score); + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy); + LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, + p_lrt, p_score); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - // Store summary data. - SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); - } - cout<<endl; + // Store summary data. + SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout << endl; - gsl_vector_free(y); + gsl_vector_free(y); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wty); - gsl_vector_free(Wtx); - gsl_permutation_free(pmt); + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); - infile.close(); - infile.clear(); + infile.close(); + infile.clear(); - return; + return; } // WJA added -void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) { - string file_bgen=file_oxford+".bgen"; - ifstream infile (file_bgen.c_str(), ios::binary); - if (!infile) { - cout<<"error reading bgen file:"<<file_bgen<<endl; - return; - } - - clock_t time_start=clock(); - - string line; - char *ch_ptr; - - double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; - int n_miss, c_phen; - double geno, x_mean; - - // Calculate some basic quantities. - double yPwy, xPwy, xPwx; - double df=(double)W->size1-(double)W->size2-1.0; - - gsl_vector *x=gsl_vector_alloc (W->size1); - gsl_vector *x_miss=gsl_vector_alloc (W->size1); - - gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); - gsl_vector *Wty=gsl_vector_alloc (W->size2); - gsl_vector *Wtx=gsl_vector_alloc (W->size2); - gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp (WtW, pmt, &sig); - LUInvert (WtW, pmt, WtWi); - - gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); - CalcvPv(WtWi, Wty, y, yPwy); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); - infile.read(reinterpret_cast<char*>(&bgen_header_length),4); - bgen_snp_block_offset-=4; - infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); - bgen_snp_block_offset-=4; - infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); - bgen_snp_block_offset-=4; - infile.ignore(4+bgen_header_length-20); - bgen_snp_block_offset-=4+bgen_header_length-20; - infile.read(reinterpret_cast<char*>(&bgen_flags),4); - bgen_snp_block_offset-=4; - bool CompressedSNPBlocks=bgen_flags&0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - std::cout << "Warning: WJA hard coded SNP missingness " << - "threshold of 10%" << std::endl; - - // Start reading genotypes and analyze. - for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%d_pace==0 || t==(ns_total-1)) { - ProgressBar ("Reading SNPs ", t, ns_total-1); - } - - // Read SNP header. - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char*>(&bgen_N),4); - infile.read(reinterpret_cast<char*>(&bgen_LS),2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char*>(&bgen_LR),2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char*>(&bgen_LC),2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); - - infile.read(reinterpret_cast<char*>(&bgen_LA),4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char*>(&bgen_LB),4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3*bgen_N]; - - if (indicator_snp[t]==0) { - if(CompressedSNPBlocks) - infile.read(reinterpret_cast<char*>(&bgen_P),4); - else - bgen_P=6*bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if(CompressedSNPBlocks) { - infile.read(reinterpret_cast<char*>(&bgen_P),4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size=6*bgen_N; - - infile.read(reinterpret_cast<char*>(zipped_data), - bgen_P); - - int result= - uncompress(reinterpret_cast<Bytef*>(unzipped_data), - reinterpret_cast<uLongf*>(&unzipped_data_size), - reinterpret_cast<Bytef*>(zipped_data), - static_cast<uLong> (bgen_P)); - assert(result == Z_OK); - - } - else - { - - bgen_P=6*bgen_N; - infile.read(reinterpret_cast<char*>(unzipped_data), - bgen_P); - } - - x_mean=0.0; c_phen=0; n_miss=0; - gsl_vector_set_zero(x_miss); - for (size_t i=0; i<bgen_N; ++i) { - if (indicator_idv[i]==0) {continue;} - - - bgen_geno_prob_AA= - static_cast<double>(unzipped_data[i*3])/32768.0; - bgen_geno_prob_AB= - static_cast<double>(unzipped_data[i*3+1])/32768.0; - bgen_geno_prob_BB= - static_cast<double>(unzipped_data[i*3+2])/32768.0; - - // WJA - bgen_geno_prob_non_miss= - bgen_geno_prob_AA + - bgen_geno_prob_AB + - bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss<0.9) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } - else { - bgen_geno_prob_AA/=bgen_geno_prob_non_miss; - bgen_geno_prob_AB/=bgen_geno_prob_non_miss; - bgen_geno_prob_BB/=bgen_geno_prob_non_miss; - - geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean+=geno; - } - c_phen++; - } - - x_mean/=static_cast<double>(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { - if (gsl_vector_get (x_miss, i)==0) { - gsl_vector_set(x, i, x_mean); - } - geno=gsl_vector_get(x, i); - } - - // Calculate statistics. - time_start=clock(); - - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, - beta, se, p_wald, p_lrt, p_score); - - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - - // Store summary data. - SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); - } - cout<<endl; - - gsl_vector_free(x); - gsl_vector_free(x_miss); - - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wty); - gsl_vector_free(Wtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return; +void LM::Analyzebgen(const gsl_matrix *W, const gsl_vector *y) { + string file_bgen = file_oxford + ".bgen"; + ifstream infile(file_bgen.c_str(), ios::binary); + if (!infile) { + cout << "error reading bgen file:" << file_bgen << endl; + return; + } + + clock_t time_start = clock(); + + string line; + char *ch_ptr; + + double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; + int n_miss, c_phen; + double geno, x_mean; + + // Calculate some basic quantities. + double yPwy, xPwy, xPwx; + double df = (double)W->size1 - (double)W->size2 - 1.0; + + gsl_vector *x = gsl_vector_alloc(W->size1); + gsl_vector *x_miss = gsl_vector_alloc(W->size1); + + gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); + gsl_vector *Wty = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // Read in header. + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); + infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); + bgen_snp_block_offset -= 4; + infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); + bgen_snp_block_offset -= 4; + infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); + bgen_snp_block_offset -= 4; + infile.ignore(4 + bgen_header_length - 20); + bgen_snp_block_offset -= 4 + bgen_header_length - 20; + infile.read(reinterpret_cast<char *>(&bgen_flags), 4); + bgen_snp_block_offset -= 4; + bool CompressedSNPBlocks = bgen_flags & 0x1; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB; + double bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout << "Warning: WJA hard coded SNP missingness " + << "threshold of 10%" << std::endl; + + // Start reading genotypes and analyze. + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (t % d_pace == 0 || t == (ns_total - 1)) { + ProgressBar("Reading SNPs ", t, ns_total - 1); + } + + // Read SNP header. + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char *>(&bgen_N), 4); + infile.read(reinterpret_cast<char *>(&bgen_LS), 2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char *>(&bgen_LR), 2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char *>(&bgen_LC), 2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); + + infile.read(reinterpret_cast<char *>(&bgen_LA), 4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + infile.read(reinterpret_cast<char *>(&bgen_LB), 4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + uint16_t unzipped_data[3 * bgen_N]; + + if (indicator_snp[t] == 0) { + if (CompressedSNPBlocks) + infile.read(reinterpret_cast<char *>(&bgen_P), 4); + else + bgen_P = 6 * bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + if (CompressedSNPBlocks) { + infile.read(reinterpret_cast<char *>(&bgen_P), 4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size = 6 * bgen_N; + + infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); + + int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), + reinterpret_cast<uLongf *>(&unzipped_data_size), + reinterpret_cast<Bytef *>(zipped_data), + static_cast<uLong>(bgen_P)); + assert(result == Z_OK); + + } else { + + bgen_P = 6 * bgen_N; + infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); + } + + x_mean = 0.0; + c_phen = 0; + n_miss = 0; + gsl_vector_set_zero(x_miss); + for (size_t i = 0; i < bgen_N; ++i) { + if (indicator_idv[i] == 0) { + continue; + } + + bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; + bgen_geno_prob_AB = + static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; + bgen_geno_prob_BB = + static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; + + // WJA + bgen_geno_prob_non_miss = + bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss < 0.9) { + gsl_vector_set(x_miss, c_phen, 0.0); + n_miss++; + } else { + bgen_geno_prob_AA /= bgen_geno_prob_non_miss; + bgen_geno_prob_AB /= bgen_geno_prob_non_miss; + bgen_geno_prob_BB /= bgen_geno_prob_non_miss; + + geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean += geno; + } + c_phen++; + } + + x_mean /= static_cast<double>(ni_test - n_miss); + + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(x_miss, i) == 0) { + gsl_vector_set(x, i, x_mean); + } + geno = gsl_vector_get(x, i); + } + + // Calculate statistics. + time_start = clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, + p_lrt, p_score); + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + // Store summary data. + SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout << endl; + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return; } -void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) { - igzstream infile (file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return; - } - - clock_t time_start=clock(); - - string line; - char *ch_ptr; - - double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; - int n_miss, c_phen; - double geno, x_mean; - - // Calculate some basic quantities. - double yPwy, xPwy, xPwx; - double df=(double)W->size1-(double)W->size2-1.0; - - gsl_vector *x=gsl_vector_alloc (W->size1); - gsl_vector *x_miss=gsl_vector_alloc (W->size1); - - gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); - gsl_vector *Wty=gsl_vector_alloc (W->size2); - gsl_vector *Wtx=gsl_vector_alloc (W->size2); - gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp (WtW, pmt, &sig); - LUInvert (WtW, pmt, WtWi); - - gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); - CalcvPv(WtWi, Wty, y, yPwy); - - // Start reading genotypes and analyze. - for (size_t t=0; t<indicator_snp.size(); ++t) { - getline(infile, line); - if (t%d_pace==0 || t==(ns_total-1)) { - ProgressBar ("Reading SNPs ", t, ns_total-1); - } - if (indicator_snp[t]==0) {continue;} - - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); - - x_mean=0.0; c_phen=0; n_miss=0; - gsl_vector_set_zero(x_miss); - for (size_t i=0; i<ni_total; ++i) { - ch_ptr=strtok (NULL, " , \t"); - if (indicator_idv[i]==0) {continue;} - - if (strcmp(ch_ptr, "NA")==0) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } - else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean+=geno; - } - c_phen++; - } - - x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { - if (gsl_vector_get (x_miss, i)==0) { - gsl_vector_set(x, i, x_mean); - } - geno=gsl_vector_get(x, i); - } - - // Calculate statistics. - time_start=clock(); - - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, - beta, se, p_wald, p_lrt, p_score); - - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - - // Store summary data. - SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); - } - cout<<endl; - - gsl_vector_free(x); - gsl_vector_free(x_miss); - - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wty); - gsl_vector_free(Wtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return; +void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) { + igzstream infile(file_geno.c_str(), igzstream::in); + if (!infile) { + cout << "error reading genotype file:" << file_geno << endl; + return; + } + + clock_t time_start = clock(); + + string line; + char *ch_ptr; + + double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; + int n_miss, c_phen; + double geno, x_mean; + + // Calculate some basic quantities. + double yPwy, xPwy, xPwx; + double df = (double)W->size1 - (double)W->size2 - 1.0; + + gsl_vector *x = gsl_vector_alloc(W->size1); + gsl_vector *x_miss = gsl_vector_alloc(W->size1); + + gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); + gsl_vector *Wty = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // Start reading genotypes and analyze. + for (size_t t = 0; t < indicator_snp.size(); ++t) { + getline(infile, line); + if (t % d_pace == 0 || t == (ns_total - 1)) { + ProgressBar("Reading SNPs ", t, ns_total - 1); + } + if (indicator_snp[t] == 0) { + continue; + } + + ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok(NULL, " , \t"); + + x_mean = 0.0; + c_phen = 0; + n_miss = 0; + gsl_vector_set_zero(x_miss); + for (size_t i = 0; i < ni_total; ++i) { + ch_ptr = strtok(NULL, " , \t"); + if (indicator_idv[i] == 0) { + continue; + } + + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(x_miss, c_phen, 0.0); + n_miss++; + } else { + geno = atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean += geno; + } + c_phen++; + } + + x_mean /= (double)(ni_test - n_miss); + + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(x_miss, i) == 0) { + gsl_vector_set(x, i, x_mean); + } + geno = gsl_vector_get(x, i); + } + + // Calculate statistics. + time_start = clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, + p_lrt, p_score); + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + // Store summary data. + SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout << endl; + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return; } -void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) { - string file_bed=file_bfile+".bed"; - ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) { - cout<<"error reading bed file:"<<file_bed<<endl; - return; - } - - clock_t time_start=clock(); - - char ch[1]; - bitset<8> b; - - double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; - int n_bit, n_miss, ci_total, ci_test; - double geno, x_mean; - - // Calculate some basic quantities. - double yPwy, xPwy, xPwx; - double df=(double)W->size1-(double)W->size2-1.0; - - gsl_vector *x=gsl_vector_alloc (W->size1); - - gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); - gsl_vector *Wty=gsl_vector_alloc (W->size2); - gsl_vector *Wtx=gsl_vector_alloc (W->size2); - gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp (WtW, pmt, &sig); - LUInvert (WtW, pmt, WtWi); - - gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); - CalcvPv(WtWi, Wty, y, yPwy); - - // Calculate n_bit and c, the number of bit for each SNP. - if (ni_total%4==0) {n_bit=ni_total/4;} - else {n_bit=ni_total/4+1;} - - // Print the first three magic numbers. - for (int i=0; i<3; ++i) { - infile.read(ch,1); - b=ch[0]; - } - - for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { - if (t%d_pace==0 || t==snpInfo.size()-1) { - ProgressBar ("Reading SNPs ", t, snpInfo.size()-1); - } - if (indicator_snp[t]==0) {continue;} - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(t*n_bit+3); - - // Read genotypes. - x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; - for (int i=0; i<n_bit; ++i) { - infile.read(ch,1); - b=ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0; - for (size_t j=0; j<4; ++j) { - if ((i==(n_bit-1)) && ci_total==(int)ni_total) { - break; - } - if (indicator_idv[ci_total]==0) { - ci_total++; - continue; - } - - if (b[2*j]==0) { - if (b[2*j+1]==0) { - gsl_vector_set(x, ci_test, 2); - x_mean+=2.0; - } - else { - gsl_vector_set(x, ci_test, 1); - x_mean+=1.0; } - } - else { - if (b[2*j+1]==1) { - gsl_vector_set(x, ci_test, 0); - } - else { - gsl_vector_set(x, ci_test, -9); - n_miss++; - } - } - - ci_total++; - ci_test++; - } - } - - x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { - geno=gsl_vector_get(x,i); - if (geno==-9) { - gsl_vector_set(x, i, x_mean); - geno=x_mean; - } - } - - // Calculate statistics. - time_start=clock(); - - gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, - beta, se, p_wald, p_lrt, p_score); - - //store summary data - SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); - - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - } - cout<<endl; - - gsl_vector_free(x); - - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wty); - gsl_vector_free(Wtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return; +void LM::AnalyzePlink(const gsl_matrix *W, const gsl_vector *y) { + string file_bed = file_bfile + ".bed"; + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return; + } + + clock_t time_start = clock(); + + char ch[1]; + bitset<8> b; + + double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + // Calculate some basic quantities. + double yPwy, xPwy, xPwx; + double df = (double)W->size1 - (double)W->size2 - 1.0; + + gsl_vector *x = gsl_vector_alloc(W->size1); + + gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); + gsl_vector *Wty = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // Calculate n_bit and c, the number of bit for each SNP. + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // Print the first three magic numbers. + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { + if (t % d_pace == 0 || t == snpInfo.size() - 1) { + ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + } + if (indicator_snp[t] == 0) { + continue; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + // Read genotypes. + x_mean = 0.0; + n_miss = 0; + ci_total = 0; + ci_test = 0; + for (int i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0; + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && ci_total == (int)ni_total) { + break; + } + if (indicator_idv[ci_total] == 0) { + ci_total++; + continue; + } + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(x, ci_test, 2); + x_mean += 2.0; + } else { + gsl_vector_set(x, ci_test, 1); + x_mean += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(x, ci_test, 0); + } else { + gsl_vector_set(x, ci_test, -9); + n_miss++; + } + } + + ci_total++; + ci_test++; + } + } + + x_mean /= (double)(ni_test - n_miss); + + for (size_t i = 0; i < ni_test; ++i) { + geno = gsl_vector_get(x, i); + if (geno == -9) { + gsl_vector_set(x, i, x_mean); + geno = x_mean; + } + } + + // Calculate statistics. + time_start = clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, + p_lrt, p_score); + + // store summary data + SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + } + cout << endl; + + gsl_vector_free(x); + + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return; } // Make sure that both y and X are centered already. -void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, - vector<pair<size_t, double> > &pos_loglr) { - double yty, xty, xtx, log_lr; - gsl_blas_ddot(y, y, &yty); +void MatrixCalcLmLR(const gsl_matrix *X, const gsl_vector *y, + vector<pair<size_t, double>> &pos_loglr) { + double yty, xty, xtx, log_lr; + gsl_blas_ddot(y, y, &yty); - for (size_t i=0; i<X->size2; ++i) { - gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); - gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); - gsl_blas_ddot(&X_col.vector, y, &xty); + for (size_t i = 0; i < X->size2; ++i) { + gsl_vector_const_view X_col = gsl_matrix_const_column(X, i); + gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); + gsl_blas_ddot(&X_col.vector, y, &xty); - log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); - pos_loglr.push_back(make_pair(i,log_lr) ); - } + log_lr = 0.5 * (double)y->size * (log(yty) - log(yty - xty * xty / xtx)); + pos_loglr.push_back(make_pair(i, log_lr)); + } - return; + return; } |