From b3b491cd9143d33bfebd4c5b26629573afcf0970 Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Sat, 11 Jul 2015 12:57:37 -0400 Subject: add GXE test --- src/io.cpp | 2482 +++++++++++++++++++++++++++++++++++++------- src/io.h | 36 +- src/lm.cpp | 516 ++++++--- src/lm.h | 28 +- src/lmm.cpp | 1517 ++++++++++++++++++--------- src/lmm.h | 30 +- src/mvlmm.cpp | 3213 ++++++++++++++++++++++++++++++++++++++++----------------- src/mvlmm.h | 30 +- src/param.cpp | 1122 ++++++++++++++++---- src/param.h | 122 ++- 10 files changed, 6894 insertions(+), 2202 deletions(-) (limited to 'src') diff --git a/src/io.cpp b/src/io.cpp index 7ed95c4..d6a70dd 100644 --- a/src/io.cpp +++ b/src/io.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" @@ -39,6 +39,7 @@ #include "lapack.h" #include "gzstream.h" #include "mathfunc.h" +#include "eigenlib.h" #ifdef FORCE_FLOAT #include "io_float.h" @@ -54,10 +55,10 @@ using namespace std; //Print process bar void ProgressBar (string str, double p, double total) { - double progress = (100.0 * p / total); - int barsize = (int) (progress / 2.0); + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); char bar[51]; - + cout< &setSnps) ifstream infile (file_snps.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open snps file: "< &mapRS2chr, map { mapRS2chr.clear(); mapRS2bp.clear(); - + ifstream infile (file_anno.c_str(), ifstream::in); if (!infile) {cout<<"error opening annotation file: "< &mapRS2chr, map if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {chr="-9";} else {chr=ch_ptr;} ch_ptr=strtok (NULL, " , \t"); if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {cM=-9;} else {cM=atof(ch_ptr);} - + mapRS2chr[rs]=chr; mapRS2bp[rs]=b_pos; mapRS2cM[rs]=cM; } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -225,28 +226,28 @@ bool ReadFile_column (const string &file_pheno, vector &indicator_idv, vect { indicator_idv.clear(); pheno.clear(); - + igzstream infile (file_pheno.c_str(), igzstream::in); // ifstream infile (file_pheno.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open phenotype file: "< > &indicator_p { indicator_pheno.clear(); pheno.clear(); - + igzstream infile (file_pheno.c_str(), igzstream::in); // ifstream infile (file_pheno.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open phenotype file: "< pheno_row; vector ind_pheno_row; - + size_t p_max=*max_element(p_column.begin(), p_column.end() ); map mapP2c; for (size_t i=0; i > &indicator_p bool ReadFile_cvt (const string &file_cvt, vector &indicator_cvt, vector > &cvt, size_t &n_cvt) { indicator_cvt.clear(); - + ifstream infile (file_cvt.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open covariates file: "< v_d; flag_na=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} else {d=atof(ch_ptr);} - + v_d.push_back(d); - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); } - if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} cvt.push_back(v_d); } - + if (indicator_cvt.empty()) {n_cvt=0;} else { flag_na=0; for (vector::size_type i=0; i &indicator_cvt, vector &snpInfo) { snpInfo.clear(); - + ifstream infile (file_bim.c_str(), ifstream::in); if (!infile) {cout<<"error opening .bim file: "< &snpInfo) minor=ch_ptr; ch_ptr=strtok (NULL, " \t"); major=ch_ptr; - - SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, -9, -9, -9}; + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; snpInfo.push_back(sInfo); } - + infile.close(); - infile.clear(); + infile.clear(); return true; } @@ -396,8 +397,8 @@ bool ReadFile_fam (const string &file_fam, vector > &indicator_pheno { indicator_pheno.clear(); pheno.clear(); - mapID2num.clear(); - + mapID2num.clear(); + igzstream infile (file_fam.c_str(), igzstream::in); //ifstream infile (file_fam.c_str(), ifstream::in); if (!infile) {cout<<"error opening .fam file: "< > &indicator_pheno vector pheno_row; vector ind_pheno_row; - + size_t p_max=*max_element(p_column.begin(), p_column.end() ); map mapP2c; for (size_t i=0; i > &indicator_pheno ch_ptr=strtok (NULL, " \t"); ch_ptr=strtok (NULL, " \t"); ch_ptr=strtok (NULL, " \t"); - + size_t i=0; while (i > &indicator_pheno ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9; } else { p=atof(ch_ptr); - + if (p==-9) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;} else {ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;} } } i++; - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); } - + indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - + pheno.push_back(pheno_row); + mapID2num[id]=c; c++; } - + infile.close(); - infile.clear(); + infile.clear(); return true; } @@ -466,7 +467,7 @@ bool ReadFile_geno (const string &file_geno, const set &setSnps, const g { indicator_snp.clear(); snpInfo.clear(); - + igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"< &setSnps, const g gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; + //eigenlib_dgemm("T", "N", 1.0, W, W, 0.0, WtW); + int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); - + double v_x, v_w; int c_idv=0; - + string line; char *ch_ptr; - + string rs; long int b_pos; string chr; string major; string minor; double cM; - + size_t file_pos; + double maf, geno, geno_old; size_t n_miss; size_t n_0, n_1, n_2; int flag_poly; - + int ni_total=indicator_idv.size(); int ni_test=0; for (int i=0; i=0 && geno<=0.5) {n_0++;} if (geno>0.5 && geno<1.5) {n_1++;} if (geno>=1.5 && geno<=2.0) {n_2++;} - - gsl_vector_set (genotype, c_idv, geno); - + + gsl_vector_set (genotype, c_idv, geno); + // if (geno<0) {n_miss++; continue;} - + if (flag_poly==0) {geno_old=geno; flag_poly=2;} if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} - + maf+=geno; - + c_idv++; } - maf/=2.0*(double)(ni_test-n_miss); - - SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + maf/=2.0*(double)(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf, ni_test-n_miss, 0, file_pos}; snpInfo.push_back(sInfo); - + file_pos++; + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} - + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} - + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} - + if (hwe_level!=0 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + for (size_t i=0; isize; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} } - + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - + if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} - - indicator_snp.push_back(1); + + indicator_snp.push_back(1); ns_test++; } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); gsl_matrix_free (WtW); @@ -591,10 +598,10 @@ bool ReadFile_geno (const string &file_geno, const set &setSnps, const g gsl_vector_free (Wtx); gsl_vector_free (WtWiWtx); gsl_permutation_free (pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -602,13 +609,13 @@ bool ReadFile_geno (const string &file_geno, const set &setSnps, const g - + //Read bed file, the first time bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) { indicator_snp.clear(); size_t ns_total=snpInfo.size(); - + ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"< &setSnps, const gsl gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; + int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); - + double v_x, v_w, geno; size_t c_idv=0; - + char ch[1]; bitset<8> b; - + size_t ni_total=indicator_idv.size(); size_t ni_test=0; for (size_t i=0; i &setSnps, const gsl infile.read(ch,1); b=ch[0]; } - + double maf; size_t n_miss; - size_t n_0, n_1, n_2, c; - + size_t n_0, n_1, n_2, c; + //start reading snps and doing association test for (size_t t=0; t &setSnps, const gsl if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); maf+=2.0; n_2++;} else {gsl_vector_set(genotype, c_idv, 1.0); maf+=1.0; n_1++;} } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;} + if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;} else {gsl_vector_set(genotype_miss, c_idv, 1); n_miss++; } } c_idv++; } } maf/=2.0*(double)(ni_test-n_miss); - + snpInfo[t].n_miss=n_miss; snpInfo[t].missingness=(double)n_miss/(double)ni_test; snpInfo[t].maf=maf; - + snpInfo[t].n_idv=ni_test-n_miss; + snpInfo[t].n_nb=0; + snpInfo[t].file_position=t; + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} - + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} - + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} - + if (hwe_level!=1 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + for (size_t i=0; isize; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} } - + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - + if (W->size2!=1 && v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} - - indicator_snp.push_back(1); + + indicator_snp.push_back(1); ns_test++; } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); gsl_matrix_free (WtW); @@ -728,63 +739,177 @@ bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl gsl_vector_free (Wtx); gsl_vector_free (WtWiWtx); gsl_permutation_free (pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } -void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) + + +//read the genotype for one SNP; remember to read empty lines +//geno stores original genotypes without centering +//missing values are replaced by mean +bool Bimbam_ReadOneSNP (const size_t inc, const vector &indicator_idv, igzstream &infile, gsl_vector *geno, double &geno_mean) +{ + size_t ni_total=indicator_idv.size(); + + // if (infile.eof()) {infile.clear();} + // infile.seekg(pos); + + string line; + char *ch_ptr; + bool flag=false; + + for (size_t i=0; i geno_miss; + + for (size_t i=0; i &indicator_idv, ifstream &infile, gsl_vector *geno, double &geno_mean) +{ + size_t ni_total=indicator_idv.size(), n_bit; + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + infile.seekg(pos*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + char ch[1]; + bitset<8> b; + + geno_mean=0.0; + size_t c=0, c_idv=0; + vector geno_miss; + + for (size_t i=0; i &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) { igzstream infile (file_kin.c_str(), igzstream::in); // ifstream infile (file_kin.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open kinship file: "< mapID2ID; size_t c=0; for (size_t i=0; i &indicator_idv, map &indicator_idv, mapsize1, n_col=U->size2, i_row=0, i_col=0; - + gsl_matrix_set_zero (U); - + string line; - char *ch_ptr; + char *ch_ptr; double d; - + while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<size, i_row=0; - + gsl_vector_set_zero (eval); - + string line; - char *ch_ptr; + char *ch_ptr; double d; - + while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<size1; gsl_vector *geno=gsl_vector_alloc (ni_total); gsl_vector *geno_miss=gsl_vector_alloc (ni_total); @@ -934,11 +1059,11 @@ bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k !safeGetline(infile, line).eof(); if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); for (size_t i=0; i &indicator_snp, const int k geno_var+=d*d; } } - + geno_mean/=(double)(ni_total-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; // geno_var=geno_mean*(1-geno_mean*0.5); - + for (size_t i=0; i &indicator_snp, const int k -bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"< b; - + size_t n_miss, ci_total; double d, geno_mean, geno_var; - + size_t ni_total=matrix_kin->size1; gsl_vector *geno=gsl_vector_alloc (ni_total); size_t ns_test=0; int n_bit; - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -1024,14 +1154,14 @@ bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_m for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; - } - + } + for (size_t t=0; t &indicator_snp, const int k_m else {gsl_vector_set(geno, ci_total, 1.0); geno_mean+=1.0; geno_var+=1.0;} } else { - if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); } + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); } else {gsl_vector_set(geno, ci_total, -9.0); n_miss++; } } ci_total++; } } - + geno_mean/=(double)(ni_total-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; // geno_var=geno_mean*(1-geno_mean*0.5); - + for (size_t i=0; i &indicator_idv, vector< igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<size1); gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1); double geno, geno_mean; size_t n_miss; - + int ni_total=(int)indicator_idv.size(); int ns_total=(int)indicator_snp.size(); int ni_test=UtX->size1; int ns_test=UtX->size2; - + int c_idv=0, c_snp=0; - + for (int i=0; isize; ++i) { + + for (size_t i=0; isize; ++i) { if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} - + gsl_vector_set (genotype, i, geno); gsl_matrix_set (UtX, i, c_snp, geno); } - + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} - + c_snp++; - } - + } + if (calc_K==true) { gsl_matrix_scale (K, 1.0/(double)ns_test); - + for (size_t i=0; isize; ++i) { for (size_t j=0; j &indicator_idv, vector< } } } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); - + infile.clear(); infile.close(); - + return true; } +//compact version of the above function, using uchar instead of gsl_matrix +bool ReadFile_geno (const string &file_geno, vector &indicator_idv, vector &indicator_snp, vector > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + // ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"< Xt_row; + for (size_t i=0; isize; ++j) { + if (gsl_vector_get (genotype_miss, j)==1) { + geno=geno_mean; + } else { + geno=gsl_vector_get (genotype, j); + } + + Xt_row[j]=Double02ToUchar(geno); + gsl_vector_set (genotype, j, (geno-geno_mean)); + } + Xt.push_back(Xt_row); + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &indicator_idv, vector b; - - int ni_total=(int)indicator_idv.size(); - int ns_total=(int)indicator_snp.size(); - int ni_test=UtX->size1; - int ns_test=UtX->size2; + + size_t ni_total=indicator_idv.size(); + size_t ns_total=indicator_snp.size(); + size_t ni_test=UtX->size1; + size_t ns_test=UtX->size2; int n_bit; - + if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - + if (calc_K==true) {gsl_matrix_set_zero (K);} - - gsl_vector *genotype=gsl_vector_alloc (UtX->size1); - + + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); + double geno, geno_mean; - size_t n_miss; - int c_idv=0, c_snp=0, c=0; - + size_t n_miss; + size_t c_idv=0, c_snp=0, c=0; + //start reading snps and doing association test - for (int t=0; tsize; ++i) { + + for (size_t i=0; isize; ++i) { geno=gsl_vector_get (genotype, i); if (geno==-9) {geno=0;} else {geno-=geno_mean;} - + gsl_vector_set (genotype, i, geno); gsl_matrix_set (UtX, i, c_snp, geno); } - + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &indicator_idv, vector &indicator_snp, vector > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< Xt_row; + for (size_t i=0; i b; + + size_t ni_total=indicator_idv.size(); + size_t ns_total=indicator_snp.size(); + int n_bit; + + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (ni_test); + + double geno, geno_mean; + size_t n_miss; + size_t c_idv=0, c_snp=0, c=0; + + //start reading snps and doing association test + for (size_t t=0; tsize; ++i) { + geno=gsl_vector_get (genotype, i); + if (geno==-9) {geno=geno_mean;} + + Xt_row[i]=Double02ToUchar(geno); + + geno-=geno_mean; + + gsl_vector_set (genotype, i, geno); + } + Xt.push_back(Xt_row); + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} - + c_snp++; - } - + } + if (calc_K==true) { gsl_matrix_scale (K, 1.0/(double)ns_test); - + for (size_t i=0; isize; ++i) { for (size_t j=0; j &indicator_idv, vector &indicator_idv, vector &est_column, map &mapRS2est) { mapRS2est.clear(); - + ifstream infile (file_est.c_str(), ifstream::in); if (!infile) {cout<<"error opening estimated parameter file: "< &est_column, map if (i==est_column[3]-1) {gamma=atof(ch_ptr);} if (i &est_column, map cout<<"the same SNP occurs more than once in estimated parameter file: "<(infile), istreambuf_iterator(), '\n'); infile.seekg (0, ios::beg); - + return true; } @@ -1348,25 +1671,25 @@ bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &vec_read, vector > &indicator_pheno, vector > &pheno, const vector &p_column, vector &indicator_cvt, vector > &cvt, size_t &n_cvt) +{ + indicator_pheno.clear(); + pheno.clear(); + indicator_cvt.clear(); + + igzstream infile (file_sample.c_str(), igzstream::in); + + if (!infile) {cout<<"error! fail to open sample file: "< pheno_row; + vector ind_pheno_row; + int flag_na=0; + + size_t num_cols=0; + size_t num_p_in_file=0; + size_t num_cvt_in_file=0; + +// size_t p_max=*max_element(p_column.begin(), p_column.end()); + + map mapP2c; + for (size_t i=0; i > cvt_factor_levels; + + char col_type[num_cols]; + // read header line2 + if(!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + size_t it=0; + ch_ptr=strtok (NULL, " "); + if(ch_ptr!=NULL) + while(ch_ptr!=NULL){ + col_type[it++]=ch_ptr[0]; + if(ch_ptr[0]=='D') {cvt_factor_levels.push_back(map());num_cvt_in_file++;} + if(ch_ptr[0]=='C') {num_cvt_in_file++;} + if((ch_ptr[0]=='P')||(ch_ptr[0]=='B')) {num_p_in_file++;} + ch_ptr=strtok(NULL, " "); + } + + } + + while (!safeGetline(infile, line).eof()) { + + ch_ptr=strtok ((char *)line.c_str(), " "); + + for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " ");} + + + size_t i=0; + size_t p_i=0; + size_t fac_cvt_i=0; + + while (i0) + { + igzstream infile2 (file_sample.c_str(), igzstream::in); + + if (!infile2) {cout<<"error! fail to open sample file: "< v_d; flag_na=0; + ch_ptr=strtok ((char *)line.c_str(), " "); + + for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " ");} + + + size_t i=0; + size_t fac_cvt_i=0; + size_t num_fac_levels; + while (i1) + { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; for(size_t it=0;it::size_type i=0; i +#include +bool ReadFile_bgen(const string &file_bgen, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) +{ + + indicator_snp.clear(); + + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; + bool LongIds=bgen_flags&0x4; + + if(!LongIds) {return false;} + + infile.ignore(bgen_snp_block_offset); + + ns_test=0; + + size_t ns_total=static_cast(bgen_nsnps); + + snpInfo.clear(); + string rs; + long int b_pos; + string chr; +// double cM; + string major; + string minor; + string id; + + double v_x, v_w; + int c_idv=0; + + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + + size_t ni_total=indicator_idv.size(); // total number of samples in phenotype file + size_t ni_test=0; // number of samples to use in test + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + + for (size_t i=0; i(&bgen_N),4); + infile.read(reinterpret_cast(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + // should we switch according to MAF? + minor=bgen_B_allele; + major=bgen_A_allele; + b_pos=static_cast(bgen_SNP_pos); + + uint16_t unzipped_data[3*bgen_N]; + + if (setSnps.size()!=0 && setSnps.count(rs)==0) { + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9}; + snpInfo.push_back(sInfo); + indicator_snp.push_back(0); + if(CompressedSNPBlocks) + infile.read(reinterpret_cast(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + infile.read(reinterpret_cast(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast(zipped_data),bgen_P); + int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); + assert(result == Z_OK); + + } + else + { + bgen_P=6*bgen_N; + infile.read(reinterpret_cast(unzipped_data),bgen_P); + + } + + + maf=0; n_miss=0; flag_poly=0; geno_old=-9; + n_0=0; n_1=0; n_2=0; + c_idv=0; + gsl_vector_set_zero (genotype_miss); + for (size_t i=0; i(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + + //CHECK 0.1 OK + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;} + + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + if (geno>=0 && geno<=0.5) {n_0++;} + if (geno>0.5 && geno<1.5) {n_1++;} + if (geno>=1.5 && geno<=2.0) {n_2++;} + + gsl_vector_set (genotype, c_idv, geno); + + // CHECK WHAT THIS DOES + if (flag_poly==0) {geno_old=geno; flag_poly=2;} + if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} + + maf+=geno; + + c_idv++; + } + + maf/=2.0*static_cast(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, -9, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + snpInfo.push_back(sInfo); + + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=0 && maf_level!=-1) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + + } + + + + + return true; + +} + + +//read oxford genotype file and calculate kinship matrix +bool bgenKin (const string &file_oxford, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +{ + string file_bgen=file_oxford; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + double genotype; + + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_total=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + for (size_t t=0; t(&bgen_N),4); + infile.read(reinterpret_cast(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast(bgen_P)); + + continue; + } + + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast(unzipped_data),bgen_P); + } + + + + geno_mean=0.0; n_miss=0; geno_var=0.0; + gsl_vector_set_all(geno_miss, 0); + + for (size_t i=0; i(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(geno_miss, i, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + genotype=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(geno, i, genotype); + gsl_vector_set(geno_miss, i, 1.0); + geno_mean+=genotype; + geno_var+=genotype*genotype; + } + + } + + + geno_mean/=(double)(ni_total-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_total; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i rs_set(rs_ptr, rs_ptr+10); + string chr_ptr[]={"chr","CHR"}; + set chr_set(chr_ptr, chr_ptr+2); + string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; + set pos_set(pos_ptr, pos_ptr+8); + string cm_ptr[]={"cm","CM"}; + set cm_set(cm_ptr, cm_ptr+2); + string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; + set a1_set(a1_ptr, a1_ptr+4); + string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; + set a0_set(a0_ptr, a0_ptr+4); + + string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; + set z_set(z_ptr, z_ptr+6); + string beta_ptr[]={"beta","BETA","b","B"}; + set beta_set(beta_ptr, beta_ptr+4); + string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"}; + set sebeta_set(sebeta_ptr, sebeta_ptr+4); + string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"}; + set chisq_set(chisq_ptr, chisq_ptr+4); + string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"}; + set p_set(p_ptr, p_ptr+6); + + string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"}; + set n_set(n_ptr, n_ptr+6); + string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"}; + set nmis_set(nmis_ptr, nmis_ptr+6); + string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; + set nobs_set(nobs_ptr, nobs_ptr+4); + + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; + set af_set(af_ptr, af_ptr+10); + string var_ptr[]={"var","VAR"}; + set var_set(var_ptr, var_ptr+2); + + string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"}; + set ws_set(ws_ptr, ws_ptr+4); + string cor_ptr[]={"cor","COR","r","R"}; + set cor_set(cor_ptr, cor_ptr+4); + + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + + char *ch_ptr; + string type; + size_t n_error=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + type=ch_ptr; + if (rs_set.count(type)!=0) { + if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."< &mapRS2cat, size_t &n_vc) +{ + mapRS2cat.clear(); + + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "<0) {n_vc++;} + + infile.clear(); + infile.close(); + + return true; +} + + + + +//read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix +bool BimbamKin (const string &file_geno, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + + size_t n_vc=matrix_kin->size2/ni_test, i_vc; + string rs; + vector ns_vec; + for (size_t i=0; i &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=matrix_kin->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + + size_t ns_test=0; + int n_bit; + + size_t n_vc=matrix_kin->size2/ni_test, i_vc; + string rs; + vector ns_vec; + for (size_t i=0; i &mapRS2cat, const map &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test) +{ + gsl_vector_set_zero(q); + ni_total=0; ns_total=0; ns_test=0; + + igzstream infile (file_beta.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open beta file: "< vec_q, vec_s; + for (size_t i=0; isize; i++) { + vec_q.push_back(0.0); + vec_s.push_back(0.0); + } + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if (header.nobs_col==0 && header.nmis_col==0) { + cout<<"error! missing sample size in the beta file."<size; i++) { + if (vec_s[i]!=0) { + gsl_vector_set(q, i, vec_q[i]/vec_s[i]); + } + gsl_vector_set(s, i, vec_s[i]); + } + + infile.clear(); + infile.close(); + + return; +} + + + + +//read S file: S and Svar +void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar) +{ + igzstream infile (file_s.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open s file: "<size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; jsize2; j++) { + d=gsl_matrix_get(S, i, j)+atof(ch_ptr); + gsl_matrix_set(S, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + for (size_t i=0; isize1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; jsize2; j++) { + d=gsl_matrix_get(Svar, i, j)+atof(ch_ptr); + gsl_matrix_set(Svar, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + + + + +void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar) +{ + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Svar); + + string file_name; + + igzstream infile (file_ms.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open ms file: "<size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; jsize2; j++) { + d=gsl_matrix_get(V, i, j)+atof(ch_ptr); + gsl_matrix_set(V, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + + +void ReadFile_mv (const string &file_mv, gsl_matrix *V) +{ + gsl_matrix_set_zero(V); + + string file_name; + + igzstream infile (file_mv.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open ms file: "<size; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + d=gsl_vector_get(q_vec, i)+atof(ch_ptr); + gsl_vector_set(q_vec, i, d); + } + + for (size_t i=0; isize; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + d=gsl_vector_get(s_vec, i)+atof(ch_ptr); + gsl_vector_set(s_vec, i, d); + } + + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + df=atof(ch_ptr); + + infile.clear(); + infile.close(); + + return; +} + + + +void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +{ + gsl_vector_set_zero(q_vec); + gsl_vector_set_zero(s_vec); + + string file_name; + + igzstream infile (file_mq.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mq file: "<. */ -#ifndef __IO_H__ +#ifndef __IO_H__ #define __IO_H__ @@ -26,6 +26,8 @@ #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" +#include "gzstream.h" + #ifdef FORCE_FLOAT #include "param_float.h" #else @@ -34,6 +36,9 @@ using namespace std; + + + void ProgressBar (string str, double p, double total); void ProgressBar (string str, double p, double total, double ratio); std::istream& safeGetline(std::istream& is, std::string& t); @@ -51,17 +56,21 @@ bool ReadFile_column (const string &file_pheno, vector &indicator_idv, vect bool ReadFile_geno (const string &file_geno, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test); bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); +bool Bimbam_ReadOneSNP (const size_t inc, const vector &indicator_idv, igzstream &infile, gsl_vector *geno, double &geno_mean); +void Plink_ReadOneSNP (const int pos, const vector &indicator_idv, ifstream &infile, gsl_vector *geno, double &geno_mean); void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); void ReadFile_mk (const string &file_mk, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U); -void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); +void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); bool ReadFile_geno (const string &file_geno, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); bool ReadFile_bed (const string &file_bed, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); +bool ReadFile_geno (const string &file_geno, vector &indicator_idv, vector &indicator_snp, vector > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test); +bool ReadFile_bed (const string &file_bed, vector &indicator_idv, vector &indicator_snp, vector > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test); bool ReadFile_est (const string &file_est, const vector &est_column, map &mapRS2est); @@ -69,6 +78,29 @@ bool CountFileLines (const string &file_input, size_t &n_lines); bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total); +bool ReadHeader (const string &line, HEADER &header); +bool ReadFile_cat (const string &file_cat, map &mapRS2cat, size_t &n_vc); + +bool BimbamKin (const string &file_geno, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin); +bool PlinkKin (const string &file_bed, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin); + +bool ReadFile_var (const string &file_var, map &mapRS2var); +void ReadFile_beta (const string &file_beta, const int k_mode, const map &mapRS2cat, const map &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test); + + +void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar); +void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar); +void ReadFile_v (const string &file_v, gsl_matrix *V); +void ReadFile_mv (const string &file_mq, gsl_matrix *V); +void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df); +void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df); + +// WJA added +bool bgenKin (const string &file_geno, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); +bool ReadFile_bgen(const string &file_bgen, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); +bool ReadFile_sample(const string &file_sample, vector > &indicator_pheno, vector > &pheno, const vector &p_column, vector &indicator_cvt, vector > &cvt, size_t &n_cvt); + + #endif diff --git a/src/lm.cpp b/src/lm.cpp index 7577d0a..b4bc010 100644 --- a/src/lm.cpp +++ b/src/lm.cpp @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see . */ @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include @@ -57,48 +57,50 @@ using namespace std; -void LM::CopyFromParam (PARAM &cPar) +void LM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; - + // WJA added + file_oxford=cPar.file_oxford; + time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + ng_total=cPar.ng_total; ng_test=0; - - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void LM::CopyToParam (PARAM &cPar) +void LM::CopyToParam (PARAM &cPar) { - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.ng_test=ng_test; - + return; } -void LM::WriteFiles () +void LM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; @@ -109,7 +111,7 @@ void LM::WriteFiles () if (!file_gene.empty()) { outfile<<"geneID"<<"\t"; - + if (a_mode==51) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<::size_type t=0; t::size_type t=0; tsize; double d; - + gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); - + gsl_blas_ddot (x, x, &xPwx); gsl_blas_ddot (x, y, &xPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - - gsl_blas_ddot (WtWiWtx, Wtx, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + + gsl_blas_ddot (WtWiWtx, Wtx, &d); xPwx-=d; - - gsl_blas_ddot (WtWiWtx, Wty, &d); + + gsl_blas_ddot (WtWiWtx, Wty, &d); xPwy-=d; - + gsl_vector_free (WtWiWtx); - + return; } @@ -202,17 +204,17 @@ void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, { size_t c_size=Wty->size; double d; - + gsl_vector *WtWiWty=gsl_vector_alloc (c_size); - + gsl_blas_ddot (y, y, &yPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); - - gsl_blas_ddot (WtWiWty, Wty, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + + gsl_blas_ddot (WtWiWty, Wty, &d); yPwy-=d; - + gsl_vector_free (WtWiWty); - + return; } @@ -223,38 +225,38 @@ void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, cons { double yPxy=yPwy-xPwy*xPwy/xPwx; double se_wald, se_score; - + beta=xPwy/xPwx; se_wald=sqrt(yPxy/(df*xPwx) ); se_score=sqrt(yPwy/((double)n_size*xPwx) ); - + p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); - + if (test_mode==3) {se=se_score;} else {se=se_wald;} - + return; } -void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) +void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) { ifstream infile (file_gene.c_str(), ifstream::in); if (!infile) {cout<<"error reading gene expression file:"<size1-(double)W->size2-1.0; @@ -262,7 +264,7 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_vector *y=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -274,42 +276,42 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wtx, x, xPwx); - + //header getline(infile, line); - + for (size_t t=0; tsize1, beta, se, p_wald, p_lrt, p_score); - + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout< +void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + gsl_vector *x_miss=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast(&bgen_N),4); + infile.read(reinterpret_cast(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast(ni_test-n_miss); + + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<size1-(double)W->size2-1.0; @@ -350,7 +580,7 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x_miss=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -362,58 +592,58 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - - //start reading genotypes and analyze + + //start reading genotypes and analyze for (size_t t=0; t1) {break;} getline(infile, line); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i1) { gsl_vector_set(x, i, 2-geno); } - } - - //calculate statistics - time_start=clock(); + } - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout< b; - + bitset<8> b; + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -459,7 +689,7 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -471,90 +701,104 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - - + + for (vector::size_type t=0; t1) { gsl_vector_set(x, i, 2-geno); } } - - //calculate statistics - time_start=clock(); - + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout< > &pos_loglr) +void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector > &pos_loglr) { double yty, xty, xtx, log_lr; gsl_blas_ddot(y, y, &yty); @@ -567,6 +811,6 @@ void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vectorsize*(log(yty)-log(yty-xty*xty/xtx)); pos_loglr.push_back(make_pair(i,log_lr) ); } - + return; } diff --git a/src/lm.h b/src/lm.h index ceec060..656dd52 100644 --- a/src/lm.h +++ b/src/lm.h @@ -1,22 +1,22 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see . */ -#ifndef __LM_H__ +#ifndef __LM_H__ #define __LM_H__ #include "gsl/gsl_vector.h" @@ -35,40 +35,44 @@ using namespace std; class LM { - + public: // IO related parameters int a_mode; //analysis mode, 50+1/2/3/4 for Frequentist tests size_t d_pace; //display pace - + string file_bfile; string file_geno; + string file_oxford; string file_out; string path_out; - + string file_gene; - + // Summary statistics size_t ni_total, ni_test; //number of individuals size_t ns_total, ns_test; //number of snps size_t ng_total, ng_test; //number of genes size_t n_cvt; double time_opt; //time spent - + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector snpInfo; //record SNP information - + // Not included in PARAM vector sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x); void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y); + // WJA added + void Analyzebgen (const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); }; void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector > &pos_loglr); diff --git a/src/lmm.cpp b/src/lmm.cpp index e0b4160..7bcf89a 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include @@ -58,56 +58,58 @@ using namespace std; -void LMM::CopyFromParam (PARAM &cPar) +void LMM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; - + // WJA added + file_oxford=cPar.file_oxford; + l_min=cPar.l_min; l_max=cPar.l_max; - n_region=cPar.n_region; + n_region=cPar.n_region; l_mle_null=cPar.l_mle_null; logl_mle_H0=cPar.logl_mle_H0; - + time_UtX=0.0; time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + ng_total=cPar.ng_total; ng_test=0; - - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void LMM::CopyToParam (PARAM &cPar) +void LMM::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.ng_test=ng_test; - + return; } -void LMM::WriteFiles () +void LMM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; @@ -118,7 +120,7 @@ void LMM::WriteFiles () if (!file_gene.empty()) { outfile<<"geneID"<<"\t"; - + if (a_mode==1) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<::size_type t=0; t::size_type t=0; ta) {l=a; h=b;} else {l=b; h=a;} - + size_t n=n_cvt+2; - index=(2*n-l+2)*(l-1)/2+h-l; - + index=(2*n-l+2)*(l-1)/2+h-l; + return index; } @@ -209,12 +211,12 @@ void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval size_t index_ab, index_aw, index_bw, index_ww; double p_ab; double ps_ab, ps_aw, ps_bw, ps_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab); if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;} @@ -224,12 +226,12 @@ void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps_ab=gsl_matrix_get (Pab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); ps_ww=gsl_matrix_get (Pab, p-1, index_ww); - + p_ab=ps_ab-ps_aw*ps_bw/ps_ww; gsl_matrix_set (Pab, p, index_ab, p_ab); } @@ -245,12 +247,12 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e size_t index_ab, index_aw, index_bw, index_ww; double p2_ab; double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab); if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);} @@ -260,7 +262,7 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps2_ab=gsl_matrix_get (PPab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); @@ -268,11 +270,11 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); - + p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww); p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww; gsl_matrix_set (PPab, p, index_ab, p2_ab); - + } } } @@ -286,12 +288,12 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH size_t index_ab, index_aw, index_bw, index_ww; double p3_ab; double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab); if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);} @@ -301,7 +303,7 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); @@ -312,11 +314,11 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw); ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw); ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww); - + p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww); p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww; p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww); - + gsl_matrix_set (PPPab, p, index_ab, p3_ab); } } @@ -331,119 +333,119 @@ double LogL_f (double l, void *params) { FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double f=0.0, logdet_h=0.0, d; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_div (Hi_eval, v_temp); + for (size_t i=0; i<(p->eval)->size; ++i) { d=gsl_vector_get (v_temp, i); logdet_h+=log(fabs(d)); - } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0); - - index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy); - + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); return f; } - - + + double LogL_dev1 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double dev1=0.0, trace_Hi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); - + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); - + if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;} - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - - double trace_HiK=((double)ni_test-trace_Hi)/l; - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double yPKPy=(P_yy-PP_yy)/l; + double yPKPy=(P_yy-PP_yy)/l; dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev1; } - - + + double LogL_dev2 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -451,71 +453,71 @@ double LogL_dev2 (double l, void *params) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); - + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); - + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev2; } - - - - - + + + + + void LogL_dev12 (double l, void *params, double *dev1, double *dev2) { FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double trace_Hi=0.0, trace_HiHi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -523,54 +525,54 @@ void LogL_dev12 (double l, void *params, double *dev1, double *dev2) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + double trace_HiK=((double)ni_test-trace_Hi)/l; double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); - + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); - - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + *dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; *dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return; } @@ -578,39 +580,39 @@ void LogL_dev12 (double l, void *params, double *dev1, double *dev2) double LogRL_f (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_div (Hi_eval, v_temp); + for (size_t i=0; i<(p->eval)->size; ++i) { d=gsl_vector_get (v_temp, i); logdet_h+=log(fabs(d)); } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); gsl_vector_set_all (v_temp, 1.0); - CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); - + CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); + //calculate |WHiW|-|WW| logdet_hiw=0.0; for (size_t i=0; in_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double dev1=0.0, trace_Hi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); - + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi; double ps_ww, ps2_ww; @@ -685,21 +687,21 @@ double LogRL_dev1 (double l, void *params) trace_P-=ps2_ww/ps_ww; } double trace_PK=(df-trace_P)/l; - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); - double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; - - dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; - + double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; + + dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev1; } @@ -708,19 +710,19 @@ double LogRL_dev1 (double l, void *params) double LogRL_dev2 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -728,31 +730,31 @@ double LogRL_dev2 (double l, void *params) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi, trace_PP=trace_HiHi; double ps_ww, ps2_ww, ps3_ww; @@ -765,46 +767,46 @@ double LogRL_dev2 (double l, void *params) trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww; } double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l); - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev2; } - + void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double trace_Hi=0.0, trace_HiHi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -812,31 +814,31 @@ void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi, trace_PP=trace_HiHi; double ps_ww, ps2_ww, ps3_ww; @@ -850,29 +852,29 @@ void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) } double trace_PK=(df-trace_P)/l; double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l); - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + *dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; *dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return ; } - + @@ -884,35 +886,35 @@ void LMM::CalcRLWald (const double &l, const FUNC_PARAM ¶ms, double &beta, d { size_t n_cvt=params.n_cvt; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + int df=(int)ni_test-(int)n_cvt-1; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); - + gsl_vector_memcpy (v_temp, params.eval); gsl_vector_scale (v_temp, l); if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - - CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); - double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); - double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); - + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + beta=P_xy/P_xx; double tau=(double)df/Px_yy; - se=sqrt(1.0/(tau*P_xx)); - p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); -// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); - + se=sqrt(1.0/(tau*P_xx)); + p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); +// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); @@ -924,36 +926,36 @@ void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, { size_t n_cvt=params.n_cvt; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + int df=(int)ni_test-(int)n_cvt-1; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); - + gsl_vector_memcpy (v_temp, params.eval); gsl_vector_scale (v_temp, l); if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - - CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); - double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); - double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); - + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + beta=P_xy/P_xx; double tau=(double)df/Px_yy; - se=sqrt(1.0/(tau*P_xx)); - + se=sqrt(1.0/(tau*P_xx)); + p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df); -// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); - +// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); @@ -967,131 +969,131 @@ void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, -void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { size_t index_ab; size_t n_cvt=UtW->size2; - + gsl_vector *u_a=gsl_vector_alloc (Uty->size); - + for (size_t a=1; a<=n_cvt+2; ++a) { if (a==n_cvt+1) {continue;} - + if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1); gsl_vector_memcpy (u_a, &UtW_col.vector); } - - for (size_t b=a; b>=1; --b) { + + for (size_t b=a; b>=1; --b) { if (b==n_cvt+1) {continue;} - + index_ab=GetabIndex (a, b, n_cvt); gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); - + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); - } - + } + gsl_vector_mul(&Uab_col.vector, u_a); } } - + gsl_vector_free (u_a); return; } -void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) -{ +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) +{ size_t index_ab; size_t n_cvt=UtW->size2; - - for (size_t b=1; b<=n_cvt+2; ++b) { + + for (size_t b=1; b<=n_cvt+2; ++b) { index_ab=GetabIndex (n_cvt+1, b, n_cvt); gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); - + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); } - + gsl_vector_mul(&Uab_col.vector, Utx); } - + return; } -void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) +void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { size_t index_ab; size_t n_cvt=W->size2; - + double d; gsl_vector *v_a=gsl_vector_alloc (y->size); gsl_vector *v_b=gsl_vector_alloc (y->size); - + for (size_t a=1; a<=n_cvt+2; ++a) { if (a==n_cvt+1) {continue;} - + if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1); gsl_vector_memcpy (v_a, &W_col.vector); } - - for (size_t b=a; b>=1; --b) { + + for (size_t b=a; b>=1; --b) { if (b==n_cvt+1) {continue;} - + index_ab=GetabIndex (a, b, n_cvt); - + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); gsl_vector_memcpy (v_b, &W_col.vector); - } - + } + gsl_blas_ddot (v_a, v_b, &d); gsl_vector_set(ab, index_ab, d); } } - + gsl_vector_free (v_a); gsl_vector_free (v_b); return; } -void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) -{ +void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) +{ size_t index_ab; size_t n_cvt=W->size2; - + double d; gsl_vector *v_b=gsl_vector_alloc (y->size); - - for (size_t b=1; b<=n_cvt+2; ++b) { + + for (size_t b=1; b<=n_cvt+2; ++b) { index_ab=GetabIndex (n_cvt+1, b, n_cvt); - + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); gsl_vector_memcpy (v_b, &W_col.vector); } - + gsl_blas_ddot (x, v_b, &d); gsl_vector_set(ab, index_ab, d); } - + gsl_vector_free (v_b); - + return; } @@ -1099,101 +1101,101 @@ void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_ -void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) +void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) { - ifstream infile (file_gene.c_str(), ifstream::in); + igzstream infile (file_gene.c_str(), igzstream::in); if (!infile) {cout<<"error reading gene expression file:"<size1); gsl_vector *Uty=gsl_vector_alloc (U->size2); gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_vector *ab=gsl_vector_alloc (n_index); + //header getline(infile, line); - + for (size_t t=0; tsize1); gsl_vector *Utx=gsl_vector_alloc (U->size2); gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); -// } - - //start reading genotypes and analyze +// } + + //start reading genotypes and analyze for (size_t t=0; t1) {break;} !safeGetline(infile, line).eof(); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); - + ch_ptr=strtok (NULL, " , \t"); + x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i1) {beta*=-1;} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout< b; - + bitset<8> b; + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; double logl_H1=0.0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; - + //Calculate basic quantities size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; gsl_vector *x=gsl_vector_alloc (U->size1); gsl_vector *Utx=gsl_vector_alloc (U->size2); - gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -1368,16 +1370,16 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m infile.read(ch,1); b=ch[0]; } - - + + for (vector::size_type t=0; t1) { gsl_vector_set(x, i, 2-geno); } } - + //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + CalcUab(UtW, Uty, Utx, Uab); // if (e_mode!=0) { // Calcab (W, y, x, ab); // } - + time_start=clock(); FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; - + //3 is before 1, for beta if (a_mode==3 || a_mode==4) { CalcRLScore (l_mle_null, param1, beta, se, p_score); } - + if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); CalcRLWald (lambda_remle, param1, beta, se, p_wald); } - + if (a_mode==2 || a_mode==4) { CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } - - if (x_mean>1) {beta*=-1;} - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout< +void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<size1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast(&bgen_N),4); + infile.read(reinterpret_cast(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast(ni_test-n_miss); + + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } -void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector > &pos_loglr) + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< > &pos_loglr) { double logl_H0, logl_H1, log_lr, lambda0, lambda1; - + gsl_vector *w=gsl_vector_alloc (Uty->size); - gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); + gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6); - gsl_vector *ab=gsl_vector_alloc (6); - + gsl_vector *ab=gsl_vector_alloc (6); + gsl_vector_set_zero(ab); gsl_vector_set_all (w, 1.0); - gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); - gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); - - CalcUab (Utw, Uty, Uab) ; - FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; - + gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); + gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); + + CalcUab (Utw, Uty, Uab) ; + FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; + CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0); - + for (size_t i=0; isize2; ++i) { gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i); CalcUab(Utw, Uty, &UtX_col.vector, Uab); FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0}; - + CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1); - log_lr=logl_H1-logl_H0; - + log_lr=logl_H1-logl_H0; + pos_loglr.push_back(make_pair(i,log_lr) ); } - + gsl_vector_free (w); gsl_matrix_free (Utw); gsl_matrix_free (Uab); gsl_vector_free (ab); - + return; } @@ -1506,17 +1748,17 @@ void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf) { if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."< > lambda_lh; - + //evaluate first order derivates in different intervals double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region; double dev1_l, dev1_h, logf_l, logf_h; - + for (size_t i=0; i=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;} } else { //if derivates change signs int status; int iter=0, max_iter=100; - double l, l_temp; - + double l, l_temp; + gsl_function F; gsl_function_fdf FDF; - + F.params=¶ms; FDF.params=¶ms; - + if (func_name=='R' || func_name=='r') { F.function=&LogRL_dev1; FDF.f=&LogRL_dev1; @@ -1568,57 +1810,57 @@ void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, c FDF.df=&LogL_dev2; FDF.fdf=&LogL_dev12; } - + const gsl_root_fsolver_type *T_f; gsl_root_fsolver *s_f; T_f=gsl_root_fsolver_brent; s_f=gsl_root_fsolver_alloc (T_f); - + const gsl_root_fdfsolver_type *T_fdf; gsl_root_fdfsolver *s_fdf; T_fdf=gsl_root_fdfsolver_newton; - s_fdf=gsl_root_fdfsolver_alloc(T_fdf); - + s_fdf=gsl_root_fdfsolver_alloc(T_fdf); + for (vector::size_type i=0; il_min && ll_min && ll_max) {l=l_max;} - if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} - + if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} + if (i==0) {logf=logf_l; lambda=l;} else if (logflogf) {lambda=l_min; logf=logf_l;} + + if (logf_l>logf) {lambda=l_min; logf=logf_l;} if (logf_h>logf) {lambda=l_max; logf=logf_h;} } - + return; } @@ -1646,53 +1888,53 @@ void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; - + CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0); - - gsl_matrix_free(Uab); - gsl_vector_free(ab); - + + gsl_matrix_free(Uab); + gsl_vector_free(ab); + return; } - - + + //obtain REMLE estimate for PVE using lambda_remle void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se) { size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; - + double se=sqrt(-1.0/LogRL_dev2 (lambda, ¶m0)); - + pve=trace_G*lambda/(trace_G*lambda+1.0); pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se; - + gsl_matrix_free (Uab); - gsl_vector_free (ab); + gsl_vector_free (ab); return; } @@ -1703,9 +1945,9 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v { size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(eval->size); gsl_vector *v_temp=gsl_vector_alloc(eval->size); @@ -1713,16 +1955,16 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2); gsl_vector *WHiy=gsl_vector_alloc(UtW->size2); gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2); - + gsl_matrix_set_zero (Uab); - CalcUab (UtW, Uty, Uab); - + CalcUab (UtW, Uty, Uab); + gsl_vector_memcpy (v_temp, eval); gsl_vector_scale (v_temp, lambda); gsl_vector_set_all (Hi_eval, 1.0); gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + //calculate beta gsl_matrix_memcpy (HiW, UtW); for (size_t i=0; isize2; i++) { @@ -1731,30 +1973,30 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v } gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW); gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy); - + int sig; gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2); LUDecomp (WHiW, pmt, &sig); LUSolve (WHiW, pmt, WHiy, beta); LUInvert (WHiW, pmt, Vbeta); - + //calculate vg and ve - CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); - + CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + ve=P_yy/(double)(ni_test-n_cvt); vg=ve*lambda; - + //with ve, calculate se(beta) gsl_matrix_scale(Vbeta, ve); - + //obtain se_beta for (size_t i=0; isize1; i++) { gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) ); } - + gsl_matrix_free(Uab); gsl_matrix_free(Pab); gsl_vector_free(ab); @@ -1764,8 +2006,309 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v gsl_matrix_free(WHiW); gsl_vector_free(WHiy); gsl_matrix_free(Vbeta); - + gsl_permutation_free(pmt); return; } + + + + + + +void LMM::AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *UtW_expand=gsl_matrix_alloc (U->size1, UtW->size2+2); + gsl_matrix_view UtW_expand_mat=gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); + gsl_matrix_memcpy (&UtW_expand_mat.matrix, UtW); + gsl_vector_view UtW_expand_env=gsl_matrix_column(UtW_expand, UtW->size2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); + gsl_vector_view UtW_expand_x=gsl_matrix_column(UtW_expand, UtW->size2+1); + + //gsl_matrix_set_zero (Uab); + // CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //start reading genotypes and analyze + for (size_t t=0; t1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &UtW_expand_x.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW_expand, Uty, Uab); + + if (a_mode==2 || a_mode==4) { + FUNC_PARAM param0={true, ni_test, n_cvt+2, eval, Uab, ab, 0}; + CalcLambda ('L', param0, l_min, l_max, n_region, lambda_mle, logl_H0); + } + + CalcUab(UtW_expand, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt+2, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< b; + + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0, logl_H0=0.0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+2+1)*(n_cvt+2+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *UtW_expand=gsl_matrix_alloc (U->size1, UtW->size2+2); + gsl_matrix_view UtW_expand_mat=gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); + gsl_matrix_memcpy (&UtW_expand_mat.matrix, UtW); + gsl_vector_view UtW_expand_env=gsl_matrix_column(UtW_expand, UtW->size2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); + gsl_vector_view UtW_expand_x=gsl_matrix_column(UtW_expand, UtW->size2+1); + + //gsl_matrix_set_zero (Uab); + //CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector::size_type t=0; t1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &UtW_expand_x.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW_expand, Uty, Uab); + + if (a_mode==2 || a_mode==4) { + FUNC_PARAM param0={true, ni_test, n_cvt+2, eval, Uab, ab, 0}; + CalcLambda ('L', param0, l_min, l_max, n_region, lambda_mle, logl_H0); + } + + CalcUab(UtW_expand, Uty, Utx, Uab); + +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt+2, eval, Uab, ab, 0}; + + //3 is before 1, for beta + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<. */ -#ifndef __LMM_H__ +#ifndef __LMM_H__ #define __LMM_H__ #include "gsl/gsl_vector.h" @@ -57,21 +57,23 @@ public: // IO related parameters int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests size_t d_pace; //display pace - + string file_bfile; string file_geno; string file_out; string path_out; - + string file_gene; - + // WJA added + string file_oxford; + // LMM related parameters double l_min; double l_max; size_t n_region; double l_mle_null; - double logl_mle_H0; - + double logl_mle_H0; + // Summary statistics size_t ni_total, ni_test; //number of individuals size_t ns_total, ns_test; //number of snps @@ -79,25 +81,29 @@ public: size_t n_cvt; double time_UtX; //time spent on optimization iterations double time_opt; //time spent on optimization iterations - + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector snpInfo; //record SNP information - + // Not included in PARAM vector sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x); void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + // WJA added + void Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env); + void AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env); void WriteFiles (); - + void CalcRLWald (const double &lambda, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_wald); - void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); + void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); }; void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector > &pos_loglr); diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index 4b910ee..5826a1f 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see . */ @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include @@ -60,16 +60,17 @@ using namespace std; //in this file, X, Y are already transformed (i.e. UtX and UtY) -void MVLMM::CopyFromParam (PARAM &cPar) +void MVLMM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; + file_oxford=cPar.file_oxford; file_out=cPar.file_out; path_out=cPar.path_out; - + l_min=cPar.l_min; l_max=cPar.l_max; n_region=cPar.n_region; @@ -79,68 +80,68 @@ void MVLMM::CopyFromParam (PARAM &cPar) em_prec=cPar.em_prec; nr_prec=cPar.nr_prec; crt=cPar.crt; - + Vg_remle_null=cPar.Vg_remle_null; Ve_remle_null=cPar.Ve_remle_null; Vg_mle_null=cPar.Vg_mle_null; Ve_mle_null=cPar.Ve_mle_null; - + time_UtX=0.0; time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + n_ph=cPar.n_ph; - - indicator_idv=cPar.indicator_idv; + + indicator_idv=cPar.indicator_idv; indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void MVLMM::CopyToParam (PARAM &cPar) +void MVLMM::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.Vg_remle_null=Vg_remle_null; cPar.Ve_remle_null=Ve_remle_null; cPar.Vg_mle_null=Vg_mle_null; cPar.Ve_mle_null=Ve_mle_null; - + cPar.VVg_remle_null=VVg_remle_null; cPar.VVe_remle_null=VVe_remle_null; cPar.VVg_mle_null=VVg_mle_null; cPar.VVe_mle_null=VVe_mle_null; - + cPar.beta_remle_null=beta_remle_null; cPar.se_beta_remle_null=se_beta_remle_null; cPar.beta_mle_null=beta_mle_null; cPar.se_beta_mle_null=se_beta_mle_null; - + cPar.logl_remle_H0=logl_remle_H0; - cPar.logl_mle_H0=logl_mle_H0; + cPar.logl_mle_H0=logl_mle_H0; return; } -void MVLMM::WriteFiles () +void MVLMM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; file_str+=".assoc.txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing file: "<size1; - double d, logdet_Ve=0.0; - + double d, logdet_Ve=0.0; + //eigen decomposition of V_e gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); + gsl_matrix_memcpy(V_e_temp, V_e); EigenDecomp(V_e_temp, U_l, D_l, 0); - + //calculate V_e_h and V_e_hi gsl_matrix_set_zero(V_e_h); gsl_matrix_set_zero(V_e_hi); @@ -233,14 +234,14 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, d=gsl_vector_get (D_l, i); if (d<=0) {continue;} logdet_Ve+=log(d); - + gsl_vector_view U_col=gsl_matrix_column(U_l, i); d=sqrt(d); gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_h); d=1.0/d; gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_hi); } - + //copy the upper part to lower part for (size_t i=0; isize, d_size=D_l->size, dc_size=Qi->size1; size_t c_size=dc_size/d_size; - + double delta, dl, d1, d2, d, logdet_Q; - + gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix_set_zero (Q); - - for (size_t i=0; isize, c_size=X->size1, d_size=D_l->size; - + gsl_vector_set_zero (xHiy); - + double x, delta, dl, y, d; - for (size_t i=0; isize, d_size=D_l->size; double delta, dl, d_u, d_e; - + for (size_t k=0; ksize1, d_size=UltVehiY->size1; - + gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size); - + gsl_matrix_memcpy (UltVehiBX, UltVehiY); gsl_matrix_sub (UltVehiBX, UltVehiU); - + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB); - - gsl_matrix_free(YUX); - + + gsl_matrix_free(YUX); + return; } void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB) { size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1; - + gsl_vector *b=gsl_vector_alloc (dc_size); - + //calculate b=Qiv gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b); - + //copy b to UltVehiB for (size_t i=0; isize, d_size=U->size1; - + gsl_matrix_set_zero (V_g); gsl_matrix_set_zero (V_e); - + double delta; - - //calculate the first part: UD^{-1}U^T and EE^T + + //calculate the first part: UD^{-1}U^T and EE^T for (size_t k=0; ksize, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1; - + gsl_matrix_set_zero(Sigma_uu); gsl_matrix_set_zero(Sigma_ee); - - double delta, dl, x, d; - + + double delta, dl, x, d; + //calculate the first diagonal term gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu); gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee); - + for (size_t k=0; ksize, d_size=D_l->size, dc_size=Qi->size1; double logl=0.0, delta, dl, y, d; - + //calculate yHiy+log|H_k| - for (size_t k=0; ksize, c_size=X->size1, d_size=Y->size1; - size_t dc_size=d_size*c_size; - + size_t dc_size=d_size*c_size; + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size); gsl_vector *D_l=gsl_vector_alloc (d_size); @@ -633,11 +634,11 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size); gsl_vector *xHiy=gsl_vector_alloc (dc_size); - gsl_permutation * pmt=gsl_permutation_alloc (c_size); - + gsl_permutation * pmt=gsl_permutation_alloc (c_size); + double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve; int sig; - + //calculate |XXt| and (XXt)^{-1} gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; size_t dc_size=d_size*c_size; double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value; - + gsl_vector *D_l=gsl_vector_alloc (d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); - gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); + gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size); - - gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); gsl_vector *xPy=gsl_vector_alloc (d_size); //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); gsl_vector *WHiy=gsl_vector_alloc (dc_size); - + gsl_matrix_set_zero (xPx); gsl_matrix_set_zero (WHix); gsl_vector_set_zero (xPy); gsl_vector_set_zero (WHiy); - + //eigen decomposition and calculate log|Ve| - logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + //calculate Qi and log|Q| - logdet_Q=CalcQi (eval, D_l, W, Qi); - + logdet_Q=CalcQi (eval, D_l, W, Qi); + //calculate UltVehiY gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); - + //calculate WHix, WHiy, xHiy, xHix for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; size_t dc_size=d_size*c_size; double delta, dl, d, dy, dw, logdet_Ve, logdet_Q; - + gsl_vector *D_l=gsl_vector_alloc (d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); @@ -870,67 +871,67 @@ void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix gsl_vector *QiWHiy=gsl_vector_alloc (dc_size); gsl_vector *beta=gsl_vector_alloc (dc_size); gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size); - + gsl_vector_set_zero (WHiy); - + //eigen decomposition and calculate log|Ve| - logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + //calculate Qi and log|Q| - logdet_Q=CalcQi (eval, D_l, W, Qi); - + logdet_Q=CalcQi (eval, D_l, W, Qi); + //calculate UltVehiY gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); - + //calculate WHiy for (size_t i=0; isize2; j++) { for (size_t i=0; isize1; i++) { gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i)); gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i))); } - } - + } + //free matrices gsl_vector_free(D_l); gsl_matrix_free(UltVeh); @@ -941,7 +942,7 @@ void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix gsl_vector_free(QiWHiy); gsl_vector_free(beta); gsl_matrix_free(Vbeta); - + return; } @@ -961,42 +962,42 @@ void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_ gsl_matrix_set_zero (Hi_all); gsl_matrix_set_zero (Qi); logdet_H=0.0; logdet_Q=0.0; - + size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1; - double logdet_Ve=0.0, delta, dl, d; - + double logdet_Ve=0.0, delta, dl, d; + gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); gsl_vector *D_l=gsl_vector_alloc (d_size); - + //calculate D_l, UltVeh and UltVehi logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + //calculate each Hi and log|H_k| logdet_H=(double)n_size*logdet_Ve; for (size_t k=0; ksize2, d_size=Y->size1; - + for (size_t k=0; ksize2, c_size=X->size1, d_size=Hi_all->size1; double d; - + for (size_t k=0; ksize2; - + for (size_t k=0; ksize2, d_size=Y->size1, dc_size=xHi->size1; - + for (size_t k=0; k=d_size || j>=d_size) {cout<<"error in GetIndex."<size; - + double delta, d1, d2; - + for (size_t k=0; ksize, d_size=Hiy->size1; - + double delta, d; - + for (size_t k=0; ksize, dc_size=xHi->size1; size_t d_size=xHi->size2/n_size; - + double delta; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; ksize, d_size=Hiy->size1; - + double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; ksize, d_size=Hiy->size1; - + double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; ksize, d_size=Hi->size1, dc_size=xHi->size1; - + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; ksize, d_size=Hi->size1; double delta, d; - + for (size_t k=0; ksize, d_size=Hi->size1; double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; ksize1, d_size=Hi->size1; size_t v=GetIndex(i, j, d_size); - + double d; - + //calculate the first part: trace(HiD) Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e); - + //calculate the second part: -trace(HixQixHiD) for (size_t k=0; ksize1, d_size=Hi->size1; size_t v_size=d_size*(d_size+1)/2; size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); - + double d; - + //calculate the first part: trace(HiDHiD) Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge); @@ -1549,7 +1550,7 @@ void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_mat gsl_vector_const_view xHiDHiDHix_gg_row=gsl_matrix_const_row (xHiDHiDHix_gg, i); gsl_vector_const_view xHiDHiDHix_ee_row=gsl_matrix_const_row (xHiDHiDHix_ee, i); gsl_vector_const_view xHiDHiDHix_ge_row=gsl_matrix_const_row (xHiDHiDHix_ge, i); - + gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_row.vector, &d); tPDPD_gg-=d; gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_row.vector, &d); @@ -1560,7 +1561,7 @@ void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_mat } //calculate the fourth part: trace(HixQixHiDHixQixHiD) - for (size_t i=0; isize1; size_t v; - + for (size_t i=0; isize2/eval->size, dc_size=xHi->size1; size_t v; - + for (size_t i=0; isize1; size_t v1, v2; - + for (size_t i1=0; i1size2/eval->size, dc_size=xHi->size1; - size_t v1, v2; - + size_t v1, v2; + for (size_t i1=0; i1size; double delta, d_Hi_ij; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; ksize1; size_t v_size=xHiDHix_all_g->size2/dc_size; - - for (size_t i=0; isize2; i++) { gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i); gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i); - + gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i); gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i); - + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector); gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector); } - + return; } @@ -1833,18 +1834,18 @@ void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gs { size_t dc_size=Qi->size1; size_t v_size=mat_all_g->size2/mat_all_g->size1; - + for (size_t i=0; isize1; size_t v=GetIndex(i, j, d_size); - - double d; - + + double d; + //first part: ytHiDHiy Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e); - + //second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v); gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v); - + gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d); yPDPy_g-=d*2.0; gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d); - yPDPy_e-=d*2.0; - + yPDPy_e-=d*2.0; + //fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v); gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v); - + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d); yPDPy_g+=d; gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d); @@ -1894,73 +1895,73 @@ void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector //+(yHix)Qi(xHiDHiDHix)Qi(xHiy) //-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge) -{ +{ size_t d_size=Hi->size1, dc_size=xHi->size1; size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); - size_t v_size=d_size*(d_size+1)/2; - + size_t v_size=d_size*(d_size+1)/2; + double d; - + gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size); - + //first part: yHiDHiDHiy - Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); - - //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) + Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); + + //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2); gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2); gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2); - + gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1); gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1); gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1); - - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); yPDPDPy_gg-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); yPDPDPy_ee-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); yPDPDPy_ge-=d; - - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); yPDPDPy_gg-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); yPDPDPy_ee-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); yPDPDPy_ge-=d; - + //fourth part: -(yHiDHix)Qi(xHiDHiy) gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1); gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1); gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2); gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2); - + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d); yPDPDPy_gg-=d; gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ee-=d; gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ge-=d; - + //fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1); gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1); - + gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1); gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1); gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2); gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2); - + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d); yPDPDPy_gg+=d; gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d); yPDPDPy_gg+=d; - + gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ee+=d; gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d); yPDPDPy_ee+=d; - + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ge+=d; gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d); @@ -1970,7 +1971,7 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); yPDPDPy_gg+=d; @@ -1980,21 +1981,21 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); yPDPDPy_ge+=d; - + //eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1); gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1); - + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d); yPDPDPy_gg-=d; gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d); yPDPDPy_ee-=d; gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d); yPDPDPy_ge-=d; - - //free memory - gsl_vector_free(xHiDHiDHixQixHiy); - + + //free memory + gsl_vector_free(xHiDHiDHixQixHiy); + return; } @@ -2005,62 +2006,62 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c) { crt_a=0.0; crt_b=0.0; crt_c=0.0; - + size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2; size_t c_size=dc_size/d_size; double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0; double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0; - + gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size); - + gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size); - + gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size); - - gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); - + + gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size); gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size); - + //invert Qi_sub to Qi_si gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + int sig; gsl_permutation * pmt=gsl_permutation_alloc (d_size); - + gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix); LUDecomp (Qi_sub, pmt, &sig); LUInvert (Qi_sub, pmt, Qi_si); - + gsl_permutation_free(pmt); gsl_matrix_free(Qi_sub); - + //calculate correctation factors for (size_t v1=0; v1size1, d_size=Hi->size1; @@ -2276,73 +2277,73 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge; gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); - gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); - + gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); - gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); - + gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); - + //calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all - Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); + Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e); Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e); - + Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge); Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge); - + //calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e); Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e); Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e); - + double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge; double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge; - //calculate gradient and Hessian for Vg + //calculate gradient and Hessian for Vg for (size_t i1=0; i11) { CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c); } else { - crt_a=0.0; crt_b=0.0; crt_c=0.0; - } - + crt_a=0.0; crt_b=0.0; crt_c=0.0; + } + gsl_matrix_free(xHiDHiy_all_g); gsl_matrix_free(xHiDHiy_all_e); gsl_matrix_free(xHiDHix_all_g); - gsl_matrix_free(xHiDHix_all_e); + gsl_matrix_free(xHiDHix_all_e); gsl_matrix_free(xHiDHixQixHiy_all_g); gsl_matrix_free(xHiDHixQixHiy_all_e); - + gsl_matrix_free(QixHiDHiy_all_g); gsl_matrix_free(QixHiDHiy_all_e); gsl_matrix_free(QixHiDHix_all_g); - gsl_matrix_free(QixHiDHix_all_e); + gsl_matrix_free(QixHiDHix_all_e); gsl_matrix_free(QixHiDHixQixHiy_all_g); gsl_matrix_free(QixHiDHixQixHiy_all_e); - + gsl_matrix_free(xHiDHiDHiy_all_gg); gsl_matrix_free(xHiDHiDHiy_all_ee); gsl_matrix_free(xHiDHiDHiy_all_ge); gsl_matrix_free(xHiDHiDHix_all_gg); gsl_matrix_free(xHiDHiDHix_all_ee); gsl_matrix_free(xHiDHiDHix_all_ge); - + return; } @@ -2452,25 +2453,25 @@ void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, cons { size_t v_size=gradient->size/2, d_size=V_g->size1; size_t v; - + gsl_vector *vec_v=gsl_vector_alloc (v_size*2); - + double d; - + //vectorize Vg and Ve for (size_t i=0; isize, c_size=X->size1, d_size=Y->size1; size_t dc_size=d_size*c_size; size_t v_size=d_size*(d_size+1)/2; - + double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale; int sig; size_t step_iter, flag_pd; - + gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size); gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size); gsl_vector *D_temp=gsl_vector_alloc (d_size); gsl_vector *xHiy=gsl_vector_alloc (dc_size); - gsl_vector *QixHiy=gsl_vector_alloc (dc_size); + gsl_vector *QixHiy=gsl_vector_alloc (dc_size); gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); - - gsl_vector *gradient=gsl_vector_alloc (v_size*2); - + + gsl_vector *gradient=gsl_vector_alloc (v_size*2); + //calculate |XXt| and (XXt)^{-1} gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); for (size_t i=0; i10 ) && step_iter<10 && t!=0); @@ -2602,21 +2603,21 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec gsl_matrix_memcpy (V_e, Ve_save); break; } - + if (logl_new-logl_oldsize, c_size=X->size1, d_size=Y->size1; + + size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1; double a, b, c; double lambda, logl, vg, ve; - + //Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates - gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); + gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); gsl_vector *beta_temp=gsl_vector_alloc(c_size); gsl_vector *se_beta_temp=gsl_vector_alloc(c_size); - - gsl_matrix_transpose_memcpy (Xt, X); - + + gsl_matrix_transpose_memcpy (Xt, X); + for (size_t i=0; i4) { //first obtain good initial values @@ -2707,48 +2708,48 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (6, 6); - + //2 by n matrix of Y gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size); gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2); gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2); gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size); - + for (size_t i=0; i +void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) { - igzstream infile (file_geno.c_str(), igzstream::in); -// ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<size1, d_size=UtY->size2, c_size=UtW->size2; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; - + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -2959,17 +2960,17 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_vector *x=gsl_vector_alloc (n_size); gsl_vector *x_miss=gsl_vector_alloc (n_size); - + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); @@ -2977,31 +2978,31 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); gsl_vector *beta=gsl_vector_alloc (d_size); gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); - + //null estimates for initial values gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); - - gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); - + gsl_matrix_transpose_memcpy (Y, UtY); gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); - + gsl_vector_view X_row=gsl_matrix_row(X, c_size); gsl_vector_set_zero(&X_row.vector); gsl_vector_view B_col=gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); + gsl_vector_set_zero(&B_col.vector); MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); - logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); - + c=0; Vg_remle_null.clear(); Ve_remle_null.clear(); @@ -3014,7 +3015,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs c++; } } - beta_remle_null.clear(); + beta_remle_null.clear(); se_beta_remle_null.clear(); for (size_t i=0; isize1; i++) { for (size_t j=0; jsize2; j++) { @@ -3023,10 +3024,10 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } } logl_remle_H0=logl_H0; - + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); cout.precision(4); - + cout<<"REMLE estimate for Vg in the null model: "<=1) {break;} - !safeGetline(infile, line).eof(); + + +// if (t>1) {break;} if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} - if (indicator_snp[t]==0) {continue;} - - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast(&bgen_N),4); + infile.read(reinterpret_cast(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast(unzipped_data),bgen_P); + } x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); - for (size_t i=0; i(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; } c_phen++; } - x_mean/=(double)(ni_test-n_miss); - + x_mean/=static_cast(ni_test-n_miss); + for (size_t i=0; i1) {gsl_vector_scale(beta, -1.0);} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; for (size_t i=0; i b; - + + string line; + char *ch_ptr; + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; double crt_a, crt_b, crt_c; - int n_bit, n_miss, ci_total, ci_test; + int n_miss, c_phen; double geno, x_mean; size_t c=0; // double s=0.0; - size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; - + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3324,50 +3438,49 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_vector *x=gsl_vector_alloc (n_size); - + gsl_vector *x_miss=gsl_vector_alloc (n_size); + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); gsl_vector *beta=gsl_vector_alloc (d_size); gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); - + //null estimates for initial values gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); - - gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); - + gsl_matrix_transpose_memcpy (Y, UtY); + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); - + gsl_vector_view X_row=gsl_matrix_row(X, c_size); gsl_vector_set_zero(&X_row.vector); gsl_vector_view B_col=gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); - - //time_start=clock(); + gsl_vector_set_zero(&B_col.vector); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); - logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); - //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1; i++) { for (size_t j=0; jsize2; j++) { @@ -3389,9 +3502,10 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } } logl_remle_H0=logl_H0; - + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<=1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - - //if (t>=0) {break;} - //if (snpInfo[t].rs_number!="MAG18140902") {continue;} - //cout<1) { gsl_vector_set(x, i, 2-geno); } - } - - /* - if (t==0) { - ofstream outfile ("./snp1.txt", ofstream::out); - if (!outfile) {cout<<"error writing file: "<size; i++) { - outfile<1) {gsl_vector_scale(beta, -1.0);} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; for (size_t i=0; i b; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; - double logl, crt_a, crt_b, crt_c; - //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3706,44 +3803,1248 @@ void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - - //transpose matrices + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *W=gsl_matrix_alloc (c_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + gsl_matrix_transpose_memcpy (Y, UtY); - gsl_matrix_transpose_memcpy (W, UtW); - - //initial, EM, NR, and calculate B - MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B); - logl=MphEM ('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); - logl=MphNR ('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - MphCalcBeta (eval, W, Y, V_g, V_e, UltVehiY, B, se_B); + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); - //free matrices - gsl_matrix_free(U_hat); - gsl_matrix_free(E_hat); - gsl_matrix_free(OmegaU); - gsl_matrix_free(OmegaE); - gsl_matrix_free(UltVehiY); - gsl_matrix_free(UltVehiBX); - gsl_matrix_free(UltVehiU); - gsl_matrix_free(UltVehiE); - - gsl_matrix_free(Hi_all); - gsl_matrix_free(Hiy_all); - gsl_matrix_free(xHi_all); - gsl_matrix_free(Hessian); - - gsl_matrix_free(Y); - gsl_matrix_free(W); - - return; -} + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + //time_start=clock(); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<=0) {break;} + //if (snpInfo[t].rs_number!="MAG18140902") {continue;} + //cout<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + /* + if (t==0) { + ofstream outfile ("./snp1.txt", ofstream::out); + if (!outfile) {cout<<"error writing file: "<size; i++) { + outfile<1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; isize1, d_size=UtY->size2, c_size=UtW->size2+2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_vector *x_miss=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values; including env but not including x + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1); + gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size); + gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1); + gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size); + + gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size); + gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW); + gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector); + + gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1); + gsl_vector_set_zero(&X_row1.vector); + gsl_vector_view X_row2=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row2.vector); + + gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1); + gsl_vector_set_zero(&B_col1.vector); + gsl_vector_view B_col2=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col2.vector); + + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + + cout<<"REMLE estimate for Vg in the null model: "<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row1.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row2.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + if (a_mode==2 || a_mode==3 || a_mode==4) { + if (a_mode==3 || a_mode==4) { + logl_H0=MphEM ('R', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('R', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + + if (a_mode==2 || a_mode==4) { + logl_H0=MphEM ('L', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('L', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + } + + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i b; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2+2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1); + gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size); + gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1); + gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size); + + gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size); + gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW); + gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector); + + gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1); + gsl_vector_set_zero(&X_row1.vector); + gsl_vector_view X_row2=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row2.vector); + + gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1); + gsl_vector_set_zero(&B_col1.vector); + gsl_vector_view B_col2=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col2.vector); + + //time_start=clock(); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix); + + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<=0) {break;} + //if (snpInfo[t].rs_number!="MAG18140902") {continue;} + //cout<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + /* + if (t==0) { + ofstream outfile ("./snp1.txt", ofstream::out); + if (!outfile) {cout<<"error writing file: "<size; i++) { + outfile<1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector snpInfo; //record SNP information - + // Not included in PARAM vector sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env); + void AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env); void WriteFiles (); - + }; void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B); diff --git a/src/param.cpp b/src/param.cpp index 7a89ff8..c4b234a 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -24,6 +24,15 @@ #include #include +#include "gsl/gsl_randist.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "eigenlib.h" +#include "mathfunc.h" #ifdef FORCE_FLOAT #include "param_float.h" @@ -39,12 +48,12 @@ using namespace std; -PARAM::PARAM(void): +PARAM::PARAM(void): mode_silence (false), a_mode (0), k_mode(1), d_pace (100000), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0), -pheno_mean(0), +pheno_mean(0), noconstrain (false), h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0), rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), @@ -55,53 +64,64 @@ n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), +window_cm(0), window_bp(0), window_ns(0), error(false), - n_cvt(1), n_vc(1), +ni_subsample(0), n_cvt(1), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} //read files //obtain ns_total, ng_total, ns_test, ni_test -void PARAM::ReadFiles (void) +void PARAM::ReadFiles (void) { string file_str; - if (!file_mk.empty()) { + + + if (!file_cat.empty()) { + if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;} + } + + if (!file_var.empty()) { + if (ReadFile_var (file_var, mapRS2var)==false) {error=true;} + } + + if (!file_mk.empty()) { if (CountFileLines (file_mk, n_vc)==false) {error=true;} } - + if (!file_snps.empty()) { if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} } else { setSnps.clear(); } - + //for prediction if (!file_epm.empty()) { if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;} - + if (!file_bfile.empty()) { file_str=file_bfile+".bim"; - if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} - + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + file_str=file_bfile+".fam"; - if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} } - - if (!file_geno.empty()) { - if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} - - if (CountFileLines (file_geno, ns_total)==false) {error=true;} + + if (!file_geno.empty()) { + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + if (CountFileLines (file_geno, ns_total)==false) {error=true;} } - + if (!file_ebv.empty() ) { if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;} } - + if (!file_log.empty() ) { if (ReadFile_log (file_log, pheno_mean)==false) {error=true;} } - + //convert indicator_pheno to indicator_idv int k=1; for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } - + if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "<1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<1) {cout<<"error! rho values must be between 0 and 1. current values = "<0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<1) { cout<<"error! pnr value must be between 0 and 1. current value = "<::size_type i=0; i<(indicator_idv).size(); ++i) { if (indicator_idv[i]==0) {continue;} ni_test++; } - + ni_cvt=0; for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; ni_test+=indicator_idv[i]; } - } - + } + if ((indicator_read).size()!=0) { - ni_test=0; + ni_test=0; for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } } */ - if (ni_test==0) { + if (ni_test==0 && file_cor.empty() && file_mq.empty() && file_q.empty() && file_beta.empty() ) { error=true; cout<<"error! number of analyzed individuals equals 0. "<ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."< > &Xt, gsl_matrix *K, const bool calc_K) { + string file_str; + + if (!file_bfile.empty()) { + file_str=file_bfile+".bed"; + if (ReadFile_bed (file_str, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } else { + if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } + + return; +} + void PARAM::CalcKin (gsl_matrix *matrix_kin) { string file_str; - + gsl_matrix_set_zero (matrix_kin); - - if (!file_bfile.empty() ) { + + if (!file_bfile.empty() ) { file_str=file_bfile+".bed"; if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } + else if (!file_oxford.empty() ) { + file_str=file_oxford+".bgen"; + if (bgenKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} + } else { file_str=file_geno; if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } - + + return; +} + + + +//from an existing n by nd G matrix, compute the d by d S matrix +void compKtoS (const gsl_matrix *G, gsl_matrix *S) { + size_t n_vc=S->size1, ni_test=G->size1; + double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d; + + for (size_t i=0; in_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + +//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d+1) Q matrix +//where inside i'th d+1 by d+1 matrix, each element is tr(KiKjKiKl)-r*tr(KjKiKl)-r*tr(KlKiKj)+r^2*tr(KjKl), where r=n/(n-1) +void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { + size_t n_vc=G->size2/G->size1, ni_test=G->size1; + + gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test); + gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc ); + gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 ); + gsl_vector *trKi=gsl_vector_alloc(n_vc); + + double d, tr, r=(double)ni_test/(double)(ni_test-1); + size_t t, t_ij, t_il, t_jl, t_ii; + + //compute KiKj for all pairs of i and j (including the identity matrix) + t=0; + for (size_t i=0; il) { + gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); + tr-=r*d; + } else if (i>j && i<=l) { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } else { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } + } + + tr+=r*r*gsl_vector_get (trKiKj, t_jl); + } else if (j!=n_vc && l==n_vc) { + t_ij=GetabIndex (i+1, j+1, n_vc-2); + tr=gsl_vector_get (trKiKjKi, i*n_vc+j)-2*r*gsl_vector_get (trKiKj, t_ij)+r*r*gsl_vector_get (trKi, j); + } else if (j==n_vc && l==n_vc) { + t_ii=GetabIndex (i+1, i+1, n_vc-2); + tr=gsl_vector_get (trKiKj, t_ii)-2*r*gsl_vector_get (trKi, i)+r*r*(double)(ni_test-1); + } + + gsl_matrix_set (Q, j, i*(n_vc+1)+l, tr); + if (l!=j) {gsl_matrix_set (Q, l, i*(n_vc+1)+j, tr);} + } + } + } + + gsl_matrix_scale (Q, 1.0/pow((double)ni_test, 2) ); + + gsl_matrix_free(KiKj); + gsl_vector_free(trKiKjKi); + gsl_vector_free(trKiKj); + gsl_vector_free(trKi); + + return; +} + + + +//perform Jacknife sampling for variance of S +void JacknifeGtoS (const gsl_matrix *G, gsl_matrix *S, gsl_matrix *Svar) { + size_t n_vc=Svar->size1, ni_test=G->size1; + vector > > tr_KiKj, s_KiKj; + vector > sum_Ki, s_Ki, si; + vector vec_tmp; + double di, dj, d, m, v; + + //initialize and set all elements to zero + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //bootstrap: in each iteration, sample individuals and compute S_pmt + size_t n_pmt=100; + vector idv_order, idv_remove; + for (size_t i=0; i(&idv_remove[0]), n_pmt, static_cast(&idv_order[0]), ni_test, sizeof(size_t)); + + gsl_matrix *S_pmt=gsl_matrix_alloc(n_vc, n_vc*n_pmt); + for (size_t i=0; isize; ++i) { + outfile<size; ++i) { + outfile<size1; ++i) { for (size_t j=0; jsize2; ++j) { outfile<size; ++i) { outfile<::size_type i=0; i set_remove; - + //check if any columns is an intercept for (size_t i=0; isize2; i++) { gsl_vector_view w_col=gsl_matrix_column (W, i); gsl_vector_minmax (&w_col.vector, &v_min, &v_max); if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);} } - + //add an intecept term if needed if (n_cvt==set_remove.size()) { indicator_cvt.clear(); @@ -697,19 +1326,19 @@ void PARAM::CheckCvt () if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;} cvt[i].push_back(1.0); } - + n_cvt++; - } else {} - + } else {} + gsl_matrix_free(W); - + return; } //post-process phentoypes, covariates void PARAM::ProcessCvtPhen () -{ +{ //convert indicator_pheno to indicator_idv int k=1; indicator_idv.clear(); @@ -720,27 +1349,88 @@ void PARAM::ProcessCvtPhen () } indicator_idv.push_back(k); } - + //remove individuals with missing covariates if ((indicator_cvt).size()!=0) { for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; } } - + + //remove individuals with missing gxe variables + if ((indicator_gxe).size()!=0) { + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_gxe[i]; + } + } + + //remove individuals with missing residual weights + if ((indicator_weight).size()!=0) { + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_weight[i]; + } + } + //obtain ni_test - ni_test=0; + ni_test=0; for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { - if (indicator_idv[i]==0) {continue;} + if (indicator_idv[i]==0) {continue;} ni_test++; } - + + + + //if subsample number is set, perform a random sub-sampling to determine the subsampled ids + if (ni_subsample!=0) { + if (ni_testtm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //from ni_test, sub-sample ni_subsample + vector a, b; + for (size_t i=0; i(&a[0]), ni_subsample, static_cast(&b[0]), ni_test, sizeof (size_t) ); + + //re-set indicator_idv and ni_test + int j=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + if(find(a.begin(), a.end(), j) == a.end()) { + indicator_idv[i]=0; + } + j++; + } + ni_test=ni_subsample; + } + } + + //check ni_test if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "< cvt_row; cvt_row.push_back(1); - + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_cvt.push_back(1); - + cvt.push_back(cvt_row); } } - + return; } -void PARAM::CopyCvt (gsl_matrix *W) +void PARAM::CopyCvt (gsl_matrix *W) { size_t ci_test=0; - + for (vector::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i. */ -#ifndef __PARAM_H__ +#ifndef __PARAM_H__ #define __PARAM_H__ #include @@ -39,14 +39,17 @@ public: string a_major; size_t n_miss; double missingness; - double maf; + double maf; + size_t n_idv;//number of non-missing individuals + size_t n_nb;//number of neighbours on the right hand side + size_t file_position;//snp location on file }; //results for lmm class SUMSTAT { public: double beta; //REML estimator for beta - double se; //SE for beta + double se; //SE for beta double lambda_remle; //REML estimator for lambda double lambda_mle; //MLE estimator for lambda double p_wald; //p value from a Wald test @@ -75,50 +78,87 @@ public: double rho; double pge; double logp; - + size_t n_gamma; }; +//header class +class HEADER +{ + +public: + size_t rs_col; + size_t chr_col; + size_t pos_col; + size_t cm_col; + size_t a1_col; + size_t a0_col; + size_t z_col; + size_t beta_col; + size_t sebeta_col; + size_t chisq_col; + size_t p_col; + size_t n_col; + size_t nmis_col; + size_t nobs_col; + size_t af_col; + size_t var_col; + size_t ws_col; + size_t cor_col; + size_t coln;//number of columns +}; + class PARAM { -public: +public: // IO related parameters bool mode_silence; int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests - int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; + int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; vector p_column; //which phenotype column needs analysis size_t d_pace; //display pace - + string file_bfile; string file_geno; string file_pheno; string file_anno; //optional + string file_gxe; //optional string file_cvt; //optional + string file_cat; + string file_var; + string file_beta; + string file_cor; string file_kin; string file_ku, file_kd; string file_mk; + string file_q, file_mq; + string file_s, file_ms; + string file_v, file_mv; + string file_weight; string file_out; string path_out; - + + string file_epm; //estimated parameter file string file_ebv; //estimated breeding value file string file_log; //log file containing mean estimate - + string file_read; //file containing total number of reads string file_gene; //gene expression file - + string file_snps; //file containing analyzed snps or genes - - - - // QC related parameters +// WJA Added + string file_oxford; + + + // QC related parameters double miss_level; - double maf_level; + double maf_level; double hwe_level; double r2_level; - + // LMM related parameters double l_min; double l_max; @@ -130,7 +170,7 @@ public: vector Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; vector beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; - double p_nr; + double p_nr; double em_prec, nr_prec; size_t em_iter, nr_iter; size_t crt; @@ -138,15 +178,16 @@ public: //for fitting multiple variance components //the first three are of size n_vc, and the next two are of size n_vc+1 + bool noconstrain; vector v_traceG; vector v_pve; vector v_se_pve; vector v_sigma2; - vector v_se_sigma2; + vector v_se_sigma2; vector v_beta; - vector v_se_beta; - + vector v_se_beta; + // BSLMM MCMC related parameters double h_min, h_max, h_scale; //priors for h double rho_min, rho_max, rho_scale; //priors for rho @@ -163,7 +204,12 @@ public: double trace_G; HYPBSLMM cHyp_initial; - + + //VARCOV related parameters + double window_cm; + size_t window_bp; + size_t window_ns; + // Summary statistics bool error; size_t ni_total, ni_test, ni_cvt; //number of individuals @@ -171,6 +217,8 @@ public: size_t ns_total, ns_test; //number of snps size_t ng_total, ng_test; //number of genes size_t ni_control, ni_case; //number of controls and number of cases + size_t ni_subsample; //number of subsampled individuals + size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel size_t n_cvt; //number of covariates size_t n_ph; //number of phenotypes size_t n_vc; //number of variance components (including the diagonal matrix) @@ -186,42 +234,54 @@ public: // Data vector > pheno; //a vector record all phenotypes, NA replaced with -9 - vector > cvt; //a vector record all covariates, NA replaced with -9 + vector > cvt; //a vector record all covariates, NA replaced with -9 + vector gxe; //a vector record all covariates, NA replaced with -9 + vector weight; //a vector record weights for the individuals, which is useful for animal breeding studies vector > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis vector indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis - + vector indicator_gxe; //indicator for gxe, 0 missing, 1 available for analysis + vector indicator_weight; //indicator for weight, 0 missing, 1 available for analysis + vector indicator_bv; //indicator for estimated breeding value file, 0 missing, 1 available for analysis vector indicator_read; //indicator for read file, 0 missing, 1 available for analysis vector vec_read; //total number of reads vector vec_bv; //breeding values vector est_column; - + map mapID2num; //map small ID number to number, from 0 to n-1 map mapRS2chr; //map rs# to chromosome location map mapRS2bp; //map rs# to base position map mapRS2cM; //map rs# to cM map mapRS2est; //map rs# to parameters - + map mapRS2cat; //map rs# to category number + map mapRS2var; //map rs# to category number + vector snpInfo; //record SNP information set setSnps; //a set of snps for analysis - + //constructor PARAM(); - + //functions - void ReadFiles (); - void CheckParam (); - void CheckData (); + void ReadFiles (); + void CheckParam (); + void CheckData (); void PrintSummary (); - void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void ReadGenotypes (vector > &Xt, gsl_matrix *K, const bool calc_K); void CheckCvt (); void CopyCvt (gsl_matrix *W); + void CopyGxe (gsl_vector *gxe); + void CopyWeight (gsl_vector *w); void ProcessCvtPhen(); void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); void CalcKin (gsl_matrix *matrix_kin); + void CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q); + void WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix); + void WriteVar (const string suffix); void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); void WriteVector (const gsl_vector *vector_D, const string suffix); void CopyRead (gsl_vector *log_N); -- cgit v1.2.3