diff options
-rw-r--r-- | src/io.cpp | 2482 | ||||
-rw-r--r-- | src/io.h | 36 | ||||
-rw-r--r-- | src/lm.cpp | 516 | ||||
-rw-r--r-- | src/lm.h | 28 | ||||
-rw-r--r-- | src/lmm.cpp | 1517 | ||||
-rw-r--r-- | src/lmm.h | 30 | ||||
-rw-r--r-- | src/mvlmm.cpp | 3005 | ||||
-rw-r--r-- | src/mvlmm.h | 30 | ||||
-rw-r--r-- | src/param.cpp | 1122 | ||||
-rw-r--r-- | src/param.h | 122 |
10 files changed, 6790 insertions, 2098 deletions
@@ -28,7 +28,7 @@ #include <cstring> #include <cmath> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" @@ -39,6 +39,7 @@ #include "lapack.h" #include "gzstream.h" #include "mathfunc.h" +#include "eigenlib.h" #ifdef FORCE_FLOAT #include "io_float.h" @@ -54,10 +55,10 @@ using namespace std; //Print process bar void ProgressBar (string str, double p, double total) { - double progress = (100.0 * p / total); - int barsize = (int) (progress / 2.0); + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); char bar[51]; - + cout<<str; for (int i = 0; i <50; i++) { if (i<barsize) {bar[i] = '=';} @@ -65,7 +66,7 @@ void ProgressBar (string str, double p, double total) cout<<bar[i]; } cout<<setprecision(2)<<fixed<<progress<<"%\r"<<flush; - + return; } @@ -73,10 +74,10 @@ void ProgressBar (string str, double p, double total) //Print process bar (with acceptance ratio) void ProgressBar (string str, double p, double total, double ratio) { - double progress = (100.0 * p / total); - int barsize = (int) (progress / 2.0); + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); char bar[51]; - + cout<<str; for (int i = 0; i <50; i++) { if (i<barsize) {bar[i] = '=';} @@ -84,8 +85,8 @@ void ProgressBar (string str, double p, double total, double ratio) cout<<bar[i]; } cout<<setprecision(2)<<fixed<<progress<<"% "<<ratio<<"\r"<<flush; - - + + return; } @@ -130,18 +131,18 @@ bool ReadFile_snps (const string &file_snps, set<string> &setSnps) ifstream infile (file_snps.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;} - + string line; char *ch_ptr; - + while (getline(infile, line)) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); - setSnps.insert(ch_ptr); + setSnps.insert(ch_ptr); } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -151,15 +152,15 @@ bool ReadFile_log (const string &file_log, double &pheno_mean) { ifstream infile (file_log.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open log file: "<<file_log<<endl; return false;} - + string line; char *ch_ptr; size_t flag=0; - + while (getline(infile, line)) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); - + if (ch_ptr!=NULL && strcmp(ch_ptr, "estimated")==0) { ch_ptr=strtok (NULL, " , \t"); if (ch_ptr!=NULL && strcmp(ch_ptr, "mean")==0) { @@ -171,13 +172,13 @@ bool ReadFile_log (const string &file_log, double &pheno_mean) } } } - + if (flag==1) {break;} } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -187,18 +188,18 @@ bool ReadFile_anno (const string &file_anno, map<string, string> &mapRS2chr, map { mapRS2chr.clear(); mapRS2bp.clear(); - + ifstream infile (file_anno.c_str(), ifstream::in); if (!infile) {cout<<"error opening annotation file: "<<file_anno<<endl; return false;} - + string line; char *ch_ptr; - + string rs; long int b_pos; string chr; double cM; - + while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; @@ -208,15 +209,15 @@ bool ReadFile_anno (const string &file_anno, map<string, string> &mapRS2chr, map if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {chr="-9";} else {chr=ch_ptr;} ch_ptr=strtok (NULL, " , \t"); if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {cM=-9;} else {cM=atof(ch_ptr);} - + mapRS2chr[rs]=chr; mapRS2bp[rs]=b_pos; mapRS2cM[rs]=cM; } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -225,28 +226,28 @@ bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vect { indicator_idv.clear(); pheno.clear(); - + igzstream infile (file_pheno.c_str(), igzstream::in); // ifstream infile (file_pheno.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;} - + string line; char *ch_ptr; - + string id; double p; while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); for (int i=0; i<(p_column-1); ++i) { - ch_ptr=strtok (NULL, " , \t"); - } + ch_ptr=strtok (NULL, " , \t"); + } if (strcmp(ch_ptr, "NA")==0) {indicator_idv.push_back(0); pheno.push_back(-9);} //pheno is different from pimass2 else {p=atof(ch_ptr); indicator_idv.push_back(1); pheno.push_back(p);} } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -257,48 +258,48 @@ bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_p { indicator_pheno.clear(); pheno.clear(); - + igzstream infile (file_pheno.c_str(), igzstream::in); // ifstream infile (file_pheno.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;} string line; char *ch_ptr; - + string id; double p; - + vector<double> pheno_row; vector<int> ind_pheno_row; - + size_t p_max=*max_element(p_column.begin(), p_column.end() ); map<size_t, size_t> mapP2c; for (size_t i=0; i<p_column.size(); i++) { mapP2c[p_column[i]]=i; pheno_row.push_back(-9); ind_pheno_row.push_back(0); - } - + } + while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); - + size_t i=0; - while (i<p_max ) { + while (i<p_max ) { if (mapP2c.count(i+1)!=0) { if (strcmp(ch_ptr, "NA")==0) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;} else {p=atof(ch_ptr); ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;} } i++; - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); } - - indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); + + indicator_pheno.push_back(ind_pheno_row); + pheno.push_back(pheno_row); } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -306,44 +307,44 @@ bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_p bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt) { indicator_cvt.clear(); - + ifstream infile (file_cvt.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open covariates file: "<<file_cvt<<endl; return false;} - + string line; char *ch_ptr; - double d; - - int flag_na=0; - + double d; + + int flag_na=0; + while (!safeGetline(infile, line).eof()) { vector<double> v_d; flag_na=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} else {d=atof(ch_ptr);} - + v_d.push_back(d); - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); } - if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} cvt.push_back(v_d); } - + if (indicator_cvt.empty()) {n_cvt=0;} else { flag_na=0; for (vector<int>::size_type i=0; i<indicator_cvt.size(); ++i) { if (indicator_cvt[i]==0) {continue;} - + if (flag_na==0) {flag_na=1; n_cvt=cvt[i].size();} if (flag_na!=0 && n_cvt!=cvt[i].size()) {cout<<"error! number of covariates in row "<<i<<" do not match other rows."<<endl; return false;} } } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -353,20 +354,20 @@ bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<ve bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo) { snpInfo.clear(); - + ifstream infile (file_bim.c_str(), ifstream::in); if (!infile) {cout<<"error opening .bim file: "<<file_bim<<endl; return false;} - + string line; char *ch_ptr; - + string rs; long int b_pos; string chr; double cM; string major; string minor; - + while (getline(infile, line)) { ch_ptr=strtok ((char *)line.c_str(), " \t"); chr=ch_ptr; @@ -380,13 +381,13 @@ bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo) minor=ch_ptr; ch_ptr=strtok (NULL, " \t"); major=ch_ptr; - - SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, -9, -9, -9}; + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; snpInfo.push_back(sInfo); } - + infile.close(); - infile.clear(); + infile.clear(); return true; } @@ -396,8 +397,8 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno { indicator_pheno.clear(); pheno.clear(); - mapID2num.clear(); - + mapID2num.clear(); + igzstream infile (file_fam.c_str(), igzstream::in); //ifstream infile (file_fam.c_str(), ifstream::in); if (!infile) {cout<<"error opening .fam file: "<<file_fam<<endl; return false;} @@ -411,15 +412,15 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno vector<double> pheno_row; vector<int> ind_pheno_row; - + size_t p_max=*max_element(p_column.begin(), p_column.end() ); map<size_t, size_t> mapP2c; for (size_t i=0; i<p_column.size(); i++) { mapP2c[p_column[i]]=i; pheno_row.push_back(-9); ind_pheno_row.push_back(0); - } - + } + while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " \t"); ch_ptr=strtok (NULL, " \t"); @@ -428,7 +429,7 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno ch_ptr=strtok (NULL, " \t"); ch_ptr=strtok (NULL, " \t"); ch_ptr=strtok (NULL, " \t"); - + size_t i=0; while (i<p_max ) { if (mapP2c.count(i+1)!=0 ) { @@ -436,23 +437,23 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9; } else { p=atof(ch_ptr); - + if (p==-9) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;} else {ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;} } } i++; - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); } - + indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - + pheno.push_back(pheno_row); + mapID2num[id]=c; c++; } - + infile.close(); - infile.clear(); + infile.clear(); return true; } @@ -466,7 +467,7 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g { indicator_snp.clear(); snpInfo.clear(); - + igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} @@ -478,112 +479,118 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; + //eigenlib_dgemm("T", "N", 1.0, W, W, 0.0, WtW); + int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); - + double v_x, v_w; int c_idv=0; - + string line; char *ch_ptr; - + string rs; long int b_pos; string chr; string major; string minor; double cM; - + size_t file_pos; + double maf, geno, geno_old; size_t n_miss; size_t n_0, n_1, n_2; int flag_poly; - + int ni_total=indicator_idv.size(); int ni_test=0; for (int i=0; i<ni_total; ++i) { ni_test+=indicator_idv[i]; } ns_test=0; - - while (!safeGetline(infile, line).eof()) { + + file_pos=0; + while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; ch_ptr=strtok (NULL, " , \t"); minor=ch_ptr; ch_ptr=strtok (NULL, " , \t"); major=ch_ptr; - + if (setSnps.size()!=0 && setSnps.count(rs)==0) { - SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9}; - snpInfo.push_back(sInfo); - indicator_snp.push_back(0); - continue; + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, 0, -9, -9, 0, 0, file_pos}; + snpInfo.push_back(sInfo); + indicator_snp.push_back(0); + + file_pos++; + continue; } - + if (mapRS2bp.count(rs)==0) {chr="-9"; b_pos=-9;cM=-9;} - else {b_pos=mapRS2bp[rs]; chr=mapRS2chr[rs]; cM=mapRS2cM[rs];} - + else {b_pos=mapRS2bp[rs]; chr=mapRS2chr[rs]; cM=mapRS2cM[rs];} + maf=0; n_miss=0; flag_poly=0; geno_old=-9; n_0=0; n_1=0; n_2=0; c_idv=0; gsl_vector_set_zero (genotype_miss); for (int i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); - if (indicator_idv[i]==0) {continue;} + if (indicator_idv[i]==0) {continue;} if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;} - + geno=atof(ch_ptr); if (geno>=0 && geno<=0.5) {n_0++;} if (geno>0.5 && geno<1.5) {n_1++;} if (geno>=1.5 && geno<=2.0) {n_2++;} - - gsl_vector_set (genotype, c_idv, geno); - + + gsl_vector_set (genotype, c_idv, geno); + // if (geno<0) {n_miss++; continue;} - + if (flag_poly==0) {geno_old=geno; flag_poly=2;} if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} - + maf+=geno; - + c_idv++; } - maf/=2.0*(double)(ni_test-n_miss); - - SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + maf/=2.0*(double)(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf, ni_test-n_miss, 0, file_pos}; snpInfo.push_back(sInfo); - + file_pos++; + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} - + if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} - + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} - + if (hwe_level!=0 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} } - + //filter SNP if it is correlated with W //unless W has only one column, of 1s - for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + for (size_t i=0; i<genotype->size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} } - + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - + if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} - - indicator_snp.push_back(1); + + indicator_snp.push_back(1); ns_test++; } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); gsl_matrix_free (WtW); @@ -591,10 +598,10 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g gsl_vector_free (Wtx); gsl_vector_free (WtWiWtx); gsl_permutation_free (pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -602,13 +609,13 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g - + //Read bed file, the first time bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) { indicator_snp.clear(); size_t ns_total=snpInfo.size(); - + ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} @@ -619,25 +626,25 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); - + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; + int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); - + double v_x, v_w, geno; size_t c_idv=0; - + char ch[1]; bitset<8> b; - + size_t ni_total=indicator_idv.size(); size_t ni_test=0; for (size_t i=0; i<ni_total; ++i) { ni_test+=indicator_idv[i]; } ns_test=0; - + //calculate n_bit and c, the number of bit for each snp size_t n_bit; if (ni_total%4==0) {n_bit=ni_total/4;} @@ -648,19 +655,20 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl infile.read(ch,1); b=ch[0]; } - + double maf; size_t n_miss; - size_t n_0, n_1, n_2, c; - + size_t n_0, n_1, n_2, c; + //start reading snps and doing association test for (size_t t=0; t<ns_total; ++t) { infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) { snpInfo[t].n_miss=-9; snpInfo[t].missingness=-9; snpInfo[t].maf=-9; + snpInfo[t].file_position=t; indicator_snp.push_back(0); continue; } @@ -675,52 +683,55 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); maf+=2.0; n_2++;} else {gsl_vector_set(genotype, c_idv, 1.0); maf+=1.0; n_1++;} } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;} + if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;} else {gsl_vector_set(genotype_miss, c_idv, 1); n_miss++; } } c_idv++; } } maf/=2.0*(double)(ni_test-n_miss); - + snpInfo[t].n_miss=n_miss; snpInfo[t].missingness=(double)n_miss/(double)ni_test; snpInfo[t].maf=maf; - + snpInfo[t].n_idv=ni_test-n_miss; + snpInfo[t].n_nb=0; + snpInfo[t].file_position=t; + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} - + if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} - + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} - + if (hwe_level!=1 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} } - - + + //filter SNP if it is correlated with W //unless W has only one column, of 1s - for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + for (size_t i=0; i<genotype->size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} } - + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - + if (W->size2!=1 && v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} - - indicator_snp.push_back(1); + + indicator_snp.push_back(1); ns_test++; } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); gsl_matrix_free (WtW); @@ -728,63 +739,177 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl gsl_vector_free (Wtx); gsl_vector_free (WtWiWtx); gsl_permutation_free (pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } -void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) + + +//read the genotype for one SNP; remember to read empty lines +//geno stores original genotypes without centering +//missing values are replaced by mean +bool Bimbam_ReadOneSNP (const size_t inc, const vector<int> &indicator_idv, igzstream &infile, gsl_vector *geno, double &geno_mean) +{ + size_t ni_total=indicator_idv.size(); + + // if (infile.eof()) {infile.clear();} + // infile.seekg(pos); + + string line; + char *ch_ptr; + bool flag=false; + + for (size_t i=0; i<inc; i++) { + !safeGetline(infile, line).eof(); + } + + if (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + geno_mean=0.0; + double d; + size_t c_idv=0; + vector<size_t> geno_miss; + + for (size_t i=0; i<ni_total; ++i) { + ch_ptr=strtok (NULL, " , \t"); + if (indicator_idv[i]==0) {continue;} + + if (strcmp(ch_ptr, "NA")==0) { + geno_miss.push_back(c_idv); + } else { + d=atof(ch_ptr); + gsl_vector_set (geno, c_idv, d); + geno_mean+=d; + } + c_idv++; + } + + geno_mean/=(double)(c_idv-geno_miss.size() ); + + for (size_t i=0; i<geno_miss.size(); ++i) { + gsl_vector_set(geno, geno_miss[i], geno_mean); + } + flag=true; + } + + return flag; +} + + +//for plink, store SNPs as double too +void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, ifstream &infile, gsl_vector *geno, double &geno_mean) +{ + size_t ni_total=indicator_idv.size(), n_bit; + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + infile.seekg(pos*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + char ch[1]; + bitset<8> b; + + geno_mean=0.0; + size_t c=0, c_idv=0; + vector<size_t> geno_miss; + + for (size_t i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && c==ni_total) {break;} + if (indicator_idv[c]==0) {c++; continue;} + c++; + + if (b[2*j]==0) { + if (b[2*j+1]==0) { + gsl_vector_set (geno, c_idv, 2); + geno_mean+=2.0; + } else { + gsl_vector_set (geno, c_idv, 1); + geno_mean+=1.0; + } + } else { + if (b[2*j+1]==1) { + gsl_vector_set (geno, c_idv, 0); + geno_mean+=0.0; + } else { + geno_miss.push_back(c_idv); + } + } + + c_idv++; + } + } + + geno_mean/=(double)(c_idv-geno_miss.size()); + + for (size_t i=0; i<geno_miss.size(); ++i) { + gsl_vector_set(geno, geno_miss[i], geno_mean); + } + + return; +} + + + + + +void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) { igzstream infile (file_kin.c_str(), igzstream::in); // ifstream infile (file_kin.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open kinship file: "<<file_kin<<endl; error=true; return;} - + size_t ni_total=indicator_idv.size(); - + gsl_matrix_set_zero (G); - + string line; - char *ch_ptr; + char *ch_ptr; double d; - + if (k_mode==1) { size_t i_test=0, i_total=0, j_test=0, j_total=0; while (getline(infile, line)) { - if (i_total==ni_total) {cout<<"error! number of rows in the kinship file is larger than the number of phentypes."<<endl; error=true;} - + if (i_total==ni_total) {cout<<"error! number of rows in the kinship file is larger than the number of phentypes."<<endl; error=true;} + if (indicator_idv[i_total]==0) {i_total++; continue;} - + j_total=0; j_test=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { if (j_total==ni_total) {cout<<"error! number of columns in the kinship file is larger than the number of phentypes for row = "<<i_total<<endl; error=true;} - + d=atof(ch_ptr); - if (indicator_idv[j_total]==1) {gsl_matrix_set (G, i_test, j_test, d); j_test++;} + if (indicator_idv[j_total]==1) {gsl_matrix_set (G, i_test, j_test, d); j_test++;} j_total++; - + ch_ptr=strtok (NULL, " , \t"); } if (j_total!=ni_total) {cout<<"error! number of columns in the kinship file do not match the number of phentypes for row = "<<i_total<<endl; error=true;} - i_total++; i_test++; + i_total++; i_test++; } if (i_total!=ni_total) {cout<<"error! number of rows in the kinship file do not match the number of phentypes."<<endl; error=true;} - } - else { + } + else { map<size_t, size_t> mapID2ID; size_t c=0; for (size_t i=0; i<indicator_idv.size(); i++) { if (indicator_idv[i]==1) {mapID2ID[i]=c; c++;} } - + string id1, id2; double Cov_d; size_t n_id1, n_id2; - + while (getline(infile, line)) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); id1=ch_ptr; @@ -794,10 +919,10 @@ void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<strin d=atof(ch_ptr); if (mapID2num.count(id1)==0 || mapID2num.count(id2)==0) {continue;} if (indicator_idv[mapID2num[id1]]==0 || indicator_idv[mapID2num[id2]]==0) {continue;} - + n_id1=mapID2ID[mapID2num[id1]]; n_id2=mapID2ID[mapID2num[id2]]; - + Cov_d=gsl_matrix_get(G, n_id1, n_id2); if (Cov_d!=0 && Cov_d!=d) {cout<<"error! redundant and unequal terms in the kinship file, for id1 = "<<id1<<" and id2 = "<<id2<<endl;} else { @@ -806,15 +931,15 @@ void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<strin } } } - + infile.close(); - infile.clear(); - + infile.clear(); + return; } -void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) +void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) { igzstream infile (file_mk.c_str(), igzstream::in); if (!infile) {cout<<"error! fail to open file: "<<file_mk<<endl; error=true; return;} @@ -830,101 +955,101 @@ void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, } infile.close(); - infile.clear(); + infile.clear(); return; } -void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) +void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) { igzstream infile (file_ku.c_str(), igzstream::in); // ifstream infile (file_ku.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open the U file: "<<file_ku<<endl; error=true; return;} - + size_t n_row=U->size1, n_col=U->size2, i_row=0, i_col=0; - + gsl_matrix_set_zero (U); - + string line; - char *ch_ptr; + char *ch_ptr; double d; - + while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<<endl; error=true;} - + if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<<endl; error=true;} + i_col=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { if (i_col==n_col) {cout<<"error! number of columns in the U file is larger than expected, for row = "<<i_row<<endl; error=true;} - + d=atof(ch_ptr); - gsl_matrix_set (U, i_row, i_col, d); + gsl_matrix_set (U, i_row, i_col, d); i_col++; - + ch_ptr=strtok (NULL, " , \t"); } - + i_row++; } - + infile.close(); - infile.clear(); - + infile.clear(); + return; } -void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) +void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) { igzstream infile (file_kd.c_str(), igzstream::in); // ifstream infile (file_kd.c_str(), ifstream::in); if (!infile) {cout<<"error! fail to open the D file: "<<file_kd<<endl; error=true; return;} - + size_t n_row=eval->size, i_row=0; - + gsl_vector_set_zero (eval); - + string line; - char *ch_ptr; + char *ch_ptr; double d; - + while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<<endl; error=true;} - + if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<<endl; error=true;} + ch_ptr=strtok ((char *)line.c_str(), " , \t"); d=atof(ch_ptr); - + ch_ptr=strtok (NULL, " , \t"); if (ch_ptr!=NULL) {cout<<"error! number of columns in the D file is larger than expected, for row = "<<i_row<<endl; error=true;} - + gsl_vector_set (eval, i_row, d); - + i_row++; } - + infile.close(); - infile.clear(); - + infile.clear(); + return; } //read bimbam mean genotype file and calculate kinship matrix -bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) { igzstream infile (file_geno.c_str(), igzstream::in); //ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} - + string line; char *ch_ptr; - + size_t n_miss; double d, geno_mean, geno_var; - + size_t ni_total=matrix_kin->size1; gsl_vector *geno=gsl_vector_alloc (ni_total); gsl_vector *geno_miss=gsl_vector_alloc (ni_total); @@ -934,11 +1059,11 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k !safeGetline(infile, line).eof(); if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); for (size_t i=0; i<ni_total; ++i) { @@ -952,44 +1077,49 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k geno_var+=d*d; } } - + geno_mean/=(double)(ni_total-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; // geno_var=geno_mean*(1-geno_mean*0.5); - + for (size_t i=0; i<ni_total; ++i) { if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} - } - + } + gsl_vector_add_constant (geno, -1.0*geno_mean); - + if (geno_var!=0) { - if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} - else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} - else {cout<<"Unknown kinship mode."<<endl;} + if (k_mode==1) { + gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); + //eigenlib_dsyr (1.0, geno, matrix_kin); + } else if (k_mode==2) { + gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); + //eigenlib_dsyr (1.0/geno_var, geno, matrix_kin); + } else { + cout<<"Unknown kinship mode."<<endl; + } } - ns_test++; - } + } cout<<endl; - + gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test); - + for (size_t i=0; i<ni_total; ++i) { for (size_t j=0; j<i; ++j) { d=gsl_matrix_get (matrix_kin, j, i); gsl_matrix_set (matrix_kin, i, j, d); } } - + gsl_vector_free (geno); gsl_vector_free (geno_miss); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -999,23 +1129,23 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k -bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} - + char ch[1]; bitset<8> b; - + size_t n_miss, ci_total; double d, geno_mean, geno_var; - + size_t ni_total=matrix_kin->size1; gsl_vector *geno=gsl_vector_alloc (ni_total); size_t ns_test=0; int n_bit; - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -1024,14 +1154,14 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; - } - + } + for (size_t t=0; t<indicator_snp.size(); ++t) { if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} if (indicator_snp[t]==0) {continue;} - + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; for (int i=0; i<n_bit; ++i) { @@ -1045,51 +1175,51 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m else {gsl_vector_set(geno, ci_total, 1.0); geno_mean+=1.0; geno_var+=1.0;} } else { - if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); } + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); } else {gsl_vector_set(geno, ci_total, -9.0); n_miss++; } } ci_total++; } } - + geno_mean/=(double)(ni_total-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; // geno_var=geno_mean*(1-geno_mean*0.5); - + for (size_t i=0; i<ni_total; ++i) { d=gsl_vector_get(geno,i); if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} - } - + } + gsl_vector_add_constant (geno, -1.0*geno_mean); - + if (geno_var!=0) { if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} else {cout<<"Unknown kinship mode."<<endl;} } - + ns_test++; - } + } cout<<endl; - + gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test); - + for (size_t i=0; i<ni_total; ++i) { for (size_t j=0; j<i; ++j) { d=gsl_matrix_get (matrix_kin, j, i); gsl_matrix_set (matrix_kin, i, j, d); } } - + gsl_vector_free (geno); - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } @@ -1103,65 +1233,65 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} - + string line; char *ch_ptr; - + if (calc_K==true) {gsl_matrix_set_zero (K);} - + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1); double geno, geno_mean; size_t n_miss; - + int ni_total=(int)indicator_idv.size(); int ns_total=(int)indicator_snp.size(); int ni_test=UtX->size1; int ns_test=UtX->size2; - + int c_idv=0, c_snp=0; - + for (int i=0; i<ns_total; ++i) { !safeGetline(infile, line).eof(); - if (indicator_snp[i]==0) {continue;} - + if (indicator_snp[i]==0) {continue;} + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + c_idv=0; geno_mean=0; n_miss=0; gsl_vector_set_zero (genotype_miss); for (int j=0; j<ni_total; ++j) { ch_ptr=strtok (NULL, " , \t"); - if (indicator_idv[j]==0) {continue;} - + if (indicator_idv[j]==0) {continue;} + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;} - else { + else { geno=atof(ch_ptr); - gsl_vector_set (genotype, c_idv, geno); + gsl_vector_set (genotype, c_idv, geno); geno_mean+=geno; } c_idv++; } - + geno_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<genotype->size; ++i) { + + for (size_t i=0; i<genotype->size; ++i) { if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} - + gsl_vector_set (genotype, i, geno); gsl_matrix_set (UtX, i, c_snp, geno); } - + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} - + c_snp++; - } - + } + if (calc_K==true) { gsl_matrix_scale (K, 1.0/(double)ns_test); - + for (size_t i=0; i<genotype->size; ++i) { for (size_t j=0; j<i; ++j) { geno=gsl_matrix_get (K, j, i); @@ -1169,18 +1299,106 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< } } } - + gsl_vector_free (genotype); gsl_vector_free (genotype_miss); - + infile.clear(); infile.close(); - + return true; } +//compact version of the above function, using uchar instead of gsl_matrix +bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + // ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + + Xt.clear(); + vector<unsigned char> Xt_row; + for (size_t i=0; i<ni_test; i++) { + Xt_row.push_back(0); + } + + string line; + char *ch_ptr; + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (ni_test); + gsl_vector *genotype_miss=gsl_vector_alloc (ni_test); + double geno, geno_mean; + size_t n_miss; + + size_t ni_total= indicator_idv.size(); + size_t ns_total= indicator_snp.size(); + + size_t c_idv=0, c_snp=0; + + for (size_t i=0; i<ns_total; ++i) { + !safeGetline(infile, line).eof(); + if (indicator_snp[i]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + c_idv=0; geno_mean=0; n_miss=0; + gsl_vector_set_zero (genotype_miss); + for (uint j=0; j<ni_total; ++j) { + ch_ptr=strtok (NULL, " , \t"); + if (indicator_idv[j]==0) {continue;} + + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;} else { + geno=atof(ch_ptr); + gsl_vector_set (genotype, c_idv, geno); + geno_mean+=geno; + } + c_idv++; + } + + geno_mean/=(double)(ni_test-n_miss); + + for (size_t j=0; j<genotype->size; ++j) { + if (gsl_vector_get (genotype_miss, j)==1) { + geno=geno_mean; + } else { + geno=gsl_vector_get (genotype, j); + } + + Xt_row[j]=Double02ToUchar(geno); + gsl_vector_set (genotype, j, (geno-geno_mean)); + } + Xt.push_back(Xt_row); + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; i<genotype->size; ++i) { + for (size_t j=0; j<i; ++j) { + geno=gsl_matrix_get (K, j, i); + gsl_matrix_set (K, i, j, geno); + } + } + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + + infile.clear(); + infile.close(); + + return true; +} @@ -1190,79 +1408,79 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} - + char ch[1]; bitset<8> b; - - int ni_total=(int)indicator_idv.size(); - int ns_total=(int)indicator_snp.size(); - int ni_test=UtX->size1; - int ns_test=UtX->size2; + + size_t ni_total=indicator_idv.size(); + size_t ns_total=indicator_snp.size(); + size_t ni_test=UtX->size1; + size_t ns_test=UtX->size2; int n_bit; - + if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - + if (calc_K==true) {gsl_matrix_set_zero (K);} - - gsl_vector *genotype=gsl_vector_alloc (UtX->size1); - + + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); + double geno, geno_mean; - size_t n_miss; - int c_idv=0, c_snp=0, c=0; - + size_t n_miss; + size_t c_idv=0, c_snp=0, c=0; + //start reading snps and doing association test - for (int t=0; t<ns_total; ++t) { - if (indicator_snp[t]==0) {continue;} + for (size_t t=0; t<ns_total; ++t) { + if (indicator_snp[t]==0) {continue;} infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes c_idv=0; geno_mean=0.0; n_miss=0; c=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; - if ((i==(n_bit-1)) && c==ni_total) {break;} + if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;} else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;} } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;} + if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;} else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;} } c_idv++; } } - + geno_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<genotype->size; ++i) { + + for (size_t i=0; i<genotype->size; ++i) { geno=gsl_vector_get (genotype, i); if (geno==-9) {geno=0;} else {geno-=geno_mean;} - + gsl_vector_set (genotype, i, geno); gsl_matrix_set (UtX, i, c_snp, geno); } - + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} - + c_snp++; - } - + } + if (calc_K==true) { gsl_matrix_scale (K, 1.0/(double)ns_test); - + for (size_t i=0; i<genotype->size; ++i) { for (size_t j=0; j<i; ++j) { geno=gsl_matrix_get (K, j, i); @@ -1270,39 +1488,144 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in } } } - - gsl_vector_free (genotype); + + gsl_vector_free (genotype); infile.clear(); infile.close(); - + return true; } +//compact version of the above function, using uchar instead of gsl_matrix +bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + + Xt.clear(); + vector<unsigned char> Xt_row; + for (size_t i=0; i<ni_test; i++) { + Xt_row.push_back(0); + } + + char ch[1]; + bitset<8> b; + + size_t ni_total=indicator_idv.size(); + size_t ns_total=indicator_snp.size(); + int n_bit; + + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (ni_test); + + double geno, geno_mean; + size_t n_miss; + size_t c_idv=0, c_snp=0, c=0; + + //start reading snps and doing association test + for (size_t t=0; t<ns_total; ++t) { + if (indicator_snp[t]==0) {continue;} + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + c_idv=0; geno_mean=0.0; n_miss=0; c=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && c==ni_total) {break;} + if (indicator_idv[c]==0) {c++; continue;} + c++; + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;} + else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;} + } + else { + if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;} + else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;} + } + c_idv++; + } + } + + geno_mean/=(double)(ni_test-n_miss); + + for (size_t i=0; i<genotype->size; ++i) { + geno=gsl_vector_get (genotype, i); + if (geno==-9) {geno=geno_mean;} + + Xt_row[i]=Double02ToUchar(geno); + + geno-=geno_mean; + + gsl_vector_set (genotype, i, geno); + } + Xt.push_back(Xt_row); + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; i<genotype->size; ++i) { + for (size_t j=0; j<i; ++j) { + geno=gsl_matrix_get (K, j, i); + gsl_matrix_set (K, i, j, geno); + } + } + } + + gsl_vector_free (genotype); + infile.clear(); + infile.close(); + + return true; +} + + + + + + bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est) { mapRS2est.clear(); - + ifstream infile (file_est.c_str(), ifstream::in); if (!infile) {cout<<"error opening estimated parameter file: "<<file_est<<endl; return false;} - + string line; char *ch_ptr; - + string rs; double alpha, beta, gamma, d; - + //header getline(infile, line); - + size_t n=*max_element(est_column.begin(), est_column.end()); - + while (getline(infile, line)) { - ch_ptr=strtok ((char *)line.c_str(), " \t"); - + ch_ptr=strtok ((char *)line.c_str(), " \t"); + alpha=0.0; beta=0.0; gamma=1.0; for (size_t i=0; i<n+1; ++i) { if (i==est_column[0]-1) {rs=ch_ptr;} @@ -1311,9 +1634,9 @@ bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map if (i==est_column[3]-1) {gamma=atof(ch_ptr);} if (i<n) {ch_ptr=strtok (NULL, " \t");} } - + d=alpha+beta*gamma; - + if (mapRS2est.count(rs)==0) { mapRS2est[rs]=d; } @@ -1321,7 +1644,7 @@ bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map cout<<"the same SNP occurs more than once in estimated parameter file: "<<rs<<endl; return false; } } - + infile.clear(); infile.close(); return true; @@ -1337,7 +1660,7 @@ bool CountFileLines (const string &file_input, size_t &n_lines) n_lines=count(istreambuf_iterator<char>(infile), istreambuf_iterator<char>(), '\n'); infile.seekg (0, ios::beg); - + return true; } @@ -1348,25 +1671,25 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN { vec_read.clear(); ng_total=0; - - ifstream infile (file_gene.c_str(), ifstream::in); + + igzstream infile (file_gene.c_str(), igzstream::in); if (!infile) {cout<<"error! fail to open gene expression file: "<<file_gene<<endl; return false;} - + string line; char *ch_ptr; string rs; - + size_t n_idv=0, t=0; - + //header getline(infile, line); - + while (getline(infile, line)) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; - - ch_ptr=strtok (NULL, " , \t"); - + + ch_ptr=strtok (NULL, " , \t"); + t=0; while (ch_ptr!=NULL) { if (ng_total==0) { @@ -1374,25 +1697,1482 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN t++; n_idv++; } else { - vec_read[t]+=atof(ch_ptr); + vec_read[t]+=atof(ch_ptr); t++; } - - ch_ptr=strtok (NULL, " , \t"); + + ch_ptr=strtok (NULL, " , \t"); } - + if (t!=n_idv) {cout<<"error! number of columns doesn't match in row: "<<ng_total<<endl; return false;} - - SNPINFO sInfo={"-9", rs, -9, -9, "-9", "-9", -9, -9, -9}; + + SNPINFO sInfo={"-9", rs, -9, -9, "-9", "-9", 0, -9, -9, 0, 0, 0}; snpInfo.push_back(sInfo); - + ng_total++; } - + infile.close(); - infile.clear(); - + infile.clear(); + return true; } + + + + + +// WJA Added +//Read Oxford sample file +bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt) +{ + indicator_pheno.clear(); + pheno.clear(); + indicator_cvt.clear(); + + igzstream infile (file_sample.c_str(), igzstream::in); + + if (!infile) {cout<<"error! fail to open sample file: "<<file_sample<<endl; return false;} + + string line; + char *ch_ptr; + + + string id; + double p,d; + + vector<double> pheno_row; + vector<int> ind_pheno_row; + int flag_na=0; + + size_t num_cols=0; + size_t num_p_in_file=0; + size_t num_cvt_in_file=0; + +// size_t p_max=*max_element(p_column.begin(), p_column.end()); + + map<size_t, size_t> mapP2c; + for (size_t i=0; i<p_column.size(); i++) { + mapP2c[p_column[i]]=i; + pheno_row.push_back(-9); + ind_pheno_row.push_back(0); + } + // read header line1 + if(!safeGetline(infile, line).eof()) { + ch_ptr=strtok((char *)line.c_str(), " "); + if(strcmp(ch_ptr, "ID_1")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "ID_2")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "missing")!=0) {return false;} + while (ch_ptr!=NULL) { + num_cols++; + ch_ptr=strtok (NULL, " "); + + } + num_cols--; + } + + vector<map<uint32_t, size_t> > cvt_factor_levels; + + char col_type[num_cols]; + // read header line2 + if(!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + ch_ptr=strtok(NULL, " "); + if(strcmp(ch_ptr, "0")!=0) {return false;} + size_t it=0; + ch_ptr=strtok (NULL, " "); + if(ch_ptr!=NULL) + while(ch_ptr!=NULL){ + col_type[it++]=ch_ptr[0]; + if(ch_ptr[0]=='D') {cvt_factor_levels.push_back(map<uint32_t, size_t>());num_cvt_in_file++;} + if(ch_ptr[0]=='C') {num_cvt_in_file++;} + if((ch_ptr[0]=='P')||(ch_ptr[0]=='B')) {num_p_in_file++;} + ch_ptr=strtok(NULL, " "); + } + + } + + while (!safeGetline(infile, line).eof()) { + + ch_ptr=strtok ((char *)line.c_str(), " "); + + for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " ");} + + + size_t i=0; + size_t p_i=0; + size_t fac_cvt_i=0; + + while (i<num_cols) { + + if((col_type[i]=='P')||(col_type[i]=='B')) + { + if (mapP2c.count(p_i+1)!=0) { + if (strcmp(ch_ptr, "NA")==0) {ind_pheno_row[mapP2c[p_i+1]]=0; pheno_row[mapP2c[p_i+1]]=-9;} + else {p=atof(ch_ptr); ind_pheno_row[mapP2c[p_i+1]]=1; pheno_row[mapP2c[p_i+1]]=p;} + } + p_i++; + } + if(col_type[i]=='D') + { + // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL IS INTEGRAL i.e for atoi error + if (strcmp(ch_ptr, "NA")!=0) {uint32_t level=atoi(ch_ptr); if(cvt_factor_levels[fac_cvt_i].count(level) == 0) {cvt_factor_levels[fac_cvt_i][level]=cvt_factor_levels[fac_cvt_i].size();}} + fac_cvt_i++; + } + + ch_ptr=strtok (NULL, " "); + i++; + } + + + indicator_pheno.push_back(ind_pheno_row); + pheno.push_back(pheno_row); + + } + // close and reopen the file + infile.close(); + infile.clear(); + + if(num_cvt_in_file>0) + { + igzstream infile2 (file_sample.c_str(), igzstream::in); + + if (!infile2) {cout<<"error! fail to open sample file: "<<file_sample<<endl; return false;} + // skip header + safeGetline(infile2, line); + safeGetline(infile2, line); + + // pull in the covariates now we now the number of factor levels + while (!safeGetline(infile2, line).eof()) { + + vector<double> v_d; flag_na=0; + ch_ptr=strtok ((char *)line.c_str(), " "); + + for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " ");} + + + size_t i=0; + size_t fac_cvt_i=0; + size_t num_fac_levels; + while (i<num_cols) { + + if(col_type[i]=='C') + { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} + else {d=atof(ch_ptr);} + + v_d.push_back(d); + } + + + if(col_type[i]=='D') + { + // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL IS INTEGRAL i.e for atoi error + num_fac_levels=cvt_factor_levels[fac_cvt_i].size(); + if(num_fac_levels>1) + { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; for(size_t it=0;it<num_fac_levels-1; it++) {v_d.push_back(-9);}} + else {uint32_t level=atoi(ch_ptr); for(size_t it=0;it<num_fac_levels-1;it++) {cvt_factor_levels[fac_cvt_i][level]==it+1 ? v_d.push_back(1.0) : v_d.push_back(0.0); }} + } + fac_cvt_i++; + } + + ch_ptr=strtok (NULL, " "); + i++; + } + + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + cvt.push_back(v_d); + + + } + + if (indicator_cvt.empty()) {n_cvt=0;} + else { + flag_na=0; + for (vector<int>::size_type i=0; i<indicator_cvt.size(); ++i) { + if (indicator_cvt[i]==0) {continue;} + + if (flag_na==0) {flag_na=1; n_cvt=cvt[i].size();} + if (flag_na!=0 && n_cvt!=cvt[i].size()) {cout<<"error! number of covariates in row "<<i<<" do not match other rows."<<endl; return false;} + } + } + + infile2.close(); + infile2.clear(); + } + return true; +} + + + +// WJA Added +//Read bgen file, the first time +#include <cstdint> +#include <assert.h> +bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) +{ + + indicator_snp.clear(); + + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return false;} + + gsl_vector *genotype=gsl_vector_alloc (W->size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; + bool LongIds=bgen_flags&0x4; + + if(!LongIds) {return false;} + + infile.ignore(bgen_snp_block_offset); + + ns_test=0; + + size_t ns_total=static_cast<size_t>(bgen_nsnps); + + snpInfo.clear(); + string rs; + long int b_pos; + string chr; +// double cM; + string major; + string minor; + string id; + + double v_x, v_w; + int c_idv=0; + + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + + size_t ni_total=indicator_idv.size(); // total number of samples in phenotype file + size_t ni_test=0; // number of samples to use in test + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + + for (size_t i=0; i<ni_total; ++i) { + + ni_test+=indicator_idv[i]; + } + + + +// ns_total=1; + for (size_t t=0; t<ns_total; ++t) { + + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + // should we switch according to MAF? + minor=bgen_B_allele; + major=bgen_A_allele; + b_pos=static_cast<long int>(bgen_SNP_pos); + + uint16_t unzipped_data[3*bgen_N]; + + if (setSnps.size()!=0 && setSnps.count(rs)==0) { + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9}; + snpInfo.push_back(sInfo); + indicator_snp.push_back(0); + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + + } + + + maf=0; n_miss=0; flag_poly=0; geno_old=-9; + n_0=0; n_1=0; n_2=0; + c_idv=0; + gsl_vector_set_zero (genotype_miss); + for (size_t i=0; i<bgen_N; ++i) { + // CHECK this set correctly! + if (indicator_idv[i]==0) {continue;} + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + + //CHECK 0.1 OK + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;} + + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + if (geno>=0 && geno<=0.5) {n_0++;} + if (geno>0.5 && geno<1.5) {n_1++;} + if (geno>=1.5 && geno<=2.0) {n_2++;} + + gsl_vector_set (genotype, c_idv, geno); + + // CHECK WHAT THIS DOES + if (flag_poly==0) {geno_old=geno; flag_poly=2;} + if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} + + maf+=geno; + + c_idv++; + } + + maf/=2.0*static_cast<double>(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, -9, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + snpInfo.push_back(sInfo); + + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=0 && maf_level!=-1) { + if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} + } + + //filter SNP if it is correlated with W + //unless W has only one column, of 1s + for (size_t i=0; i<genotype->size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + + } + + + + + return true; + +} + + +//read oxford genotype file and calculate kinship matrix +bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +{ + string file_bgen=file_oxford; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return false;} + + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + double genotype; + + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_total=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + for (size_t t=0; t<indicator_snp.size(); ++t) { + + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + } + + + + geno_mean=0.0; n_miss=0; geno_var=0.0; + gsl_vector_set_all(geno_miss, 0); + + for (size_t i=0; i<bgen_N; ++i) { + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(geno_miss, i, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + genotype=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(geno, i, genotype); + gsl_vector_set(geno_miss, i, 1.0); + geno_mean+=genotype; + geno_var+=genotype*genotype; + } + + } + + + geno_mean/=(double)(ni_total-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_total; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_total; ++i) { + if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + } + + gsl_vector_add_constant (geno, -1.0*geno_mean); + + if (geno_var!=0) { + if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} + else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} + else {cout<<"Unknown kinship mode."<<endl;} + } + + ns_test++; + } + cout<<endl; + + gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test); + + for (size_t i=0; i<ni_total; ++i) { + for (size_t j=0; j<i; ++j) { + d=gsl_matrix_get (matrix_kin, j, i); + gsl_matrix_set (matrix_kin, i, j, d); + } + } + + gsl_vector_free (geno); + gsl_vector_free (geno_miss); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + + + + + + + + + + + + + + + + + + +//read header to determine which column contains which item +bool ReadHeader (const string &line, HEADER &header) +{ + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; + set<string> rs_set(rs_ptr, rs_ptr+10); + string chr_ptr[]={"chr","CHR"}; + set<string> chr_set(chr_ptr, chr_ptr+2); + string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; + set<string> pos_set(pos_ptr, pos_ptr+8); + string cm_ptr[]={"cm","CM"}; + set<string> cm_set(cm_ptr, cm_ptr+2); + string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; + set<string> a1_set(a1_ptr, a1_ptr+4); + string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; + set<string> a0_set(a0_ptr, a0_ptr+4); + + string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; + set<string> z_set(z_ptr, z_ptr+6); + string beta_ptr[]={"beta","BETA","b","B"}; + set<string> beta_set(beta_ptr, beta_ptr+4); + string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"}; + set<string> sebeta_set(sebeta_ptr, sebeta_ptr+4); + string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"}; + set<string> chisq_set(chisq_ptr, chisq_ptr+4); + string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"}; + set<string> p_set(p_ptr, p_ptr+6); + + string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"}; + set<string> n_set(n_ptr, n_ptr+6); + string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"}; + set<string> nmis_set(nmis_ptr, nmis_ptr+6); + string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; + set<string> nobs_set(nobs_ptr, nobs_ptr+4); + + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; + set<string> af_set(af_ptr, af_ptr+10); + string var_ptr[]={"var","VAR"}; + set<string> var_set(var_ptr, var_ptr+2); + + string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"}; + set<string> ws_set(ws_ptr, ws_ptr+4); + string cor_ptr[]={"cor","COR","r","R"}; + set<string> cor_set(cor_ptr, cor_ptr+4); + + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + + char *ch_ptr; + string type; + size_t n_error=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + type=ch_ptr; + if (rs_set.count(type)!=0) { + if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."<<endl; n_error++;} + } else if (chr_set.count(type)!=0) { + if (header.chr_col==0) {header.chr_col=header.coln+1;} else {cout<<"error! more than two chr columns in the file."<<endl; n_error++;} + } else if (pos_set.count(type)!=0) { + if (header.pos_col==0) {header.pos_col=header.coln+1;} else {cout<<"error! more than two pos columns in the file."<<endl; n_error++;} + } else if (cm_set.count(type)!=0) { + if (header.cm_col==0) {header.cm_col=header.coln+1;} else {cout<<"error! more than two cm columns in the file."<<endl; n_error++;} + } else if (a1_set.count(type)!=0) { + if (header.a1_col==0) {header.a1_col=header.coln+1;} else {cout<<"error! more than two allele1 columns in the file."<<endl; n_error++;} + } else if (a0_set.count(type)!=0) { + if (header.a0_col==0) {header.a0_col=header.coln+1;} else {cout<<"error! more than two allele0 columns in the file."<<endl; n_error++;} + } else if (z_set.count(type)!=0) { + if (header.z_col==0) {header.z_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + } else if (beta_set.count(type)!=0) { + if (header.beta_col==0) {header.beta_col=header.coln+1;} else {cout<<"error! more than two beta columns in the file."<<endl; n_error++;} + } else if (sebeta_set.count(type)!=0) { + if (header.sebeta_col==0) {header.sebeta_col=header.coln+1;} else {cout<<"error! more than two se_beta columns in the file."<<endl; n_error++;} + } else if (chisq_set.count(type)!=0) { + if (header.chisq_col==0) {header.chisq_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + } else if (p_set.count(type)!=0) { + if (header.p_col==0) {header.p_col=header.coln+1;} else {cout<<"error! more than two p columns in the file."<<endl; n_error++;} + } else if (n_set.count(type)!=0) { + if (header.n_col==0) {header.n_col=header.coln+1;} else {cout<<"error! more than two n_total columns in the file."<<endl; n_error++;} + } else if (nmis_set.count(type)!=0) { + if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;} + } else if (nobs_set.count(type)!=0) { + if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;} + } else if (ws_set.count(type)!=0) { + if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;} + } else if (af_set.count(type)!=0) { + if (header.af_col==0) {header.af_col=header.coln+1;} else {cout<<"error! more than two af columns in the file."<<endl; n_error++;} + } else if (cor_set.count(type)!=0) { + if (header.cor_col==0) {header.cor_col=header.coln+1;} else {cout<<"error! more than two cor columns in the file."<<endl; n_error++;} + } else {} + + ch_ptr=strtok (NULL, " , \t"); + header.coln++; + } + + if (header.cor_col!=0 && header.cor_col!=header.coln) {cout<<"error! the cor column should be the last column."<<endl; n_error++;} + + if (header.rs_col==0) { + if (header.chr_col!=0 && header.pos_col!=0) { + cout<<"missing an rs column. rs id will be replaced by chr:pos"<<endl; + } else { + cout<<"error! missing an rs column."<<endl; n_error++; + } + } + + if (n_error==0) {return true;} else {return false;} +} + + + + +//read category file, record mapRS2in +//the category file does not contain a null category +//so if a snp has 0 for all categories, then it is not included in the analysis +bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc) +{ + mapRS2cat.clear(); + + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "<<file_cat<<endl; return false;} + + string line; + char *ch_ptr; + + string rs, chr, a1, a0, pos, cm; + size_t i_cat;// ns_vc=0; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + //use the header to count the number of categories + n_vc=header.coln; + if (header.rs_col!=0) {n_vc--;} + if (header.chr_col!=0) {n_vc--;} + if (header.pos_col!=0) {n_vc--;} + if (header.cm_col!=0) {n_vc--;} + if (header.a1_col!=0) {n_vc--;} + if (header.a0_col!=0) {n_vc--;} + + //read the following lines to record mapRS2cat + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + i_cat=0; + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) { + rs=ch_ptr; + } else if (header.chr_col!=0 && header.chr_col==i+1) { + chr=ch_ptr; + } else if (header.pos_col!=0 && header.pos_col==i+1) { + pos=ch_ptr; + } else if (header.cm_col!=0 && header.cm_col==i+1) { + cm=ch_ptr; + } else if (header.a1_col!=0 && header.a1_col==i+1) { + a1=ch_ptr; + } else if (header.a0_col!=0 && header.a0_col==i+1) { + a0=ch_ptr; + } else if (atoi(ch_ptr)==1 || atoi(ch_ptr)==0) { + if (i_cat==0) { + if (header.rs_col==0) { + rs=chr+":"+pos; + } + } + + if (atoi(ch_ptr)==1 && mapRS2cat.count(rs)==0) {mapRS2cat[rs]=i_cat;} + i_cat++; + } else {} + + ch_ptr=strtok (NULL, " , \t"); + } + + //if (mapRS2cat.count(rs)==0) {mapRS2cat[rs]=n_vc+1; ns_vc++;} + } + + //if (ns_vc>0) {n_vc++;} + + infile.clear(); + infile.close(); + + return true; +} + + + + +//read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix +bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + + string line; + char *ch_ptr; + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_test=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + + size_t n_vc=matrix_kin->size2/ni_test, i_vc; + string rs; + vector<size_t> ns_vec; + for (size_t i=0; i<n_vc; i++) { + ns_vec.push_back(0); + } + + size_t ns_test=0; + for (size_t t=0; t<indicator_snp.size(); ++t) { + !safeGetline(infile, line).eof(); + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + rs=snpInfo[t].rs_number;//this line is new + + geno_mean=0.0; n_miss=0; geno_var=0.0; + gsl_vector_set_all(geno_miss, 0); + + size_t j=0; + for (size_t i=0; i<indicator_idv.size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ch_ptr=strtok (NULL, " , \t"); + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;} + else { + d=atof(ch_ptr); + gsl_vector_set (geno, j, d); + gsl_vector_set (geno_miss, j, 1); + geno_mean+=d; + geno_var+=d*d; + } + j++; + } + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + } + + //this line is new; removed + //gsl_vector_add_constant (geno, -1.0*geno_mean); + + if (geno_var!=0) { + mapRS2var[rs]=geno_var; + + if (k_mode==1) { + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); + } + + //eigenlib_dsyr (1.0, geno, matrix_kin); + } else if (k_mode==2) { + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); + } + } else { + cout<<"Unknown kinship mode."<<endl; + } + } + ns_test++; + } + cout<<endl; + + for (size_t t=0; t<n_vc; t++) { + if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);} + + for (size_t i=0; i<ni_test; ++i) { + for (size_t j=0; j<i; ++j) { + d=gsl_matrix_get (matrix_kin, j, i+ni_test*t); + gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); + } + } + } + + gsl_vector_free (geno); + gsl_vector_free (geno_miss); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + + +bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + + char ch[1]; + bitset<8> b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=matrix_kin->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + + size_t ns_test=0; + int n_bit; + + size_t n_vc=matrix_kin->size2/ni_test, i_vc; + string rs; + vector<size_t> ns_vec; + for (size_t i=0; i<n_vc; i++) { + ns_vec.push_back(0); + } + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + rs=snpInfo[t].rs_number;//this line is new + + //read genotypes + geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && ci_total==ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } + else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;} + } + else { + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); } + else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; } + } + + ci_test++; + ci_total++; + } + } + + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + d=gsl_vector_get(geno,i); + if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} + } + + //this line is new; removed + //gsl_vector_add_constant (geno, -1.0*geno_mean); + + if (geno_var!=0) { + mapRS2var[rs]=geno_var; + if (k_mode==1) { + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); + } + } else if (k_mode==2) { + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); + } + } else { + cout<<"Unknown kinship mode."<<endl; + } + } + + ns_test++; + } + cout<<endl; + + for (size_t t=0; t<n_vc; t++) { + if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);} + + for (size_t i=0; i<ni_test; ++i) { + for (size_t j=0; j<i; ++j) { + d=gsl_matrix_get (matrix_kin, j, i+ni_test*t); + gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); + //cout<<d<<" "; + } + //cout<<endl; + } + } + + d=0; + for (size_t i=0; i<ni_test; ++i) { + for (size_t j=0; j<ni_test; ++j) { + d+=gsl_matrix_get (matrix_kin, i, j)*gsl_matrix_get (matrix_kin, i, j); + } + } + d/=(double)ni_test*(double)ni_test; + //cout<<"trace = "<<scientific<<d-1/(double)ni_test<<endl; + + + + gsl_vector_free (geno); + + infile.close(); + infile.clear(); + + return true; +} + + + +//read var file, store mapRS2var +bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var) +{ + mapRS2var.clear(); + + igzstream infile (file_var.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open var file: "<<file_var<<endl; return false;} + + char *ch_ptr; + string line, rs; + double var; + + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + rs=ch_ptr; + ch_ptr=strtok (NULL, " , \t"); + var=atof(ch_ptr); + mapRS2var[rs]=var; + } + + return true; +} + + +//read beta file, use the mapRS2var to select snps (and to provide var if maf/var is not provided in the beta file), calculate q +void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test) +{ + gsl_vector_set_zero(q); + ni_total=0; ns_total=0; ns_test=0; + + igzstream infile (file_beta.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;} + + string line; + char *ch_ptr; + string type; + + string rs, chr, a1, a0, pos, cm; + double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0; + size_t n_total=0, n_mis=0, n_obs=0; + + vector<double> vec_q, vec_s; + for (size_t i=0; i<q->size; i++) { + vec_q.push_back(0.0); + vec_s.push_back(0.0); + } + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if (header.nobs_col==0 && header.nmis_col==0) { + cout<<"error! missing sample size in the beta file."<<endl; + } else { + cout<<"total sample size will be replaced by obs/mis sample size."<<endl; + } + } + + if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) { + cout<<"error! missing z scores in the beta file."<<endl; + } + + if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) { + cout<<"error! missing allele frequency in the beta file."<<endl; + } + + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + z=0; beta=0; se_beta=0; chisq=0; pvalue=0; + n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;} + if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;} + if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} + if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} + + if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} + if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} + if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);} + if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} + if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} + + if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} + if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} + if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + + if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} + if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + if (header.n_col==0) { + n_total=n_mis+n_obs; + } + + //both z values and beta/se_beta have directions, while chisq/pvalue do not + if (header.z_col!=0) { + zsquare=z*z; + } else if (header.beta_col!=0 && header.sebeta_col!=0) { + z=beta/se_beta; + zsquare=z*z; + } else if (header.chisq_col!=0) { + zsquare=chisq; + } else if (header.p_col!=0) { + zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); + } else {zsquare=0;} + + //if the snp is also present in cor file, then do calculations + if (mapRS2var.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { + //obtain var_x + if (k_mode==1) { + if (header.var_col==0) { + if (header.af_col!=0) { + var_x=2.0*af*(1.0-af); + } else { + var_x=mapRS2var.at(rs); + } + } + } else { + var_x=1.0; + } + + //compute q + if (mapRS2cat.size()!=0) { + vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total; + vec_s[mapRS2cat.at(rs) ]+=var_x; + } else { + vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total; + vec_s[0]+=var_x; + } + + ni_total=max(ni_total, n_total); + ns_test++; + } + + ns_total++; + } + + //save q + for (size_t i=0; i<q->size; i++) { + if (vec_s[i]!=0) { + gsl_vector_set(q, i, vec_q[i]/vec_s[i]); + } + gsl_vector_set(s, i, vec_s[i]); + } + + infile.clear(); + infile.close(); + + return; +} + + + + +//read S file: S and Svar +void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar) +{ + igzstream infile (file_s.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;} + + string line; + char *ch_ptr; + double d; + + for (size_t i=0; i<S->size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; j<S->size2; j++) { + d=gsl_matrix_get(S, i, j)+atof(ch_ptr); + gsl_matrix_set(S, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + for (size_t i=0; i<Svar->size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; j<Svar->size2; j++) { + d=gsl_matrix_get(Svar, i, j)+atof(ch_ptr); + gsl_matrix_set(Svar, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + + + + +void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar) +{ + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Svar); + + string file_name; + + igzstream infile (file_ms.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open ms file: "<<file_ms<<endl; return;} + + while (!safeGetline(infile, file_name).eof()) { + ReadFile_s(file_name, S, Svar); + } + + infile.clear(); + infile.close(); + + return; +} + + + + +//read V file: V (i.e. Q) +void ReadFile_v (const string &file_v, gsl_matrix *V) +{ + igzstream infile (file_v.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open v file: "<<file_v<<endl; return;} + + string line; + char *ch_ptr; + double d; + + for (size_t i=0; i<V->size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; j<V->size2; j++) { + d=gsl_matrix_get(V, i, j)+atof(ch_ptr); + gsl_matrix_set(V, i, j, d); + ch_ptr=strtok (NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + + +void ReadFile_mv (const string &file_mv, gsl_matrix *V) +{ + gsl_matrix_set_zero(V); + + string file_name; + + igzstream infile (file_mv.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open ms file: "<<file_mv<<endl; return;} + + while (!safeGetline(infile, file_name).eof()) { + ReadFile_v(file_name, V); + } + + infile.clear(); + infile.close(); + + return; +} + + +//read q file: q, s and ni_test +void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +{ + igzstream infile (file_s.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;} + + string line; + char *ch_ptr; + double d; + + for (size_t i=0; i<q_vec->size; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + d=gsl_vector_get(q_vec, i)+atof(ch_ptr); + gsl_vector_set(q_vec, i, d); + } + + for (size_t i=0; i<s_vec->size; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + d=gsl_vector_get(s_vec, i)+atof(ch_ptr); + gsl_vector_set(s_vec, i, d); + } + + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + df=atof(ch_ptr); + + infile.clear(); + infile.close(); + + return; +} + + + +void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +{ + gsl_vector_set_zero(q_vec); + gsl_vector_set_zero(s_vec); + + string file_name; + + igzstream infile (file_mq.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mq file: "<<file_mq<<endl; return;} + + while (!safeGetline(infile, file_name).eof()) { + ReadFile_q(file_name, q_vec, s_vec, df); + } + + infile.clear(); + infile.close(); + + return; +} @@ -16,7 +16,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __IO_H__ +#ifndef __IO_H__ #define __IO_H__ @@ -26,6 +26,8 @@ #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" +#include "gzstream.h" + #ifdef FORCE_FLOAT #include "param_float.h" #else @@ -34,6 +36,9 @@ using namespace std; + + + void ProgressBar (string str, double p, double total); void ProgressBar (string str, double p, double total, double ratio); std::istream& safeGetline(std::istream& is, std::string& t); @@ -51,17 +56,21 @@ bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vect bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test); bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); +bool Bimbam_ReadOneSNP (const size_t inc, const vector<int> &indicator_idv, igzstream &infile, gsl_vector *geno, double &geno_mean); +void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, ifstream &infile, gsl_vector *geno, double &geno_mean); void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U); -void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); +void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); +bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test); +bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test); bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est); @@ -69,6 +78,29 @@ bool CountFileLines (const string &file_input, size_t &n_lines); bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total); +bool ReadHeader (const string &line, HEADER &header); +bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc); + +bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin); +bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin); + +bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var); +void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test); + + +void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar); +void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar); +void ReadFile_v (const string &file_v, gsl_matrix *V); +void ReadFile_mv (const string &file_mq, gsl_matrix *V); +void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df); +void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df); + +// WJA added +bool bgenKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); +bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); +bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt); + + #endif @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -26,7 +26,7 @@ #include <cmath> #include <iostream> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <bitset> #include <cstring> @@ -57,48 +57,50 @@ using namespace std; -void LM::CopyFromParam (PARAM &cPar) +void LM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; - + // WJA added + file_oxford=cPar.file_oxford; + time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + ng_total=cPar.ng_total; ng_test=0; - - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void LM::CopyToParam (PARAM &cPar) +void LM::CopyToParam (PARAM &cPar) { - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.ng_test=ng_test; - + return; } -void LM::WriteFiles () +void LM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; @@ -109,7 +111,7 @@ void LM::WriteFiles () if (!file_gene.empty()) { outfile<<"geneID"<<"\t"; - + if (a_mode==51) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; } else if (a_mode==52) { @@ -119,10 +121,10 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - - for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { + + for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { outfile<<snpInfo[t].rs_number<<"\t"; - + if (a_mode==51) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==52) { @@ -132,10 +134,10 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl; } else {} - } + } } else { - outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; - + outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis"<<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; + if (a_mode==51) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl; } else if (a_mode==52) { @@ -145,13 +147,13 @@ void LM::WriteFiles () } else if (a_mode==54) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - + size_t t=0; for (size_t i=0; i<snpInfo.size(); ++i) { if (indicator_snp[i]==0) {continue;} - - outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; - + + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<ni_test-snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; + if (a_mode==51) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==52) { @@ -164,8 +166,8 @@ void LM::WriteFiles () t++; } } - - + + outfile.close(); outfile.clear(); return; @@ -179,21 +181,21 @@ void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wt { size_t c_size=Wty->size; double d; - + gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); - + gsl_blas_ddot (x, x, &xPwx); gsl_blas_ddot (x, y, &xPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - - gsl_blas_ddot (WtWiWtx, Wtx, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + + gsl_blas_ddot (WtWiWtx, Wtx, &d); xPwx-=d; - - gsl_blas_ddot (WtWiWtx, Wty, &d); + + gsl_blas_ddot (WtWiWtx, Wty, &d); xPwy-=d; - + gsl_vector_free (WtWiWtx); - + return; } @@ -202,17 +204,17 @@ void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, { size_t c_size=Wty->size; double d; - + gsl_vector *WtWiWty=gsl_vector_alloc (c_size); - + gsl_blas_ddot (y, y, &yPwy); - gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); - - gsl_blas_ddot (WtWiWty, Wty, &d); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + + gsl_blas_ddot (WtWiWty, Wty, &d); yPwy-=d; - + gsl_vector_free (WtWiWty); - + return; } @@ -223,38 +225,38 @@ void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, cons { double yPxy=yPwy-xPwy*xPwy/xPwx; double se_wald, se_score; - + beta=xPwy/xPwx; se_wald=sqrt(yPxy/(df*xPwx) ); se_score=sqrt(yPwy/((double)n_size*xPwx) ); - + p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); - + if (test_mode==3) {se=se_score;} else {se=se_wald;} - + return; } -void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) +void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) { ifstream infile (file_gene.c_str(), ifstream::in); if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;} - + clock_t time_start=clock(); - + string line; char *ch_ptr; - + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int c_phen; string rs; //gene id double d; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -262,7 +264,7 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_vector *y=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -274,42 +276,42 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wtx, x, xPwx); - + //header getline(infile, line); - + for (size_t t=0; t<ng_total; t++) { getline(infile, line); if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);} ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; - - c_phen=0; + + c_phen=0; for (size_t i=0; i<indicator_idv.size(); ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - - d=atof(ch_ptr); + + d=atof(ch_ptr); gsl_vector_set(y, c_phen, d); - + c_phen++; } - - //calculate statistics - time_start=clock(); - + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout<<endl; - + gsl_vector_free(y); gsl_matrix_free(WtW); @@ -317,31 +319,259 @@ void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); infile.clear(); - + return; } +// WJA added +#include <assert.h> +void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;} + + + clock_t time_start=clock(); + + string line; + char *ch_ptr; + + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + int n_miss, c_phen; + double geno, x_mean; + + //calculate some basic quantities + double yPwy, xPwy, xPwx; + double df=(double)W->size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + gsl_vector *x_miss=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<<std::endl; + + + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) + { + +// if (t>1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<bgen_N; ++i) { + if (indicator_idv[i]==0) {continue;} + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast<double>(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<<endl; + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wty); + gsl_vector_free(Wtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return; +} + + + void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) { igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} - + clock_t time_start=clock(); - + string line; char *ch_ptr; - + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_miss, c_phen; double geno, x_mean; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -350,7 +580,7 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x_miss=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -362,58 +592,58 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - - //start reading genotypes and analyze + + //start reading genotypes and analyze for (size_t t=0; t<indicator_snp.size(); ++t) { //if (t>1) {break;} getline(infile, line); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - + x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; - } - + } + x_mean/=(double)(ni_test-n_miss); - + for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } - } - - //calculate statistics - time_start=clock(); + } - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; gsl_vector_free(x); @@ -424,10 +654,10 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); infile.clear(); - + return; } @@ -437,21 +667,21 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) -void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) +void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) { string file_bed=file_bfile+".bed"; ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} - + clock_t time_start=clock(); - + char ch[1]; - bitset<8> b; - + bitset<8> b; + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; - + //calculate some basic quantities double yPwy, xPwy, xPwx; double df=(double)W->size1-(double)W->size2-1.0; @@ -459,7 +689,7 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_vector *x=gsl_vector_alloc (W->size1); gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); - gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); gsl_vector *Wty=gsl_vector_alloc (W->size2); gsl_vector *Wtx=gsl_vector_alloc (W->size2); gsl_permutation * pmt=gsl_permutation_alloc (W->size2); @@ -471,90 +701,104 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); CalcvPv(WtWi, Wty, y, yPwy); - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - - + + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} - + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes - x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;} if (indicator_idv[ci_total]==0) {ci_total++; continue;} - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; } else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } } else { - if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } else {gsl_vector_set(x, ci_test, -9); n_miss++; } } - + ci_total++; ci_test++; } } - + x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { + + for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } } - - //calculate statistics - time_start=clock(); - + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; - + gsl_vector_free(x); gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); + gsl_matrix_free(WtWi); gsl_vector_free(Wty); gsl_vector_free(Wtx); gsl_permutation_free(pmt); - + infile.close(); - infile.clear(); - + infile.clear(); + return; } + + + + + + + + + + + + + + //make sure that both y and X are centered already -void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) +void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) { double yty, xty, xtx, log_lr; gsl_blas_ddot(y, y, &yty); @@ -567,6 +811,6 @@ void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_ log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); pos_loglr.push_back(make_pair(i,log_lr) ); } - + return; } @@ -1,22 +1,22 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __LM_H__ +#ifndef __LM_H__ #define __LM_H__ #include "gsl/gsl_vector.h" @@ -35,40 +35,44 @@ using namespace std; class LM { - + public: // IO related parameters int a_mode; //analysis mode, 50+1/2/3/4 for Frequentist tests size_t d_pace; //display pace - + string file_bfile; string file_geno; + string file_oxford; string file_out; string path_out; - + string file_gene; - + // Summary statistics size_t ni_total, ni_test; //number of individuals size_t ns_total, ns_test; //number of snps size_t ng_total, ng_test; //number of genes size_t n_cvt; double time_opt; //time spent - + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector<SNPINFO> snpInfo; //record SNP information - + // Not included in PARAM vector<SUMSTAT> sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x); void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y); + // WJA added + void Analyzebgen (const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); }; void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr); diff --git a/src/lmm.cpp b/src/lmm.cpp index e0b4160..7bcf89a 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -26,7 +26,7 @@ #include <cmath> #include <iostream> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <bitset> #include <cstring> @@ -58,56 +58,58 @@ using namespace std; -void LMM::CopyFromParam (PARAM &cPar) +void LMM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; file_gene=cPar.file_gene; - + // WJA added + file_oxford=cPar.file_oxford; + l_min=cPar.l_min; l_max=cPar.l_max; - n_region=cPar.n_region; + n_region=cPar.n_region; l_mle_null=cPar.l_mle_null; logl_mle_H0=cPar.logl_mle_H0; - + time_UtX=0.0; time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + ng_total=cPar.ng_total; ng_test=0; - - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void LMM::CopyToParam (PARAM &cPar) +void LMM::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.ng_test=ng_test; - + return; } -void LMM::WriteFiles () +void LMM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; @@ -118,7 +120,7 @@ void LMM::WriteFiles () if (!file_gene.empty()) { outfile<<"geneID"<<"\t"; - + if (a_mode==1) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl; } else if (a_mode==2) { @@ -128,10 +130,10 @@ void LMM::WriteFiles () } else if (a_mode==4) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - - for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { + + for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) { outfile<<snpInfo[t].rs_number<<"\t"; - + if (a_mode==1) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==2) { @@ -141,10 +143,10 @@ void LMM::WriteFiles () } else if (a_mode==4) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl; } else {} - } + } } else { outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; - + if (a_mode==1) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl; } else if (a_mode==2) { @@ -154,13 +156,13 @@ void LMM::WriteFiles () } else if (a_mode==4) { outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - + size_t t=0; for (size_t i=0; i<snpInfo.size(); ++i) { if (indicator_snp[i]==0) {continue;} - + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; - + if (a_mode==1) { outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl; } else if (a_mode==2) { @@ -173,8 +175,8 @@ void LMM::WriteFiles () t++; } } - - + + outfile.close(); outfile.clear(); return; @@ -196,10 +198,10 @@ size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { size_t index; size_t l, h; if (b>a) {l=a; h=b;} else {l=b; h=a;} - + size_t n=n_cvt+2; - index=(2*n-l+2)*(l-1)/2+h-l; - + index=(2*n-l+2)*(l-1)/2+h-l; + return index; } @@ -209,12 +211,12 @@ void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval size_t index_ab, index_aw, index_bw, index_ww; double p_ab; double ps_ab, ps_aw, ps_bw, ps_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab); if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;} @@ -224,12 +226,12 @@ void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps_ab=gsl_matrix_get (Pab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); ps_ww=gsl_matrix_get (Pab, p-1, index_ww); - + p_ab=ps_ab-ps_aw*ps_bw/ps_ww; gsl_matrix_set (Pab, p, index_ab, p_ab); } @@ -245,12 +247,12 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e size_t index_ab, index_aw, index_bw, index_ww; double p2_ab; double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab); if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);} @@ -260,7 +262,7 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps2_ab=gsl_matrix_get (PPab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); @@ -268,11 +270,11 @@ void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_e ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); - + p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww); p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww; gsl_matrix_set (PPab, p, index_ab, p2_ab); - + } } } @@ -286,12 +288,12 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH size_t index_ab, index_aw, index_bw, index_ww; double p3_ab; double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww; - + for (size_t p=0; p<=n_cvt+1; ++p) { for (size_t a=p+1; a<=n_cvt+2; ++a) { for (size_t b=a; b<=n_cvt+2; ++b) { index_ab=GetabIndex (a, b, n_cvt); - if (p==0) { + if (p==0) { gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab); if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);} @@ -301,7 +303,7 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH index_aw=GetabIndex (a, p, n_cvt); index_bw=GetabIndex (b, p, n_cvt); index_ww=GetabIndex (p, p, n_cvt); - + ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab); ps_aw=gsl_matrix_get (Pab, p-1, index_aw); ps_bw=gsl_matrix_get (Pab, p-1, index_bw); @@ -312,11 +314,11 @@ void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiH ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw); ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw); ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww); - + p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww); p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww; p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww); - + gsl_matrix_set (PPPab, p, index_ab, p3_ab); } } @@ -331,119 +333,119 @@ double LogL_f (double l, void *params) { FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double f=0.0, logdet_h=0.0, d; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_div (Hi_eval, v_temp); + for (size_t i=0; i<(p->eval)->size; ++i) { d=gsl_vector_get (v_temp, i); logdet_h+=log(fabs(d)); - } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0); - - index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy); - + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); return f; } - - + + double LogL_dev1 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double dev1=0.0, trace_Hi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); - + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); - + if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;} - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - - double trace_HiK=((double)ni_test-trace_Hi)/l; - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double yPKPy=(P_yy-PP_yy)/l; + double yPKPy=(P_yy-PP_yy)/l; dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev1; } - - + + double LogL_dev2 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -451,71 +453,71 @@ double LogL_dev2 (double l, void *params) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); - + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); - + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev2; } - - - - - + + + + + void LogL_dev12 (double l, void *params, double *dev1, double *dev2) { FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} - + double trace_Hi=0.0, trace_HiHi=0.0; size_t index_yy; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -523,54 +525,54 @@ void LogL_dev12 (double l, void *params, double *dev1, double *dev2) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + double trace_HiK=((double)ni_test-trace_Hi)/l; double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); - + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); - - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + *dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; *dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return; } @@ -578,39 +580,39 @@ void LogL_dev12 (double l, void *params, double *dev1, double *dev2) double LogRL_f (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_div (Hi_eval, v_temp); + for (size_t i=0; i<(p->eval)->size; ++i) { d=gsl_vector_get (v_temp, i); logdet_h+=log(fabs(d)); } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); gsl_vector_set_all (v_temp, 1.0); - CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); - + CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); + //calculate |WHiW|-|WW| logdet_hiw=0.0; for (size_t i=0; i<nc_total; ++i) { @@ -620,12 +622,12 @@ double LogRL_f (double l, void *params) d=gsl_matrix_get (Iab, i, index_ww); logdet_hiw-=log(d); } - index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); - - double c=0.5*df*(log(df)-log(2*M_PI)-1.0); + + double c=0.5*df*(log(df)-log(2*M_PI)-1.0); f=c-0.5*logdet_h-0.5*logdet_hiw-0.5*df*log(P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (Iab); gsl_vector_free (Hi_eval); @@ -637,44 +639,44 @@ double LogRL_f (double l, void *params) double LogRL_dev1 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double dev1=0.0, trace_Hi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); - + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi; double ps_ww, ps2_ww; @@ -685,21 +687,21 @@ double LogRL_dev1 (double l, void *params) trace_P-=ps2_ww/ps_ww; } double trace_PK=(df-trace_P)/l; - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); - double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; - - dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; - + double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; + + dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev1; } @@ -708,19 +710,19 @@ double LogRL_dev1 (double l, void *params) double LogRL_dev2 (double l, void *params) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -728,31 +730,31 @@ double LogRL_dev2 (double l, void *params) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi, trace_PP=trace_HiHi; double ps_ww, ps2_ww, ps3_ww; @@ -765,46 +767,46 @@ double LogRL_dev2 (double l, void *params) trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww; } double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l); - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return dev2; } - + void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) { - FUNC_PARAM *p=(FUNC_PARAM *) params; + FUNC_PARAM *p=(FUNC_PARAM *) params; size_t n_cvt=p->n_cvt; - size_t ni_test=p->ni_test; + size_t ni_test=p->ni_test; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + double df; size_t nc_total; if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} - + double trace_Hi=0.0, trace_HiHi=0.0; size_t index_ww; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); @@ -812,31 +814,31 @@ void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); - + gsl_vector_memcpy (v_temp, p->eval); gsl_vector_scale (v_temp, l); if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + gsl_vector_memcpy (HiHi_eval, Hi_eval); - gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); gsl_vector_mul (HiHiHi_eval, Hi_eval); - + gsl_vector_set_all (v_temp, 1.0); gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); - - if (p->e_mode!=0) { + + if (p->e_mode!=0) { trace_Hi=(double)ni_test-trace_Hi; trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; } - - CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); - CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); - CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); - + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + //calculate tracePK and trace PKPK double trace_P=trace_Hi, trace_PP=trace_HiHi; double ps_ww, ps2_ww, ps3_ww; @@ -850,29 +852,29 @@ void LogRL_dev12 (double l, void *params, double *dev1, double *dev2) } double trace_PK=(df-trace_P)/l; double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l); - + //calculate yPKPy, yPKPKPy index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); double P_yy=gsl_matrix_get (Pab, nc_total, index_ww); double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww); - double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); - double yPKPy=(P_yy-PP_yy)/l; + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww); + double yPKPy=(P_yy-PP_yy)/l; double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); - + *dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy; *dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); - + gsl_matrix_free (Pab); gsl_matrix_free (PPab); gsl_matrix_free (PPPab); gsl_vector_free (Hi_eval); gsl_vector_free (HiHi_eval); gsl_vector_free (HiHiHi_eval); - gsl_vector_free (v_temp); - + gsl_vector_free (v_temp); + return ; } - + @@ -884,35 +886,35 @@ void LMM::CalcRLWald (const double &l, const FUNC_PARAM ¶ms, double &beta, d { size_t n_cvt=params.n_cvt; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + int df=(int)ni_test-(int)n_cvt-1; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); - + gsl_vector_memcpy (v_temp, params.eval); gsl_vector_scale (v_temp, l); if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - - CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); - double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); - double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); - + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + beta=P_xy/P_xx; double tau=(double)df/Px_yy; - se=sqrt(1.0/(tau*P_xx)); - p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); -// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); - + se=sqrt(1.0/(tau*P_xx)); + p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); +// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); @@ -924,36 +926,36 @@ void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, { size_t n_cvt=params.n_cvt; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + int df=(int)ni_test-(int)n_cvt-1; - + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); - + gsl_vector_memcpy (v_temp, params.eval); gsl_vector_scale (v_temp, l); if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} gsl_vector_add_constant (v_temp, 1.0); - gsl_vector_div (Hi_eval, v_temp); - - CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); - double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); - double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); - + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + beta=P_xy/P_xx; double tau=(double)df/Px_yy; - se=sqrt(1.0/(tau*P_xx)); - + se=sqrt(1.0/(tau*P_xx)); + p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df); -// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); - +// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); + gsl_matrix_free (Pab); gsl_vector_free (Hi_eval); gsl_vector_free (v_temp); @@ -967,131 +969,131 @@ void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, -void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { size_t index_ab; size_t n_cvt=UtW->size2; - + gsl_vector *u_a=gsl_vector_alloc (Uty->size); - + for (size_t a=1; a<=n_cvt+2; ++a) { if (a==n_cvt+1) {continue;} - + if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1); gsl_vector_memcpy (u_a, &UtW_col.vector); } - - for (size_t b=a; b>=1; --b) { + + for (size_t b=a; b>=1; --b) { if (b==n_cvt+1) {continue;} - + index_ab=GetabIndex (a, b, n_cvt); gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); - + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); - } - + } + gsl_vector_mul(&Uab_col.vector, u_a); } } - + gsl_vector_free (u_a); return; } -void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) -{ +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) +{ size_t index_ab; size_t n_cvt=UtW->size2; - - for (size_t b=1; b<=n_cvt+2; ++b) { + + for (size_t b=1; b<=n_cvt+2; ++b) { index_ab=GetabIndex (n_cvt+1, b, n_cvt); gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); - + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);} else { gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); } - + gsl_vector_mul(&Uab_col.vector, Utx); } - + return; } -void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) +void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { size_t index_ab; size_t n_cvt=W->size2; - + double d; gsl_vector *v_a=gsl_vector_alloc (y->size); gsl_vector *v_b=gsl_vector_alloc (y->size); - + for (size_t a=1; a<=n_cvt+2; ++a) { if (a==n_cvt+1) {continue;} - + if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1); gsl_vector_memcpy (v_a, &W_col.vector); } - - for (size_t b=a; b>=1; --b) { + + for (size_t b=a; b>=1; --b) { if (b==n_cvt+1) {continue;} - + index_ab=GetabIndex (a, b, n_cvt); - + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); gsl_vector_memcpy (v_b, &W_col.vector); - } - + } + gsl_blas_ddot (v_a, v_b, &d); gsl_vector_set(ab, index_ab, d); } } - + gsl_vector_free (v_a); gsl_vector_free (v_b); return; } -void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) -{ +void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) +{ size_t index_ab; size_t n_cvt=W->size2; - + double d; gsl_vector *v_b=gsl_vector_alloc (y->size); - - for (size_t b=1; b<=n_cvt+2; ++b) { + + for (size_t b=1; b<=n_cvt+2; ++b) { index_ab=GetabIndex (n_cvt+1, b, n_cvt); - + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);} else { gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); gsl_vector_memcpy (v_b, &W_col.vector); } - + gsl_blas_ddot (x, v_b, &d); gsl_vector_set(ab, index_ab, d); } - + gsl_vector_free (v_b); - + return; } @@ -1099,101 +1101,101 @@ void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_ -void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) +void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) { - ifstream infile (file_gene.c_str(), ifstream::in); + igzstream infile (file_gene.c_str(), igzstream::in); if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;} - + clock_t time_start=clock(); - + string line; char *ch_ptr; - + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; double logl_H1=0.0, logl_H0=0.0, l_H0; int c_phen; string rs; //gene id double d; - + //Calculate basic quantities size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - + gsl_vector *y=gsl_vector_alloc (U->size1); gsl_vector *Uty=gsl_vector_alloc (U->size2); gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_vector *ab=gsl_vector_alloc (n_index); + //header getline(infile, line); - + for (size_t t=0; t<ng_total; t++) { !safeGetline(infile, line).eof(); if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);} ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; - - c_phen=0; + + c_phen=0; for (size_t i=0; i<indicator_idv.size(); ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - - d=atof(ch_ptr); + + d=atof(ch_ptr); gsl_vector_set(y, c_phen, d); - + c_phen++; } - + time_start=clock(); - gsl_blas_dgemv (CblasTrans, 1.0, U, y, 0.0, Uty); + gsl_blas_dgemv (CblasTrans, 1.0, U, y, 0.0, Uty); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //calculate null time_start=clock(); - + gsl_matrix_set_zero (Uab); - + CalcUab (UtW, Uty, Uab); FUNC_PARAM param0={false, ni_test, n_cvt, eval, Uab, ab, 0}; - + if (a_mode==2 || a_mode==3 || a_mode==4) { CalcLambda('L', param0, l_min, l_max, n_region, l_H0, logl_H0); } - + //calculate alternative CalcUab(UtW, Uty, Utx, Uab); FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; - + //3 is before 1 if (a_mode==3 || a_mode==4) { CalcRLScore (l_H0, param1, beta, se, p_score); } - + if (a_mode==1 || a_mode==4) { CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); CalcRLWald (lambda_remle, param1, beta, se, p_wald); } - + if (a_mode==2 || a_mode==4) { CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); } - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); } cout<<endl; - + gsl_vector_free (y); gsl_vector_free (Uty); gsl_matrix_free (Uab); gsl_vector_free (ab); - + infile.close(); infile.clear(); - + return; } @@ -1201,22 +1203,22 @@ void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma -void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) +void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) { igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} clock_t time_start=clock(); - + string line; char *ch_ptr; - + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; double logl_H1=0.0; int n_miss, c_phen; double geno, x_mean; - + //Calculate basic quantities size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; @@ -1224,45 +1226,45 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ gsl_vector *x_miss=gsl_vector_alloc (U->size1); gsl_vector *Utx=gsl_vector_alloc (U->size2); gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); -// } - - //start reading genotypes and analyze +// } + + //start reading genotypes and analyze for (size_t t=0; t<indicator_snp.size(); ++t) { // if (t>1) {break;} !safeGetline(infile, line).eof(); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); - + ch_ptr=strtok (NULL, " , \t"); + x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; - } - + } + x_mean/=(double)(ni_test-n_miss); - + for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); @@ -1270,55 +1272,55 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ gsl_vector_set(x, i, 2-geno); } } - - + + //calculate statistics time_start=clock(); - gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + CalcUab(UtW, Uty, Utx, Uab); // if (e_mode!=0) { // Calcab (W, y, x, ab); // } - + time_start=clock(); FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; - + //3 is before 1 if (a_mode==3 || a_mode==4) { CalcRLScore (l_mle_null, param1, beta, se, p_score); } - + if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); CalcRLWald (lambda_remle, param1, beta, se, p_wald); } - + if (a_mode==2 || a_mode==4) { CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + if (x_mean>1) {beta*=-1;} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; - + gsl_vector_free (x); gsl_vector_free (x_miss); gsl_vector_free (Utx); gsl_matrix_free (Uab); gsl_vector_free (ab); - + infile.close(); infile.clear(); - + return; } @@ -1328,37 +1330,37 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ -void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) +void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) { string file_bed=file_bfile+".bed"; ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} - + clock_t time_start=clock(); - + char ch[1]; - bitset<8> b; - + bitset<8> b; + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; double logl_H1=0.0; int n_bit, n_miss, ci_total, ci_test; double geno, x_mean; - + //Calculate basic quantities size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; gsl_vector *x=gsl_vector_alloc (U->size1); gsl_vector *Utx=gsl_vector_alloc (U->size2); - gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -1368,16 +1370,16 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m infile.read(ch,1); b=ch[0]; } - - + + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} - + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes - x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; @@ -1390,7 +1392,7 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } } else { - if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } else {gsl_vector_set(x, ci_test, -9); n_miss++; } } @@ -1398,105 +1400,345 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m ci_test++; } } - + x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { + + for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } } - + //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + CalcUab(UtW, Uty, Utx, Uab); // if (e_mode!=0) { // Calcab (W, y, x, ab); // } - + time_start=clock(); FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; - + //3 is before 1, for beta if (a_mode==3 || a_mode==4) { CalcRLScore (l_mle_null, param1, beta, se, p_score); } - + if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); CalcRLWald (lambda_remle, param1, beta, se, p_wald); } - + if (a_mode==2 || a_mode==4) { CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } - - if (x_mean>1) {beta*=-1;} - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); - } + } cout<<endl; - + gsl_vector_free (x); gsl_vector_free (Utx); gsl_matrix_free (Uab); gsl_vector_free (ab); - + infile.close(); - infile.clear(); - + infile.clear(); + return; } +// WJA added +#include <assert.h> +void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;} + + + clock_t time_start=clock(); + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0; + int n_miss, c_phen; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<<std::endl; + + + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) + { + +// if (t>1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); -void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr) + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<bgen_N; ++i) { + if (indicator_idv[i]==0) {continue;} + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast<double>(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<<endl; + + gsl_vector_free (x); + gsl_vector_free (x_miss); + gsl_vector_free (Utx); + gsl_matrix_free (Uab); + gsl_vector_free (ab); + + infile.close(); + infile.clear(); + + return; + +} + + + +void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr) { double logl_H0, logl_H1, log_lr, lambda0, lambda1; - + gsl_vector *w=gsl_vector_alloc (Uty->size); - gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); + gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6); - gsl_vector *ab=gsl_vector_alloc (6); - + gsl_vector *ab=gsl_vector_alloc (6); + gsl_vector_set_zero(ab); gsl_vector_set_all (w, 1.0); - gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); - gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); - - CalcUab (Utw, Uty, Uab) ; - FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; - + gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); + gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); + + CalcUab (Utw, Uty, Uab) ; + FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; + CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0); - + for (size_t i=0; i<UtX->size2; ++i) { gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i); CalcUab(Utw, Uty, &UtX_col.vector, Uab); FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0}; - + CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1); - log_lr=logl_H1-logl_H0; - + log_lr=logl_H1-logl_H0; + pos_loglr.push_back(make_pair(i,log_lr) ); } - + gsl_vector_free (w); gsl_matrix_free (Utw); gsl_matrix_free (Uab); gsl_vector_free (ab); - + return; } @@ -1506,17 +1748,17 @@ void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf) { if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;} - + vector<pair<double, double> > lambda_lh; - + //evaluate first order derivates in different intervals double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region; double dev1_l, dev1_h, logf_l, logf_h; - + for (size_t i=0; i<n_region; ++i) { lambda_l=l_min*exp(lambda_interval*i); lambda_h=l_min*exp(lambda_interval*(i+1.0)); - + if (func_name=='R' || func_name=='r') { dev1_l=LogRL_dev1 (lambda_l, ¶ms); dev1_h=LogRL_dev1 (lambda_h, ¶ms); @@ -1525,12 +1767,12 @@ void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, c dev1_l=LogL_dev1 (lambda_l, ¶ms); dev1_h=LogL_dev1 (lambda_h, ¶ms); } - + if (dev1_l*dev1_h<=0) { lambda_lh.push_back(make_pair(lambda_l, lambda_h)); } } - + //if derivates do not change signs in any interval if (lambda_lh.empty()) { if (func_name=='R' || func_name=='r') { @@ -1541,21 +1783,21 @@ void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, c logf_l=LogL_f (l_min, ¶ms); logf_h=LogL_f (l_max, ¶ms); } - + if (logf_l>=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;} } else { //if derivates change signs int status; int iter=0, max_iter=100; - double l, l_temp; - + double l, l_temp; + gsl_function F; gsl_function_fdf FDF; - + F.params=¶ms; FDF.params=¶ms; - + if (func_name=='R' || func_name=='r') { F.function=&LogRL_dev1; FDF.f=&LogRL_dev1; @@ -1568,57 +1810,57 @@ void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, c FDF.df=&LogL_dev2; FDF.fdf=&LogL_dev12; } - + const gsl_root_fsolver_type *T_f; gsl_root_fsolver *s_f; T_f=gsl_root_fsolver_brent; s_f=gsl_root_fsolver_alloc (T_f); - + const gsl_root_fdfsolver_type *T_fdf; gsl_root_fdfsolver *s_fdf; T_fdf=gsl_root_fdfsolver_newton; - s_fdf=gsl_root_fdfsolver_alloc(T_fdf); - + s_fdf=gsl_root_fdfsolver_alloc(T_fdf); + for (vector<double>::size_type i=0; i<lambda_lh.size(); ++i) { lambda_l=lambda_lh[i].first; lambda_h=lambda_lh[i].second; - + gsl_root_fsolver_set (s_f, &F, lambda_l, lambda_h); - + do { iter++; status=gsl_root_fsolver_iterate (s_f); l=gsl_root_fsolver_root (s_f); lambda_l=gsl_root_fsolver_x_lower (s_f); lambda_h=gsl_root_fsolver_x_upper (s_f); - status=gsl_root_test_interval (lambda_l, lambda_h, 0, 1e-1); + status=gsl_root_test_interval (lambda_l, lambda_h, 0, 1e-1); } - while (status==GSL_CONTINUE && iter<max_iter); - + while (status==GSL_CONTINUE && iter<max_iter); + iter=0; - - gsl_root_fdfsolver_set (s_fdf, &FDF, l); - + + gsl_root_fdfsolver_set (s_fdf, &FDF, l); + do { iter++; status=gsl_root_fdfsolver_iterate (s_fdf); l_temp=l; l=gsl_root_fdfsolver_root (s_fdf); - status=gsl_root_test_delta (l, l_temp, 0, 1e-5); + status=gsl_root_test_delta (l, l_temp, 0, 1e-5); } - while (status==GSL_CONTINUE && iter<max_iter && l>l_min && l<l_max); - + while (status==GSL_CONTINUE && iter<max_iter && l>l_min && l<l_max); + l=l_temp; if (l<l_min) {l=l_min;} if (l>l_max) {l=l_max;} - if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} - + if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} + if (i==0) {logf=logf_l; lambda=l;} else if (logf<logf_l) {logf=logf_l; lambda=l;} else {} } - gsl_root_fsolver_free (s_f); - gsl_root_fdfsolver_free (s_fdf); - + gsl_root_fsolver_free (s_f); + gsl_root_fdfsolver_free (s_fdf); + if (func_name=='R' || func_name=='r') { logf_l=LogRL_f (l_min, ¶ms); logf_h=LogRL_f (l_max, ¶ms); @@ -1627,11 +1869,11 @@ void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, c logf_l=LogL_f (l_min, ¶ms); logf_h=LogL_f (l_max, ¶ms); } - - if (logf_l>logf) {lambda=l_min; logf=logf_l;} + + if (logf_l>logf) {lambda=l_min; logf=logf_l;} if (logf_h>logf) {lambda=l_max; logf=logf_h;} } - + return; } @@ -1646,53 +1888,53 @@ void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; - + CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0); - - gsl_matrix_free(Uab); - gsl_vector_free(ab); - + + gsl_matrix_free(Uab); + gsl_vector_free(ab); + return; } - - + + //obtain REMLE estimate for PVE using lambda_remle void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se) { size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); - + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { // gsl_vector_set_zero (ab); // Calcab (W, y, ab); // } - + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; - + double se=sqrt(-1.0/LogRL_dev2 (lambda, ¶m0)); - + pve=trace_G*lambda/(trace_G*lambda+1.0); pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se; - + gsl_matrix_free (Uab); - gsl_vector_free (ab); + gsl_vector_free (ab); return; } @@ -1703,9 +1945,9 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v { size_t n_cvt=UtW->size2, ni_test=UtW->size1; size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; - - gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); - gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); gsl_vector *Hi_eval=gsl_vector_alloc(eval->size); gsl_vector *v_temp=gsl_vector_alloc(eval->size); @@ -1713,16 +1955,16 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2); gsl_vector *WHiy=gsl_vector_alloc(UtW->size2); gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2); - + gsl_matrix_set_zero (Uab); - CalcUab (UtW, Uty, Uab); - + CalcUab (UtW, Uty, Uab); + gsl_vector_memcpy (v_temp, eval); gsl_vector_scale (v_temp, lambda); gsl_vector_set_all (Hi_eval, 1.0); gsl_vector_add_constant (v_temp, 1.0); gsl_vector_div (Hi_eval, v_temp); - + //calculate beta gsl_matrix_memcpy (HiW, UtW); for (size_t i=0; i<UtW->size2; i++) { @@ -1731,30 +1973,30 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v } gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW); gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy); - + int sig; gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2); LUDecomp (WHiW, pmt, &sig); LUSolve (WHiW, pmt, WHiy, beta); LUInvert (WHiW, pmt, Vbeta); - + //calculate vg and ve - CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); - - size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); - double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); - + CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + ve=P_yy/(double)(ni_test-n_cvt); vg=ve*lambda; - + //with ve, calculate se(beta) gsl_matrix_scale(Vbeta, ve); - + //obtain se_beta for (size_t i=0; i<Vbeta->size1; i++) { gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) ); } - + gsl_matrix_free(Uab); gsl_matrix_free(Pab); gsl_vector_free(ab); @@ -1764,8 +2006,309 @@ void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_v gsl_matrix_free(WHiW); gsl_vector_free(WHiy); gsl_matrix_free(Vbeta); - + gsl_permutation_free(pmt); return; } + + + + + + +void LMM::AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} + + clock_t time_start=clock(); + + string line; + char *ch_ptr; + + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0, logl_H0=0.0; + int n_miss, c_phen; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+2+1)*(n_cvt+2+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *UtW_expand=gsl_matrix_alloc (U->size1, UtW->size2+2); + gsl_matrix_view UtW_expand_mat=gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); + gsl_matrix_memcpy (&UtW_expand_mat.matrix, UtW); + gsl_vector_view UtW_expand_env=gsl_matrix_column(UtW_expand, UtW->size2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); + gsl_vector_view UtW_expand_x=gsl_matrix_column(UtW_expand, UtW->size2+1); + + //gsl_matrix_set_zero (Uab); + // CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) { +// if (t>1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<ni_total; ++i) { + ch_ptr=strtok (NULL, " , \t"); + if (indicator_idv[i]==0) {continue;} + + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=(double)(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &UtW_expand_x.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW_expand, Uty, Uab); + + if (a_mode==2 || a_mode==4) { + FUNC_PARAM param0={true, ni_test, n_cvt+2, eval, Uab, ab, 0}; + CalcLambda ('L', param0, l_min, l_max, n_region, lambda_mle, logl_H0); + } + + CalcUab(UtW_expand, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt+2, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<<endl; + + gsl_vector_free (x); + gsl_vector_free (x_miss); + gsl_vector_free (Utx); + gsl_matrix_free (Uab); + gsl_vector_free (ab); + + gsl_matrix_free (UtW_expand); + + infile.close(); + infile.clear(); + + return; +} + + + + + + + +void LMM::AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env) +{ + string file_bed=file_bfile+".bed"; + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} + + clock_t time_start=clock(); + + char ch[1]; + bitset<8> b; + + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0, logl_H0=0.0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+2+1)*(n_cvt+2+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix *UtW_expand=gsl_matrix_alloc (U->size1, UtW->size2+2); + gsl_matrix_view UtW_expand_mat=gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); + gsl_matrix_memcpy (&UtW_expand_mat.matrix, UtW); + gsl_vector_view UtW_expand_env=gsl_matrix_column(UtW_expand, UtW->size2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); + gsl_vector_view UtW_expand_x=gsl_matrix_column(UtW_expand, UtW->size2+1); + + //gsl_matrix_set_zero (Uab); + //CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { + if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} + if (indicator_snp[t]==0) {continue;} + + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; } + else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } + } + else { + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + else {gsl_vector_set(x, ci_test, -9); n_miss++; } + } + + ci_total++; + ci_test++; + } + } + + x_mean/=(double)(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + geno=gsl_vector_get(x,i); + if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &UtW_expand_x.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW_expand, Uty, Uab); + + if (a_mode==2 || a_mode==4) { + FUNC_PARAM param0={true, ni_test, n_cvt+2, eval, Uab, ab, 0}; + CalcLambda ('L', param0, l_min, l_max, n_region, lambda_mle, logl_H0); + } + + CalcUab(UtW_expand, Uty, Utx, Uab); + +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt+2, eval, Uab, ab, 0}; + + //3 is before 1, for beta + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<<endl; + + gsl_vector_free (x); + gsl_vector_free (Utx); + gsl_matrix_free (Uab); + gsl_vector_free (ab); + + gsl_matrix_free (UtW_expand); + + infile.close(); + infile.clear(); + + return; +} @@ -16,7 +16,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __LMM_H__ +#ifndef __LMM_H__ #define __LMM_H__ #include "gsl/gsl_vector.h" @@ -57,21 +57,23 @@ public: // IO related parameters int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests size_t d_pace; //display pace - + string file_bfile; string file_geno; string file_out; string path_out; - + string file_gene; - + // WJA added + string file_oxford; + // LMM related parameters double l_min; double l_max; size_t n_region; double l_mle_null; - double logl_mle_H0; - + double logl_mle_H0; + // Summary statistics size_t ni_total, ni_test; //number of individuals size_t ns_total, ns_test; //number of snps @@ -79,25 +81,29 @@ public: size_t n_cvt; double time_UtX; //time spent on optimization iterations double time_opt; //time spent on optimization iterations - + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector<SNPINFO> snpInfo; //record SNP information - + // Not included in PARAM vector<SUMSTAT> sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x); void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + // WJA added + void Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env); + void AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env); void WriteFiles (); - + void CalcRLWald (const double &lambda, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_wald); - void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); + void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); }; void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr); diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index 4b910ee..5826a1f 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -26,7 +26,7 @@ #include <cmath> #include <iostream> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <bitset> #include <cstring> @@ -60,16 +60,17 @@ using namespace std; //in this file, X, Y are already transformed (i.e. UtX and UtY) -void MVLMM::CopyFromParam (PARAM &cPar) +void MVLMM::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; + file_oxford=cPar.file_oxford; file_out=cPar.file_out; path_out=cPar.path_out; - + l_min=cPar.l_min; l_max=cPar.l_max; n_region=cPar.n_region; @@ -79,68 +80,68 @@ void MVLMM::CopyFromParam (PARAM &cPar) em_prec=cPar.em_prec; nr_prec=cPar.nr_prec; crt=cPar.crt; - + Vg_remle_null=cPar.Vg_remle_null; Ve_remle_null=cPar.Ve_remle_null; Vg_mle_null=cPar.Vg_mle_null; Ve_mle_null=cPar.Ve_mle_null; - + time_UtX=0.0; time_opt=0.0; - + ni_total=cPar.ni_total; ns_total=cPar.ns_total; ni_test=cPar.ni_test; ns_test=cPar.ns_test; n_cvt=cPar.n_cvt; - + n_ph=cPar.n_ph; - - indicator_idv=cPar.indicator_idv; + + indicator_idv=cPar.indicator_idv; indicator_snp=cPar.indicator_snp; snpInfo=cPar.snpInfo; - + return; } -void MVLMM::CopyToParam (PARAM &cPar) +void MVLMM::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - + cPar.time_opt=time_opt; + cPar.Vg_remle_null=Vg_remle_null; cPar.Ve_remle_null=Ve_remle_null; cPar.Vg_mle_null=Vg_mle_null; cPar.Ve_mle_null=Ve_mle_null; - + cPar.VVg_remle_null=VVg_remle_null; cPar.VVe_remle_null=VVe_remle_null; cPar.VVg_mle_null=VVg_mle_null; cPar.VVe_mle_null=VVe_mle_null; - + cPar.beta_remle_null=beta_remle_null; cPar.se_beta_remle_null=se_beta_remle_null; cPar.beta_mle_null=beta_mle_null; cPar.se_beta_mle_null=se_beta_mle_null; - + cPar.logl_remle_H0=logl_remle_H0; - cPar.logl_mle_H0=logl_mle_H0; + cPar.logl_mle_H0=logl_mle_H0; return; } -void MVLMM::WriteFiles () +void MVLMM::WriteFiles () { string file_str; file_str=path_out+"/"+file_out; file_str+=".assoc.txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} - + outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t"; - + for (size_t i=0; i<n_ph; i++) { outfile<<"beta_"<<i+1<<"\t"; } @@ -149,7 +150,7 @@ void MVLMM::WriteFiles () outfile<<"Vbeta_"<<i+1<<"_"<<j+1<<"\t"; } } - + if (a_mode==1) { outfile<<"p_wald"<<endl; } else if (a_mode==2) { @@ -159,20 +160,20 @@ void MVLMM::WriteFiles () } else if (a_mode==4) { outfile<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl; } else {} - - + + size_t t=0, c=0; for (size_t i=0; i<snpInfo.size(); ++i) { if (indicator_snp[i]==0) {continue;} - + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t"; - + outfile<<scientific<<setprecision(6); - + for (size_t i=0; i<n_ph; i++) { outfile<<sumStat[t].v_beta[i]<<"\t"; } - + c=0; for (size_t i=0; i<n_ph; i++) { for (size_t j=i; j<n_ph; j++) { @@ -180,7 +181,7 @@ void MVLMM::WriteFiles () c++; } } - + if (a_mode==1) { outfile<<sumStat[t].p_wald <<endl; } else if (a_mode==2) { @@ -190,11 +191,11 @@ void MVLMM::WriteFiles () } else if (a_mode==4) { outfile<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl; } else {} - + t++; } - - + + outfile.close(); outfile.clear(); return; @@ -208,24 +209,24 @@ void MVLMM::WriteFiles () - + double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix *UltVeh, gsl_matrix *UltVehi) { size_t d_size=V_g->size1; - double d, logdet_Ve=0.0; - + double d, logdet_Ve=0.0; + //eigen decomposition of V_e gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); + gsl_matrix_memcpy(V_e_temp, V_e); EigenDecomp(V_e_temp, U_l, D_l, 0); - + //calculate V_e_h and V_e_hi gsl_matrix_set_zero(V_e_h); gsl_matrix_set_zero(V_e_hi); @@ -233,14 +234,14 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, d=gsl_vector_get (D_l, i); if (d<=0) {continue;} logdet_Ve+=log(d); - + gsl_vector_view U_col=gsl_matrix_column(U_l, i); d=sqrt(d); gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_h); d=1.0/d; gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_hi); } - + //copy the upper part to lower part for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<i; j++) { @@ -248,19 +249,19 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix_set (V_e_hi, i, j, gsl_matrix_get(V_e_hi, j, i)); } } - + //calculate Lambda=V_ehi V_g V_ehi gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_g, V_e_hi, 0.0, VgVehi); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_e_hi, VgVehi, 0.0, Lambda); - + //eigen decomposition of Lambda EigenDecomp(Lambda, U_l, D_l, 0); - + for (size_t i=0; i<d_size; i++) { d=gsl_vector_get (D_l, i); if (d<0) {gsl_vector_set (D_l, i, 0);} } - + //calculate UltVeh and UltVehi gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_h, 0.0, UltVeh); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_hi, 0.0, UltVehi); @@ -279,7 +280,7 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, } cout<<endl; } - + cout<<"Dl: "<<endl; for (size_t i=0; i<d_size; i++) { cout<<gsl_vector_get (D_l, i)<<endl; @@ -292,7 +293,7 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, cout<<endl; } */ - + //free memory gsl_matrix_free (Lambda); gsl_matrix_free (V_e_temp); @@ -300,54 +301,54 @@ double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix_free (V_e_hi); gsl_matrix_free (VgVehi); gsl_matrix_free (U_l); - + return logdet_Ve; } - + //Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+I)^{-1} )^{-1} double CalcQi (const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, gsl_matrix *Qi) { size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1; size_t c_size=dc_size/d_size; - + double delta, dl, d1, d2, d, logdet_Q; - + gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix_set_zero (Q); - - for (size_t i=0; i<c_size; i++) { - for (size_t j=0; j<c_size; j++) { + + for (size_t i=0; i<c_size; i++) { + for (size_t j=0; j<c_size; j++) { for (size_t l=0; l<d_size; l++) { dl=gsl_vector_get(D_l, l); - + if (j<i) { - d=gsl_matrix_get (Q, j*d_size+l, i*d_size+l); + d=gsl_matrix_get (Q, j*d_size+l, i*d_size+l); } else { d=0.0; for (size_t k=0; k<n_size; k++) { d1=gsl_matrix_get(X, i, k); d2=gsl_matrix_get(X, j, k); delta=gsl_vector_get(eval, k); - d+=d1*d2/(dl*delta+1.0); + d+=d1*d2/(dl*delta+1.0); } } - + gsl_matrix_set (Q, i*d_size+l, j*d_size+l, d); } } } - - //calculate LU decomposition of Q, and invert Q and calculate |Q| + + //calculate LU decomposition of Q, and invert Q and calculate |Q| int sig; gsl_permutation * pmt=gsl_permutation_alloc (dc_size); - LUDecomp (Q, pmt, &sig); + LUDecomp (Q, pmt, &sig); LUInvert (Q, pmt, Qi); - + logdet_Q=LULndet (Q); - + gsl_matrix_free (Q); gsl_permutation_free (pmt); - + return logdet_Q; } @@ -355,13 +356,13 @@ double CalcQi (const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix * void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *UltVehiY, gsl_vector *xHiy) { size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size; - + gsl_vector_set_zero (xHiy); - + double x, delta, dl, y, d; - for (size_t i=0; i<d_size; i++) { + for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - for (size_t j=0; j<c_size; j++) { + for (size_t j=0; j<c_size; j++) { d=0.0; for (size_t k=0; k<n_size; k++) { x=gsl_matrix_get(X, j, k); @@ -388,20 +389,20 @@ void CalcOmega (const gsl_vector *eval, const gsl_vector *D_l, gsl_matrix *Omega { size_t n_size=eval->size, d_size=D_l->size; double delta, dl, d_u, d_e; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - + d_u=dl/(delta*dl+1.0); d_e=delta*d_u; - + gsl_matrix_set(OmegaU, i, k, d_u); gsl_matrix_set(OmegaE, i, k, d_e); } } - + return; } @@ -410,8 +411,8 @@ void UpdateU (const gsl_matrix *OmegaE, const gsl_matrix *UltVehiY, const gsl_ma { gsl_matrix_memcpy (UltVehiU, UltVehiY); gsl_matrix_sub (UltVehiU, UltVehiBX); - - gsl_matrix_mul_elements (UltVehiU, OmegaE); + + gsl_matrix_mul_elements (UltVehiU, OmegaE); return; } @@ -421,7 +422,7 @@ void UpdateE (const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, const gsl gsl_matrix_memcpy (UltVehiE, UltVehiY); gsl_matrix_sub (UltVehiE, UltVehiBX); gsl_matrix_sub (UltVehiE, UltVehiU); - + return; } @@ -430,38 +431,38 @@ void UpdateE (const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, const gsl void UpdateL_B (const gsl_matrix *X, const gsl_matrix *XXti, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiB) { size_t c_size=X->size1, d_size=UltVehiY->size1; - + gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size); - + gsl_matrix_memcpy (UltVehiBX, UltVehiY); gsl_matrix_sub (UltVehiBX, UltVehiU); - + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB); - - gsl_matrix_free(YUX); - + + gsl_matrix_free(YUX); + return; } void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB) { size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1; - + gsl_vector *b=gsl_vector_alloc (dc_size); - + //calculate b=Qiv gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b); - + //copy b to UltVehiB for (size_t i=0; i<c_size; i++) { gsl_vector_view UltVehiB_col=gsl_matrix_column (UltVehiB, i); gsl_vector_const_view b_subcol=gsl_vector_const_subvector (b, i*d_size, d_size); gsl_vector_memcpy (&UltVehiB_col.vector, &b_subcol.vector); - } - + } + gsl_vector_free(b); - + return; } @@ -470,23 +471,23 @@ void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVe void UpdateV (const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E, const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee, gsl_matrix *V_g, gsl_matrix *V_e) { size_t n_size=eval->size, d_size=U->size1; - + gsl_matrix_set_zero (V_g); gsl_matrix_set_zero (V_e); - + double delta; - - //calculate the first part: UD^{-1}U^T and EE^T + + //calculate the first part: UD^{-1}U^T and EE^T for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); if (delta==0) {continue;} - - gsl_vector_const_view U_col=gsl_matrix_const_column (U, k); + + gsl_vector_const_view U_col=gsl_matrix_const_column (U, k); gsl_blas_dsyr (CblasUpper, 1.0/delta, &U_col.vector, V_g); } - - gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e); - + + gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e); + //copy the upper part to lower part for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<i; j++) { @@ -494,84 +495,84 @@ void UpdateV (const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E, gsl_matrix_set (V_e, i, j, gsl_matrix_get(V_e, j, i)); } } - + //add Sigma gsl_matrix_add (V_g, Sigma_uu); gsl_matrix_add (V_e, Sigma_ee); - + //scale by 1/n gsl_matrix_scale (V_g, 1.0/(double)n_size); gsl_matrix_scale (V_e, 1.0/(double)n_size); - + return; } void CalcSigma (const char func_name, const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *OmegaU, const gsl_matrix *OmegaE, const gsl_matrix *UltVeh, const gsl_matrix *Qi, gsl_matrix *Sigma_uu, gsl_matrix *Sigma_ee) -{ +{ if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;} size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1; - + gsl_matrix_set_zero(Sigma_uu); gsl_matrix_set_zero(Sigma_ee); - - double delta, dl, x, d; - + + double delta, dl, x, d; + //calculate the first diagonal term gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu); gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee); - + for (size_t k=0; k<n_size; k++) { gsl_vector_const_view OmegaU_col=gsl_matrix_const_column (OmegaU, k); gsl_vector_const_view OmegaE_col=gsl_matrix_const_column (OmegaE, k); - + gsl_vector_add (&Suu_diag.vector, &OmegaU_col.vector); gsl_vector_add (&See_diag.vector, &OmegaE_col.vector); - } - + } + //calculate the second term for reml - if (func_name=='R' || func_name=='r') { + if (func_name=='R' || func_name=='r') { gsl_matrix *M_u=gsl_matrix_alloc(dc_size, d_size); gsl_matrix *M_e=gsl_matrix_alloc(dc_size, d_size); - gsl_matrix *QiM=gsl_matrix_alloc(dc_size, d_size); - + gsl_matrix *QiM=gsl_matrix_alloc(dc_size, d_size); + gsl_matrix_set_zero(M_u); gsl_matrix_set_zero(M_e); - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); //if (delta==0) {continue;} - + for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - for (size_t j=0; j<c_size; j++) { + for (size_t j=0; j<c_size; j++) { x=gsl_matrix_get(X, j, k); d=x/(delta*dl+1.0); gsl_matrix_set(M_e, j*d_size+i, i, d); - gsl_matrix_set(M_u, j*d_size+i, i, d*dl); + gsl_matrix_set(M_u, j*d_size+i, i, d*dl); } - } + } gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_u, 0.0, QiM); gsl_blas_dgemm(CblasTrans, CblasNoTrans, delta, M_u, QiM, 1.0, Sigma_uu); - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_e, 0.0, QiM); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, M_e, QiM, 1.0, Sigma_ee); - } - + } + gsl_matrix_free(M_u); gsl_matrix_free(M_e); - gsl_matrix_free(QiM); + gsl_matrix_free(QiM); } - + //multiply both sides by VehUl gsl_matrix *M=gsl_matrix_alloc (d_size, d_size); - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_uu, UltVeh, 0.0, M); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_uu); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_ee, UltVeh, 0.0, M); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_ee); - + gsl_matrix_free(M); return; } @@ -580,33 +581,33 @@ void CalcSigma (const char func_name, const gsl_vector *eval, const gsl_vector * //'R' for restricted likelihood and 'L' for likelihood //'R' update B and 'L' don't //only calculate -0.5*\sum_{k=1}^n|H_k|-0.5yPxy -double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *Qi) +double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *Qi) { size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1; double logl=0.0, delta, dl, y, d; - + //calculate yHiy+log|H_k| - for (size_t k=0; k<n_size; k++) { + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); for (size_t i=0; i<d_size; i++) { y=gsl_matrix_get(UltVehiY, i, k); dl=gsl_vector_get(D_l, i); d=delta*dl+1.0; - + logl+=y*y/d+log(d); } } - + //calculate the rest of yPxy gsl_vector *Qiv=gsl_vector_alloc(dc_size); - + gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, Qiv); gsl_blas_ddot(xHiy, Qiv, &d); - + logl-=d; - + gsl_vector_free(Qiv); - + return -0.5*logl; } @@ -619,10 +620,10 @@ double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_ve double MphEM (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *U_hat, gsl_matrix *E_hat, gsl_matrix *OmegaU, gsl_matrix *OmegaE, gsl_matrix *UltVehiY, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU, gsl_matrix *UltVehiE, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B) { if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;} - + size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1; - size_t dc_size=d_size*c_size; - + size_t dc_size=d_size*c_size; + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size); gsl_vector *D_l=gsl_vector_alloc (d_size); @@ -633,11 +634,11 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size); gsl_vector *xHiy=gsl_vector_alloc (dc_size); - gsl_permutation * pmt=gsl_permutation_alloc (c_size); - + gsl_permutation * pmt=gsl_permutation_alloc (c_size); + double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve; int sig; - + //calculate |XXt| and (XXt)^{-1} gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); for (size_t i=0; i<c_size; ++i) { @@ -645,17 +646,17 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i)); } } - + LUDecomp (XXt, pmt, &sig); LUInvert (XXt, pmt, XXti); - - //calculate the constant for logl - if (func_name=='R' || func_name=='r') { + + //calculate the constant for logl + if (func_name=='R' || func_name=='r') { logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt); } else { logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI); - } - + } + //start EM for (size_t t=0; t<max_iter; t++) { logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); @@ -665,17 +666,17 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); CalcXHiY(eval, D_l, X, UltVehiY, xHiy); - //calculate log likelihood/restricted likelihood value, and terminate if change is small + //calculate log likelihood/restricted likelihood value, and terminate if change is small logl_new=logl_const+MphCalcLogL (eval, xHiy, D_l, UltVehiY, Qi)-0.5*(double)n_size*logdet_Ve; - if (func_name=='R' || func_name=='r') { + if (func_name=='R' || func_name=='r') { logl_new+=-0.5*(logdet_Q-(double)c_size*logdet_Ve); - } + } if (t!=0 && abs(logl_new-logl_old)<max_prec) {break;} logl_old=logl_new; - + /* cout<<"iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl; - + cout<<"Vg: "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { @@ -691,28 +692,28 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec cout<<endl; } */ - + CalcOmega (eval, D_l, OmegaU, OmegaE); //Update UltVehiB, UltVehiU - if (func_name=='R' || func_name=='r') { + if (func_name=='R' || func_name=='r') { UpdateRL_B(xHiy, Qi, UltVehiB); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX); } else if (t==0) { gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, B, 0.0, UltVehiB); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX); } - + UpdateU(OmegaE, UltVehiY, UltVehiBX, UltVehiU); - - if (func_name=='L' || func_name=='l') { + + if (func_name=='L' || func_name=='l') { //UltVehiBX is destroyed here UpdateL_B(X, XXti, UltVehiY, UltVehiU, UltVehiBX, UltVehiB); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX); } UpdateE(UltVehiY, UltVehiBX, UltVehiU, UltVehiE); - + //calculate U_hat, E_hat and B gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiU, 0.0, U_hat); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiE, 0.0, E_hat); @@ -720,11 +721,11 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec //calculate Sigma_uu and Sigma_ee CalcSigma (func_name, eval, D_l, X, OmegaU, OmegaE, UltVeh, Qi, Sigma_uu, Sigma_ee); - + //update V_g and V_e - UpdateV (eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e); + UpdateV (eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e); } - + gsl_matrix_free(XXt); gsl_matrix_free(XXti); gsl_vector_free(D_l); @@ -736,7 +737,7 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec gsl_matrix_free(Sigma_ee); gsl_vector_free(xHiy); gsl_permutation_free(pmt); - + return logl_new; } @@ -747,63 +748,63 @@ double MphEM (const char func_name, const size_t max_iter, const double max_prec //calculate p-value, beta (d by 1 vector) and V(beta) -double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta, gsl_matrix *Vbeta) +double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta, gsl_matrix *Vbeta) { size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1; size_t dc_size=d_size*c_size; double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value; - + gsl_vector *D_l=gsl_vector_alloc (d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); - gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); + gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size); - - gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); gsl_vector *xPy=gsl_vector_alloc (d_size); //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); gsl_vector *WHiy=gsl_vector_alloc (dc_size); - + gsl_matrix_set_zero (xPx); gsl_matrix_set_zero (WHix); gsl_vector_set_zero (xPy); gsl_vector_set_zero (WHiy); - + //eigen decomposition and calculate log|Ve| - logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + //calculate Qi and log|Q| - logdet_Q=CalcQi (eval, D_l, W, Qi); - + logdet_Q=CalcQi (eval, D_l, W, Qi); + //calculate UltVehiY gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); - + //calculate WHix, WHiy, xHiy, xHix for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - + d1=0.0; d2=0.0; for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); dx=gsl_vector_get(x_vec, k); dy=gsl_matrix_get(UltVehiY, i, k); - + d1+=dx*dy/(delta*dl+1.0); d2+=dx*dx/(delta*dl+1.0); } gsl_vector_set (xPy, i, d1); gsl_matrix_set (xPx, i, i, d2); - - for (size_t j=0; j<c_size; j++) { + + for (size_t j=0; j<c_size; j++) { d1=0.0; d2=0.0; for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); dx=gsl_vector_get(x_vec, k); dw=gsl_matrix_get(W, j, k); dy=gsl_matrix_get(UltVehiY, i, k); - - //if (delta==0) {continue;} + + //if (delta==0) {continue;} d1+=dx*dw/(delta*dl+1.0); d2+=dy*dw/(delta*dl+1.0); } @@ -811,11 +812,11 @@ double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matr gsl_vector_set(WHiy, j*d_size+i, d2); } } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, WHix, 0.0, QiWHix); gsl_blas_dgemm(CblasTrans, CblasNoTrans, -1.0, WHix, QiWHix, 1.0, xPx); gsl_blas_dgemv(CblasTrans, -1.0, QiWHix, WHiy, 1.0, xPy); - + //calculate V(beta) and beta int sig; gsl_permutation * pmt=gsl_permutation_alloc (d_size); @@ -826,40 +827,40 @@ double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matr //need to multiply UltVehi on both sides or one side gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, D_l, 0.0, beta); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Vbeta, UltVeh, 0.0, xPx); - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, xPx, 0.0, Vbeta); + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, xPx, 0.0, Vbeta); - //calculate test statistic and p value + //calculate test statistic and p value gsl_blas_ddot(D_l, xPy, &d); - + p_value=gsl_cdf_chisq_Q (d, (double)d_size); //d*=(double)(n_size-c_size-d_size)/((double)d_size*(double)(n_size-c_size-1)); - //p_value=gsl_cdf_fdist_Q (d, (double)d_size, (double)(n_size-c_size-d_size)); - + //p_value=gsl_cdf_fdist_Q (d, (double)d_size, (double)(n_size-c_size-d_size)); + gsl_vector_free(D_l); gsl_matrix_free(UltVeh); gsl_matrix_free(UltVehi); gsl_matrix_free(Qi); - gsl_matrix_free(WHix); + gsl_matrix_free(WHix); gsl_matrix_free(QiWHix); - - gsl_matrix_free(xPx); + + gsl_matrix_free(xPx); gsl_vector_free(xPy); gsl_vector_free(WHiy); - + gsl_permutation_free(pmt); - + return p_value; } //calculate B and its standard error (which is a matrix of the same dimension as B) -void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B, gsl_matrix *se_B) +void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B, gsl_matrix *se_B) { size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1; size_t dc_size=d_size*c_size; double delta, dl, d, dy, dw, logdet_Ve, logdet_Q; - + gsl_vector *D_l=gsl_vector_alloc (d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); @@ -870,67 +871,67 @@ void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix gsl_vector *QiWHiy=gsl_vector_alloc (dc_size); gsl_vector *beta=gsl_vector_alloc (dc_size); gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size); - + gsl_vector_set_zero (WHiy); - + //eigen decomposition and calculate log|Ve| - logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + //calculate Qi and log|Q| - logdet_Q=CalcQi (eval, D_l, W, Qi); - + logdet_Q=CalcQi (eval, D_l, W, Qi); + //calculate UltVehiY gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); - + //calculate WHiy for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - - for (size_t j=0; j<c_size; j++) { + + for (size_t j=0; j<c_size; j++) { d=0.0; for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); dw=gsl_matrix_get(W, j, k); dy=gsl_matrix_get(UltVehiY, i, k); - - //if (delta==0) {continue;} + + //if (delta==0) {continue;} d+=dy*dw/(delta*dl+1.0); } gsl_vector_set(WHiy, j*d_size+i, d); } } - + gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, WHiy, 0.0, QiWHiy); - + //need to multiply I_c\otimes UltVehi on both sides or one side for (size_t i=0; i<c_size; i++) { gsl_vector_view QiWHiy_sub=gsl_vector_subvector(QiWHiy, i*d_size, d_size); - gsl_vector_view beta_sub=gsl_vector_subvector(beta, i*d_size, d_size); + gsl_vector_view beta_sub=gsl_vector_subvector(beta, i*d_size, d_size); gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &QiWHiy_sub.vector, 0.0, &beta_sub.vector); - + for (size_t j=0; j<c_size; j++) { gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size); gsl_matrix_view Qitemp_sub=gsl_matrix_submatrix (Qi_temp, i*d_size, j*d_size, d_size, d_size); gsl_matrix_view Vbeta_sub=gsl_matrix_submatrix (Vbeta, i*d_size, j*d_size, d_size, d_size); - + if (j<i) { gsl_matrix_view Vbeta_sym=gsl_matrix_submatrix (Vbeta, j*d_size, i*d_size, d_size, d_size); gsl_matrix_transpose_memcpy (&Vbeta_sub.matrix, &Vbeta_sym.matrix); } else { gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, &Qitemp_sub.matrix); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, &Qitemp_sub.matrix, 0.0, &Vbeta_sub.matrix); - } + } } } - + //copy beta to B, and Vbeta to se_B for (size_t j=0; j<B->size2; j++) { for (size_t i=0; i<B->size1; i++) { gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i)); gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i))); } - } - + } + //free matrices gsl_vector_free(D_l); gsl_matrix_free(UltVeh); @@ -941,7 +942,7 @@ void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix gsl_vector_free(QiWHiy); gsl_vector_free(beta); gsl_matrix_free(Vbeta); - + return; } @@ -961,42 +962,42 @@ void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_ gsl_matrix_set_zero (Hi_all); gsl_matrix_set_zero (Qi); logdet_H=0.0; logdet_Q=0.0; - + size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1; - double logdet_Ve=0.0, delta, dl, d; - + double logdet_Ve=0.0, delta, dl, d; + gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); gsl_vector *D_l=gsl_vector_alloc (d_size); - + //calculate D_l, UltVeh and UltVehi logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + //calculate each Hi and log|H_k| logdet_H=(double)n_size*logdet_Ve; for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + gsl_matrix_memcpy (mat_dd, UltVehi); for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); d=delta*dl+1.0; - + gsl_vector_view mat_row=gsl_matrix_row (mat_dd, i); gsl_vector_scale (&mat_row.vector, 1.0/d); - + logdet_H+=log(d); } - + gsl_matrix_view Hi_k=gsl_matrix_submatrix(Hi_all, 0, k*d_size, d_size, d_size); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVehi, mat_dd, 0.0, &Hi_k.matrix); - } - + } + //calculate Qi, and multiply I\otimes UtVeh on both side //and calculate logdet_Q, don't forget to substract c_size*logdet_Ve logdet_Q=CalcQi (eval, D_l, X, Qi)-(double)c_size*logdet_Ve; - + for (size_t i=0; i<c_size; i++) { for (size_t j=0; j<c_size; j++) { gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size); @@ -1015,7 +1016,7 @@ void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_ gsl_matrix_free(UltVeh); gsl_matrix_free(UltVehi); gsl_vector_free(D_l); - + return; } @@ -1026,17 +1027,17 @@ void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_ void Calc_Hiy_all (const gsl_matrix *Y, const gsl_matrix *Hi_all, gsl_matrix *Hiy_all) { gsl_matrix_set_zero (Hiy_all); - + size_t n_size=Y->size2, d_size=Y->size1; - + for (size_t k=0; k<n_size; k++) { gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size); gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k); gsl_vector_view Hiy_k=gsl_matrix_column(Hiy_all, k); - + gsl_blas_dgemv (CblasNoTrans, 1.0, &Hi_k.matrix, &y_k.vector, 0.0, &Hiy_k.vector); } - + return; } @@ -1045,11 +1046,11 @@ void Calc_Hiy_all (const gsl_matrix *Y, const gsl_matrix *Hi_all, gsl_matrix *Hi void Calc_xHi_all (const gsl_matrix *X, const gsl_matrix *Hi_all, gsl_matrix *xHi_all) { gsl_matrix_set_zero (xHi_all); - + size_t n_size=X->size2, c_size=X->size1, d_size=Hi_all->size1; double d; - + for (size_t k=0; k<n_size; k++) { gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size); @@ -1060,7 +1061,7 @@ void Calc_xHi_all (const gsl_matrix *X, const gsl_matrix *Hi_all, gsl_matrix *xH gsl_matrix_scale(&xHi_sub.matrix, d); } } - + return; } @@ -1070,15 +1071,15 @@ double Calc_yHiy (const gsl_matrix *Y, const gsl_matrix *Hiy_all) { double yHiy=0.0, d; size_t n_size=Y->size2; - + for (size_t k=0; k<n_size; k++) { gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k); gsl_vector_const_view Hiy_k=gsl_matrix_const_column(Hiy_all, k); - + gsl_blas_ddot (&Hiy_k.vector, &y_k.vector, &d); yHiy+=d; } - + return yHiy; } @@ -1087,16 +1088,16 @@ double Calc_yHiy (const gsl_matrix *Y, const gsl_matrix *Hiy_all) void Calc_xHiy (const gsl_matrix *Y, const gsl_matrix *xHi, gsl_vector *xHiy) { gsl_vector_set_zero (xHiy); - + size_t n_size=Y->size2, d_size=Y->size1, dc_size=xHi->size1; - + for (size_t k=0; k<n_size; k++) { gsl_matrix_const_view xHi_k=gsl_matrix_const_submatrix(xHi, 0, k*d_size, dc_size, d_size); gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k); - + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHi_k.matrix, &y_k.vector, 1.0, xHiy); } - + return; } @@ -1107,10 +1108,10 @@ void Calc_xHiy (const gsl_matrix *Y, const gsl_matrix *xHi, gsl_vector *xHiy) size_t GetIndex (const size_t i, const size_t j, const size_t d_size) { if (i>=d_size || j>=d_size) {cout<<"error in GetIndex."<<endl; return 0;} - + size_t s, l; if (j<i) {s=j; l=i;} else {s=i; l=j;} - + return (2*d_size-s+1)*s/2+l-s; } @@ -1120,16 +1121,16 @@ void Calc_yHiDHiy (const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i { yHiDHiy_g=0.0; yHiDHiy_e=0.0; - + size_t n_size=eval->size; - + double delta, d1, d2; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); d1=gsl_matrix_get (Hiy, i, k); d2=gsl_matrix_get (Hiy, j, k); - + if (i==j) { yHiDHiy_g+=delta*d1*d2; yHiDHiy_e+=d1*d2; @@ -1137,8 +1138,8 @@ void Calc_yHiDHiy (const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i yHiDHiy_g+=delta*d1*d2*2.0; yHiDHiy_e+=d1*d2*2.0; } - } - + } + return; } @@ -1148,29 +1149,29 @@ void Calc_xHiDHiy (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matr { gsl_vector_set_zero(xHiDHiy_g); gsl_vector_set_zero(xHiDHiy_e); - + size_t n_size=eval->size, d_size=Hiy->size1; - + double delta, d; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i); d=gsl_matrix_get (Hiy, j, k); - + gsl_blas_daxpy (d*delta, &xHi_col_i.vector, xHiDHiy_g); - gsl_blas_daxpy (d, &xHi_col_i.vector, xHiDHiy_e); - + gsl_blas_daxpy (d, &xHi_col_i.vector, xHiDHiy_e); + if (i!=j) { gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j); d=gsl_matrix_get (Hiy, i, k); - + gsl_blas_daxpy (d*delta, &xHi_col_j.vector, xHiDHiy_g); - gsl_blas_daxpy (d, &xHi_col_j.vector, xHiDHiy_e); + gsl_blas_daxpy (d, &xHi_col_j.vector, xHiDHiy_e); } } - + return; } @@ -1179,42 +1180,42 @@ void Calc_xHiDHix (const gsl_vector *eval, const gsl_matrix *xHi, const size_t i { gsl_matrix_set_zero(xHiDHix_g); gsl_matrix_set_zero(xHiDHix_e); - + size_t n_size=eval->size, dc_size=xHi->size1; size_t d_size=xHi->size2/n_size; - + double delta; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i); gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j); - + gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc); - + gsl_matrix_transpose_memcpy (mat_dcdc_t, mat_dcdc); - + gsl_matrix_add (xHiDHix_e, mat_dcdc); - + gsl_matrix_scale (mat_dcdc, delta); gsl_matrix_add (xHiDHix_g, mat_dcdc); - + if (i!=j) { - gsl_matrix_add (xHiDHix_e, mat_dcdc_t); - + gsl_matrix_add (xHiDHix_e, mat_dcdc_t); + gsl_matrix_scale (mat_dcdc_t, delta); gsl_matrix_add (xHiDHix_g, mat_dcdc_t); } } - + gsl_matrix_free(mat_dcdc); gsl_matrix_free(mat_dcdc_t); - + return; } @@ -1225,30 +1226,30 @@ void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_ma yHiDHiDHiy_gg=0.0; yHiDHiDHiy_ee=0.0; yHiDHiDHiy_ge=0.0; - + size_t n_size=eval->size, d_size=Hiy->size1; - + double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + d_Hiy_i1=gsl_matrix_get (Hiy, i1, k); d_Hiy_j1=gsl_matrix_get (Hiy, j1, k); d_Hiy_i2=gsl_matrix_get (Hiy, i2, k); d_Hiy_j2=gsl_matrix_get (Hiy, j2, k); - - d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); - d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); - d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); - d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); - - if (i1==j1) { + + d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); + d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); + d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); + d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); + + if (i1==j1) { yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2); yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2); yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2); - - if (i2!=j2) { + + if (i2!=j2) { yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2); yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2); yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2); @@ -1257,7 +1258,7 @@ void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_ma yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2); yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2); yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2); - + if (i2!=j2) { yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2); yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2); @@ -1265,7 +1266,7 @@ void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_ma } } } - + return; } @@ -1275,56 +1276,56 @@ void Calc_xHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_ma gsl_vector_set_zero(xHiDHiDHiy_gg); gsl_vector_set_zero(xHiDHiDHiy_ee); gsl_vector_set_zero(xHiDHiDHiy_ge); - + size_t n_size=eval->size, d_size=Hiy->size1; - + double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1); gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j1); - + d_Hiy_i=gsl_matrix_get (Hiy, i2, k); d_Hiy_j=gsl_matrix_get (Hiy, j2, k); - - d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); - d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); - d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); - d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); - + + d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); + d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); + d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); + d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); + if (i1==j1) { gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge); - + if (i2!=j2) { gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge); } - } else { + } else { gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge); - + gsl_blas_daxpy (delta*delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ge); - + if (i2!=j2) { gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge); - + gsl_blas_daxpy (delta*delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_gg); gsl_blas_daxpy (d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ee); gsl_blas_daxpy (delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ge); } } } - + return; } @@ -1334,106 +1335,106 @@ void Calc_xHiDHiDHix (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_ma gsl_matrix_set_zero(xHiDHiDHix_gg); gsl_matrix_set_zero(xHiDHiDHix_ee); gsl_matrix_set_zero(xHiDHiDHix_ge); - + size_t n_size=eval->size, d_size=Hi->size1, dc_size=xHi->size1; - + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + gsl_vector_const_view xHi_col_i1=gsl_matrix_const_column (xHi, k*d_size+i1); gsl_vector_const_view xHi_col_j1=gsl_matrix_const_column (xHi, k*d_size+j1); gsl_vector_const_view xHi_col_i2=gsl_matrix_const_column (xHi, k*d_size+i2); - gsl_vector_const_view xHi_col_j2=gsl_matrix_const_column (xHi, k*d_size+j2); - - d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); - d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); - d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); - d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); - + gsl_vector_const_view xHi_col_j2=gsl_matrix_const_column (xHi, k*d_size+j2); + + d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); + d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); + d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); + d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); + if (i1==j1) { gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); if (i2!=j2) { gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); } } else { gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); - + gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_i1i2, &xHi_col_j1.vector, &xHi_col_j2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); - + if (i2!=j2) { gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); - + gsl_matrix_set_zero (mat_dcdc); gsl_blas_dger (d_Hi_i1j2, &xHi_col_j1.vector, &xHi_col_i2.vector, mat_dcdc); - - gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); + + gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); - gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); + gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc); gsl_matrix_scale(mat_dcdc, delta); gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc); } } } - + gsl_matrix_free(mat_dcdc); - + return; } -void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i, const size_t j, double &tHiD_g, double &tHiD_e) +void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i, const size_t j, double &tHiD_g, double &tHiD_e) { tHiD_g=0.0; tHiD_e=0.0; - + size_t n_size=eval->size, d_size=Hi->size1; double delta, d; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); d=gsl_matrix_get (Hi, j, k*d_size+i); - + if (i==j) { tHiD_g+=delta*d; tHiD_e+=d; @@ -1442,33 +1443,33 @@ void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i tHiD_e+=d*2.0; } } - + return; } -void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee, double &tHiDHiD_ge) +void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee, double &tHiDHiD_ge) { tHiDHiD_gg=0.0; tHiDHiD_ee=0.0; tHiDHiD_ge=0.0; - + size_t n_size=eval->size, d_size=Hi->size1; double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - - d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); - d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); - d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); + + d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); + d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); + d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); - + if (i1==j1) { tHiDHiD_gg+=delta*delta*d_Hi_i1j2*d_Hi_j1i2; tHiDHiD_ee+=d_Hi_i1j2*d_Hi_j1i2; tHiDHiD_ge+=delta*d_Hi_i1j2*d_Hi_j1i2; - + if (i2!=j2) { tHiDHiD_gg+=delta*delta*d_Hi_i1i2*d_Hi_j1j2; tHiDHiD_ee+=d_Hi_i1i2*d_Hi_j1j2; @@ -1478,7 +1479,7 @@ void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_ tHiDHiD_gg+=delta*delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2); tHiDHiD_ee+=(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2); tHiDHiD_ge+=delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2); - + if (i2!=j2) { tHiDHiD_gg+=delta*delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2); tHiDHiD_ee+=(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2); @@ -1486,34 +1487,34 @@ void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_ } } } - + return; } //trace(PD)=trace((Hi-HixQixHi)D)=trace(HiD)-trace(HixQixHiD) -void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const size_t i, const size_t j, double &tPD_g, double &tPD_e) +void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const size_t i, const size_t j, double &tPD_g, double &tPD_e) { size_t dc_size=Qi->size1, d_size=Hi->size1; size_t v=GetIndex(i, j, d_size); - + double d; - + //calculate the first part: trace(HiD) Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e); - + //calculate the second part: -trace(HixQixHiD) for (size_t k=0; k<dc_size; k++) { gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, k); gsl_vector_const_view xHiDHix_g_col=gsl_matrix_const_column (xHiDHix_all_g, v*dc_size+k); gsl_vector_const_view xHiDHix_e_col=gsl_matrix_const_column (xHiDHix_all_e, v*dc_size+k); - + gsl_blas_ddot(&Qi_row.vector, &xHiDHix_g_col.vector, &d); tPD_g-=d; gsl_blas_ddot(&Qi_row.vector, &xHiDHix_e_col.vector, &d); tPD_e-=d; } - + return; } @@ -1521,14 +1522,14 @@ void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matri //trace(PDPD)=trace((Hi-HixQixHi)D(Hi-HixQixHi)D) //=trace(HiDHiD)-trace(HixQixHiDHiD)-trace(HiDHixQixHiD)+trace(HixQixHiDHixQixHiD) -void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tPDPD_gg, double &tPDPD_ee, double &tPDPD_ge) +void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tPDPD_gg, double &tPDPD_ee, double &tPDPD_ge) { size_t dc_size=Qi->size1, d_size=Hi->size1; size_t v_size=d_size*(d_size+1)/2; size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); - + double d; - + //calculate the first part: trace(HiDHiD) Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge); @@ -1549,7 +1550,7 @@ void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_mat gsl_vector_const_view xHiDHiDHix_gg_row=gsl_matrix_const_row (xHiDHiDHix_gg, i); gsl_vector_const_view xHiDHiDHix_ee_row=gsl_matrix_const_row (xHiDHiDHix_ee, i); gsl_vector_const_view xHiDHiDHix_ge_row=gsl_matrix_const_row (xHiDHiDHix_ge, i); - + gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_row.vector, &d); tPDPD_gg-=d; gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_row.vector, &d); @@ -1560,7 +1561,7 @@ void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_mat } //calculate the fourth part: trace(HixQixHiDHixQixHiD) - for (size_t i=0; i<dc_size; i++) { + for (size_t i=0; i<dc_size; i++) { //gsl_vector_const_view QixHiDHix_g_row1=gsl_matrix_const_subrow (QixHiDHix_all_g, i, v1*dc_size, dc_size); //gsl_vector_const_view QixHiDHix_e_row1=gsl_matrix_const_subrow (QixHiDHix_all_e, i, v1*dc_size, dc_size); @@ -1578,7 +1579,7 @@ void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_mat tPDPD_ee+=d; gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_e_col2.vector, &d); tPDPD_ge+=d; - } + } return; } @@ -1590,18 +1591,18 @@ void Calc_xHiDHiy_all (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_ { gsl_matrix_set_zero(xHiDHiy_all_g); gsl_matrix_set_zero(xHiDHiy_all_e); - + size_t d_size=Hiy->size1; size_t v; - + for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { if (j<i) {continue;} v=GetIndex(i, j, d_size); - + gsl_vector_view xHiDHiy_g=gsl_matrix_column (xHiDHiy_all_g, v); gsl_vector_view xHiDHiy_e=gsl_matrix_column (xHiDHiy_all_e, v); - + Calc_xHiDHiy (eval, xHi, Hiy, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector); } } @@ -1614,18 +1615,18 @@ void Calc_xHiDHix_all (const gsl_vector *eval, const gsl_matrix *xHi, gsl_matrix { gsl_matrix_set_zero(xHiDHix_all_g); gsl_matrix_set_zero(xHiDHix_all_e); - + size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1; size_t v; - + for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { if (j<i) {continue;} v=GetIndex(i, j, d_size); - + gsl_matrix_view xHiDHix_g=gsl_matrix_submatrix (xHiDHix_all_g, 0, v*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHix_e=gsl_matrix_submatrix (xHiDHix_all_e, 0, v*dc_size, dc_size, dc_size); - + Calc_xHiDHix (eval, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix); } } @@ -1640,24 +1641,24 @@ void Calc_xHiDHiDHiy_all (const size_t v_size, const gsl_vector *eval, const gsl gsl_matrix_set_zero(xHiDHiDHiy_all_gg); gsl_matrix_set_zero(xHiDHiDHiy_all_ee); gsl_matrix_set_zero(xHiDHiDHiy_all_ge); - + size_t d_size=Hiy->size1; size_t v1, v2; - + for (size_t i1=0; i1<d_size; i1++) { for (size_t j1=0; j1<d_size; j1++) { if (j1<i1) {continue;} v1=GetIndex(i1, j1, d_size); - + for (size_t i2=0; i2<d_size; i2++) { for (size_t j2=0; j2<d_size; j2++) { if (j2<i2) {continue;} v2=GetIndex(i2, j2, d_size); - + gsl_vector_view xHiDHiDHiy_gg=gsl_matrix_column (xHiDHiDHiy_all_gg, v1*v_size+v2); gsl_vector_view xHiDHiDHiy_ee=gsl_matrix_column (xHiDHiDHiy_all_ee, v1*v_size+v2); gsl_vector_view xHiDHiDHiy_ge=gsl_matrix_column (xHiDHiDHiy_all_ge, v1*v_size+v2); - + Calc_xHiDHiDHiy (eval, Hi, xHi, Hiy, i1, j1, i2, j2, &xHiDHiDHiy_gg.vector, &xHiDHiDHiy_ee.vector, &xHiDHiDHiy_ge.vector); } } @@ -1673,33 +1674,33 @@ void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl gsl_matrix_set_zero(xHiDHiDHix_all_gg); gsl_matrix_set_zero(xHiDHiDHix_all_ee); gsl_matrix_set_zero(xHiDHiDHix_all_ge); - + size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1; - size_t v1, v2; - + size_t v1, v2; + for (size_t i1=0; i1<d_size; i1++) { for (size_t j1=0; j1<d_size; j1++) { if (j1<i1) {continue;} v1=GetIndex(i1, j1, d_size); - + for (size_t i2=0; i2<d_size; i2++) { for (size_t j2=0; j2<d_size; j2++) { if (j2<i2) {continue;} v2=GetIndex(i2, j2, d_size); - + if (v2<v1) {continue;} - + gsl_matrix_view xHiDHiDHix_gg1=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ee1=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ge1=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + Calc_xHiDHiDHix (eval, Hi, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix, &xHiDHiDHix_ee1.matrix, &xHiDHiDHix_ge1.matrix); - + if (v2!=v1) { gsl_matrix_view xHiDHiDHix_gg2=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ee2=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ge2=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size); - + gsl_matrix_memcpy (&xHiDHiDHix_gg2.matrix, &xHiDHiDHix_gg1.matrix); gsl_matrix_memcpy (&xHiDHiDHix_ee2.matrix, &xHiDHiDHix_ee1.matrix); gsl_matrix_memcpy (&xHiDHiDHix_ge2.matrix, &xHiDHiDHix_ge1.matrix); @@ -1708,39 +1709,39 @@ void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl } } } - - + + /* size_t n_size=eval->size; double delta, d_Hi_ij; - + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size); - + for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get (eval, k); - + for (size_t i1=0; i1<d_size; i1++) { - for (size_t j2=0; j2<d_size; j2++) { + for (size_t j2=0; j2<d_size; j2++) { gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1); gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j2); - + gsl_matrix_set_zero (mat_dcdc); - gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc); - + gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc); + for (size_t j1=0; j1<d_size; j1++) { for (size_t i2=0; i2<d_size; i2++) { - d_Hi_ij=gsl_matrix_get (Hi, j1, k*d_size+i2); - + d_Hi_ij=gsl_matrix_get (Hi, j1, k*d_size+i2); + v1=GetIndex(i1, j1, d_size); - v2=GetIndex(i2, j2, d_size); - + v2=GetIndex(i2, j2, d_size); + gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + gsl_matrix_memcpy (mat_dcdc_temp, mat_dcdc); - + gsl_matrix_scale (mat_dcdc_temp, d_Hi_ij); gsl_matrix_add(&xHiDHiDHix_ee.matrix, mat_dcdc_temp); gsl_matrix_scale(mat_dcdc_temp, delta); @@ -1752,21 +1753,21 @@ void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl } } } - + for (size_t i1=0; i1<d_size; i1++) { for (size_t j1=0; j1<d_size; j1++) { v1=GetIndex(i1, j1, d_size); - + for (size_t i2=0; i2<d_size; i2++) { for (size_t j2=0; j2<d_size; j2++) { v2=GetIndex(i2, j2, d_size); - + if (i1!=j1 && i2!=j2) {continue;} - + gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + if ( (i1==j1 && i2!=j2) || (i1!=j1 && i2==j2) ) { gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.5); gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.5); @@ -1780,11 +1781,11 @@ void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl } } } - + gsl_matrix_free (mat_dcdc); - gsl_matrix_free (mat_dcdc_temp); + gsl_matrix_free (mat_dcdc_temp); */ - + return; } @@ -1795,18 +1796,18 @@ void Calc_xHiDHixQixHiy_all (const gsl_matrix *xHiDHix_all_g, const gsl_matrix * { size_t dc_size=xHiDHix_all_g->size1; size_t v_size=xHiDHix_all_g->size2/dc_size; - - for (size_t i=0; i<v_size; i++) { + + for (size_t i=0; i<v_size; i++) { gsl_matrix_const_view xHiDHix_g=gsl_matrix_const_submatrix (xHiDHix_all_g, 0, i*dc_size, dc_size, dc_size); gsl_matrix_const_view xHiDHix_e=gsl_matrix_const_submatrix (xHiDHix_all_e, 0, i*dc_size, dc_size, dc_size); - + gsl_vector_view xHiDHixQixHiy_g=gsl_matrix_column (xHiDHixQixHiy_all_g, i); gsl_vector_view xHiDHixQixHiy_e=gsl_matrix_column (xHiDHixQixHiy_all_e, i); - + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_g.matrix, QixHiy, 0.0, &xHiDHixQixHiy_g.vector); gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_e.matrix, QixHiy, 0.0, &xHiDHixQixHiy_e.vector); } - + return; } @@ -1816,14 +1817,14 @@ void Calc_QiVec_all (const gsl_matrix *Qi, const gsl_matrix *vec_all_g, const gs for (size_t i=0; i<vec_all_g->size2; i++) { gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i); gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i); - + gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i); gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i); - + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector); gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector); } - + return; } @@ -1833,18 +1834,18 @@ void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gs { size_t dc_size=Qi->size1; size_t v_size=mat_all_g->size2/mat_all_g->size1; - + for (size_t i=0; i<v_size; i++) { gsl_matrix_const_view mat_g=gsl_matrix_const_submatrix (mat_all_g, 0, i*dc_size, dc_size, dc_size); gsl_matrix_const_view mat_e=gsl_matrix_const_submatrix (mat_all_e, 0, i*dc_size, dc_size, dc_size); - + gsl_matrix_view Qimat_g=gsl_matrix_submatrix (Qimat_all_g, 0, i*dc_size, dc_size, dc_size); gsl_matrix_view Qimat_e=gsl_matrix_submatrix (Qimat_all_e, 0, i*dc_size, dc_size, dc_size); - + gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_g.matrix, 0.0, &Qimat_g.matrix); gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_e.matrix, 0.0, &Qimat_e.matrix); } - + return; } @@ -1856,28 +1857,28 @@ void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gs //-(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy) //+(yHix)Qi(xHiDHix)Qi(xtHiy) void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const size_t i, const size_t j, double &yPDPy_g, double &yPDPy_e) -{ +{ size_t d_size=Hiy->size1; size_t v=GetIndex(i, j, d_size); - - double d; - + + double d; + //first part: ytHiDHiy Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e); - + //second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v); gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v); - + gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d); yPDPy_g-=d*2.0; gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d); - yPDPy_e-=d*2.0; - + yPDPy_e-=d*2.0; + //fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v); gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v); - + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d); yPDPy_g+=d; gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d); @@ -1894,73 +1895,73 @@ void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector //+(yHix)Qi(xHiDHiDHix)Qi(xHiy) //-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge) -{ +{ size_t d_size=Hi->size1, dc_size=xHi->size1; size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); - size_t v_size=d_size*(d_size+1)/2; - + size_t v_size=d_size*(d_size+1)/2; + double d; - + gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size); - + //first part: yHiDHiDHiy - Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); - - //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) + Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); + + //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2); gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2); gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2); - + gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1); gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1); gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1); - - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); yPDPDPy_gg-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); yPDPDPy_ee-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); yPDPDPy_ge-=d; - - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); yPDPDPy_gg-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); yPDPDPy_ee-=d; - gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); yPDPDPy_ge-=d; - + //fourth part: -(yHiDHix)Qi(xHiDHiy) gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1); gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1); gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2); gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2); - + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d); yPDPDPy_gg-=d; gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ee-=d; gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ge-=d; - + //fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1); gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1); - + gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1); gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1); gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2); gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2); - + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d); yPDPDPy_gg+=d; gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d); yPDPDPy_gg+=d; - + gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ee+=d; gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d); yPDPDPy_ee+=d; - + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d); yPDPDPy_ge+=d; gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d); @@ -1970,7 +1971,7 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); yPDPDPy_gg+=d; @@ -1980,21 +1981,21 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); yPDPDPy_ge+=d; - + //eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1); gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1); - + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d); yPDPDPy_gg-=d; gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d); yPDPDPy_ee-=d; gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d); yPDPDPy_ge-=d; - - //free memory - gsl_vector_free(xHiDHiDHixQixHiy); - + + //free memory + gsl_vector_free(xHiDHiDHixQixHiy); + return; } @@ -2005,62 +2006,62 @@ void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matri void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c) { crt_a=0.0; crt_b=0.0; crt_c=0.0; - + size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2; size_t c_size=dc_size/d_size; double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0; double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0; - + gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size); - + gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size); gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size); - + gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size); - - gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); - + + gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size); gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size); - + //invert Qi_sub to Qi_si gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size); - + gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + int sig; gsl_permutation * pmt=gsl_permutation_alloc (d_size); - + gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix); LUDecomp (Qi_sub, pmt, &sig); LUInvert (Qi_sub, pmt, Qi_si); - + gsl_permutation_free(pmt); gsl_matrix_free(Qi_sub); - + //calculate correctation factors for (size_t v1=0; v1<v_size; v1++) { //calculate Qi(xHiDHix)Qi, and subpart of it gsl_matrix_const_view QiM_g1=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v1*dc_size, dc_size, dc_size); gsl_matrix_const_view QiM_e1=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v1*dc_size, dc_size, dc_size); - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, Qi, 0.0, QiMQi_g1); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, Qi, 0.0, QiMQi_e1); - + gsl_matrix_view QiMQi_g1_s=gsl_matrix_submatrix (QiMQi_g1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMQi_e1_s=gsl_matrix_submatrix (QiMQi_e1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + /* for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { @@ -2075,7 +2076,7 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat for (size_t k=0; k<d_size; k++) { trCg1-=gsl_matrix_get (QiMQisQisi_g1, k, k); } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e1_s.matrix, Qi_si, 0.0, QiMQisQisi_e1); trCe1=0.0; for (size_t k=0; k<d_size; k++) { @@ -2083,64 +2084,64 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat } /* cout<<v1<<endl; - cout<<"trCg1 = "<<trCg1<<", trCe1 = "<<trCe1<<endl; + cout<<"trCg1 = "<<trCg1<<", trCe1 = "<<trCe1<<endl; */ for (size_t v2=0; v2<v_size; v2++) { if (v2<v1) {continue;} - + //calculate Qi(xHiDHix)Qi, and subpart of it gsl_matrix_const_view QiM_g2=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v2*dc_size, dc_size, dc_size); gsl_matrix_const_view QiM_e2=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v2*dc_size, dc_size, dc_size); - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g2.matrix, Qi, 0.0, QiMQi_g2); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e2.matrix, Qi, 0.0, QiMQi_e2); - + gsl_matrix_view QiMQi_g2_s=gsl_matrix_submatrix (QiMQi_g2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMQi_e2_s=gsl_matrix_submatrix (QiMQi_e2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + //calculate trCg2 and trCe2 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g2_s.matrix, Qi_si, 0.0, QiMQisQisi_g2); trCg2=0.0; for (size_t k=0; k<d_size; k++) { trCg2-=gsl_matrix_get (QiMQisQisi_g2, k, k); } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e2_s.matrix, Qi_si, 0.0, QiMQisQisi_e2); trCe2=0.0; for (size_t k=0; k<d_size; k++) { trCe2-=gsl_matrix_get (QiMQisQisi_e2, k, k); } - + //calculate trCC_gg, trCC_ge, trCC_ee gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_g2, 0.0, M_dd); trCC_gg=0.0; for (size_t k=0; k<d_size; k++) { trCC_gg+=gsl_matrix_get (M_dd, k, k); } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_e2, 0.0, M_dd); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_g2, 1.0, M_dd); trCC_ge=0.0; for (size_t k=0; k<d_size; k++) { trCC_ge+=gsl_matrix_get (M_dd, k, k); } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_e2, 0.0, M_dd); trCC_ee=0.0; for (size_t k=0; k<d_size; k++) { trCC_ee+=gsl_matrix_get (M_dd, k, k); } - - //calculate Qi(xHiDHix)Qi(xHiDHix)Qi, and subpart of it + + //calculate Qi(xHiDHix)Qi(xHiDHix)Qi, and subpart of it gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_g2, 0.0, QiMQiMQi_gg); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ge); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_g2, 1.0, QiMQiMQi_ge); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ee); - + gsl_matrix_view QiMQiMQi_gg_s=gsl_matrix_submatrix (QiMQiMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMQiMQi_ge_s=gsl_matrix_submatrix (QiMQiMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMQiMQi_ee_s=gsl_matrix_submatrix (QiMQiMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + //and part of trB_gg, trB_ge, trB_ee gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_gg_s.matrix, Qi_si, 0.0, M_dd); trB_gg=0.0; @@ -2148,37 +2149,37 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat d=gsl_matrix_get (M_dd, k, k); trB_gg-=d; } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ge_s.matrix, Qi_si, 0.0, M_dd); trB_ge=0.0; for (size_t k=0; k<d_size; k++) { d=gsl_matrix_get (M_dd, k, k); trB_ge-=d; } - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ee_s.matrix, Qi_si, 0.0, M_dd); trB_ee=0.0; for (size_t k=0; k<d_size; k++) { d=gsl_matrix_get (M_dd, k, k); trB_ee-=d; } - - //calculate Qi(xHiDHiDHix)Qi, and subpart of it + + //calculate Qi(xHiDHiDHix)Qi, and subpart of it gsl_matrix_const_view MM_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view MM_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); gsl_matrix_const_view MM_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); - + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_gg.matrix, 0.0, M_dcdc); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_gg); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ge.matrix, 0.0, M_dcdc); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ge); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ee.matrix, 0.0, M_dcdc); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ee); - + gsl_matrix_view QiMMQi_gg_s=gsl_matrix_submatrix (QiMMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMMQi_ge_s=gsl_matrix_submatrix (QiMMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); gsl_matrix_view QiMMQi_ee_s=gsl_matrix_submatrix (QiMMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); - + //calculate the other part of trB_gg, trB_ge, trB_ee gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_gg_s.matrix, Qi_si, 0.0, M_dd); for (size_t k=0; k<d_size; k++) { @@ -2192,28 +2193,28 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat for (size_t k=0; k<d_size; k++) { trB_ee+=gsl_matrix_get (M_dd, k, k); } - - + + //calculate trD_gg, trD_ge, trD_ee trD_gg=2.0*trB_gg; trD_ge=2.0*trB_ge; trD_ee=2.0*trB_ee; - + //calculate B, C and D h_gg=-1.0*gsl_matrix_get (Hessian_inv, v1, v2); h_ge=-1.0*gsl_matrix_get (Hessian_inv, v1, v2+v_size); h_ee=-1.0*gsl_matrix_get (Hessian_inv, v1+v_size, v2+v_size); - + B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee; C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2); D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee); - + if (v1!=v2) { B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee; C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2); D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee); } - + /* cout<<v1<<"\t"<<v2<<endl; cout<<h_gg<<"\t"<<h_ge<<"\t"<<h_ee<<endl; @@ -2224,7 +2225,7 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat */ } } - + //calculate a, b, c from B C D crt_a=2.0*D-C; crt_b=2.0*B; @@ -2238,25 +2239,25 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat gsl_matrix_free(QiMQi_e1); gsl_matrix_free(QiMQi_g2); gsl_matrix_free(QiMQi_e2); - + gsl_matrix_free(QiMQisQisi_g1); gsl_matrix_free(QiMQisQisi_e1); gsl_matrix_free(QiMQisQisi_g2); gsl_matrix_free(QiMQisQisi_e2); - + gsl_matrix_free(QiMQiMQi_gg); gsl_matrix_free(QiMQiMQi_ge); gsl_matrix_free(QiMQiMQi_ee); - + gsl_matrix_free(QiMMQi_gg); gsl_matrix_free(QiMMQi_ge); gsl_matrix_free(QiMMQi_ee); - + gsl_matrix_free(Qi_si); - + gsl_matrix_free(M_dd); gsl_matrix_free(M_dcdc); - + return; } @@ -2266,7 +2267,7 @@ void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_mat //calculate first-order and second-order derivatives void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, gsl_vector *gradient, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c) -{ +{ if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;} size_t dc_size=Qi->size1, d_size=Hi->size1; @@ -2276,73 +2277,73 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge; gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); - gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); - + gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); - gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); - + gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size); gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); - + //calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all - Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); + Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e); Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e); - + Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge); Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge); - + //calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e); Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e); Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e); - + double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge; double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge; - //calculate gradient and Hessian for Vg + //calculate gradient and Hessian for Vg for (size_t i1=0; i1<d_size; i1++) { for (size_t j1=0; j1<d_size; j1++) { if (j1<i1) {continue;} v1=GetIndex (i1, j1, d_size); Calc_yPDPy (eval, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, i1, j1, yPDPy_g, yPDPy_e); - - if (func_name=='R' || func_name=='r') { - Calc_tracePD (eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g, tPD_e); + + if (func_name=='R' || func_name=='r') { + Calc_tracePD (eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g, tPD_e); //cout<<i1<<" "<<j1<<" "<<yPDPy_g<<" "<<yPDPy_e<<" "<<tPD_g<<" "<<tPD_e<<endl; - + dev1_g=-0.5*tPD_g+0.5*yPDPy_g; dev1_e=-0.5*tPD_e+0.5*yPDPy_e; } else { Calc_traceHiD (eval, Hi, i1, j1, tHiD_g, tHiD_e); - + dev1_g=-0.5*tHiD_g+0.5*yPDPy_g; dev1_e=-0.5*tHiD_e+0.5*yPDPy_e; } gsl_vector_set (gradient, v1, dev1_g); gsl_vector_set (gradient, v1+v_size, dev1_e); - + for (size_t i2=0; i2<d_size; i2++) { for (size_t j2=0; j2<d_size; j2++) { if (j2<i2) {continue;} v2=GetIndex (i2, j2, d_size); - + if (v2<v1) {continue;} Calc_yPDPDPy (eval, Hi, xHi, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); @@ -2351,21 +2352,21 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi //AI for reml if (func_name=='R' || func_name=='r') { Calc_tracePDPD (eval, Qi, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge); - - dev2_gg=0.5*tPDPD_gg-yPDPDPy_gg; - dev2_ee=0.5*tPDPD_ee-yPDPDPy_ee; - dev2_ge=0.5*tPDPD_ge-yPDPDPy_ge; + + dev2_gg=0.5*tPDPD_gg-yPDPDPy_gg; + dev2_ee=0.5*tPDPD_ee-yPDPDPy_ee; + dev2_ge=0.5*tPDPD_ge-yPDPDPy_ge; /* - dev2_gg=-0.5*yPDPDPy_gg; - dev2_ee=-0.5*yPDPDPy_ee; - dev2_ge=-0.5*yPDPDPy_ge; + dev2_gg=-0.5*yPDPDPy_gg; + dev2_ee=-0.5*yPDPDPy_ee; + dev2_ge=-0.5*yPDPDPy_ge; */ } else { Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge); - - dev2_gg=0.5*tHiDHiD_gg-yPDPDPy_gg; - dev2_ee=0.5*tHiDHiD_ee-yPDPDPy_ee; - dev2_ge=0.5*tHiDHiD_ge-yPDPDPy_ge; + + dev2_gg=0.5*tHiDHiD_gg-yPDPDPy_gg; + dev2_ee=0.5*tHiDHiD_ee-yPDPDPy_ee; + dev2_ge=0.5*tHiDHiD_ge-yPDPDPy_ge; } //set up Hessian @@ -2373,7 +2374,7 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi gsl_matrix_set (Hessian, v1+v_size, v2+v_size, dev2_ee); gsl_matrix_set (Hessian, v1, v2+v_size, dev2_ge); gsl_matrix_set (Hessian, v2+v_size, v1, dev2_ge); - + if (v1!=v2) { gsl_matrix_set (Hessian, v2, v1, dev2_gg); gsl_matrix_set (Hessian, v2+v_size, v1+v_size, dev2_ee); @@ -2384,7 +2385,7 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi } } } - + /* cout<<"Hessian: "<<endl; for (size_t i=0; i<2*v_size; i++) { @@ -2394,12 +2395,12 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi cout<<endl; } */ - - + + //Invert Hessian int sig; gsl_permutation * pmt=gsl_permutation_alloc (v_size*2); - + LUDecomp (Hessian, pmt, &sig); LUInvert (Hessian, pmt, Hessian_inv); /* @@ -2411,38 +2412,38 @@ void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi cout<<endl; } */ - gsl_permutation_free(pmt); + gsl_permutation_free(pmt); gsl_matrix_free(Hessian); - + //calculate Edgeworth correction factors //after inverting Hessian if (c_size>1) { CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c); } else { - crt_a=0.0; crt_b=0.0; crt_c=0.0; - } - + crt_a=0.0; crt_b=0.0; crt_c=0.0; + } + gsl_matrix_free(xHiDHiy_all_g); gsl_matrix_free(xHiDHiy_all_e); gsl_matrix_free(xHiDHix_all_g); - gsl_matrix_free(xHiDHix_all_e); + gsl_matrix_free(xHiDHix_all_e); gsl_matrix_free(xHiDHixQixHiy_all_g); gsl_matrix_free(xHiDHixQixHiy_all_e); - + gsl_matrix_free(QixHiDHiy_all_g); gsl_matrix_free(QixHiDHiy_all_e); gsl_matrix_free(QixHiDHix_all_g); - gsl_matrix_free(QixHiDHix_all_e); + gsl_matrix_free(QixHiDHix_all_e); gsl_matrix_free(QixHiDHixQixHiy_all_g); gsl_matrix_free(QixHiDHixQixHiy_all_e); - + gsl_matrix_free(xHiDHiDHiy_all_gg); gsl_matrix_free(xHiDHiDHiy_all_ee); gsl_matrix_free(xHiDHiDHiy_all_ge); gsl_matrix_free(xHiDHiDHix_all_gg); gsl_matrix_free(xHiDHiDHix_all_ee); gsl_matrix_free(xHiDHiDHix_all_ge); - + return; } @@ -2452,25 +2453,25 @@ void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, cons { size_t v_size=gradient->size/2, d_size=V_g->size1; size_t v; - + gsl_vector *vec_v=gsl_vector_alloc (v_size*2); - + double d; - + //vectorize Vg and Ve for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { if (j<i) {continue;} v=GetIndex(i, j, d_size); - + d=gsl_matrix_get (V_g, i, j); gsl_vector_set (vec_v, v, d); - + d=gsl_matrix_get (V_e, i, j); gsl_vector_set (vec_v, v+v_size, d); } - } - + } + gsl_blas_dgemv (CblasNoTrans, -1.0*step_scale, Hessian_inv, gradient, 1.0, vec_v); //save Vg and Ve @@ -2478,19 +2479,19 @@ void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, cons for (size_t j=0; j<d_size; j++) { if (j<i) {continue;} v=GetIndex(i, j, d_size); - + d=gsl_vector_get (vec_v, v); gsl_matrix_set (V_g, i, j, d); gsl_matrix_set (V_g, j, i, d); - + d=gsl_vector_get (vec_v, v+v_size); gsl_matrix_set (V_e, i, j, d); gsl_matrix_set (V_e, j, i, d); } - } - + } + gsl_vector_free(vec_v); - + return; } @@ -2505,23 +2506,23 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1; size_t dc_size=d_size*c_size; size_t v_size=d_size*(d_size+1)/2; - + double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale; int sig; size_t step_iter, flag_pd; - + gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size); gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size); gsl_vector *D_temp=gsl_vector_alloc (d_size); gsl_vector *xHiy=gsl_vector_alloc (dc_size); - gsl_vector *QixHiy=gsl_vector_alloc (dc_size); + gsl_vector *QixHiy=gsl_vector_alloc (dc_size); gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); - - gsl_vector *gradient=gsl_vector_alloc (v_size*2); - + + gsl_vector *gradient=gsl_vector_alloc (v_size*2); + //calculate |XXt| and (XXt)^{-1} gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); for (size_t i=0; i<c_size; ++i) { @@ -2533,17 +2534,17 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec gsl_permutation * pmt=gsl_permutation_alloc (c_size); LUDecomp (XXt, pmt, &sig); gsl_permutation_free (pmt); -// LUInvert (XXt, pmt, XXti); - - //calculate the constant for logl - if (func_name=='R' || func_name=='r') { +// LUInvert (XXt, pmt, XXti); + + //calculate the constant for logl + if (func_name=='R' || func_name=='r') { logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt); } else { logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI); } //optimization iterations - - for (size_t t=0; t<max_iter; t++) { + + for (size_t t=0; t<max_iter; t++) { gsl_matrix_memcpy (Vg_save, V_g); gsl_matrix_memcpy (Ve_save, V_e); @@ -2551,10 +2552,10 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec do { gsl_matrix_memcpy (V_g, Vg_save); gsl_matrix_memcpy (V_e, Ve_save); - + //update Vg, Ve, and invert Hessian if (t!=0) {UpdateVgVe (Hessian_inv, gradient, step_scale, V_g, V_e);} - + //check if both Vg and Ve are positive definite flag_pd=1; gsl_matrix_memcpy (V_temp, V_e); @@ -2565,33 +2566,33 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec gsl_matrix_memcpy (V_temp, V_g); EigenDecomp(V_temp, U_temp, D_temp, 0); for (size_t i=0; i<d_size; i++) { - if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;} + if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;} } //if flag_pd==1 continue to calculate quantities and logl - if (flag_pd==1) { + if (flag_pd==1) { CalcHiQi (eval, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q); Calc_Hiy_all (Y, Hi_all, Hiy_all); Calc_xHi_all (X, Hi_all, xHi_all); - + //calculate QixHiy and yPy Calc_xHiy (Y, xHi_all, xHiy); gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, xHiy, 0.0, QixHiy); - + gsl_blas_ddot (QixHiy, xHiy, &yPy); yPy=Calc_yHiy (Y, Hiy_all)-yPy; - + //calculate log likelihood/restricted likelihood value - if (func_name=='R' || func_name=='r') { + if (func_name=='R' || func_name=='r') { logl_new=logl_const-0.5*logdet_H-0.5*logdet_Q-0.5*yPy; } else { logl_new=logl_const-0.5*logdet_H-0.5*yPy; - } + } } - step_scale/=2.0; + step_scale/=2.0; step_iter++; - + //cout<<t<<"\t"<<step_iter<<"\t"<<logl_old<<"\t"<<logl_new<<"\t"<<flag_pd<<endl; } while ( (flag_pd==0 || logl_new<logl_old || logl_new-logl_old>10 ) && step_iter<10 && t!=0); @@ -2602,21 +2603,21 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec gsl_matrix_memcpy (V_e, Ve_save); break; } - + if (logl_new-logl_old<max_prec) { break; } } logl_old=logl_new; - + CalcDev (func_name, eval, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient, Hessian_inv, crt_a, crt_b, crt_c); - - + + //output estimates in each iteration /* cout<<func_name<<" iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl; - + cout<<"Vg: "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<d_size; j++) { @@ -2640,24 +2641,24 @@ double MphNR (const char func_name, const size_t max_iter, const double max_prec } */ } - + //mutiply Hessian_inv with -1.0 //now Hessian_inv is the variance matrix gsl_matrix_scale (Hessian_inv, -1.0); - + gsl_matrix_free(Vg_save); gsl_matrix_free(Ve_save); gsl_matrix_free(V_temp); gsl_matrix_free(U_temp); gsl_vector_free(D_temp); gsl_vector_free(xHiy); - gsl_vector_free(QixHiy); - + gsl_vector_free(QixHiy); + gsl_matrix_free(Qi); gsl_matrix_free(XXt); - + gsl_vector_free(gradient); - + return logl_new; } @@ -2671,23 +2672,23 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix_set_zero (V_g); gsl_matrix_set_zero (V_e); gsl_matrix_set_zero (B); - - size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1; + + size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1; double a, b, c; double lambda, logl, vg, ve; - + //Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates - gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); + gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); gsl_vector *beta_temp=gsl_vector_alloc(c_size); gsl_vector *se_beta_temp=gsl_vector_alloc(c_size); - - gsl_matrix_transpose_memcpy (Xt, X); - + + gsl_matrix_transpose_memcpy (Xt, X); + for (size_t i=0; i<d_size; i++) { gsl_vector_const_view Y_row=gsl_matrix_const_row (Y, i); CalcLambda ('R', eval, Xt, &Y_row.vector, l_min, l_max, n_region, lambda, logl); CalcLmmVgVeBeta (eval, Xt, &Y_row.vector, lambda, vg, ve, beta_temp, se_beta_temp); - + gsl_matrix_set(V_g, i, i, vg); gsl_matrix_set(V_e, i, i, ve); } @@ -2695,7 +2696,7 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix_free (Xt); gsl_vector_free (beta_temp); gsl_vector_free (se_beta_temp); - + //if number of phenotypes is above four, then obtain the off diagonal elements with two trait models if (d_size>4) { //first obtain good initial values @@ -2707,48 +2708,48 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (6, 6); - + //2 by n matrix of Y gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size); gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2); gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2); gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size); - + for (size_t i=0; i<d_size; i++) { gsl_vector_view Y_sub1=gsl_matrix_row (Y_sub, 0); gsl_vector_const_view Y_1=gsl_matrix_const_row (Y, i); gsl_vector_memcpy (&Y_sub1.vector, &Y_1.vector); - + for (size_t j=i+1; j<d_size; j++) { gsl_vector_view Y_sub2=gsl_matrix_row (Y_sub, 1); gsl_vector_const_view Y_2=gsl_matrix_const_row (Y, j); gsl_vector_memcpy (&Y_sub2.vector, &Y_2.vector); - + gsl_matrix_set_zero (Vg_sub); gsl_matrix_set_zero (Ve_sub); gsl_matrix_set (Vg_sub, 0, 0, gsl_matrix_get (V_g, i, i)); gsl_matrix_set (Ve_sub, 0, 0, gsl_matrix_get (V_e, i, i)); gsl_matrix_set (Vg_sub, 1, 1, gsl_matrix_get (V_g, j, j)); gsl_matrix_set (Ve_sub, 1, 1, gsl_matrix_get (V_e, j, j)); - - logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub); + + logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub); logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, a, b, c); - + gsl_matrix_set(V_g, i, j, gsl_matrix_get (Vg_sub, 0, 1)); gsl_matrix_set(V_g, j, i, gsl_matrix_get (Vg_sub, 0, 1)); - + gsl_matrix_set(V_e, i, j, ve=gsl_matrix_get (Ve_sub, 0, 1)); gsl_matrix_set(V_e, j, i, ve=gsl_matrix_get (Ve_sub, 0, 1)); } } - + //free matrices gsl_matrix_free(U_hat); gsl_matrix_free(E_hat); @@ -2757,21 +2758,21 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix_free(UltVehiY); gsl_matrix_free(UltVehiBX); gsl_matrix_free(UltVehiU); - gsl_matrix_free(UltVehiE); - + gsl_matrix_free(UltVehiE); + gsl_matrix_free(Hi_all); gsl_matrix_free(Hiy_all); gsl_matrix_free(xHi_all); gsl_matrix_free(Hessian); - + gsl_matrix_free(Y_sub); gsl_matrix_free(Vg_sub); gsl_matrix_free(Ve_sub); gsl_matrix_free(B_sub); - + /* //second, maximize a increasingly large matrix - for (size_t i=1; i<d_size; i++) { + for (size_t i=1; i<d_size; i++) { //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (i+1, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (i+1, n_size); @@ -2780,34 +2781,34 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix *UltVehiY=gsl_matrix_alloc (i+1, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (i+1, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (i+1, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (i+1, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (i+1, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (i+1, (i+1)*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (i+1, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc ((i+1)*c_size, (i+1)*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc ((i+1)*(i+2), (i+1)*(i+2)); - + //(i+1) by n matrix of Y gsl_matrix *Y_sub=gsl_matrix_alloc (i+1, n_size); gsl_matrix *Vg_sub=gsl_matrix_alloc (i+1, i+1); gsl_matrix *Ve_sub=gsl_matrix_alloc (i+1, i+1); gsl_matrix *B_sub=gsl_matrix_alloc (i+1, c_size); - + gsl_matrix_const_view Y_sub_view=gsl_matrix_const_submatrix (Y, 0, 0, i+1, n_size); gsl_matrix_view Vg_sub_view=gsl_matrix_submatrix (V_g, 0, 0, i+1, i+1); gsl_matrix_view Ve_sub_view=gsl_matrix_submatrix (V_e, 0, 0, i+1, i+1); - + gsl_matrix_memcpy (Y_sub, &Y_sub_view.matrix); gsl_matrix_memcpy (Vg_sub, &Vg_sub_view.matrix); gsl_matrix_memcpy (Ve_sub, &Ve_sub_view.matrix); - - logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub); + + logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub); logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, crt_a, crt_b, crt_c); - + gsl_matrix_memcpy (&Vg_sub_view.matrix, Vg_sub); gsl_matrix_memcpy (&Ve_sub_view.matrix, Ve_sub); - + //free matrices gsl_matrix_free(U_hat); gsl_matrix_free(E_hat); @@ -2816,13 +2817,13 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter gsl_matrix_free(UltVehiY); gsl_matrix_free(UltVehiBX); gsl_matrix_free(UltVehiU); - gsl_matrix_free(UltVehiE); - + gsl_matrix_free(UltVehiE); + gsl_matrix_free(Hi_all); gsl_matrix_free(Hiy_all); gsl_matrix_free(xHi_all); gsl_matrix_free(Hessian); - + gsl_matrix_free(Y_sub); gsl_matrix_free(Vg_sub); gsl_matrix_free(Ve_sub); @@ -2830,42 +2831,42 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter } */ } - + //calculate B hat using GSL estimate gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); - + gsl_vector *D_l=gsl_vector_alloc (d_size); gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); gsl_matrix *Qi=gsl_matrix_alloc (d_size*c_size, d_size*c_size); gsl_vector *XHiy=gsl_vector_alloc (d_size*c_size); gsl_vector *beta=gsl_vector_alloc (d_size*c_size); - + gsl_vector_set_zero (XHiy); - + double logdet_Ve, logdet_Q, dl, d, delta, dx, dy; - + //eigen decomposition and calculate log|Ve| - logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); - + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + //calculate Qi and log|Q| - logdet_Q=CalcQi (eval, D_l, X, Qi); - + logdet_Q=CalcQi (eval, D_l, X, Qi); + //calculate UltVehiY gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); //calculate XHiy for (size_t i=0; i<d_size; i++) { dl=gsl_vector_get(D_l, i); - - for (size_t j=0; j<c_size; j++) { + + for (size_t j=0; j<c_size; j++) { d=0.0; for (size_t k=0; k<n_size; k++) { delta=gsl_vector_get(eval, k); dx=gsl_matrix_get(X, j, k); dy=gsl_matrix_get(UltVehiY, i, k); - - //if (delta==0) {continue;} + + //if (delta==0) {continue;} d+=dy*dx/(delta*dl+1.0); } gsl_vector_set(XHiy, j*d_size+i, d); @@ -2877,20 +2878,20 @@ void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter //multiply beta by UltVeh and save to B for (size_t i=0; i<c_size; i++) { gsl_vector_view B_col=gsl_matrix_column (B, i); - gsl_vector_view beta_sub=gsl_vector_subvector (beta, i*d_size, d_size); + gsl_vector_view beta_sub=gsl_vector_subvector (beta, i*d_size, d_size); gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &beta_sub.vector, 0.0, &B_col.vector); } //free memory gsl_matrix_free(UltVehiY); - + gsl_vector_free(D_l); gsl_matrix_free(UltVeh); gsl_matrix_free(UltVehi); gsl_matrix_free(Qi); gsl_vector_free(XHiy); gsl_vector_free(beta); - + return; } @@ -2902,33 +2903,511 @@ double PCRT (const size_t mode, const size_t d_size, const double p_value, const { double p_crt=0.0, chisq_crt=0.0, q=(double)d_size; double chisq=gsl_cdf_chisq_Qinv(p_value, (double)d_size ); - - if (mode==1) { + + if (mode==1) { double a=crt_c/(2.0*q*(q+2.0)); - double b=1.0+(crt_a+crt_b)/(2.0*q); - chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a); + double b=1.0+(crt_a+crt_b)/(2.0*q); + chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a); } else if (mode==2) { - chisq_crt=chisq/(1.0+crt_a/(2.0*q) ); + chisq_crt=chisq/(1.0+crt_a/(2.0*q) ); } else { /* double a=-1.0*crt_c/(2.0*q*(q+2.0)); - double b=1.0+(crt_a-crt_b)/(2.0*q); + double b=1.0+(crt_a-crt_b)/(2.0*q); chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a); */ chisq_crt=chisq; } - - p_crt=gsl_cdf_chisq_Q (chisq_crt, (double)d_size ); - + + p_crt=gsl_cdf_chisq_Q (chisq_crt, (double)d_size ); + //cout<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl; //cout<<setprecision(10)<<p_value<<"\t"<<p_crt<<endl; - + return p_crt; } +// WJA added +#include <assert.h> +void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) +{ + string file_bgen=file_oxford+".bgen"; + ifstream infile (file_bgen.c_str(), ios::binary); + if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;} + + + clock_t time_start=clock(); + time_UtX=0; time_opt=0; + + string line; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_miss, c_phen; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_vector *x_miss=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); + + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_remle_null.clear(); + se_beta_remle_null.clear(); + for (size_t i=0; i<se_B_null->size1; i++) { + for (size_t j=0; j<se_B_null->size2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + + cout<<"REMLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE likelihood = "<<logl_H0<<endl; + + + logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + + c=0; + Vg_mle_null.clear(); + Ve_mle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_mle_null.clear(); + se_beta_mle_null.clear(); + for (size_t i=0; i<se_B_null->size1; i++) { + for (size_t j=0; j<se_B_null->size2; j++) { + beta_mle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_mle_H0=logl_H0; + + cout<<"MLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"MLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"MLE likelihood = "<<logl_H0<<endl; + + + vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; + for (size_t i=0; i<d_size; i++) { + v_beta.push_back(0.0); + } + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg.push_back(0.0); + v_Ve.push_back(0.0); + v_Vbeta.push_back(0.0); + } + } + + gsl_matrix_memcpy (V_g_null, V_g); + gsl_matrix_memcpy (V_e_null, V_e); + gsl_matrix_memcpy (B_null, B); + + // read in header + uint32_t bgen_snp_block_offset; + uint32_t bgen_header_length; + uint32_t bgen_nsamples; + uint32_t bgen_nsnps; + uint32_t bgen_flags; + infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4); + infile.read(reinterpret_cast<char*>(&bgen_header_length),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsnps),4); + bgen_snp_block_offset-=4; + infile.read(reinterpret_cast<char*>(&bgen_nsamples),4); + bgen_snp_block_offset-=4; + infile.ignore(4+bgen_header_length-20); + bgen_snp_block_offset-=4+bgen_header_length-20; + infile.read(reinterpret_cast<char*>(&bgen_flags),4); + bgen_snp_block_offset-=4; + bool CompressedSNPBlocks=bgen_flags&0x1; +// bool LongIds=bgen_flags&0x4; + + infile.ignore(bgen_snp_block_offset); + + double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + + uint32_t bgen_N; + uint16_t bgen_LS; + uint16_t bgen_LR; + uint16_t bgen_LC; + uint32_t bgen_SNP_pos; + uint32_t bgen_LA; + std::string bgen_A_allele; + uint32_t bgen_LB; + std::string bgen_B_allele; + uint32_t bgen_P; + size_t unzipped_data_size; + string id; + string rs; + string chr; + std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<<std::endl; + + + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) { + + +// if (t>1) {break;} + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + // read SNP header + id.clear(); + rs.clear(); + chr.clear(); + bgen_A_allele.clear(); + bgen_B_allele.clear(); + + infile.read(reinterpret_cast<char*>(&bgen_N),4); + infile.read(reinterpret_cast<char*>(&bgen_LS),2); + + id.resize(bgen_LS); + infile.read(&id[0], bgen_LS); + + infile.read(reinterpret_cast<char*>(&bgen_LR),2); + rs.resize(bgen_LR); + infile.read(&rs[0], bgen_LR); + + infile.read(reinterpret_cast<char*>(&bgen_LC),2); + chr.resize(bgen_LC); + infile.read(&chr[0], bgen_LC); + + infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4); + + infile.read(reinterpret_cast<char*>(&bgen_LA),4); + bgen_A_allele.resize(bgen_LA); + infile.read(&bgen_A_allele[0], bgen_LA); + + + infile.read(reinterpret_cast<char*>(&bgen_LB),4); + bgen_B_allele.resize(bgen_LB); + infile.read(&bgen_B_allele[0], bgen_LB); + + + + + uint16_t unzipped_data[3*bgen_N]; + + if (indicator_snp[t]==0) { + if(CompressedSNPBlocks) + infile.read(reinterpret_cast<char*>(&bgen_P),4); + else + bgen_P=6*bgen_N; + infile.ignore(static_cast<size_t>(bgen_P)); + + continue; + } + + + if(CompressedSNPBlocks) + { + + + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + assert(result == Z_OK); + + } + else + { + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + } + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<bgen_N; ++i) { + if (indicator_idv[i]==0) {continue;} + + + bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=static_cast<double>(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); + } + } + } + + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); + } + } + } + + if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } + + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } -void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + cout<<endl; + + + infile.close(); + infile.clear(); + + gsl_matrix_free(U_hat); + gsl_matrix_free(E_hat); + gsl_matrix_free(OmegaU); + gsl_matrix_free(OmegaE); + gsl_matrix_free(UltVehiY); + gsl_matrix_free(UltVehiBX); + gsl_matrix_free(UltVehiU); + gsl_matrix_free(UltVehiE); + + gsl_matrix_free(Hi_all); + gsl_matrix_free(Hiy_all); + gsl_matrix_free(xHi_all); + gsl_matrix_free(Hessian); + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(Y); + gsl_matrix_free(X); + gsl_matrix_free(V_g); + gsl_matrix_free(V_e); + gsl_matrix_free(B); + gsl_vector_free(beta); + gsl_matrix_free(Vbeta); + + gsl_matrix_free(V_g_null); + gsl_matrix_free(V_e_null); + gsl_matrix_free(B_null); + gsl_matrix_free(se_B_null); + + return; +} + +void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) { igzstream infile (file_geno.c_str(), igzstream::in); // ifstream infile (file_geno.c_str(), ifstream::in); @@ -2936,10 +3415,10 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs clock_t time_start=clock(); time_UtX=0; time_opt=0; - + string line; char *ch_ptr; - + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; double crt_a, crt_b, crt_c; @@ -2947,10 +3426,10 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs double geno, x_mean; size_t c=0; // double s=0.0; - size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; - + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -2959,17 +3438,17 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_vector *x=gsl_vector_alloc (n_size); gsl_vector *x_miss=gsl_vector_alloc (n_size); - + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); @@ -2977,31 +3456,31 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); gsl_vector *beta=gsl_vector_alloc (d_size); gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); - + //null estimates for initial values gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); - - gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); - + gsl_matrix_transpose_memcpy (Y, UtY); gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); - + gsl_vector_view X_row=gsl_matrix_row(X, c_size); gsl_vector_set_zero(&X_row.vector); gsl_vector_view B_col=gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); + gsl_vector_set_zero(&B_col.vector); MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); - logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); - + c=0; Vg_remle_null.clear(); Ve_remle_null.clear(); @@ -3014,7 +3493,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs c++; } } - beta_remle_null.clear(); + beta_remle_null.clear(); se_beta_remle_null.clear(); for (size_t i=0; i<se_B_null->size1; i++) { for (size_t j=0; j<se_B_null->size2; j++) { @@ -3023,10 +3502,10 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } } logl_remle_H0=logl_H0; - + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); cout.precision(4); - + cout<<"REMLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { @@ -3034,13 +3513,13 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } cout<<endl; } - cout<<"se(Vg): "<<endl; + cout<<"se(Vg): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"REMLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { @@ -3049,21 +3528,21 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } cout<<endl; } - cout<<"se(Ve): "<<endl; + cout<<"se(Ve): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"REMLE likelihood = "<<logl_H0<<endl; - - + + logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); - + c=0; Vg_mle_null.clear(); Ve_mle_null.clear(); @@ -3076,7 +3555,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs c++; } } - beta_mle_null.clear(); + beta_mle_null.clear(); se_beta_mle_null.clear(); for (size_t i=0; i<se_B_null->size1; i++) { for (size_t j=0; j<se_B_null->size2; j++) { @@ -3085,7 +3564,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } } logl_mle_H0=logl_H0; - + cout<<"MLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { @@ -3093,13 +3572,13 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } cout<<endl; } - cout<<"se(Vg): "<<endl; + cout<<"se(Vg): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"MLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { @@ -3108,17 +3587,17 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } cout<<endl; } - cout<<"se(Ve): "<<endl; + cout<<"se(Ve): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"MLE likelihood = "<<logl_H0<<endl; - + vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; for (size_t i=0; i<d_size; i++) { v_beta.push_back(0.0); @@ -3130,41 +3609,41 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs v_Vbeta.push_back(0.0); } } - + gsl_matrix_memcpy (V_g_null, V_g); gsl_matrix_memcpy (V_e_null, V_e); gsl_matrix_memcpy (B_null, B); - - //start reading genotypes and analyze + + //start reading genotypes and analyze for (size_t t=0; t<indicator_snp.size(); ++t) { //if (t>=1) {break;} !safeGetline(infile, line).eof(); if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} if (indicator_snp[t]==0) {continue;} - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); x_mean=0.0; c_phen=0; n_miss=0; gsl_vector_set_zero(x_miss); for (size_t i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; } x_mean/=(double)(ni_test-n_miss); - + for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); @@ -3174,68 +3653,68 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs } //calculate statistics - time_start=clock(); + time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //initial values gsl_matrix_memcpy (V_g, V_g_null); gsl_matrix_memcpy (V_e, V_e_null); gsl_matrix_memcpy (B, B_null); - + time_start=clock(); - + //3 is before 1 - if (a_mode==3 || a_mode==4) { + if (a_mode==3 || a_mode==4) { p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); if (p_score<p_nr && crt==1) { logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); } - } + } if (a_mode==2 || a_mode==4) { logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + if (p_lrt<p_nr) { logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + if (crt==1) { p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); } - } - } + } + } if (a_mode==1 || a_mode==4) { logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - + if (p_wald<p_nr) { logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - + if (crt==1) { p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); } - } - } + } + } if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; for (size_t i=0; i<d_size; i++) { - v_beta[i]=gsl_vector_get (beta, i); + v_beta[i]=gsl_vector_get (beta, i); } - + c=0; for (size_t i=0; i<d_size; i++) { for (size_t j=i; j<d_size; j++) { @@ -3245,16 +3724,16 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs c++; } } - + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; sumStat.push_back(SNPs); - } + } cout<<endl; - - + + infile.close(); infile.clear(); - + gsl_matrix_free(U_hat); gsl_matrix_free(E_hat); gsl_matrix_free(OmegaU); @@ -3263,28 +3742,28 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix_free(UltVehiBX); gsl_matrix_free(UltVehiU); gsl_matrix_free(UltVehiE); - + gsl_matrix_free(Hi_all); gsl_matrix_free(Hiy_all); gsl_matrix_free(xHi_all); gsl_matrix_free(Hessian); - + gsl_vector_free(x); gsl_vector_free(x_miss); - + gsl_matrix_free(Y); - gsl_matrix_free(X); + gsl_matrix_free(X); gsl_matrix_free(V_g); gsl_matrix_free(V_e); gsl_matrix_free(B); gsl_vector_free(beta); gsl_matrix_free(Vbeta); - + gsl_matrix_free(V_g_null); gsl_matrix_free(V_e_null); - gsl_matrix_free(B_null); + gsl_matrix_free(B_null); gsl_matrix_free(se_B_null); - + return; } @@ -3294,18 +3773,18 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs -void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) +void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) { string file_bed=file_bfile+".bed"; ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} - + clock_t time_start=clock(); time_UtX=0; time_opt=0; - + char ch[1]; bitset<8> b; - + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; double crt_a, crt_b, crt_c; @@ -3313,9 +3792,9 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl double geno, x_mean; size_t c=0; // double s=0.0; - size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; - + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3324,50 +3803,50 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + gsl_vector *x=gsl_vector_alloc (n_size); - + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); gsl_vector *beta=gsl_vector_alloc (d_size); gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); - + //null estimates for initial values gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); - gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); - - gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); - + gsl_matrix_transpose_memcpy (Y, UtY); gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); - + gsl_vector_view X_row=gsl_matrix_row(X, c_size); gsl_vector_set_zero(&X_row.vector); gsl_vector_view B_col=gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); - - //time_start=clock(); + gsl_vector_set_zero(&B_col.vector); + + //time_start=clock(); MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); - + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; - + c=0; Vg_remle_null.clear(); Ve_remle_null.clear(); @@ -3380,7 +3859,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl c++; } } - beta_remle_null.clear(); + beta_remle_null.clear(); se_beta_remle_null.clear(); for (size_t i=0; i<se_B_null->size1; i++) { for (size_t j=0; j<se_B_null->size2; j++) { @@ -3389,7 +3868,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } } logl_remle_H0=logl_H0; - + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); cout.precision(4); cout<<"REMLE estimate for Vg in the null model: "<<endl; @@ -3399,13 +3878,13 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } cout<<endl; } - cout<<"se(Vg): "<<endl; + cout<<"se(Vg): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"REMLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { @@ -3414,22 +3893,22 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } cout<<endl; } - cout<<"se(Ve): "<<endl; + cout<<"se(Ve): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"REMLE likelihood = "<<logl_H0<<endl; - - //time_start=clock(); + + //time_start=clock(); logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); //cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; - + c=0; Vg_mle_null.clear(); Ve_mle_null.clear(); @@ -3442,7 +3921,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl c++; } } - beta_mle_null.clear(); + beta_mle_null.clear(); se_beta_mle_null.clear(); for (size_t i=0; i<se_B_null->size1; i++) { for (size_t j=0; j<se_B_null->size2; j++) { @@ -3451,7 +3930,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } } logl_mle_H0=logl_H0; - + cout<<"MLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { @@ -3459,13 +3938,13 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } cout<<endl; } - cout<<"se(Vg): "<<endl; + cout<<"se(Vg): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"MLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<d_size; i++) { @@ -3474,16 +3953,16 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } cout<<endl; } - cout<<"se(Ve): "<<endl; + cout<<"se(Ve): "<<endl; for (size_t i=0; i<d_size; i++) { for (size_t j=0; j<=i; j++) { c=GetIndex(i, j, d_size); cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; } - cout<<endl; + cout<<endl; } cout<<"MLE likelihood = "<<logl_H0<<endl; - + vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; for (size_t i=0; i<d_size; i++) { v_beta.push_back(0.0); @@ -3495,143 +3974,143 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl v_Vbeta.push_back(0.0); } } - + gsl_matrix_memcpy (V_g_null, V_g); gsl_matrix_memcpy (V_e_null, V_e); - gsl_matrix_memcpy (B_null, B); - - - //start reading genotypes and analyze - + gsl_matrix_memcpy (B_null, B); + + + //start reading genotypes and analyze + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - + //print the first three majic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } - + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} - + //if (t>=0) {break;} //if (snpInfo[t].rs_number!="MAG18140902") {continue;} //cout<<t<<endl; - + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - + //read genotypes - x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;} if (indicator_idv[ci_total]==0) {ci_total++; continue;} - + if (b[2*j]==0) { if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; } else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } } else { - if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } else {gsl_vector_set(x, ci_test, -9); n_miss++; } } - + ci_total++; ci_test++; } } - + x_mean/=(double)(ni_test-n_miss); - - for (size_t i=0; i<ni_test; ++i) { + + for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} if (x_mean>1) { gsl_vector_set(x, i, 2-geno); } - } - + } + /* - if (t==0) { + if (t==0) { ofstream outfile ("./snp1.txt", ofstream::out); if (!outfile) {cout<<"error writing file: "<<endl; return;} for (size_t i=0; i<x->size; i++) { outfile<<gsl_vector_get(x, i)<<endl; } outfile.clear(); - outfile.close(); + outfile.close(); } */ - + //calculate statistics - time_start=clock(); + time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //initial values gsl_matrix_memcpy (V_g, V_g_null); gsl_matrix_memcpy (V_e, V_e_null); gsl_matrix_memcpy (B, B_null); - + time_start=clock(); - + //3 is before 1 if (a_mode==3 || a_mode==4) { p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); - + if (p_score<p_nr && crt==1) { logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); } - } - + } + if (a_mode==2 || a_mode==4) { logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + if (p_lrt<p_nr) { logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); if (crt==1) { p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); - } + } } - } - + } + if (a_mode==1 || a_mode==4) { logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - + if (p_wald<p_nr) { logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - + if (crt==1) { p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); } } } - + //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl; - + if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //store summary data //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; for (size_t i=0; i<d_size; i++) { - v_beta[i]=gsl_vector_get (beta, i); + v_beta[i]=gsl_vector_get (beta, i); } c=0; @@ -3643,17 +4122,17 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl c++; } } - + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; sumStat.push_back(SNPs); - } - cout<<endl; - + } + cout<<endl; + //cout<<"time_opt = "<<time_opt<<endl; - + infile.close(); infile.clear(); - + gsl_matrix_free(U_hat); gsl_matrix_free(E_hat); gsl_matrix_free(OmegaU); @@ -3662,27 +4141,27 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl gsl_matrix_free(UltVehiBX); gsl_matrix_free(UltVehiU); gsl_matrix_free(UltVehiE); - + gsl_matrix_free(Hi_all); gsl_matrix_free(Hiy_all); gsl_matrix_free(xHi_all); gsl_matrix_free(Hessian); - + gsl_vector_free(x); - + gsl_matrix_free(Y); - gsl_matrix_free(X); + gsl_matrix_free(X); gsl_matrix_free(V_g); gsl_matrix_free(V_e); gsl_matrix_free(B); gsl_vector_free(beta); gsl_matrix_free(Vbeta); - + gsl_matrix_free(V_g_null); gsl_matrix_free(V_e_null); gsl_matrix_free(B_null); gsl_matrix_free(se_B_null); - + return; } @@ -3693,11 +4172,11 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl //both B and se_B are d by c matrices void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B) { - size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*c_size, v_size=d_size*(d_size+1)/2; double logl, crt_a, crt_b, crt_c; - + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3706,22 +4185,22 @@ void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); - gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); - + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + //large matrices for NR gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); - + //transpose matrices gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); gsl_matrix *W=gsl_matrix_alloc (c_size, n_size); gsl_matrix_transpose_memcpy (Y, UtY); gsl_matrix_transpose_memcpy (W, UtW); - + //initial, EM, NR, and calculate B - MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B); logl=MphEM ('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); logl=MphNR ('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta (eval, W, Y, V_g, V_e, UltVehiY, B, se_B); @@ -3735,15 +4214,837 @@ void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl gsl_matrix_free(UltVehiBX); gsl_matrix_free(UltVehiU); gsl_matrix_free(UltVehiE); - + gsl_matrix_free(Hi_all); gsl_matrix_free(Hiy_all); gsl_matrix_free(xHi_all); gsl_matrix_free(Hessian); - + gsl_matrix_free(Y); gsl_matrix_free(W); - + return; } + + + + +void MVLMM::AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} + + clock_t time_start=clock(); + time_UtX=0; time_opt=0; + + string line; + char *ch_ptr; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_miss, c_phen; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2+2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_vector *x_miss=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values; including env but not including x + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1); + gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size); + gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1); + gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size); + + gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size); + gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW); + gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector); + + gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1); + gsl_vector_set_zero(&X_row1.vector); + gsl_vector_view X_row2=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row2.vector); + + gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1); + gsl_vector_set_zero(&B_col1.vector); + gsl_vector_view B_col2=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col2.vector); + + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_remle_null.clear(); + se_beta_remle_null.clear(); + for (size_t i=0; i<se_B_null1->size1; i++) { + for (size_t j=0; j<se_B_null1->size2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + + cout<<"REMLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE likelihood = "<<logl_H0<<endl; + + + logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + + c=0; + Vg_mle_null.clear(); + Ve_mle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_mle_null.clear(); + se_beta_mle_null.clear(); + for (size_t i=0; i<se_B_null1->size1; i++) { + for (size_t j=0; j<se_B_null1->size2; j++) { + beta_mle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_mle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_mle_H0=logl_H0; + + cout<<"MLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"MLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"MLE likelihood = "<<logl_H0<<endl; + + + vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; + for (size_t i=0; i<d_size; i++) { + v_beta.push_back(0.0); + } + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg.push_back(0.0); + v_Ve.push_back(0.0); + v_Vbeta.push_back(0.0); + } + } + + gsl_matrix_memcpy (V_g_null, V_g); + gsl_matrix_memcpy (V_e_null, V_e); + gsl_matrix_memcpy (B_null, B); + + //start reading genotypes and analyze + for (size_t t=0; t<indicator_snp.size(); ++t) { + //if (t>=1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i<ni_total; ++i) { + ch_ptr=strtok (NULL, " , \t"); + if (indicator_idv[i]==0) {continue;} + + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;} + else { + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean+=geno; + } + c_phen++; + } + + x_mean/=(double)(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} + geno=gsl_vector_get(x, i); + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row1.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row2.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + if (a_mode==2 || a_mode==3 || a_mode==4) { + if (a_mode==3 || a_mode==4) { + logl_H0=MphEM ('R', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('R', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + + if (a_mode==2 || a_mode==4) { + logl_H0=MphEM ('L', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('L', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + } + + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); + } + } + } + + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); + } + } + } + + if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } + + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } + + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + cout<<endl; + + + infile.close(); + infile.clear(); + + gsl_matrix_free(U_hat); + gsl_matrix_free(E_hat); + gsl_matrix_free(OmegaU); + gsl_matrix_free(OmegaE); + gsl_matrix_free(UltVehiY); + gsl_matrix_free(UltVehiBX); + gsl_matrix_free(UltVehiU); + gsl_matrix_free(UltVehiE); + + gsl_matrix_free(Hi_all); + gsl_matrix_free(Hiy_all); + gsl_matrix_free(xHi_all); + gsl_matrix_free(Hessian); + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + gsl_matrix_free(Y); + gsl_matrix_free(X); + gsl_matrix_free(V_g); + gsl_matrix_free(V_e); + gsl_matrix_free(B); + gsl_vector_free(beta); + gsl_matrix_free(Vbeta); + + gsl_matrix_free(V_g_null); + gsl_matrix_free(V_e_null); + gsl_matrix_free(B_null); + gsl_matrix_free(se_B_null1); + gsl_matrix_free(se_B_null2); + + return; +} + + + + + + + +void MVLMM::AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env) +{ + string file_bed=file_bfile+".bed"; + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} + + clock_t time_start=clock(); + time_UtX=0; time_opt=0; + + char ch[1]; + bitset<8> b; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2+2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1); + gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size); + gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1); + gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size); + + gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size); + gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW); + gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2); + gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector); + + gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1); + gsl_vector_set_zero(&X_row1.vector); + gsl_vector_view X_row2=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row2.vector); + + gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1); + gsl_vector_set_zero(&B_col1.vector); + gsl_vector_view B_col2=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col2.vector); + + //time_start=clock(); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix); + + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_remle_null.clear(); + se_beta_remle_null.clear(); + for (size_t i=0; i<se_B_null1->size1; i++) { + for (size_t j=0; j<se_B_null1->size2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"REMLE likelihood = "<<logl_H0<<endl; + + //time_start=clock(); + logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix); + logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1); + //cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + + c=0; + Vg_mle_null.clear(); + Ve_mle_null.clear(); + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) ); + Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) ); + VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) ); + VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) ); + c++; + } + } + beta_mle_null.clear(); + se_beta_mle_null.clear(); + for (size_t i=0; i<se_B_null1->size1; i++) { + for (size_t j=0; j<se_B_null1->size2; j++) { + beta_mle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_mle_null.push_back(gsl_matrix_get(se_B_null1, i, j) ); + } + } + logl_mle_H0=logl_H0; + + cout<<"MLE estimate for Vg in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_g, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Vg): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t"; + } + cout<<endl; + } + cout<<"MLE estimate for Ve in the null model: "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + cout<<gsl_matrix_get(V_e, i, j)<<"\t"; + } + cout<<endl; + } + cout<<"se(Ve): "<<endl; + for (size_t i=0; i<d_size; i++) { + for (size_t j=0; j<=i; j++) { + c=GetIndex(i, j, d_size); + cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t"; + } + cout<<endl; + } + cout<<"MLE likelihood = "<<logl_H0<<endl; + + vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; + for (size_t i=0; i<d_size; i++) { + v_beta.push_back(0.0); + } + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg.push_back(0.0); + v_Ve.push_back(0.0); + v_Vbeta.push_back(0.0); + } + } + + gsl_matrix_memcpy (V_g_null, V_g); + gsl_matrix_memcpy (V_e_null, V_e); + gsl_matrix_memcpy (B_null, B); + + + //start reading genotypes and analyze + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { + if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} + if (indicator_snp[t]==0) {continue;} + + //if (t>=0) {break;} + //if (snpInfo[t].rs_number!="MAG18140902") {continue;} + //cout<<t<<endl; + + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + x_mean=0.0; n_miss=0; ci_total=0; ci_test=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; } + else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; } + } + else { + if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); } + else {gsl_vector_set(x, ci_test, -9); n_miss++; } + } + + ci_total++; + ci_test++; + } + } + + x_mean/=(double)(ni_test-n_miss); + + for (size_t i=0; i<ni_test; ++i) { + geno=gsl_vector_get(x,i); + if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} + if (x_mean>1) { + gsl_vector_set(x, i, 2-geno); + } + } + + /* + if (t==0) { + ofstream outfile ("./snp1.txt", ofstream::out); + if (!outfile) {cout<<"error writing file: "<<endl; return;} + for (size_t i=0; i<x->size; i++) { + outfile<<gsl_vector_get(x, i)<<endl; + } + outfile.clear(); + outfile.close(); + } + */ + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row1.vector); + gsl_vector_mul (x, env); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row2.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + if (a_mode==2 || a_mode==3 || a_mode==4) { + if (a_mode==3 || a_mode==4) { + logl_H0=MphEM ('R', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('R', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + + if (a_mode==2 || a_mode==4) { + logl_H0=MphEM ('L', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix); + logl_H0=MphNR ('L', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2); + } + } + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); + } + } + } + + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); + } + } + } + + //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl; + + if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } + + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } + + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + cout<<endl; + + //cout<<"time_opt = "<<time_opt<<endl; + + infile.close(); + infile.clear(); + + gsl_matrix_free(U_hat); + gsl_matrix_free(E_hat); + gsl_matrix_free(OmegaU); + gsl_matrix_free(OmegaE); + gsl_matrix_free(UltVehiY); + gsl_matrix_free(UltVehiBX); + gsl_matrix_free(UltVehiU); + gsl_matrix_free(UltVehiE); + + gsl_matrix_free(Hi_all); + gsl_matrix_free(Hiy_all); + gsl_matrix_free(xHi_all); + gsl_matrix_free(Hessian); + + gsl_vector_free(x); + + gsl_matrix_free(Y); + gsl_matrix_free(X); + gsl_matrix_free(V_g); + gsl_matrix_free(V_e); + gsl_matrix_free(B); + gsl_vector_free(beta); + gsl_matrix_free(Vbeta); + + gsl_matrix_free(V_g_null); + gsl_matrix_free(V_e_null); + gsl_matrix_free(B_null); + gsl_matrix_free(se_B_null1); + gsl_matrix_free(se_B_null2); + + return; +} diff --git a/src/mvlmm.h b/src/mvlmm.h index 129879c..9ff567c 100644 --- a/src/mvlmm.h +++ b/src/mvlmm.h @@ -1,22 +1,22 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __MVLMM_H__ +#ifndef __MVLMM_H__ #define __MVLMM_H__ #include "gsl/gsl_vector.h" @@ -38,17 +38,18 @@ using namespace std; class MVLMM { - + public: // IO related parameters int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests size_t d_pace; //display pace - + string file_bfile; string file_geno; + string file_oxford; string file_out; string path_out; - + // MVLMM related parameters double l_min; double l_max; @@ -61,7 +62,7 @@ public: size_t em_iter, nr_iter; double em_prec, nr_prec; size_t crt; - + // Summary statistics size_t ni_total, ni_test; //number of individuals size_t ns_total, ns_test; //number of snps @@ -69,22 +70,25 @@ public: size_t n_ph; double time_UtX; //time spent on optimization iterations double time_opt; //time spent on optimization iterations - + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis - + vector<SNPINFO> snpInfo; //record SNP information - + // Not included in PARAM vector<MPHSUMSTAT> sumStat; //Output SNPSummary Data - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env); + void AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env); void WriteFiles (); - + }; void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B); diff --git a/src/param.cpp b/src/param.cpp index 7a89ff8..c4b234a 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -24,6 +24,15 @@ #include <cmath> #include <algorithm> +#include "gsl/gsl_randist.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "eigenlib.h" +#include "mathfunc.h" #ifdef FORCE_FLOAT #include "param_float.h" @@ -39,12 +48,12 @@ using namespace std; -PARAM::PARAM(void): +PARAM::PARAM(void): mode_silence (false), a_mode (0), k_mode(1), d_pace (100000), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0), -pheno_mean(0), +pheno_mean(0), noconstrain (false), h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0), rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), @@ -55,53 +64,64 @@ n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), +window_cm(0), window_bp(0), window_ns(0), error(false), - n_cvt(1), n_vc(1), +ni_subsample(0), n_cvt(1), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} //read files //obtain ns_total, ng_total, ns_test, ni_test -void PARAM::ReadFiles (void) +void PARAM::ReadFiles (void) { string file_str; - if (!file_mk.empty()) { + + + if (!file_cat.empty()) { + if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;} + } + + if (!file_var.empty()) { + if (ReadFile_var (file_var, mapRS2var)==false) {error=true;} + } + + if (!file_mk.empty()) { if (CountFileLines (file_mk, n_vc)==false) {error=true;} } - + if (!file_snps.empty()) { if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} } else { setSnps.clear(); } - + //for prediction if (!file_epm.empty()) { if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;} - + if (!file_bfile.empty()) { file_str=file_bfile+".bim"; - if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} - + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + file_str=file_bfile+".fam"; - if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} } - - if (!file_geno.empty()) { - if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} - - if (CountFileLines (file_geno, ns_total)==false) {error=true;} + + if (!file_geno.empty()) { + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + if (CountFileLines (file_geno, ns_total)==false) {error=true;} } - + if (!file_ebv.empty() ) { if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;} } - + if (!file_log.empty() ) { if (ReadFile_log (file_log, pheno_mean)==false) {error=true;} } - + //convert indicator_pheno to indicator_idv int k=1; for (size_t i=0; i<indicator_pheno.size(); i++) { @@ -111,46 +131,80 @@ void PARAM::ReadFiles (void) } indicator_idv.push_back(k); } - + ns_test=0; - + return; } - + //read covariates before the genotype files if (!file_cvt.empty() ) { if (ReadFile_cvt (file_cvt, indicator_cvt, cvt, n_cvt)==false) {error=true;} if ((indicator_cvt).size()==0) { n_cvt=1; - } + } } else { n_cvt=1; } + if (!file_gxe.empty() ) { + if (ReadFile_column (file_gxe, indicator_gxe, gxe, 1)==false) {error=true;} + } + if (!file_weight.empty() ) { + if (ReadFile_column (file_weight, indicator_weight, weight, 1)==false) {error=true;} + } + + + // WJA added + //read genotype and phenotype file for bgen format + if (!file_oxford.empty()) { + file_str=file_oxford+".sample"; + if (ReadFile_sample(file_str, indicator_pheno, pheno, p_column,indicator_cvt, cvt, n_cvt)==false) {error=true;} + if ((indicator_cvt).size()==0) { + n_cvt=1; + } + // n_cvt=1; + + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates + ProcessCvtPhen(); + + + //obtain covariate matrix + gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); + CopyCvt (W); + + file_str=file_oxford+".bgen"; + if (ReadFile_bgen (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test)==false) {error=true;} + gsl_matrix_free(W); + + ns_total=indicator_snp.size(); + } + + //read genotype and phenotype file for plink format if (!file_bfile.empty()) { file_str=file_bfile+".bim"; - if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} - + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + file_str=file_bfile+".fam"; if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} - + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates ProcessCvtPhen(); - + //obtain covariate matrix gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); CopyCvt (W); - + file_str=file_bfile+".bed"; if (ReadFile_bed (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test)==false) {error=true;} - + gsl_matrix_free(W); - + ns_total=indicator_snp.size(); } - + //read genotype and phenotype file for bimbam format if (!file_geno.empty()) { //annotation file before genotype file @@ -163,7 +217,7 @@ void PARAM::ReadFiles (void) //post-process covariates and phenotypes, obtain ni_test, save all useful covariates ProcessCvtPhen(); - + //obtain covariate matrix gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); CopyCvt (W); @@ -171,13 +225,13 @@ void PARAM::ReadFiles (void) if (ReadFile_geno (file_geno, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test)==false) {error=true;} gsl_matrix_free(W); - + ns_total=indicator_snp.size(); } - + if (!file_gene.empty()) { if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} - + //convert indicator_pheno to indicator_idv int k=1; for (size_t i=0; i<indicator_pheno.size(); i++) { @@ -187,32 +241,39 @@ void PARAM::ReadFiles (void) } indicator_idv.push_back(k); } - - if (ReadFile_gene (file_gene, vec_read, snpInfo, ng_total)==false) {error=true;} + + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates + ProcessCvtPhen(); + + //obtain covariate matrix + gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); + CopyCvt (W); + + if (ReadFile_gene (file_gene, vec_read, snpInfo, ng_total)==false) {error=true;} } - - + + //read is after gene file if (!file_read.empty() ) { if (ReadFile_column (file_read, indicator_read, vec_read, 1)==false) {error=true;} - - ni_test=0; + + ni_test=0; for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } - + if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; } } - + //for ridge prediction, read phenotype only if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) { - if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} - + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates ProcessCvtPhen(); } @@ -225,37 +286,43 @@ void PARAM::ReadFiles (void) -void PARAM::CheckParam (void) -{ +void PARAM::CheckParam (void) +{ struct stat fileInfo; string str; - + //check parameters if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;} - if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=21 && a_mode!=22 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61) - {cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm or -predict is sepcified correctly."<<endl; error=true;} + if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=71) + {cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm -predict or -calccov is sepcified correctly."<<endl; error=true;} if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;} if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;} if (hwe_level>1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<<hwe_level<<endl; error=true;} if (r2_level>1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<<r2_level<<endl; error=true;} - - if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;} + + if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;} if (h_max<h_min) {cout<<"error! maximum h value must be larger than the minimal value. current values = "<<h_max<<" and "<<h_min<<endl; error=true;} if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;} if (rho_max<rho_min) {cout<<"error! maximum rho value must be larger than the minimal value. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;} if (logp_max<logp_min) {cout<<"error! maximum logp value must be larger than the minimal value. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;} - + if (h_max>1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<<h_max<<" and "<<h_min<<endl; error=true;} if (rho_max>1) {cout<<"error! rho values must be between 0 and 1. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;} if (logp_max>0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;} if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;} - + if (h_scale>1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<<h_scale<<endl; error=true;} if (rho_scale>1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<<rho_scale<<endl; error=true;} if (logp_scale>1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<<logp_scale<<endl; error=true;} if (rho_max==1 && rho_min==1 && a_mode==12) {cout<<"error! ridge regression does not support a rho parameter. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;} - + + if (window_cm<0) {cout<<"error! windowcm values must be non-negative. current values = "<<window_cm<<endl; error=true;} + + if (window_cm==0 && window_bp==0 && window_ns==0) { + window_bp=1000000; + } + //check p_column, and (no need to) sort p_column into ascending order if (p_column.size()==0) { p_column.push_back(1); @@ -266,12 +333,12 @@ void PARAM::CheckParam (void) } } } - + //sort (p_column.begin(), p_column.end() ); n_ph=p_column.size(); - - - + + + //only lmm option (and one prediction option) can deal with multiple phenotypes //and no gene expression files if (n_ph>1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) { @@ -280,11 +347,11 @@ void PARAM::CheckParam (void) if (n_ph>1 && !file_gene.empty() ) { cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<<endl; error=true; } - + if (p_nr>1) { cout<<"error! pnr value must be between 0 and 1. current value = "<<p_nr<<endl; error=true; } - + //check est_column if (est_column.size()==0) { if (file_ebv.empty()) { @@ -299,10 +366,10 @@ void PARAM::CheckParam (void) est_column.push_back(7); } } - - if (est_column.size()!=4) {cout<<"error! -en not followed by four numbers. current number = "<<est_column.size()<<endl; error=true;} + + if (est_column.size()!=4) {cout<<"error! -en not followed by four numbers. current number = "<<est_column.size()<<endl; error=true;} if (est_column[0]==0) {cout<<"error! -en rs column can not be zero. current number = "<<est_column.size()<<endl; error=true;} - + //check if files are compatible with each other, and if files exist if (!file_bfile.empty()) { str=file_bfile+".bim"; @@ -310,44 +377,101 @@ void PARAM::CheckParam (void) str=file_bfile+".bed"; if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bed file: "<<str<<endl; error=true;} str=file_bfile+".fam"; - if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .fam file: "<<str<<endl; error=true;} + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .fam file: "<<str<<endl; error=true;} + } + + if (!file_oxford.empty()) { + str=file_bfile+".bgen"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bgen file: "<<str<<endl; error=true;} + str=file_bfile+".sample"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .sample file: "<<str<<endl; error=true;} } - + if ((!file_geno.empty() || !file_gene.empty()) ) { str=file_pheno; if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open phenotype file: "<<str<<endl; error=true;} - } - + } + str=file_geno; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mean genotype file: "<<str<<endl; error=true;} - + str=file_gene; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open gene expression file: "<<str<<endl; error=true;} - + + str=file_cat; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;} + + str=file_var; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;} + + str=file_beta; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open beta file: "<<str<<endl; error=true;} + + str=file_cor; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open correlation file: "<<str<<endl; error=true;} + + str=file_q; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open q file: "<<str<<endl; error=true;} + + str=file_s; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open s file: "<<str<<endl; error=true;} + + str=file_v; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open v file: "<<str<<endl; error=true;} + + str=file_mq; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mq file: "<<str<<endl; error=true;} + + str=file_ms; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open ms file: "<<str<<endl; error=true;} + + str=file_mv; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mv file: "<<str<<endl; error=true;} + size_t flag=0; if (!file_bfile.empty()) {flag++;} if (!file_geno.empty()) {flag++;} if (!file_gene.empty()) {flag++;} - - if (flag!=1 && a_mode!=43 && a_mode!=5 && a_mode!=61) { + // WJA added + if (!file_oxford.empty()) {flag++;} + + if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62) { cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true; } - - if (file_pheno.empty() && (a_mode==43 || a_mode==5 || a_mode==61) ) { + + if (file_pheno.empty() && (a_mode==43 || a_mode==5) ) { cout<<"error! phenotype file is required."<<endl; error=true; } - + + if (a_mode==61 || a_mode==62) { + if (!file_pheno.empty()) { + if (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) { + cout<<"error! missing relatedness file. "<<endl; error=true; + } + } else if (!file_cor.empty()) { + if (file_beta.empty() ) { + cout<<"error! missing cor file."<<endl; error=true; + } + } else { + if ( (file_mq.empty() || file_ms.empty() || file_mv.empty() ) && (file_q.empty() || file_s.empty() || file_v.empty() ) ) { + cout<<"error! either phenotype/kinship files or ms/mq/mv s/q/v files are required."<<endl; error=true; + } + } + } + + + if (!file_epm.empty() && file_bfile.empty() && file_geno.empty() ) {cout<<"error! estimated parameter file also requires genotype file."<<endl; error=true;} if (!file_ebv.empty() && file_kin.empty()) {cout<<"error! estimated breeding value file also requires relatedness file."<<endl; error=true;} - + if (!file_log.empty() && pheno_mean!=0) {cout<<"error! either log file or mu value can be provide."<<endl; error=true;} - + str=file_snps; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open snps file: "<<str<<endl; error=true;} - + str=file_log; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open log file: "<<str<<endl; error=true;} - + str=file_anno; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open annotation file: "<<str<<endl; error=true;} @@ -356,52 +480,75 @@ void PARAM::CheckParam (void) str=file_mk; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open relatedness matrix file: "<<str<<endl; error=true;} - + str=file_cvt; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open covariates file: "<<str<<endl; error=true;} - + + str=file_gxe; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open environmental covariate file: "<<str<<endl; error=true;} + + str=file_weight; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open the residual weight file: "<<str<<endl; error=true;} + str=file_epm; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated parameter file: "<<str<<endl; error=true;} - + str=file_ebv; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated breeding value file: "<<str<<endl; error=true;} - + str=file_read; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open total read file: "<<str<<endl; error=true;} - + //check if files are compatible with analysis mode if (k_mode==2 && !file_geno.empty() ) {cout<<"error! use \"-km 1\" when using bimbam mean genotype file. "<<endl; error=true;} - - if ((a_mode==1 || a_mode==2 || a_mode==3 || a_mode==4 || a_mode==5 || a_mode==31) && (file_kin.empty() && (file_ku.empty()||file_kd.empty())) ) {cout<<"error! missing relatedness file. "<<endl; error=true;} - if (a_mode==61 && (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) ) {cout<<"error! missing relatedness file. "<<endl; error=true;} + if ((a_mode==1 || a_mode==2 || a_mode==3 || a_mode==4 || a_mode==5 || a_mode==31) && (file_kin.empty() && (file_ku.empty()||file_kd.empty())) ) {cout<<"error! missing relatedness file. "<<endl; error=true;} if ((a_mode==43) && file_kin.empty()) {cout<<"error! missing relatedness file. -predict option requires -k option to provide a relatedness file."<<endl; error=true;} - + if ((a_mode==11 || a_mode==12 || a_mode==13) && !file_cvt.empty() ) {cout<<"error! -bslmm option does not support covariates files."<<endl; error=true;} - + if (a_mode==41 || a_mode==42) { - if (!file_cvt.empty() ) {cout<<"error! -predict option does not support covariates files."<<endl; error=true;} - if (file_epm.empty() ) {cout<<"error! -predict option requires estimated parameter files."<<endl; error=true;} + if (!file_cvt.empty() ) {cout<<"error! -predict option does not support covariates files."<<endl; error=true;} + if (file_epm.empty() ) {cout<<"error! -predict option requires estimated parameter files."<<endl; error=true;} + } + + if (file_beta.empty() && (a_mode==27 || a_mode==28) ) { + cout<<"error! beta effects file is required."<<endl; error=true; } return; } - + void PARAM::CheckData (void) { - if ((file_cvt).empty() || (indicator_cvt).size()==0) { - n_cvt=1; + if(file_oxford.empty()) // WJA NOTE: I added this condition so that covariates can be added through sample, probably not exactly what is wanted + + { + if ((file_cvt).empty() || (indicator_cvt).size()==0) { + n_cvt=1; + } } + if ( (indicator_cvt).size()!=0 && (indicator_cvt).size()!=(indicator_idv).size()) { error=true; cout<<"error! number of rows in the covariates file do not match the number of individuals. "<<endl; return; } - + if ( (indicator_gxe).size()!=0 && (indicator_gxe).size()!=(indicator_idv).size()) { + error=true; + cout<<"error! number of rows in the gxe file do not match the number of individuals. "<<endl; + return; + } + if ( (indicator_weight).size()!=0 && (indicator_weight).size()!=(indicator_idv).size()) { + error=true; + cout<<"error! number of rows in the weight file do not match the number of individuals. "<<endl; + return; + } + if ( (indicator_read).size()!=0 && (indicator_read).size()!=(indicator_idv).size()) { error=true; cout<<"error! number of rows in the total read file do not match the number of individuals. "<<endl; @@ -411,13 +558,13 @@ void PARAM::CheckData (void) { //calculate ni_total and ni_test, and set indicator_idv to 0 whenever indicator_cvt=0 //and calculate np_obs and np_miss ni_total=(indicator_idv).size(); - - ni_test=0; + + ni_test=0; for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { if (indicator_idv[i]==0) {continue;} ni_test++; } - + ni_cvt=0; for (size_t i=0; i<indicator_cvt.size(); i++) { if (indicator_cvt[i]==0) {continue;} @@ -429,8 +576,16 @@ void PARAM::CheckData (void) { if (indicator_cvt.size()!=0) { if (indicator_cvt[i]==0) {continue;} } - - for (size_t j=0; j<indicator_pheno[i].size(); j++) { + + if (indicator_gxe.size()!=0) { + if (indicator_gxe[i]==0) {continue;} + } + + if (indicator_weight.size()!=0) { + if (indicator_weight[i]==0) {continue;} + } + + for (size_t j=0; j<indicator_pheno[i].size(); j++) { if (indicator_pheno[i][j]==0) { np_miss++; } else { @@ -441,101 +596,103 @@ void PARAM::CheckData (void) { /* if ((indicator_cvt).size()!=0) { - ni_test=0; + ni_test=0; for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; ni_test+=indicator_idv[i]; } - } - + } + if ((indicator_read).size()!=0) { - ni_test=0; + ni_test=0; for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } } */ - if (ni_test==0) { + if (ni_test==0 && file_cor.empty() && file_mq.empty() && file_q.empty() && file_beta.empty() ) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; } - + if (a_mode==43) { if (ni_cvt==ni_test) { error=true; - cout<<"error! no individual has missing phenotypes."<<endl; + cout<<"error! no individual has missing phenotypes."<<endl; return; } if ((np_obs+np_miss)!=(ni_cvt*n_ph)) { error=true; //cout<<ni_cvt<<"\t"<<ni_test<<"\t"<<ni_total<<"\t"<<np_obs<<"\t"<<np_miss<<"\t"<<indicator_cvt.size()<<endl; - cout<<"error! number of phenotypes do not match the summation of missing and observed phenotypes."<<endl; + cout<<"error! number of phenotypes do not match the summation of missing and observed phenotypes."<<endl; return; } } //output some information - cout<<"## number of total individuals = "<<ni_total<<endl; - if (a_mode==43) { - cout<<"## number of analyzed individuals = "<<ni_cvt<<endl; - cout<<"## number of individuals with full phenotypes = "<<ni_test<<endl; - } else { - cout<<"## number of analyzed individuals = "<<ni_test<<endl; + if (file_cor.empty() && file_mq.empty() && file_q.empty() ) { + cout<<"## number of total individuals = "<<ni_total<<endl; + if (a_mode==43) { + cout<<"## number of analyzed individuals = "<<ni_cvt<<endl; + cout<<"## number of individuals with full phenotypes = "<<ni_test<<endl; + } else { + cout<<"## number of analyzed individuals = "<<ni_test<<endl; + } + cout<<"## number of covariates = "<<n_cvt<<endl; + cout<<"## number of phenotypes = "<<n_ph<<endl; + if (a_mode==43) { + cout<<"## number of observed data = "<<np_obs<<endl; + cout<<"## number of missing data = "<<np_miss<<endl; + } + if (!file_gene.empty()) { + cout<<"## number of total genes = "<<ng_total<<endl; + } else if (file_epm.empty() && a_mode!=43 && a_mode!=5) { + cout<<"## number of total SNPs = "<<ns_total<<endl; + cout<<"## number of analyzed SNPs = "<<ns_test<<endl; + } else {} } - cout<<"## number of covariates = "<<n_cvt<<endl; - cout<<"## number of phenotypes = "<<n_ph<<endl; - if (a_mode==43) { - cout<<"## number of observed data = "<<np_obs<<endl; - cout<<"## number of missing data = "<<np_miss<<endl; - } - if (!file_gene.empty()) { - cout<<"## number of total genes = "<<ng_total<<endl; - } else if (file_epm.empty() && a_mode!=43 && a_mode!=5) { - cout<<"## number of total SNPs = "<<ns_total<<endl; - cout<<"## number of analyzed SNPs = "<<ns_test<<endl; - } else {} - + //set d_pace to 1000 for gene expression if (!file_gene.empty() && d_pace==100000) { d_pace=1000; } - + //for case-control studies, count #cases and #controls int flag_cc=0; - if (a_mode==13) { + if (a_mode==13) { ni_case=0; ni_control=0; for (size_t i=0; i<indicator_idv.size(); i++) { if (indicator_idv[i]==0) {continue;} - + if (pheno[i][0]==0) {ni_control++;} else if (pheno[i][0]==1) {ni_case++;} else {flag_cc=1;} } - cout<<"## number of cases = "<<ni_case<<endl; - cout<<"## number of controls = "<<ni_control<<endl; - } - + cout<<"## number of cases = "<<ni_case<<endl; + cout<<"## number of controls = "<<ni_control<<endl; + } + if (flag_cc==1) {cout<<"Unexpected non-binary phenotypes for case/control analysis. Use default (BSLMM) analysis instead."<<endl; a_mode=11;} - + //set parameters for BSLMM //and check for predict if (a_mode==11 || a_mode==12 || a_mode==13) { - if (a_mode==11) {n_mh=1;} + if (a_mode==11) {n_mh=1;} if (logp_min==0) {logp_min=-1.0*log((double)ns_test);} - + if (h_scale==-1) {h_scale=min(1.0, 10.0/sqrt((double)ni_test) );} if (rho_scale==-1) {rho_scale=min(1.0, 10.0/sqrt((double)ni_test) );} if (logp_scale==-1) {logp_scale=min(1.0, 5.0/sqrt((double)ni_test) );} - + if (h_min==-1) {h_min=0.0;} if (h_max==-1) {h_max=1.0;} - + if (s_max>ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."<<endl;} if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;} - } else if (a_mode==41 || a_mode==42) { + } else if (a_mode==41 || a_mode==42) { if (indicator_bv.size()!=0) { if (indicator_idv.size()!=indicator_bv.size()) { cout<<"error! number of rows in the phenotype file does not match that in the estimated breeding value file: "<<indicator_idv.size()<<"\t"<<indicator_bv.size()<<endl; @@ -555,18 +712,18 @@ void PARAM::CheckData (void) { //file_mk needs to contain more than one line if (n_vc==1 && !file_mk.empty()) {cout<<"error! -mk file should contain more than one line."<<endl; error=true;} - + return; } -void PARAM::PrintSummary () +void PARAM::PrintSummary () { if (n_ph==1) { cout<<"pve estimate ="<<pve_null<<endl; cout<<"se(pve) ="<<pve_se_null<<endl; } else { - + } return; } @@ -575,7 +732,7 @@ void PARAM::PrintSummary () void PARAM::ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { string file_str; - + if (!file_bfile.empty()) { file_str=file_bfile+".bed"; if (ReadFile_bed (file_str, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;} @@ -583,91 +740,563 @@ void PARAM::ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { else { if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;} } - + return; } - + + +void PARAM::ReadGenotypes (vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K) { + string file_str; + + if (!file_bfile.empty()) { + file_str=file_bfile+".bed"; + if (ReadFile_bed (file_str, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } else { + if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } + + return; +} + void PARAM::CalcKin (gsl_matrix *matrix_kin) { string file_str; - + gsl_matrix_set_zero (matrix_kin); - - if (!file_bfile.empty() ) { + + if (!file_bfile.empty() ) { file_str=file_bfile+".bed"; if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } + else if (!file_oxford.empty() ) { + file_str=file_oxford+".bgen"; + if (bgenKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} + } else { file_str=file_geno; if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } - + + return; +} + + + +//from an existing n by nd G matrix, compute the d by d S matrix +void compKtoS (const gsl_matrix *G, gsl_matrix *S) { + size_t n_vc=S->size1, ni_test=G->size1; + double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d; + + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + tr_KiKj=0; sum_Ki=0; sum_Kj=0; s_KiKj=0; si=0; sj=0; + for (size_t l=0; l<ni_test; l++) { + s_Ki=0; s_Kj=0; + for (size_t k=0; k<ni_test; k++) { + di=gsl_matrix_get(G, l, k+ni_test*i); + dj=gsl_matrix_get(G, l, k+ni_test*j); + s_Ki+=di; s_Kj+=dj; + + tr_KiKj+=di*dj; sum_Ki+=di; sum_Kj+=dj; + if (l==k) {si+=di; sj+=dj;} + } + s_KiKj+=s_Ki*s_Kj; + } + + sum_Ki/=(double)ni_test; + sum_Kj/=(double)ni_test; + s_KiKj/=(double)ni_test; + si-=sum_Ki; + sj-=sum_Kj; + d=tr_KiKj-2*s_KiKj+sum_Ki*sum_Kj; + d=d/(si*sj)-1/(double)(ni_test-1); + + gsl_matrix_set (S, i, j, d); + if (i!=j) {gsl_matrix_set (S, j, i, d);} + } + } + //cout<<tr_KiKj<<" "<<s_KiKj<<" "<<sum_Ki<<" "<<sum_Kj<<" "<<si<<" "<<sj<<" "<<d*1000000<<endl; + return; +} + + + +//copied from lmm.cpp; is used in the following function compKtoQ +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + +//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d+1) Q matrix +//where inside i'th d+1 by d+1 matrix, each element is tr(KiKjKiKl)-r*tr(KjKiKl)-r*tr(KlKiKj)+r^2*tr(KjKl), where r=n/(n-1) +void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { + size_t n_vc=G->size2/G->size1, ni_test=G->size1; + + gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test); + gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc ); + gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 ); + gsl_vector *trKi=gsl_vector_alloc(n_vc); + + double d, tr, r=(double)ni_test/(double)(ni_test-1); + size_t t, t_ij, t_il, t_jl, t_ii; + + //compute KiKj for all pairs of i and j (including the identity matrix) + t=0; + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_const_view Ki=gsl_matrix_const_submatrix(G, 0, i*ni_test, ni_test, ni_test); + for (size_t j=i; j<n_vc; j++) { + gsl_matrix_const_view Kj=gsl_matrix_const_submatrix(G, 0, j*ni_test, ni_test, ni_test); + gsl_matrix_view KiKj_sub=gsl_matrix_submatrix (KiKj, 0, t*ni_test, ni_test, ni_test); + eigenlib_dgemm ("N", "N", 1.0, &Ki.matrix, &Kj.matrix, 0.0, &KiKj_sub.matrix); + t++; + } + } + /* + for (size_t i=0; i<5; i++) { + for (size_t j=0; j<5; j++) { + cout<<gsl_matrix_get (G, i, j)<<" "; + } + cout<<endl; + } + */ + + //compute trKi, trKiKj + t=0; + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + tr=0; + for (size_t k=0; k<ni_test; k++) { + tr+=gsl_matrix_get (KiKj, k, t*ni_test+k); + } + gsl_vector_set (trKiKj, t, tr); + + t++; + } + + tr=0; + for (size_t k=0; k<ni_test; k++) { + tr+=gsl_matrix_get (G, k, i*ni_test+k); + } + gsl_vector_set (trKi, i, tr); + } + + //compute trKiKjKi (it is not symmetric w.r.t. i and j) + for (size_t i=0; i<n_vc; i++) { + for (size_t j=0; j<n_vc; j++) { + tr=0; + t=GetabIndex (i+1, j+1, n_vc-2); + for (size_t k=0; k<ni_test; k++) { + gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t*ni_test, ni_test); + gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t*ni_test+k); + + gsl_vector_const_view Ki_col=gsl_matrix_const_column (G, i*ni_test+k); + + if (i<=j) { + gsl_blas_ddot (&KiKj_row.vector, &Ki_col.vector, &d); + tr+=d; + } else { + gsl_blas_ddot (&KiKj_col.vector, &Ki_col.vector, &d); + tr+=d; + } + } + gsl_vector_set (trKiKjKi, i*n_vc+j, tr); + } + } + + //compute Q + for (size_t i=0; i<n_vc; i++) { + for (size_t j=0; j<n_vc+1; j++) { + for (size_t l=j; l<n_vc+1; l++) { + if (j!=n_vc && l!=n_vc) { + t_ij=GetabIndex (i+1, j+1, n_vc-2); + t_il=GetabIndex (i+1, l+1, n_vc-2); + t_jl=GetabIndex (j+1, l+1, n_vc-2); + + //cout<<ni_test<<" "<<r<<t_ij<<" "<<t_il<<" "<<t_jl<<" "<<endl; + tr=0; + for (size_t k=0; k<ni_test; k++) { + gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t_ij*ni_test, ni_test); + gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t_ij*ni_test+k); + gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test); + gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k); + + gsl_vector_const_view Kj_row=gsl_matrix_const_subrow (G, k, j*ni_test, ni_test); + gsl_vector_const_view Kl_row=gsl_matrix_const_subrow (G, k, l*ni_test, ni_test); + + if (i<=j && i<=l) { + gsl_blas_ddot (&KiKj_row.vector, &KiKl_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); + tr-=r*d; + } else if (i<=j && i>l) { + gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); + tr-=r*d; + } else if (i>j && i<=l) { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } else { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } + } + + tr+=r*r*gsl_vector_get (trKiKj, t_jl); + } else if (j!=n_vc && l==n_vc) { + t_ij=GetabIndex (i+1, j+1, n_vc-2); + tr=gsl_vector_get (trKiKjKi, i*n_vc+j)-2*r*gsl_vector_get (trKiKj, t_ij)+r*r*gsl_vector_get (trKi, j); + } else if (j==n_vc && l==n_vc) { + t_ii=GetabIndex (i+1, i+1, n_vc-2); + tr=gsl_vector_get (trKiKj, t_ii)-2*r*gsl_vector_get (trKi, i)+r*r*(double)(ni_test-1); + } + + gsl_matrix_set (Q, j, i*(n_vc+1)+l, tr); + if (l!=j) {gsl_matrix_set (Q, l, i*(n_vc+1)+j, tr);} + } + } + } + + gsl_matrix_scale (Q, 1.0/pow((double)ni_test, 2) ); + + gsl_matrix_free(KiKj); + gsl_vector_free(trKiKjKi); + gsl_vector_free(trKiKj); + gsl_vector_free(trKi); + + return; +} + + + +//perform Jacknife sampling for variance of S +void JacknifeGtoS (const gsl_matrix *G, gsl_matrix *S, gsl_matrix *Svar) { + size_t n_vc=Svar->size1, ni_test=G->size1; + vector<vector<vector<double> > > tr_KiKj, s_KiKj; + vector<vector<double> > sum_Ki, s_Ki, si; + vector<double> vec_tmp; + double di, dj, d, m, v; + + //initialize and set all elements to zero + for (size_t i=0; i<ni_test; i++) { + vec_tmp.push_back(0); + } + + for (size_t i=0; i<n_vc; i++) { + sum_Ki.push_back(vec_tmp); + s_Ki.push_back(vec_tmp); + si.push_back(vec_tmp); + } + + for (size_t i=0; i<n_vc; i++) { + tr_KiKj.push_back(sum_Ki); + s_KiKj.push_back(sum_Ki); + } + + //run jacknife + for (size_t i=0; i<n_vc; i++) { + for (size_t l=0; l<ni_test; l++) { + for (size_t k=0; k<ni_test; k++) { + di=gsl_matrix_get(G, l, k+ni_test*i); + + for (size_t t=0; t<ni_test; t++) { + if (t==l || t==k) {continue;} + sum_Ki[i][t]+=di; + if (l==k) {si[i][t]+=di;} + } + s_Ki[i][l]+=di; + } + } + + for (size_t t=0; t<ni_test; t++) { + sum_Ki[i][t]/=(double)(ni_test-1); + } + } + + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + for (size_t l=0; l<ni_test; l++) { + for (size_t k=0; k<ni_test; k++) { + di=gsl_matrix_get(G, l, k+ni_test*i); + dj=gsl_matrix_get(G, l, k+ni_test*j); + d=di*dj; + + for (size_t t=0; t<ni_test; t++) { + if (t==l || t==k) {continue;} + tr_KiKj[i][j][t]+=d; + } + } + + for (size_t t=0; t<ni_test; t++) { + if (t==l) {continue;} + di=gsl_matrix_get(G, l, t+ni_test*i); + dj=gsl_matrix_get(G, l, t+ni_test*j); + + s_KiKj[i][j][t]+=(s_Ki[i][l]-di)*(s_Ki[j][l]-dj); + } + } + + for (size_t t=0; t<ni_test; t++) { + s_KiKj[i][j][t]/=(double)(ni_test-1); + } + + m=0; v=0; + for (size_t t=0; t<ni_test; t++) { + d=tr_KiKj[i][j][t]-2*s_KiKj[i][j][t]+sum_Ki[i][t]*sum_Ki[j][t]; + d/=(si[i][t]-sum_Ki[i][t])*(si[j][t]-sum_Ki[j][t]); + d-=1/(double)(ni_test-2); + + m+=d; v+=d*d; + } + m/=(double)ni_test; + v/=(double)ni_test; + v-=m*m; + v*=(double)(ni_test-1); + + gsl_matrix_set (Svar, i, j, v); + d=gsl_matrix_get (S, i, j); + d=(double)ni_test*d-(double)(ni_test-1)*m; + gsl_matrix_set (S, i, j, d); + if (i!=j) {gsl_matrix_set (Svar, j, i, v); gsl_matrix_set (S, j, i, d);} + } + } + + return; +} + + + +//compute the d by d S matrix with its d by d variance matrix of Svar, and the d+1 by d(d+1) matrix of Q for V(q) +void PARAM::CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q) { + string file_str; + + gsl_matrix_set_zero (S); + gsl_matrix_set_zero (Svar); + gsl_matrix_set_zero (Q); + + //compute the kinship matrix G for multiple categories; these matrices are not centered, for convienence of Jacknife sampling + gsl_matrix *G=gsl_matrix_alloc (ni_test, n_vc*ni_test); + gsl_matrix_set_zero (G); + + if (!file_bfile.empty() ) { + file_str=file_bfile+".bed"; + if (PlinkKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} + } else { + file_str=file_geno; + if (BimbamKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} + } + + //center and scale every kinship matrix inside G + double d; + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_view K=gsl_matrix_submatrix(G, 0, i*ni_test, ni_test, ni_test); + CenterMatrix(&K.matrix); + d=ScaleMatrix(&K.matrix); + } + + //based on G, compute S + compKtoS (G, S); + + //based on G, compute a matrix Q that can be used to calculate the variance of q + compKtoQ (G, Q); + + /* + //set up random environment + gsl_rng_env_setup(); + gsl_rng *gsl_r; + const gsl_rng_type * gslType; + gslType = gsl_rng_default; + if (randseed<0) { + time_t rawtime; + time (&rawtime); + tm * ptm = gmtime (&rawtime); + + randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //bootstrap: in each iteration, sample individuals and compute S_pmt + size_t n_pmt=100; + vector<size_t> idv_order, idv_remove; + for (size_t i=0; i<ni_test; i++) { + idv_order.push_back(i); + } + for (size_t i=0; i<n_pmt; i++) { + idv_remove.push_back(0); + } + gsl_ran_choose (gsl_r, static_cast<void*>(&idv_remove[0]), n_pmt, static_cast<void*>(&idv_order[0]), ni_test, sizeof(size_t)); + + gsl_matrix *S_pmt=gsl_matrix_alloc(n_vc, n_vc*n_pmt); + for (size_t i=0; i<n_pmt; i++) { + gsl_matrix_view S_sub=gsl_matrix_submatrix (S_pmt, 0, n_vc*i, n_vc, n_vc); + compKtoS (G, idv_remove[i], &S_sub.matrix); + } + + //based on S_pmt, compute Svar + double m, v, d; + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + m=0; v=0; + for (size_t t=0; t<n_pmt; t++) { + d=gsl_matrix_get(S_pmt, i, j); + m+=d; v+=d*d; + } + m/=(double)n_pmt; v/=(double)n_pmt; + v=v-m*m; + gsl_matrix_set(Svar, i, j, v); + if (i!=j) {gsl_matrix_set(Svar, j, i, v);} + } + } + */ + + //compute Svar and update S with Jacknife + JacknifeGtoS (G, S, Svar); + + gsl_matrix_free(G); + return; +} + + + +void PARAM::WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+="."; + file_str+=suffix; + file_str+=".txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + outfile.precision(10); + + for (size_t i=0; i<q->size; ++i) { + outfile<<gsl_vector_get (q, i)<<endl; + } + + for (size_t i=0; i<s->size; ++i) { + outfile<<gsl_vector_get (s, i)<<endl; + } + + outfile<<n_total<<endl; + + outfile.close(); + outfile.clear(); + return; +} + + + +void PARAM::WriteVar (const string suffix) +{ + string file_str, rs; + file_str=path_out+"/"+file_out; + file_str+="."; + file_str+=suffix; + file_str+=".txt.gz"; + + ogzstream outfile (file_str.c_str(), ogzstream::out); + if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + outfile.precision(10); + + for (size_t i=0; i<indicator_snp.size(); i++) { + if (indicator_snp[i]==0) {continue;} + rs=snpInfo[i].rs_number; + if (mapRS2var.count(rs)!=0) { + outfile<<rs<<"\t"<<mapRS2var.at(rs)<<endl; + } + } + + outfile.close(); + outfile.clear(); return; } - -void PARAM::WriteMatrix (const gsl_matrix *matrix_U, const string suffix) +void PARAM::WriteMatrix (const gsl_matrix *matrix_U, const string suffix) { string file_str; file_str=path_out+"/"+file_out; file_str+="."; file_str+=suffix; - file_str+=".txt"; - + file_str+=".txt"; + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} - + outfile.precision(10); - + for (size_t i=0; i<matrix_U->size1; ++i) { for (size_t j=0; j<matrix_U->size2; ++j) { outfile<<gsl_matrix_get (matrix_U, i, j)<<"\t"; } outfile<<endl; } - + outfile.close(); outfile.clear(); return; } -void PARAM::WriteVector (const gsl_vector *vector_D, const string suffix) +void PARAM::WriteVector (const gsl_vector *vector_D, const string suffix) { string file_str; file_str=path_out+"/"+file_out; file_str+="."; file_str+=suffix; file_str+=".txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} - + outfile.precision(10); - + for (size_t i=0; i<vector_D->size; ++i) { outfile<<gsl_vector_get (vector_D, i)<<endl; } - + outfile.close(); outfile.clear(); return; } -void PARAM::CheckCvt () +void PARAM::CheckCvt () { if (indicator_cvt.size()==0) {return;} - + size_t ci_test=0; - + gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); - + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;} for (size_t j=0; j<n_cvt; ++j) { @@ -679,14 +1308,14 @@ void PARAM::CheckCvt () size_t flag_ipt=0; double v_min, v_max; set<size_t> set_remove; - + //check if any columns is an intercept for (size_t i=0; i<W->size2; i++) { gsl_vector_view w_col=gsl_matrix_column (W, i); gsl_vector_minmax (&w_col.vector, &v_min, &v_max); if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);} } - + //add an intecept term if needed if (n_cvt==set_remove.size()) { indicator_cvt.clear(); @@ -697,19 +1326,19 @@ void PARAM::CheckCvt () if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;} cvt[i].push_back(1.0); } - + n_cvt++; - } else {} - + } else {} + gsl_matrix_free(W); - + return; } //post-process phentoypes, covariates void PARAM::ProcessCvtPhen () -{ +{ //convert indicator_pheno to indicator_idv int k=1; indicator_idv.clear(); @@ -720,27 +1349,88 @@ void PARAM::ProcessCvtPhen () } indicator_idv.push_back(k); } - + //remove individuals with missing covariates if ((indicator_cvt).size()!=0) { for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; } } - + + //remove individuals with missing gxe variables + if ((indicator_gxe).size()!=0) { + for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_gxe[i]; + } + } + + //remove individuals with missing residual weights + if ((indicator_weight).size()!=0) { + for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_weight[i]; + } + } + //obtain ni_test - ni_test=0; + ni_test=0; for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { - if (indicator_idv[i]==0) {continue;} + if (indicator_idv[i]==0) {continue;} ni_test++; } - + + + + //if subsample number is set, perform a random sub-sampling to determine the subsampled ids + if (ni_subsample!=0) { + if (ni_test<ni_subsample) { + cout<<"error! number of subsamples is less than number of analyzed individuals. "<<endl; + } else { + //set up random environment + gsl_rng_env_setup(); + gsl_rng *gsl_r; + const gsl_rng_type * gslType; + gslType = gsl_rng_default; + if (randseed<0) { + time_t rawtime; + time (&rawtime); + tm * ptm = gmtime (&rawtime); + + randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //from ni_test, sub-sample ni_subsample + vector<size_t> a, b; + for (size_t i=0; i<ni_subsample; i++) { + a.push_back(0); + } + for (size_t i=0; i<ni_test; i++) { + b.push_back(i); + } + + gsl_ran_choose (gsl_r, static_cast<void*>(&a[0]), ni_subsample, static_cast<void*>(&b[0]), ni_test, sizeof (size_t) ); + + //re-set indicator_idv and ni_test + int j=0; + for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + if(find(a.begin(), a.end(), j) == a.end()) { + indicator_idv[i]=0; + } + j++; + } + ni_test=ni_subsample; + } + } + + //check ni_test if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; } - + //check covariates to see if they are correlated with each other, and to see if the intercept term is included //after getting ni_test //add or remove covariates @@ -749,24 +1439,24 @@ void PARAM::ProcessCvtPhen () } else { vector<double> cvt_row; cvt_row.push_back(1); - + for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_cvt.push_back(1); - + cvt.push_back(cvt_row); } } - + return; } -void PARAM::CopyCvt (gsl_matrix *W) +void PARAM::CopyCvt (gsl_matrix *W) { size_t ci_test=0; - + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;} for (size_t j=0; j<n_cvt; ++j) { @@ -774,57 +1464,85 @@ void PARAM::CopyCvt (gsl_matrix *W) } ci_test++; } - + + return; +} + + +void PARAM::CopyGxe (gsl_vector *env) +{ + size_t ci_test=0; + + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { + if (indicator_idv[i]==0 || indicator_gxe[i]==0) {continue;} + gsl_vector_set (env, ci_test, gxe[i]); + ci_test++; + } + + return; +} + +void PARAM::CopyWeight (gsl_vector *w) +{ + size_t ci_test=0; + + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { + if (indicator_idv[i]==0 || indicator_weight[i]==0) {continue;} + gsl_vector_set (w, ci_test, weight[i]); + ci_test++; + } + return; } //if flag=0, then use indicator_idv to load W and Y //else, use indicator_cvt to load them -void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag) +void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag) { size_t ci_test=0; - + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { if (flag==0) { if (indicator_idv[i]==0) {continue;} } else { if (indicator_cvt[i]==0) {continue;} } - + gsl_vector_set (y, ci_test, (pheno)[i][0]); - + for (size_t j=0; j<n_cvt; ++j) { gsl_matrix_set (W, ci_test, j, (cvt)[i][j]); } ci_test++; } - + return; } //if flag=0, then use indicator_idv to load W and Y //else, use indicator_cvt to load them -void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag) +void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag) { size_t ci_test=0; - + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { if (flag==0) { if (indicator_idv[i]==0) {continue;} } else { if (indicator_cvt[i]==0) {continue;} - } - - for (size_t j=0; j<n_ph; ++j) { + } + + for (size_t j=0; j<n_ph; ++j) { gsl_matrix_set (Y, ci_test, j, (pheno)[i][j]); } for (size_t j=0; j<n_cvt; ++j) { gsl_matrix_set (W, ci_test, j, (cvt)[i][j]); } + ci_test++; } - + return; } @@ -832,18 +1550,18 @@ void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag) -void PARAM::CopyRead (gsl_vector *log_N) +void PARAM::CopyRead (gsl_vector *log_N) { size_t ci_test=0; - + for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0) {continue;} - gsl_vector_set (log_N, ci_test, log(vec_read[i]) ); + gsl_vector_set (log_N, ci_test, log(vec_read[i]) ); ci_test++; } - + return; } - - + + diff --git a/src/param.h b/src/param.h index fa18181..3c3b42e 100644 --- a/src/param.h +++ b/src/param.h @@ -16,7 +16,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __PARAM_H__ +#ifndef __PARAM_H__ #define __PARAM_H__ #include <vector> @@ -39,14 +39,17 @@ public: string a_major; size_t n_miss; double missingness; - double maf; + double maf; + size_t n_idv;//number of non-missing individuals + size_t n_nb;//number of neighbours on the right hand side + size_t file_position;//snp location on file }; //results for lmm class SUMSTAT { public: double beta; //REML estimator for beta - double se; //SE for beta + double se; //SE for beta double lambda_remle; //REML estimator for lambda double lambda_mle; //MLE estimator for lambda double p_wald; //p value from a Wald test @@ -75,50 +78,87 @@ public: double rho; double pge; double logp; - + size_t n_gamma; }; +//header class +class HEADER +{ + +public: + size_t rs_col; + size_t chr_col; + size_t pos_col; + size_t cm_col; + size_t a1_col; + size_t a0_col; + size_t z_col; + size_t beta_col; + size_t sebeta_col; + size_t chisq_col; + size_t p_col; + size_t n_col; + size_t nmis_col; + size_t nobs_col; + size_t af_col; + size_t var_col; + size_t ws_col; + size_t cor_col; + size_t coln;//number of columns +}; + class PARAM { -public: +public: // IO related parameters bool mode_silence; int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests - int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; + int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; vector<size_t> p_column; //which phenotype column needs analysis size_t d_pace; //display pace - + string file_bfile; string file_geno; string file_pheno; string file_anno; //optional + string file_gxe; //optional string file_cvt; //optional + string file_cat; + string file_var; + string file_beta; + string file_cor; string file_kin; string file_ku, file_kd; string file_mk; + string file_q, file_mq; + string file_s, file_ms; + string file_v, file_mv; + string file_weight; string file_out; string path_out; - + + string file_epm; //estimated parameter file string file_ebv; //estimated breeding value file string file_log; //log file containing mean estimate - + string file_read; //file containing total number of reads string file_gene; //gene expression file - + string file_snps; //file containing analyzed snps or genes - - - - // QC related parameters +// WJA Added + string file_oxford; + + + // QC related parameters double miss_level; - double maf_level; + double maf_level; double hwe_level; double r2_level; - + // LMM related parameters double l_min; double l_max; @@ -130,7 +170,7 @@ public: vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; - double p_nr; + double p_nr; double em_prec, nr_prec; size_t em_iter, nr_iter; size_t crt; @@ -138,15 +178,16 @@ public: //for fitting multiple variance components //the first three are of size n_vc, and the next two are of size n_vc+1 + bool noconstrain; vector<double> v_traceG; vector<double> v_pve; vector<double> v_se_pve; vector<double> v_sigma2; - vector<double> v_se_sigma2; + vector<double> v_se_sigma2; vector<double> v_beta; - vector<double> v_se_beta; - + vector<double> v_se_beta; + // BSLMM MCMC related parameters double h_min, h_max, h_scale; //priors for h double rho_min, rho_max, rho_scale; //priors for rho @@ -163,7 +204,12 @@ public: double trace_G; HYPBSLMM cHyp_initial; - + + //VARCOV related parameters + double window_cm; + size_t window_bp; + size_t window_ns; + // Summary statistics bool error; size_t ni_total, ni_test, ni_cvt; //number of individuals @@ -171,6 +217,8 @@ public: size_t ns_total, ns_test; //number of snps size_t ng_total, ng_test; //number of genes size_t ni_control, ni_case; //number of controls and number of cases + size_t ni_subsample; //number of subsampled individuals + size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel size_t n_cvt; //number of covariates size_t n_ph; //number of phenotypes size_t n_vc; //number of variance components (including the diagonal matrix) @@ -186,42 +234,54 @@ public: // Data vector<vector<double> > pheno; //a vector record all phenotypes, NA replaced with -9 - vector<vector<double> > cvt; //a vector record all covariates, NA replaced with -9 + vector<vector<double> > cvt; //a vector record all covariates, NA replaced with -9 + vector<double> gxe; //a vector record all covariates, NA replaced with -9 + vector<double> weight; //a vector record weights for the individuals, which is useful for animal breeding studies vector<vector<int> > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis vector<int> indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis - + vector<int> indicator_gxe; //indicator for gxe, 0 missing, 1 available for analysis + vector<int> indicator_weight; //indicator for weight, 0 missing, 1 available for analysis + vector<int> indicator_bv; //indicator for estimated breeding value file, 0 missing, 1 available for analysis vector<int> indicator_read; //indicator for read file, 0 missing, 1 available for analysis vector<double> vec_read; //total number of reads vector<double> vec_bv; //breeding values vector<size_t> est_column; - + map<string, int> mapID2num; //map small ID number to number, from 0 to n-1 map<string, string> mapRS2chr; //map rs# to chromosome location map<string, long int> mapRS2bp; //map rs# to base position map<string, double> mapRS2cM; //map rs# to cM map<string, double> mapRS2est; //map rs# to parameters - + map<string, size_t> mapRS2cat; //map rs# to category number + map<string, double> mapRS2var; //map rs# to category number + vector<SNPINFO> snpInfo; //record SNP information set<string> setSnps; //a set of snps for analysis - + //constructor PARAM(); - + //functions - void ReadFiles (); - void CheckParam (); - void CheckData (); + void ReadFiles (); + void CheckParam (); + void CheckData (); void PrintSummary (); - void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void ReadGenotypes (vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K); void CheckCvt (); void CopyCvt (gsl_matrix *W); + void CopyGxe (gsl_vector *gxe); + void CopyWeight (gsl_vector *w); void ProcessCvtPhen(); void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); void CalcKin (gsl_matrix *matrix_kin); + void CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q); + void WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix); + void WriteVar (const string suffix); void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); void WriteVector (const gsl_vector *vector_D, const string suffix); void CopyRead (gsl_vector *log_N); |