diff options
author | Peter Carbonetto | 2017-05-04 14:41:32 -0500 |
---|---|---|
committer | Peter Carbonetto | 2017-05-04 14:41:32 -0500 |
commit | c18588b6d00650b9ce742229fdf1eca7133f58fc (patch) | |
tree | 2a7262fc48dbbdca244d0860c8b5167c69f3c553 /src | |
parent | 452f232cb627d7180bf1c845dff9ddd88af6a600 (diff) | |
download | pangemma-c18588b6d00650b9ce742229fdf1eca7133f58fc.tar.gz |
Local updates made by Xiang---shared via email on May 4, 2017, subject: gemma on expression data.
Diffstat (limited to 'src')
-rw-r--r-- | src/gemma.cpp | 241 | ||||
-rw-r--r-- | src/io.cpp | 156 | ||||
-rw-r--r-- | src/io.h | 3 | ||||
-rw-r--r-- | src/lmm.cpp | 24 | ||||
-rw-r--r-- | src/mathfunc.cpp | 27 | ||||
-rw-r--r-- | src/mathfunc.h | 1 | ||||
-rw-r--r-- | src/param.cpp | 84 | ||||
-rw-r--r-- | src/param.h | 11 | ||||
-rw-r--r-- | src/vc.cpp | 300 | ||||
-rw-r--r-- | src/vc.h | 1 |
10 files changed, 797 insertions, 51 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp index 682835f..ca9c4aa 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -39,9 +39,12 @@ #include "vc_float.h" #include "lm_float.h" //for LM class #include "bslmm_float.h" //for BSLMM class +#include "bslmmdap_float.h" //for BSLMMDAP class +#include "ldr_float.h" //for LDR class #include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe #include "mvlmm_float.h" //for MVLMM class #include "prdt_float.h" //for PRDT class +#include "varcov_float.h" //for MVLMM class #include "mathfunc_float.h" //for a few functions #else #include "io.h" @@ -49,9 +52,12 @@ #include "vc.h" #include "lm.h" #include "bslmm.h" +#include "bslmmdap.h" +#include "ldr.h" #include "lmm.h" #include "mvlmm.h" #include "prdt.h" +#include "varcov.h" #include "mathfunc.h" #endif @@ -365,6 +371,8 @@ void GEMMA::PrintHelp(size_t option) cout<<" options: 1: BSLMM"<<endl; cout<<" 2: standard ridge regression/GBLUP (no mcmc)"<<endl; cout<<" 3: probit BSLMM (requires 0/1 phenotypes)"<<endl; + cout<<" 4: BSLMM with DAP for Hyper Parameter Estimation"<<endl; + cout<<" 5: BSLMM with DAP for Fine Mapping"<<endl; cout<<" -ldr [num] "<<" specify analysis options (default 1)."<<endl; cout<<" options: 1: LDR"<<endl; @@ -433,7 +441,7 @@ void GEMMA::PrintHelp(size_t option) //gq: 27-28 //eigen: 31-32 //lmm: 1-5 -//bslmm: 11-13 +//bslmm: 11-15 //predict: 41-43 //lm: 51 //vc: 61 @@ -576,6 +584,20 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_mcat=str; } + else if (strcmp(argv[i], "-catc")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_catc=str; + } + else if (strcmp(argv[i], "-mcatc")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mcatc=str; + } else if (strcmp(argv[i], "-beta")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -583,6 +605,20 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_beta=str; } + else if (strcmp(argv[i], "-bf")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_bf=str; + } + else if (strcmp(argv[i], "-hyp")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_hyp=str; + } else if (strcmp(argv[i], "-cor")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -920,6 +956,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.a_mode=10+atoi(str.c_str()); } + /* else if (strcmp(argv[i], "-ldr")==0) { if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=14; continue;} @@ -928,6 +965,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.a_mode=13+atoi(str.c_str()); } + */ else if (strcmp(argv[i], "-hmin")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -1347,7 +1385,6 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_matrix_free (G); } - /* //Compute the LDSC weights (not implemented yet) if (cPar.a_mode==72) { cout<<"Calculating Weights ... "<<endl; @@ -1363,7 +1400,7 @@ void GEMMA::BatchRun (PARAM &cPar) cVarcov.CopyToParam(cPar); } - */ + //Compute the S matrix (and its variance), that is used for variance component estimation using summary statistics if (cPar.a_mode==25 || cPar.a_mode==26) { @@ -1471,7 +1508,7 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_free (s); } - /* + //Calculate SNP covariance if (cPar.a_mode==71) { VARCOV cVarcov; @@ -1485,7 +1522,7 @@ void GEMMA::BatchRun (PARAM &cPar) cVarcov.CopyToParam(cPar); } - */ + //LM if (cPar.a_mode==51 || cPar.a_mode==52 || cPar.a_mode==53 || cPar.a_mode==54) { //Fit LM @@ -1541,7 +1578,7 @@ void GEMMA::BatchRun (PARAM &cPar) //REML approach only //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp //for one phenotype only; - if (cPar.a_mode==61 || cPar.a_mode==62) { + if (cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==63) { if (!cPar.file_beta.empty() ) { //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1 @@ -1866,8 +1903,10 @@ void GEMMA::BatchRun (PARAM &cPar) cVc.CopyFromParam(cPar); if (cPar.a_mode==61) { cVc.CalcVChe (G, W, &Y_col.vector); - } else { + } else if (cPar.a_mode==62) { cVc.CalcVCreml (cPar.noconstrain, G, W, &Y_col.vector); + } else { + cVc.CalcVCacl (G, W, &Y_col.vector); } cVc.CopyToParam(cPar); //obtain pve from sigma2 @@ -2310,7 +2349,7 @@ void GEMMA::BatchRun (PARAM &cPar) //center y, even for case/control data cPar.pheno_mean=CenterVector(y); - //run bslmm if rho==1 + //run bvsr if rho==1 if (cPar.rho_min==1 && cPar.rho_max==1) { //read genotypes X (not UtX) cPar.ReadGenotypes (UtX, G, false); @@ -2329,7 +2368,6 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); gsl_vector *Uty=gsl_vector_alloc (y->size); - //read relatedness matrix G if (!(cPar.file_kin).empty()) { cPar.ReadGenotypes (UtX, G, false); @@ -2373,17 +2411,20 @@ void GEMMA::BatchRun (PARAM &cPar) CalcUtX (U, UtX); cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //perform BSLMM analysis - BSLMM cBslmm; - cBslmm.CopyFromParam(cPar); - time_start=clock(); - if (cPar.a_mode==12) { //ridge regression - cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null); - } else { //Run MCMC - cBslmm.MCMC(U, UtX, Uty, eval, y); + //perform BSLMM or BSLMMDAP analysis + if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { + BSLMM cBslmm; + cBslmm.CopyFromParam(cPar); + time_start=clock(); + if (cPar.a_mode==12) { //ridge regression + cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null); + } else { //Run MCMC + cBslmm.MCMC(U, UtX, Uty, eval, y); + } + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmm.CopyToParam(cPar); + } else { } - cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - cBslmm.CopyToParam(cPar); //release all matrices and vectors gsl_matrix_free (G); @@ -2399,8 +2440,157 @@ void GEMMA::BatchRun (PARAM &cPar) } + + //BSLMM-DAP + if (cPar.a_mode==14 || cPar.a_mode==15 || cPar.a_mode==16) { + if (cPar.a_mode==14) { + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); + gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); + + //set covariates matrix W and phenotype vector y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, y, 0); + + //center y, even for case/control data + cPar.pheno_mean=CenterVector(y); + + //run bvsr if rho==1 + if (cPar.rho_min==1 && cPar.rho_max==1) { + //read genotypes X (not UtX) + cPar.ReadGenotypes (UtX, G, false); + + //perform BSLMM analysis + BSLMM cBslmm; + cBslmm.CopyFromParam(cPar); + time_start=clock(); + cBslmm.MCMC(UtX, y); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmm.CopyToParam(cPar); + //else, if rho!=1 + } else { + gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); + gsl_vector *eval=gsl_vector_alloc (y->size); + gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); + gsl_vector *Uty=gsl_vector_alloc (y->size); + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { + cPar.ReadGenotypes (UtX, G, false); + + //read relatedness matrix G + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} + + //center matrix G + CenterMatrix (G); + } else { + cPar.ReadGenotypes (UtX, G, true); + } + + //eigen-decomposition and calculate trace_G + cout<<"Start Eigen-Decomposition..."<<endl; + time_start=clock(); + cPar.trace_G=EigenDecomp (G, U, eval, 0); + cPar.trace_G=0.0; + for (size_t i=0; i<eval->size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, y, Uty); + + //calculate REMLE/MLE estimate and pve + CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + + cPar.PrintSummary(); + + //Creat and calcualte UtX, use a large memory + cout<<"Calculating UtX..."<<endl; + time_start=clock(); + CalcUtX (U, UtX); + cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //perform analysis; assume X and y are already centered + BSLMMDAP cBslmmDap; + cBslmmDap.CopyFromParam(cPar); + time_start=clock(); + cBslmmDap.DAP_CalcBF (U, UtX, Uty, eval, y); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmmDap.CopyToParam(cPar); + + //release all matrices and vectors + gsl_matrix_free (G); + gsl_matrix_free (U); + gsl_matrix_free (UtW); + gsl_vector_free (eval); + gsl_vector_free (Uty); + } + + gsl_matrix_free (W); + gsl_vector_free (y); + gsl_matrix_free (UtX); + } else if (cPar.a_mode==15) { + //perform EM algorithm and estimate parameters + vector<string> vec_rs; + vector<double> vec_sa2, vec_sb2, wab; + vector<vector<vector<double> > > BF; + + //read hyp and bf files (functions defined in BSLMMDAP) + ReadFile_hyb (cPar.file_hyp, vec_sa2, vec_sb2, wab); + ReadFile_bf (cPar.file_bf, vec_rs, BF); + + cPar.ns_test=vec_rs.size(); + if (wab.size()!=BF[0][0].size()) {cout<<"error! hyp and bf files dimension do not match"<<endl;} + + //load annotations + gsl_matrix *Ac; + gsl_matrix_int *Ad; + gsl_vector_int *dlevel; + size_t kc, kd; + if (!cPar.file_cat.empty()) { + ReadFile_cat (cPar.file_cat, vec_rs, Ac, Ad, dlevel, kc, kd); + } else { + kc=0; kd=0; + } + + cout<<"## number of blocks = "<<BF.size()<<endl; + cout<<"## number of analyzed SNPs = "<<vec_rs.size()<<endl; + cout<<"## grid size for hyperparameters = "<<wab.size()<<endl; + cout<<"## number of continuous annotations = "<<kc<<endl; + cout<<"## number of discrete annotations = "<<kd<<endl; + + //DAP_EstimateHyper (const size_t kc, const size_t kd, const vector<string> &vec_rs, const vector<double> &vec_sa2, const vector<double> &vec_sb2, const vector<double> &wab, const vector<vector<vector<double> > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel); + + //perform analysis + BSLMMDAP cBslmmDap; + cBslmmDap.CopyFromParam(cPar); + time_start=clock(); + cBslmmDap.DAP_EstimateHyper (kc, kd, vec_rs, vec_sa2, vec_sb2, wab, BF, Ac, Ad, dlevel); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmmDap.CopyToParam(cPar); + + gsl_matrix_free(Ac); + gsl_matrix_int_free(Ad); + gsl_vector_int_free(dlevel); + } else { + // + } + + } + + + + /* - //LDR + //LDR (change 14 to 16?) if (cPar.a_mode==14) { gsl_vector *y=gsl_vector_alloc (cPar.ni_test); gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); @@ -2428,6 +2618,7 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_matrix_free (G); } */ + cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0); return; @@ -2584,7 +2775,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) outfile<<"## number of observed data = "<<cPar.np_obs<<endl; outfile<<"## number of missing data = "<<cPar.np_miss<<endl; } - if (cPar.a_mode==25 || cPar.a_mode==26 || cPar.a_mode==27 || cPar.a_mode==28 || cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==66 || cPar.a_mode==67) { + if (cPar.a_mode==25 || cPar.a_mode==26 || cPar.a_mode==27 || cPar.a_mode==28 || cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==63 || cPar.a_mode==66 || cPar.a_mode==67) { outfile<<"## number of variance components = "<<cPar.n_vc<<endl; } @@ -2604,7 +2795,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } } - if ( (cPar.a_mode==61 || cPar.a_mode==62) && cPar.file_cor.empty() && cPar.file_study.empty() && cPar.file_mstudy.empty() ) { + if ( (cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==63) && cPar.file_cor.empty() && cPar.file_study.empty() && cPar.file_mstudy.empty() ) { // outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl; if (cPar.n_ph==1) { outfile<<"## pve estimates = "; @@ -2799,7 +2990,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) */ - if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { + if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13 || cPar.a_mode==14 || cPar.a_mode==16) { outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl; } @@ -2818,13 +3009,13 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) outfile<<"## Computation Time:"<<endl; outfile<<"## total computation time = "<<cPar.time_total<<" min "<<endl; outfile<<"## computation time break down: "<<endl; - if (cPar.a_mode==21 || cPar.a_mode==22 || cPar.a_mode==11 || cPar.a_mode==13) { + if (cPar.a_mode==21 || cPar.a_mode==22 || cPar.a_mode==11 || cPar.a_mode==13 || cPar.a_mode==14 || cPar.a_mode==16) { outfile<<"## time on calculating relatedness matrix = "<<cPar.time_G<<" min "<<endl; } if (cPar.a_mode==31) { outfile<<"## time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl; } - if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13 || cPar.a_mode==14 || cPar.a_mode==16) { outfile<<"## time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl; outfile<<"## time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl; } @@ -47,9 +47,10 @@ #include "io.h" #endif + using namespace std; -bool ReadHeader_io (const string &line, HEADER &header); + //Print process bar void ProgressBar (string str, double p, double total) @@ -180,7 +181,7 @@ bool ReadFile_snps_header (const string &file_snps, set<string> &setSnps) //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); + ReadHeader (line, header); if (header.rs_col==0 && (header.chr_col==0 || header.pos_col==0) ) { cout<<"missing rs id in the hearder"<<endl; @@ -2500,7 +2501,7 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k //read header to determine which column contains which item -bool ReadHeader_io (const string &line, HEADER &header) +bool ReadHeader (const string &line, HEADER &header) { string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID","MarkerName"}; set<string> rs_set(rs_ptr, rs_ptr+11); @@ -2594,8 +2595,17 @@ bool ReadHeader_io (const string &line, HEADER &header) if (header.af_col==0) {header.af_col=header.coln+1;} else {cout<<"error! more than two af columns in the file."<<endl; n_error++;} } else if (cor_set.count(type)!=0) { if (header.cor_col==0) {header.cor_col=header.coln+1;} else {cout<<"error! more than two cor columns in the file."<<endl; n_error++;} - } else {} - + } else { + string str = ch_ptr; + string cat = str.substr(str.size()-2, 2); + // continuous + if(cat == "_c" || cat =="_C"){ + header.catc_col.insert(header.coln+1); + } else { //discrete + header.catd_col.insert(header.coln+1); + } + } + ch_ptr=strtok (NULL, " , \t"); header.coln++; } @@ -2635,7 +2645,7 @@ bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_ //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); + ReadHeader (line, header); //use the header to count the number of categories n_vc=header.coln; @@ -2715,6 +2725,118 @@ bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, siz } + + + +/* +//read the continuous category file, record mapR2catc +bool ReadFile_catc (const string &file_cat, map<string, vector<double> > &mapRS2catc, size_t &n_cat) +{ + mapRS2catc.clear(); + + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "<<file_cat<<endl; return false;} + + string line; + char *ch_ptr; + + string rs, chr, a1, a0, pos, cm; + size_t i_cat;// ns_vc=0; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + //use the header to count the number of categories + n_cat=header.coln; + if (header.rs_col!=0) {n_cat--;} + if (header.chr_col!=0) {n_cat--;} + if (header.pos_col!=0) {n_cat--;} + if (header.cm_col!=0) {n_cat--;} + if (header.a1_col!=0) {n_cat--;} + if (header.a0_col!=0) {n_cat--;} + + //set up continous category + vector<double> catc; + for (size_t i=0; i<n_cat; i++) { + catc.push_back(0); + } + + //read the following lines to record mapRS2cat + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + i_cat=0; + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) { + rs=ch_ptr; + } else if (header.chr_col!=0 && header.chr_col==i+1) { + chr=ch_ptr; + } else if (header.pos_col!=0 && header.pos_col==i+1) { + pos=ch_ptr; + } else if (header.cm_col!=0 && header.cm_col==i+1) { + cm=ch_ptr; + } else if (header.a1_col!=0 && header.a1_col==i+1) { + a1=ch_ptr; + } else if (header.a0_col!=0 && header.a0_col==i+1) { + a0=ch_ptr; + } else { + catc[i_cat]=atof(ch_ptr); + i_cat++; + } + + ch_ptr=strtok (NULL, " , \t"); + } + + if (mapRS2catc.count(rs)==0) {mapRS2catc[rs]=catc;} + + //if (mapRS2cat.count(rs)==0) {mapRS2cat[rs]=n_vc+1; ns_vc++;} + } + + //if (ns_vc>0) {n_vc++;} + + infile.clear(); + infile.close(); + + return true; +} + + + + +bool ReadFile_mcatc (const string &file_mcat, map<string, vector<double> > &mapRS2catc, size_t &n_cat) +{ + mapRS2catc.clear(); + + igzstream infile (file_mcat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; return false;} + + string file_name; + map<string, vector<double> > mapRS2catc_tmp; + size_t n_cat_tmp, t=0; + + while (!safeGetline(infile, file_name).eof()) { + mapRS2catc_tmp.clear(); + ReadFile_catc (file_name, mapRS2catc_tmp, n_cat_tmp); + mapRS2catc.insert(mapRS2catc_tmp.begin(), mapRS2catc_tmp.end()); + if (t==0) {n_cat=n_cat_tmp;} + if (n_cat!=n_cat_tmp) {cout<<"number of category differs in different mcatc files."<<endl;;} + + t++; + } + + return true; +} +*/ + + + + //read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { @@ -3208,7 +3330,7 @@ bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vect //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); + ReadHeader (line, header); while (!safeGetline(infile, line).eof()) { if (isBlankLine(line)) {continue;} @@ -3281,7 +3403,7 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); + ReadHeader (line, header); if (header.n_col==0 ) { if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { @@ -3412,7 +3534,7 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); + ReadHeader (line, header); if (header.n_col==0 ) { if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { @@ -3503,7 +3625,6 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA - void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s) { gsl_matrix_set_zero (Vq); @@ -3845,6 +3966,21 @@ void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector return; } + +//copied from lmm.cpp; is used in the following function compKtoV +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + //read reference file void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { @@ -83,6 +83,9 @@ bool ReadHeader (const string &line, HEADER &header); bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc); bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc); +bool ReadFile_catc (const string &file_cat, map<string, vector<double> > &mapRS2catc, size_t &n_cat); +bool ReadFile_mcatc (const string &file_mcat, map<string, vector<double> > &mapRS2catc, size_t &n_cat); + bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); diff --git a/src/lmm.cpp b/src/lmm.cpp index a707534..044f33c 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -183,6 +183,30 @@ void LMM::WriteFiles () return; } + + + + + + + + + + +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, const gsl_matrix *Uab, const gsl_vector *ab, gsl_matrix *Pab) { size_t index_ab, index_aw, index_bw, index_ww; diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index 915245b..4417f8a 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -165,6 +165,33 @@ void CenterMatrix (gsl_matrix *G, const gsl_matrix *W) } +//standardize the matrix G such that all diagonal elements = 1 +void StandardizeMatrix (gsl_matrix *G) +{ + double d=0.0; + vector<double> vec_d; + + for (size_t i=0; i<G->size1; ++i) { + vec_d.push_back(gsl_matrix_get(G, i, i)); + } + for (size_t i=0; i<G->size1; ++i) { + for (size_t j=i; j<G->size2; ++j) { + if (j==i) { + gsl_matrix_set(G, i, j, 1); + } else { + d=gsl_matrix_get(G, i, j); + d/=sqrt(vec_d[i]*vec_d[j]); + gsl_matrix_set(G, i, j, d); + gsl_matrix_set(G, j, i, d); + } + } + } + + return; +} + + + //scale the matrix G such that the mean diagonal = 1 double ScaleMatrix (gsl_matrix *G) { diff --git a/src/mathfunc.h b/src/mathfunc.h index 98c0e35..84f3186 100644 --- a/src/mathfunc.h +++ b/src/mathfunc.h @@ -30,6 +30,7 @@ double VectorVar (const gsl_vector *v); void CenterMatrix (gsl_matrix *G); void CenterMatrix (gsl_matrix *G, const gsl_vector *w); void CenterMatrix (gsl_matrix *G, const gsl_matrix *W); +void StandardizeMatrix (gsl_matrix *G); double ScaleMatrix (gsl_matrix *G); double CenterVector (gsl_vector *y); void CenterVector (gsl_vector *y, const gsl_matrix *W); diff --git a/src/param.cpp b/src/param.cpp index 0a63a16..4b8c3a4 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -57,6 +57,7 @@ pheno_mean(0), noconstrain (false), h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0), rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), +h_ngrid(10), rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000), r_pace(10), w_pace(1000), @@ -66,7 +67,7 @@ geo_mean(2000.0), randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), -ni_subsample(0), n_cvt(1), n_vc(1), +ni_subsample(0), n_cvt(1), n_vc(1), n_cat(0), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} @@ -76,7 +77,14 @@ time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), tim void PARAM::ReadFiles (void) { string file_str; - + /* + //read continuous cat file + if (!file_mcatc.empty()) { + if (ReadFile_mcatc (file_mcatc, mapRS2catc, n_cat)==false) {error=true;} + } else if (!file_catc.empty()) { + if (ReadFile_catc (file_catc, mapRS2catc, n_cat)==false) {error=true;} + } + */ //read cat file if (!file_mcat.empty()) { if (ReadFile_mcat (file_mcat, mapRS2cat, n_vc)==false) {error=true;} @@ -398,7 +406,7 @@ void PARAM::CheckParam (void) //check parameters if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;} - if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67 && a_mode!=71) + if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=15 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=63 && a_mode!=66 && a_mode!=67 && a_mode!=71) {cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm -predict or -calccov is sepcified correctly."<<endl; error=true;} if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;} if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;} @@ -550,7 +558,7 @@ void PARAM::CheckParam (void) // WJA added if (!file_oxford.empty()) {flag++;} - if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67) { + if (flag!=1 && a_mode!=15 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62 && a_mode!=63 && a_mode!=66 && a_mode!=67) { cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true; } @@ -578,6 +586,16 @@ void PARAM::CheckParam (void) } } + + if (a_mode==63) { + if (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) { + cout<<"error! missing relatedness file. "<<endl; error=true; + } + if ( file_pheno.empty() ) { + cout<<"error! missing phenotype file."<<endl; error=true; + } + } + if (a_mode==66 || a_mode==67) { if (file_beta.empty() || ( file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty()) ) { cout<<"error! missing beta file or genotype file."<<endl; error=true; @@ -630,7 +648,7 @@ void PARAM::CheckParam (void) if ((a_mode==43) && file_kin.empty()) {cout<<"error! missing relatedness file. -predict option requires -k option to provide a relatedness file."<<endl; error=true;} - if ((a_mode==11 || a_mode==12 || a_mode==13) && !file_cvt.empty() ) {cout<<"error! -bslmm option does not support covariates files."<<endl; error=true;} + if ((a_mode==11 || a_mode==12 || a_mode==13 || a_mode==14 || a_mode==16) && !file_cvt.empty() ) {cout<<"error! -bslmm option does not support covariates files."<<endl; error=true;} if (a_mode==41 || a_mode==42) { if (!file_cvt.empty() ) {cout<<"error! -predict option does not support covariates files."<<endl; error=true;} @@ -738,7 +756,7 @@ void PARAM::CheckData (void) { } } */ - if (ni_test==0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() ) { + if (ni_test==0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() && file_bf.empty() ) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; @@ -759,7 +777,7 @@ void PARAM::CheckData (void) { } //output some information - if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode!=27 && a_mode!=28) { + if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode!=15 && a_mode!=27 && a_mode!=28) { cout<<"## number of total individuals = "<<ni_total<<endl; if (a_mode==43) { cout<<"## number of analyzed individuals = "<<ni_cvt<<endl; @@ -806,7 +824,7 @@ void PARAM::CheckData (void) { //set parameters for BSLMM //and check for predict - if (a_mode==11 || a_mode==12 || a_mode==13) { + if (a_mode==11 || a_mode==12 || a_mode==13 || a_mode==14) { if (a_mode==11) {n_mh=1;} if (logp_min==0) {logp_min=-1.0*log((double)ns_test);} @@ -1598,7 +1616,7 @@ void PARAM::ProcessCvtPhen () } //check ni_test - if (ni_test==0) { + if (ni_test==0 && a_mode!=15) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; @@ -1641,6 +1659,54 @@ void PARAM::CopyCvt (gsl_matrix *W) return; } +/* +//if there is a column contains all 1's, then flag==1; otherwise flag=0 +void PARAM::CopyA (size_t flag, gsl_matrix *A) +{ + size_t ci_test=0; + string rs; + vector<size_t> flag_vec; + vector<double> catc; + + for (size_t j=0; j<n_cat; j++) { + flag_vec.push_back(0); + } + + for (vector<int>::size_type i=0; i<indicator_snp.size(); ++i) { + if (indicator_snp[i]==0) {continue;} + rs=snpInfo[i].rs_number; + + if (mapRS2catc.count(rs)==0) { + for (size_t j=0; j<n_cat; j++) { + gsl_matrix_set (A, ci_test, j, 0); + } + } else { + for (size_t j=0; j<n_cat; j++) { + gsl_matrix_set (A, ci_test, j, mapRS2catc[rs][j]); + } + } + + if (ci_test==0) { + for (size_t j=0; j<n_cat; j++) { + catc.push_back(mapRS2catc[rs][j]); + } + } else { + for (size_t j=0; j<n_cat; j++) { + if (catc[j]==mapRS2catc[rs][j]) {flag_vec[j]++;}; + } + } + + ci_test++; + } + + flag=0; + for (size_t j=0; j<n_cat; j++) { + if (flag_vec[j]==0) {flag++;} + } + + return; +} +*/ void PARAM::CopyGxe (gsl_vector *env) { diff --git a/src/param.h b/src/param.h index 8db3590..6cb0d97 100644 --- a/src/param.h +++ b/src/param.h @@ -109,6 +109,8 @@ public: size_t ws_col; size_t cor_col; size_t coln;//number of columns + set<size_t> catc_col; + set<size_t> catd_col; }; @@ -129,6 +131,7 @@ public: string file_gxe; //optional string file_cvt; //optional string file_cat, file_mcat; + string file_catc, file_mcatc; string file_var; string file_beta; string file_cor; @@ -138,6 +141,7 @@ public: string file_ref, file_mref; string file_weight, file_wsnp, file_wcat; string file_out; + string file_bf, file_hyp; string path_out; @@ -194,6 +198,7 @@ public: double h_min, h_max, h_scale; //priors for h double rho_min, rho_max, rho_scale; //priors for rho double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t h_ngrid, rho_ngrid; size_t s_min, s_max; //minimum and maximum number of gammas size_t w_step; //number of warm up/burn in iterations size_t s_step; //number of sampling iterations @@ -225,6 +230,7 @@ public: size_t ni_subsample; //number of subsampled individuals //size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel size_t n_cvt; //number of covariates + size_t n_cat; //number of continuous categories size_t n_ph; //number of phenotypes size_t n_vc; //number of variance components (including the diagonal matrix) double time_total; //record total time @@ -262,6 +268,7 @@ public: map<string, double> mapRS2cM; //map rs# to cM map<string, double> mapRS2est; //map rs# to parameters map<string, size_t> mapRS2cat; //map rs# to category number + map<string, vector<double> > mapRS2catc; //map rs# to continuous categories map<string, double> mapRS2wsnp; //map rs# to snp weights map<string, vector<double> > mapRS2wcat; //map rs# to snp cat weights @@ -281,6 +288,7 @@ public: void ReadGenotypes (vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K); void CheckCvt (); void CopyCvt (gsl_matrix *W); + void CopyA (size_t flag, gsl_matrix *A); void CopyGxe (gsl_vector *gxe); void CopyWeight (gsl_vector *w); void ProcessCvtPhen(); @@ -299,7 +307,6 @@ public: void UpdateSNP (const map<string, double> &mapRS2wA); }; -size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt); - + #endif @@ -453,7 +453,7 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g //read header to determine which column contains which item -bool ReadHeader_vc (const string &line, HEADER &header) +bool ReadHeader (const string &line, HEADER &header) { string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; set<string> rs_set(rs_ptr, rs_ptr+10); @@ -586,7 +586,7 @@ void ReadFile_cor (const string &file_cor, const set<string> &setSnps, vector<st //header !safeGetline(infile, line).eof(); - ReadHeader_vc (line, header); + ReadHeader (line, header); if (header.n_col==0 ) { if (header.nobs_col==0 && header.nmis_col==0) { @@ -700,7 +700,7 @@ void ReadFile_beta (const bool flag_priorscale, const string &file_beta, const m //read header HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_vc (line, header); + ReadHeader (line, header); if (header.n_col==0 ) { if (header.nobs_col==0 && header.nmis_col==0) { @@ -869,7 +869,7 @@ void ReadFile_cor (const string &file_cor, const vector<string> &vec_rs, const v HEADER header; !safeGetline(infile, line).eof(); - ReadHeader_vc (line, header); + ReadHeader (line, header); while (!safeGetline(infile, line).eof()) { //do not read cor values this time; upto col_n-1 @@ -1059,6 +1059,25 @@ void ReadFile_cor (const string &file_cor, const vector<string> &vec_rs, const v return; } + + + + +//copied from lmm.cpp; is used in the following function VCss +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + //use the new method to calculate variance components with summary statistics //first, use a function CalcS to compute S matrix (where the diagonal elements are part of V(q) ), and then use bootstrap to compute the variance for S, use a set of genotypes, phenotypes, and individual ids, and snp category label void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) { @@ -1336,7 +1355,7 @@ void VC::CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y gsl_vector_set(q_vec, i, d); } - //compuate yKrKKry, which is used later for confidence interval + //compute yKrKKry, which is used later for confidence interval for (size_t i=0; i<n_vc; i++) { gsl_vector_const_view Kry_coli=gsl_matrix_const_column (Kry, i); for (size_t j=i; j<n_vc; j++) { @@ -1842,6 +1861,277 @@ void VC::CalcVCreml (bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, +//Ks are not scaled; +void VC::CalcVCacl (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) +{ + size_t n1=K->size1, n2=K->size2; + size_t n_vc=n2/n1; + + double d, y2_sum, tau_inv, se_tau_inv; + + //new matrices/vectors + gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2); + gsl_vector *y_scale=gsl_vector_alloc (n1); + gsl_vector *y2=gsl_vector_alloc (n1); + gsl_vector *n1_vec=gsl_vector_alloc (n1); + gsl_matrix *Ay=gsl_matrix_alloc (n1, n_vc); + gsl_matrix *K2=gsl_matrix_alloc (n1, n_vc*n_vc); + gsl_matrix *K_tmp=gsl_matrix_alloc (n1, n1); + gsl_matrix *V_mat=gsl_matrix_alloc (n1, n1); + + //old matrices/vectors + gsl_vector *pve=gsl_vector_alloc (n_vc); + gsl_vector *se_pve=gsl_vector_alloc (n_vc); + gsl_vector *q_vec=gsl_vector_alloc (n_vc); + gsl_matrix *S1=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *S2=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *J_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + + //center and scale K by W + //and standardize K further so that all diagonal elements are 1 + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); + gsl_matrix_const_view K_sub = gsl_matrix_const_submatrix (K, 0, n1*i, n1, n1); + gsl_matrix_memcpy (&Kscale_sub.matrix, &K_sub.matrix); + + CenterMatrix (&Kscale_sub.matrix, W); + StandardizeMatrix (&Kscale_sub.matrix); + } + + //center y by W, and standardize it to have variance 1 (t(y)%*%y/n=1) + gsl_vector_memcpy (y_scale, y); + CenterVector (y_scale, W); + // StandardizeVector (y_scale); + + //compute y^2 and sum(y^2), which is also the variance of y*n1 + gsl_vector_memcpy (y2, y_scale); + gsl_vector_mul (y2, y_scale); + + y2_sum=0; + for (size_t i=0; i<y2->size; i++) { + y2_sum+=gsl_vector_get(y2, i); + } + + //compute the n_vc size q vector + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); + + gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, 0.0, n1_vec); + + gsl_blas_ddot (n1_vec, y_scale, &d); + gsl_vector_set(q_vec, i, d-y2_sum); + } + + //compute the n_vc by n_vc S1 and S2 matrix (and eventually S=S1-\tau^{-1}S2) + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_const_view Kscale_sub1 = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); + + for (size_t j=i; j<n_vc; j++) { + gsl_matrix_const_view Kscale_sub2 = gsl_matrix_const_submatrix (K_scale, 0, n1*j, n1, n1); + + gsl_matrix_memcpy (K_tmp, &Kscale_sub1.matrix); + gsl_matrix_mul_elements (K_tmp, &Kscale_sub2.matrix); + + gsl_vector_set_zero(n1_vec); + for (size_t t=0; t<K_tmp->size1; t++) { + gsl_vector_view Ktmp_col=gsl_matrix_column (K_tmp, t); + gsl_vector_add (n1_vec, &Ktmp_col.vector); + } + gsl_vector_add_constant (n1_vec, -1.0); + + //compute S1 + gsl_blas_ddot (n1_vec, y2, &d); + gsl_matrix_set (S1, i, j, 2*d); + if (i!=j) {gsl_matrix_set (S1, j, i, 2*d);} + + //compute S2 + d=0; + for (size_t t=0; t<n1_vec->size; t++) { + d+=gsl_vector_get (n1_vec, t); + } + gsl_matrix_set (S2, i, j, d); + if (i!=j) {gsl_matrix_set (S2, j, i, d);} + + //save information to compute J + gsl_vector_view K2col1=gsl_matrix_column (K2, n_vc*i+j); + gsl_vector_view K2col2=gsl_matrix_column (K2, n_vc*j+i); + + gsl_vector_memcpy(&K2col1.vector, n1_vec); + if (i!=j) {gsl_vector_memcpy(&K2col2.vector, n1_vec);} + } + } + + //iterate to solve tau and h's + size_t it=0; + double s=1; + while (abs(s)>1e-3 && it<100) { + //update tau_inv + gsl_blas_ddot (q_vec, pve, &d); + if (it>0) {s=y2_sum/(double)n1-d/((double)n1*((double)n1-1))-tau_inv;} + tau_inv=y2_sum/(double)n1-d/((double)n1*((double)n1-1)); + if (it>0) {s/=tau_inv;} + + //update S + gsl_matrix_memcpy (S_mat, S2); + gsl_matrix_scale (S_mat, -1*tau_inv); + gsl_matrix_add (S_mat, S1); + + //update h=S^{-1}q + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + LUDecomp (S_mat, pmt, &sig); + LUInvert (S_mat, pmt, Si_mat); + gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); + + //cout<<tau_inv<<endl; + it++; + } + + //compute V matrix and A matrix (K_scale is destroyed, so need to compute V first) + gsl_matrix_set_zero (V_mat); + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); + + //compute V + gsl_matrix_memcpy (K_tmp, &Kscale_sub.matrix); + gsl_matrix_scale (K_tmp, gsl_vector_get(pve, i)); + gsl_matrix_add (V_mat, K_tmp); + + //compute A; the corresponding Kscale is destroyed + gsl_matrix_const_view K2_sub = gsl_matrix_const_submatrix (K2, 0, n_vc*i, n1, n_vc); + gsl_blas_dgemv (CblasNoTrans, 1.0, &K2_sub.matrix, pve, 0.0, n1_vec); + + for (size_t t=0; t<n1; t++) { + gsl_matrix_set (K_scale, t, n1*i+t, gsl_vector_get(n1_vec, t) ); + } + + //compute Ay + gsl_vector_view Ay_col=gsl_matrix_column (Ay, i); + gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, 0.0, &Ay_col.vector); + } + gsl_matrix_scale (V_mat, tau_inv); + + //compute J matrix + for (size_t i=0; i<n_vc; i++) { + gsl_vector_view Ay_col1=gsl_matrix_column (Ay, i); + gsl_blas_dgemv(CblasNoTrans, 1.0, V_mat, &Ay_col1.vector, 0.0, n1_vec); + + for (size_t j=i; j<n_vc; j++) { + gsl_vector_view Ay_col2=gsl_matrix_column (Ay, j); + + gsl_blas_ddot (&Ay_col2.vector, n1_vec, &d); + gsl_matrix_set (J_mat, i, j, 2.0*d); + if (i!=j) {gsl_matrix_set (J_mat, j, i, 2.0*d);} + } + } + + //compute H^{-1}JH^{-1} as V(\hat h), where H=S2*tau_inv; this is stored in Var_mat + gsl_matrix_memcpy (S_mat, S2); + gsl_matrix_scale (S_mat, tau_inv); + + LUDecomp (S_mat, pmt, &sig); + LUInvert (S_mat, pmt, Si_mat); + + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, J_mat, 0.0, S_mat); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, S_mat, Si_mat, 0.0, Var_mat); + + //compute variance for tau_inv + gsl_blas_dgemv(CblasNoTrans, 1.0, V_mat, y_scale, 0.0, n1_vec); + gsl_blas_ddot (y_scale, n1_vec, &d); + se_tau_inv=sqrt(2*d)/(double)n1; + + //transform pve back to the original scale and save data + v_pve.clear(); v_se_pve.clear(); + v_sigma2.clear(); v_se_sigma2.clear(); + + pve_total=0, se_pve_total=0; + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get (pve, i); + pve_total+=d; + + v_pve.push_back(d); + v_sigma2.push_back(d*tau_inv/v_traceG[i] ); + + d=sqrt(gsl_matrix_get (Var_mat, i, i)); + v_se_pve.push_back(d); + v_se_sigma2.push_back(d*tau_inv/v_traceG[i]); + + //d*=sqrt(var_y/v_traceG[i]-v_sigma2[i]); + //v_se_pve.push_back(d/var_y); + + for (size_t j=0; j<n_vc; j++) { + se_pve_total+=gsl_matrix_get(Var_mat, i, j); + } + } + v_sigma2.push_back( (1-pve_total)*tau_inv ); + v_se_sigma2.push_back(sqrt(se_pve_total)*tau_inv ); + se_pve_total=sqrt(se_pve_total); + + cout<<"sigma2 = "; + for (size_t i=0; i<n_vc+1; i++) { + cout<<v_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"se(sigma2) = "; + for (size_t i=0; i<n_vc+1; i++) { + cout<<v_se_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"pve = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_pve[i]<<" "; + } + cout<<endl; + + cout<<"se(pve) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_se_pve[i]<<" "; + } + cout<<endl; + + if (n_vc>1) { + cout<<"total pve = "<<pve_total<<endl; + cout<<"se(total pve) = "<<se_pve_total<<endl; + } + + gsl_permutation_free(pmt); + + gsl_matrix_free(K_scale); + gsl_vector_free(y_scale); + gsl_vector_free(y2); + gsl_vector_free(n1_vec); + gsl_matrix_free(Ay); + gsl_matrix_free(K2); + gsl_matrix_free(K_tmp); + gsl_matrix_free(V_mat); + + gsl_vector_free(pve); + gsl_vector_free(se_pve); + gsl_vector_free(q_vec); + gsl_matrix_free(S1); + gsl_matrix_free(S2); + gsl_matrix_free(S_mat); + gsl_matrix_free(Si_mat); + gsl_matrix_free(J_mat); + gsl_matrix_free(Var_mat); + + return; +} + + + + + + + //read bimbam mean genotype file and compute XWz bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) { @@ -95,6 +95,7 @@ public: void WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat); void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); void CalcVCreml (const bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); + void CalcVCacl (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); }; void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich); |