From 943e970c9cbc184dcca679fbe455f48c32242cdc Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Mon, 23 May 2016 17:05:35 -0400 Subject: version 0.95alpha --- src/gemma.cpp | 1840 ++++++++++++++++++++++++++++++++---------- src/io.cpp | 1224 +++++++++++++++++++++++----- src/io.h | 24 +- src/lm.cpp | 24 +- src/lmm.cpp | 267 ++++-- src/mathfunc.cpp | 18 +- src/mvlmm.cpp | 451 +++++++---- src/param.cpp | 878 ++++++++++++++------ src/param.h | 42 +- src/vc.cpp | 2376 ++++++++++++++++++++++++++++++++++++++++++++++++++---- src/vc.h | 41 +- 11 files changed, 5854 insertions(+), 1331 deletions(-) (limited to 'src') diff --git a/src/gemma.cpp b/src/gemma.cpp index b8693a8..3b9fe29 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -39,9 +39,11 @@ #include "vc_float.h" #include "lm_float.h" //for LM class #include "bslmm_float.h" //for BSLMM class +#include "ldr_float.h" //for LDR class #include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe #include "mvlmm_float.h" //for MVLMM class #include "prdt_float.h" //for PRDT class +#include "varcov_float.h" //for MVLMM class #include "mathfunc_float.h" //for a few functions #else #include "io.h" @@ -49,9 +51,11 @@ #include "vc.h" #include "lm.h" #include "bslmm.h" +#include "ldr.h" #include "lmm.h" #include "mvlmm.h" #include "prdt.h" +#include "varcov.h" #include "mathfunc.h" #endif @@ -60,26 +64,23 @@ using namespace std; -GEMMA::GEMMA(void): -version("0.95alpha"), date("08/08/2014"), year("2011") +GEMMA::GEMMA(void): +version("0.95alpha"), date("07/11/2015"), year("2011") {} void GEMMA::PrintHeader (void) { cout< indicator_all; size_t c_bv=0; @@ -854,13 +1136,13 @@ void GEMMA::BatchRun (PARAM &cPar) indicator_all.push_back(1); if (cPar.indicator_bv[i]==1) {gsl_vector_set(u_hat, c_bv, cPar.vec_bv[i]); c_bv++;} } - + ReadFile_kin (cPar.file_kin, indicator_all, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1, cPar.n_cvt); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); - gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); - + gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph); gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt); //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); cPar.CopyCvtPhen (W_full, Y_full, 1); - - gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); - gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); + + gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); + gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2); - + //read relatedness matrix G, and matrix G_full ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { @@ -937,8 +1219,8 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); @@ -948,7 +1230,7 @@ void GEMMA::BatchRun (PARAM &cPar) if (cPar.n_ph==1) { gsl_vector *beta=gsl_vector_alloc (W->size2); gsl_vector *se_beta=gsl_vector_alloc (W->size2); - + double lambda, logl, vg, ve; gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); @@ -959,29 +1241,29 @@ void GEMMA::BatchRun (PARAM &cPar) cout<<"REMLE estimate for vg in the null model = "<size1; i++) { for (size_t j=0; j<=i; j++) { @@ -1004,110 +1286,250 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) ); } } - + //obtain Y_hat from fixed effects gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat); - + //obtain H KroneckerSym(G_full, Vg, H_full); for (size_t i=0; isize1; i++) { gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2); gsl_matrix_add (&H_sub.matrix, Ve); } - - //free matrices + + //free matrices gsl_matrix_free (Vg); gsl_matrix_free (Ve); gsl_matrix_free (B); gsl_matrix_free (se_B); } - + PRDT cPRDT; - + cPRDT.CopyFromParam(cPar); - + cout<<"Predicting Missing Phentypes ... "< setSnps_beta; + map mapRS2wA, mapRS2wK; + + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + time_start=clock(); + cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &ns_vec.vector); + cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "< vec_cat, vec_ni; + vector vec_weight, vec_z2; + map mapRS2weight; + mapRS2weight.clear(); + + time_start=clock(); + ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2weight, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_total, cPar.ns_total, cPar.ns_test); + cout<<"## number of total individuals = "<size1, cPar.n_cvt); - - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); - + //Fit LM or mvLM - if (cPar.n_ph==1) { + if (cPar.n_ph==1) { LM cLm; cLm.CopyFromParam(cPar); - + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); - - if (!cPar.file_gene.empty()) { + + if (!cPar.file_gene.empty()) { cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype } else if (!cPar.file_bfile.empty()) { cLm.AnalyzePlink (W, &Y_col.vector); + } else if (!cPar.file_oxford.empty()) { + cLm.Analyzebgen (W, &Y_col.vector); } else { cLm.AnalyzeBimbam (W, &Y_col.vector); } - + cLm.WriteFiles(); cLm.CopyToParam(cPar); } /* - else { + else { MVLM cMvlm; - cMvlm.CopyFromParam(cPar); - + cMvlm.CopyFromParam(cPar); + if (!cPar.file_bfile.empty()) { cMvlm.AnalyzePlink (W, Y); } else { cMvlm.AnalyzeBimbam (W, Y); } - + cMvlm.WriteFiles(); cMvlm.CopyToParam(cPar); } @@ -1115,27 +1537,202 @@ void GEMMA::BatchRun (PARAM &cPar) //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); - } + } //VC estimation with one or multiple kinship matrices //REML approach only //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp - //for one phenotype only; - if (cPar.a_mode==61) { + //for one phenotype only; + if (cPar.a_mode==61 || cPar.a_mode==62) { + if (!cPar.file_beta.empty() ) { + //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK + //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1 + //update indicator_snps, so that the numbers are in accordance with mapRS2wK + set setSnps_beta; + ReadFile_snps_header (cPar.file_beta, setSnps_beta); + + map mapRS2wA, mapRS2wK; + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + cPar.UpdateSNP (mapRS2wK); + + //setup matrices and vectors + gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc*2, cPar.n_vc); + gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_vector *q=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1); + + gsl_matrix *K=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + gsl_matrix *A=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (cPar.ni_test, cPar.n_cvt); + + gsl_matrix_set_zero (K); + gsl_matrix_set_zero (A); + + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Vq); + gsl_vector_set_zero (q); + gsl_vector_set_zero (s); + + cPar.CopyCvtPhen (W, y, 0); + + gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc); + gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc); + gsl_vector_view s_vec=gsl_vector_subvector(s, 0, cPar.n_vc); + + vector vec_cat, vec_ni; + vector vec_weight, vec_z2; + + //read beta, based on the mapRS2wK + ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2wK, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_study, cPar.ns_study, cPar.ns_test); + + cout<<"Study Panel: "<size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc ); - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); //read kinship matrices if (!(cPar.file_mk).empty()) { ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} @@ -1202,7 +1799,7 @@ void GEMMA::BatchRun (PARAM &cPar) if (cPar.n_ph==1) { // if (cPar.n_vc==1) { /* - //calculate UtW and Uty + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); @@ -1228,10 +1825,10 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } - + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); - + //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); @@ -1239,11 +1836,11 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); - + //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); - + double d, u, e; for (size_t i=0; isize; i++) { d=gsl_vector_get (eval, i); @@ -1252,37 +1849,210 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } - + //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); - - //output residuals + + //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); - + gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); - } -*/ + } +*/ // } else { gsl_vector_view Y_col=gsl_matrix_column (Y, 0); VC cVc; - cVc.CopyFromParam(cPar); - cVc.CalcVCreml (G, W, &Y_col.vector); + cVc.CopyFromParam(cPar); + if (cPar.a_mode==61) { + cVc.CalcVChe (G, W, &Y_col.vector); + } else { + cVc.CalcVCreml (cPar.noconstrain, G, W, &Y_col.vector); + } cVc.CopyToParam(cPar); - //obtain pve from sigma2 //obtain se_pve from se_sigma2 - + //} - } + } + } + + } + - + //compute confidence intervals with additional summary statistics + //we do not check the sign of z-scores here, but they have to be matched with the genotypes + if (cPar.a_mode==66 || cPar.a_mode==67) { + //read reference file first + gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_matrix *Svar=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_vector *s_ref=gsl_vector_alloc (cPar.n_vc); + + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Svar); + gsl_vector_set_zero(s_ref); + + if (!cPar.file_ref.empty()) { + ReadFile_ref(cPar.file_ref, S, Svar, s_ref, cPar.ni_ref); + } else { + ReadFile_mref(cPar.file_mref, S, Svar, s_ref, cPar.ni_ref); + } + + //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK + //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1 + set setSnps_beta; + ReadFile_snps_header (cPar.file_beta, setSnps_beta); + + //obtain the weights for wA, which contains the SNP weights for SNPs used in the model + map mapRS2wK; + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + //set up matrices and vector + gsl_matrix *Xz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc); + gsl_matrix *XWz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc); + gsl_matrix *XtXWz=gsl_matrix_alloc (mapRS2wK.size(), cPar.n_vc*cPar.n_vc); + gsl_vector *w=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *w1=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *z=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *s_vec=gsl_vector_alloc (cPar.n_vc); + + vector vec_cat, vec_size; + vector vec_z; + + map mapRS2z, mapRS2wA; + map mapRS2A1; + string file_str; + + //update s_vec, the number of snps in each category + for (size_t i=0; i::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + vec_size[cPar.mapRS2cat[it->first]]++; + } + + for (size_t i=0; i::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + mapRS2wA[it->first]=1; + } + } else { + cPar.UpdateWeight (0, mapRS2wK, cPar.ni_test, s_vec, mapRS2wA); + } + + //read in z-scores based on allele 0, and save that into a vector + ReadFile_beta (cPar.file_beta, mapRS2wA, mapRS2A1, mapRS2z); + + //update snp indicator, save weights to w, save z-scores to vec_z, save category label to vec_cat + //sign of z is determined by matching alleles + cPar.UpdateSNPnZ (mapRS2wA, mapRS2A1, mapRS2z, w, z, vec_cat); + + //compute an n by k matrix of X_iWz + cout<<"Calculating Xz ... "<size2, W->size2); //B is a d by c matrix gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); - gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); - - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + gsl_vector *env=gsl_vector_alloc (Y->size1); + gsl_vector *weight=gsl_vector_alloc (Y->size1); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); - - //read relatedness matrix G + if (!cPar.file_gxe.empty()) {cPar.CopyGxe (env);} + + //read relatedness matrix G if (!(cPar.file_kin).empty()) { ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1; i++) { + wi=gsl_vector_get(weight, i); + for (size_t j=i; jsize2; j++) { + wj=gsl_vector_get(weight, j); + d=gsl_matrix_get(G, i, j); + if (wi<=0 || wj<=0) {d=0;} else {d/=sqrt(wi*wj);} + gsl_matrix_set(G, i, j, d); + if (j!=i) {gsl_matrix_set(G, j, i, d);} + } + } + } + //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<size1; i++) { + wi=gsl_vector_get(weight, i); + if (wi<=0) {wi=0;} else {wi=sqrt(wi);} + gsl_vector_view Urow=gsl_matrix_row (U, i); + gsl_vector_scale (&Urow.vector, wi); + } + } + cPar.trace_G=0.0; for (size_t i=0; isize; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} @@ -1324,14 +2123,14 @@ void GEMMA::BatchRun (PARAM &cPar) } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} @@ -1339,14 +2138,29 @@ void GEMMA::BatchRun (PARAM &cPar) } cPar.trace_G/=(double)eval->size; } - + if (cPar.a_mode==31) { cPar.WriteMatrix(U, "eigenU"); cPar.WriteVector(eval, "eigenD"); - } else { - //calculate UtW and Uty + } else if (!cPar.file_gene.empty() ) { + //calculate UtW and Uty CalcUtX (U, W, UtW); - CalcUtX (U, Y, UtY); + CalcUtX (U, Y, UtY); + + LMM cLmm; + cLmm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype + + cLmm.WriteFiles(); + cLmm.CopyToParam(cPar); + } else { + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); //calculate REMLE/MLE estimate and pve for univariate model if (cPar.n_ph==1) { @@ -1372,10 +2186,10 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } - + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); - + //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); @@ -1383,11 +2197,11 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); - + //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); - + double d, u, e; for (size_t i=0; isize; i++) { d=gsl_vector_get (eval, i); @@ -1396,81 +2210,106 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } - + //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); - - //output residuals + + //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); - + gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); - } - } - + } + } + //Fit LMM or mvLMM if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) { - if (cPar.n_ph==1) { + if (cPar.n_ph==1) { LMM cLmm; cLmm.CopyFromParam(cPar); - + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); - - if (!cPar.file_gene.empty()) { - cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype - } else if (!cPar.file_bfile.empty()) { - cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); - } else { - cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); - } - + + if (!cPar.file_bfile.empty()) { + if (cPar.file_gxe.empty()) { + cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzePlinkGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); + } + } + // WJA added + else if(!cPar.file_oxford.empty()) { + cLmm.Analyzebgen (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } + else { + if (cPar.file_gxe.empty()) { + cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzeBimbamGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); + } + } + cLmm.WriteFiles(); cLmm.CopyToParam(cPar); - } else { + } else { MVLMM cMvlmm; - cMvlmm.CopyFromParam(cPar); - + cMvlmm.CopyFromParam(cPar); + if (!cPar.file_bfile.empty()) { - cMvlmm.AnalyzePlink (U, eval, UtW, UtY); - } else { - cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + if (cPar.file_gxe.empty()) { + cMvlmm.AnalyzePlink (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzePlinkGXE (U, eval, UtW, UtY, env); + } + } + else if(!cPar.file_oxford.empty()) + { + cMvlmm.Analyzebgen (U, eval, UtW, UtY); + } + else { + if (cPar.file_gxe.empty()) { + cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzeBimbamGXE (U, eval, UtW, UtY, env); + } } - + cMvlmm.WriteFiles(); cMvlmm.CopyToParam(cPar); } } } - - + + //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); gsl_matrix_free(B); gsl_matrix_free(se_B); - gsl_matrix_free (G); + gsl_matrix_free (G); gsl_matrix_free (U); gsl_matrix_free (UtW); gsl_matrix_free (UtY); gsl_vector_free (eval); - } - - + gsl_vector_free (env); + } + + //BSLMM if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { gsl_vector *y=gsl_vector_alloc (cPar.ni_test); - gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); - gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); - - //set covariates matrix W and phenotype vector y - //an intercept should be included in W, + gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); + + //set covariates matrix W and phenotype vector y + //an intercept should be included in W, cPar.CopyCvtPhen (W, y, 0); - + //center y, even for case/control data cPar.pheno_mean=CenterVector(y); @@ -1482,32 +2321,32 @@ void GEMMA::BatchRun (PARAM &cPar) //perform BSLMM analysis BSLMM cBslmm; cBslmm.CopyFromParam(cPar); - time_start=clock(); + time_start=clock(); cBslmm.MCMC(UtX, y); cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); cBslmm.CopyToParam(cPar); //else, if rho!=1 } else { - gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); + gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); gsl_vector *eval=gsl_vector_alloc (y->size); gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); gsl_vector *Uty=gsl_vector_alloc (y->size); - - //read relatedness matrix G - if (!(cPar.file_kin).empty()) { + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { cPar.ReadGenotypes (UtX, G, false); - + //read relatedness matrix G ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - - //calculate UtW and Uty + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, y, Uty); - + //calculate REMLE/MLE estimate and pve CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); - + cPar.PrintSummary(); - + //Creat and calcualte UtX, use a large memory cout<<"Calculating UtX..."<size, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (1, 1); + vector > Xt; + + //set covariates matrix W and phenotype vector y + //an intercept is included in W + cPar.CopyCvtPhen (W, y, 0); + + //read in genotype matrix X + cPar.ReadGenotypes (Xt, G, false); + + LDR cLdr; + cLdr.CopyFromParam(cPar); + time_start=clock(); + + cLdr.VB(Xt, W, y); + + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cLdr.CopyToParam(cPar); + + gsl_vector_free (y); + gsl_matrix_free (W); + gsl_matrix_free (G); + } + cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0); - + return; } -void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) +void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) { string file_str; file_str=cPar.path_out+"/"+cPar.file_out; file_str+=".log.txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing log file: "<tm_month<<":"<tm_day":"<tm_hour<<":"<tm_min<1) { + outfile<<"## total pve = "<1) { + outfile<<"## total pve = "<1) { + outfile<<"## total pve = "<=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) { outfile<<"## time on optimization = "< &setSnps) { setSnps.clear(); - ifstream infile (file_snps.c_str(), ifstream::in); + //ifstream infile (file_snps.c_str(), ifstream::in); + //if (!infile) {cout<<"error! fail to open snps file: "< &setSnps) } +bool ReadFile_snps_header (const string &file_snps, set &setSnps) +{ + setSnps.clear(); + + //ifstream infile (file_snps.c_str(), ifstream::in); + //if (!infile) {cout<<"error! fail to open snps file: "< &indicator_cvt, vector &snpInfo) { - snpInfo.clear(); + snpInfo.clear(); ifstream infile (file_bim.c_str(), ifstream::in); if (!infile) {cout<<"error opening .bim file: "< &setSnps, const gsl //start reading snps and doing association test for (size_t t=0; t &setSnps, const gsl if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} - if (hwe_level!=1 && maf_level!=-1) { + if (hwe_level!=0 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)size; ++i) { @@ -1054,6 +1119,11 @@ bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k gsl_vector *geno=gsl_vector_alloc (ni_total); gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); + gsl_matrix_set_zero(Xlarge); + size_t ns_test=0; for (size_t t=0; t &indicator_snp, const int k gsl_vector_add_constant (geno, -1.0*geno_mean); + /* if (geno_var!=0) { if (k_mode==1) { gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); @@ -1101,8 +1172,23 @@ bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k cout<<"Unknown kinship mode."< &indicator_snp, const int k gsl_vector_free (geno); gsl_vector_free (geno_miss); + gsl_matrix_free (Xlarge); infile.close(); infile.clear(); @@ -1146,11 +1233,16 @@ bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_m size_t ns_test=0; int n_bit; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); + gsl_matrix_set_zero(Xlarge); + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - //print the first three majic numbers + //print the first three magic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; @@ -1196,14 +1288,30 @@ bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_m gsl_vector_add_constant (geno, -1.0*geno_mean); + /* if (geno_var!=0) { if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} else {cout<<"Unknown kinship mode."< &indicator_snp, const int k_m } gsl_vector_free (geno); + gsl_matrix_free (Xlarge); infile.close(); infile.clear(); @@ -2053,7 +2162,7 @@ bool ReadFile_bgen(const string &file_bgen, const set &setSnps, const gs uint16_t unzipped_data[3*bgen_N]; if (setSnps.size()!=0 && setSnps.count(rs)==0) { - SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9}; + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, (long int) -9}; snpInfo.push_back(sInfo); indicator_snp.push_back(0); if(CompressedSNPBlocks) @@ -2394,18 +2503,18 @@ bool bgenKin (const string &file_oxford, vector &indicator_snp, const int k //read header to determine which column contains which item bool ReadHeader (const string &line, HEADER &header) { - string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; - set rs_set(rs_ptr, rs_ptr+10); + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID","MarkerName"}; + set rs_set(rs_ptr, rs_ptr+11); string chr_ptr[]={"chr","CHR"}; set chr_set(chr_ptr, chr_ptr+2); string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; set pos_set(pos_ptr, pos_ptr+8); string cm_ptr[]={"cm","CM"}; set cm_set(cm_ptr, cm_ptr+2); - string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; - set a1_set(a1_ptr, a1_ptr+4); - string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; - set a0_set(a0_ptr, a0_ptr+4); + string a1_ptr[]={"a1","A1","allele1","ALLELE1","Allele1","INC_ALLELE"}; + set a1_set(a1_ptr, a1_ptr+5); + string a0_ptr[]={"a0","A0","allele0","ALLELE0","Allele0","a2","A2","allele2","ALLELE2","Allele2","DEC_ALLELE"}; + set a0_set(a0_ptr, a0_ptr+10); string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; set z_set(z_ptr, z_ptr+6); @@ -2424,9 +2533,13 @@ bool ReadHeader (const string &line, HEADER &header) set nmis_set(nmis_ptr, nmis_ptr+6); string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; set nobs_set(nobs_ptr, nobs_ptr+4); + string ncase_ptr[]={"ncase","NCASE","n_case","N_CASE"}; + set ncase_set(ncase_ptr, ncase_ptr+4); + string ncontrol_ptr[]={"ncontrol","NCONTROL","n_control","N_CONTROL"}; + set ncontrol_set(ncontrol_ptr, ncontrol_ptr+4); - string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; - set af_set(af_ptr, af_ptr+10); + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY","Freq.Allele1.HapMapCEU","FreqAllele1HapMapCEU", "Freq1.Hapmap"}; + set af_set(af_ptr, af_ptr+13); string var_ptr[]={"var","VAR"}; set var_set(var_ptr, var_ptr+2); @@ -2435,7 +2548,7 @@ bool ReadHeader (const string &line, HEADER &header) string cor_ptr[]={"cor","COR","r","R"}; set cor_set(cor_ptr, cor_ptr+4); - header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.cm_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.ncase_col=0; header.ncontrol_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; char *ch_ptr; string type; @@ -2472,6 +2585,10 @@ bool ReadHeader (const string &line, HEADER &header) if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."< &mapRS2cat, size_ +bool ReadFile_mcat (const string &file_mcat, map &mapRS2cat, size_t &n_vc) +{ + mapRS2cat.clear(); + + igzstream infile (file_mcat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mcategory file: "< mapRS2cat_tmp; + size_t n_vc_tmp, t=0; + + while (!safeGetline(infile, file_name).eof()) { + mapRS2cat_tmp.clear(); + ReadFile_cat (file_name, mapRS2cat_tmp, n_vc_tmp); + mapRS2cat.insert(mapRS2cat_tmp.begin(), mapRS2cat_tmp.end()); + if (t==0) {n_vc=n_vc_tmp;} else {n_vc=max(n_vc, n_vc_tmp);} + t++; + } + + return true; +} + + //read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix -bool BimbamKin (const string &file_geno, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin) +bool BimbamKin (const string &file_geno, const int display_pace, const vector &indicator_idv, const vector &indicator_snp, const map &mapRS2weight, const map &mapRS2cat, const vector &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { igzstream infile (file_geno.c_str(), igzstream::in); //ifstream infile (file_geno.c_str(), ifstream::in); @@ -2593,6 +2733,17 @@ bool BimbamKin (const string &file_geno, vector &indicator_idv, vector gsl_vector *geno=gsl_vector_alloc (ni_test); gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + size_t n_vc=matrix_kin->size2/ni_test, i_vc; string rs; vector ns_vec; @@ -2600,6 +2751,11 @@ bool BimbamKin (const string &file_geno, vector &indicator_idv, vector ns_vec.push_back(0); } + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); + gsl_matrix_set_zero(Xlarge); + size_t ns_test=0; for (size_t t=0; t &indicator_idv, vector if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} } - //this line is new; removed - //gsl_vector_add_constant (geno, -1.0*geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); - if (geno_var!=0) { - mapRS2var[rs]=geno_var; + gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot (geno, geno, &geno_var); + geno_var/=(double)ni_test; - if (k_mode==1) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { + if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (mapRS2weight.size()==0) { + d=1.0/geno_var; + } else { + d=mapRS2weight.at(rs)/geno_var; + } + + /* + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { i_vc=mapRS2cat.at(rs); ns_vec[i_vc]++; gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); + gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); + //eigenlib_dsyr (1.0, geno, matrix_kin); + } + */ + + gsl_vector_scale (geno, sqrt(d)); + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0]%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); } + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); - //eigenlib_dsyr (1.0, geno, matrix_kin); - } else if (k_mode==2) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc]%msize==0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); } - } else { - cout<<"Unknown kinship mode."< &indicator_idv, vector gsl_vector_free (geno); gsl_vector_free (geno_miss); + gsl_vector_free (Wtx); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + gsl_matrix_free (Xlarge); + infile.close(); infile.clear(); @@ -2702,7 +2902,7 @@ bool BimbamKin (const string &file_geno, vector &indicator_idv, vector -bool PlinkKin (const string &file_bed, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin) +bool PlinkKin (const string &file_bed, const int display_pace, const vector &indicator_idv, const vector &indicator_snp, const map &mapRS2weight, const map &mapRS2cat, const vector &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"< &indicator_idv, vector & size_t ni_total=indicator_idv.size(); gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + size_t ns_test=0; int n_bit; @@ -2727,6 +2938,11 @@ bool PlinkKin (const string &file_bed, vector &indicator_idv, vector & ns_vec.push_back(0); } + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); + gsl_matrix_set_zero(Xlarge); + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -2780,65 +2996,97 @@ bool PlinkKin (const string &file_bed, vector &indicator_idv, vector & if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} } - //this line is new; removed - //gsl_vector_add_constant (geno, -1.0*geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); + + gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot (geno, geno, &geno_var); + geno_var/=(double)ni_test; + + if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (mapRS2weight.size()==0) { + d=1.0/geno_var; + } else { + d=mapRS2weight.at(rs)/geno_var; + } + + /* + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); + } + */ + + gsl_vector_scale (geno, sqrt(d)); + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0]%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc]%msize==0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); + } + } - if (geno_var!=0) { - mapRS2var[rs]=geno_var; - if (k_mode==1) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); - } - } else if (k_mode==2) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); - } - } else { - cout<<"Unknown kinship mode."<size1, matrix_kin->size2); + gsl_vector *ns_tmp=gsl_vector_alloc (vector_ns->size); + + size_t l=0; + double d; + while (!safeGetline(infile, file_name).eof()) { + gsl_matrix_set_zero(kin_tmp); + gsl_vector_set_zero(ns_tmp); + + if (mfile_mode==1) { + file_name+=".bed"; + PlinkKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + } else { + BimbamKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + } + + //add ns + gsl_vector_add(vector_ns, ns_tmp); + + //add kin + for (size_t t=0; t &mapRS2weight) +{ + mapRS2weight.clear(); + + igzstream infile (file_wsnp.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open snp weight file: "< > &mapRS2wvector) +{ + mapRS2wvector.clear(); + + igzstream infile (file_wcat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open snp weight file: "< weight; + for (size_t i=0; in_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + } + + ch_ptr=strtok (NULL, " , \t"); + } + + if (t!=n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + mapRS2wvector[rs]=weight; + } + + return true; +} + + + + + + + -//read beta file, use the mapRS2var to select snps (and to provide var if maf/var is not provided in the beta file), calculate q -void ReadFile_beta (const string &file_beta, const int k_mode, const map &mapRS2cat, const map &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test) +//read the beta file, save snp z scores in to z2_score, and save category into indicator_snp based on mapRS2var and set, and indicator_snp record the category number (from 1 to n_vc), and provide var if maf/var is not provided in the beta file +//notice that indicator_snp contains ns_test snps, instead of ns_total snps +//read the beta file for the second time, compute q, and Vq based on block jacknife +//use the mapRS2var to select snps (and to ), calculate q +//do a block-wise jacknife, and compute Vq +void ReadFile_beta (const string &file_beta, const map &mapRS2cat, const map &mapRS2wA, vector &vec_cat, vector &vec_ni, vector &vec_weight, vector &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test) { - gsl_vector_set_zero(q); + vec_cat.clear(); vec_ni.clear(); vec_weight.clear(); vec_z2.clear(); ni_total=0; ns_total=0; ns_test=0; igzstream infile (file_beta.c_str(), igzstream::in); @@ -2887,13 +3277,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map vec_q, vec_s; - for (size_t i=0; isize; i++) { - vec_q.push_back(0.0); - vec_s.push_back(0.0); - } + size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; //read header HEADER header; @@ -2901,7 +3285,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const mapsize; i++) { - if (vec_s[i]!=0) { - gsl_vector_set(q, i, vec_q[i]/vec_s[i]); - } - gsl_vector_set(s, i, vec_s[i]); - } - infile.clear(); infile.close(); @@ -3013,34 +3392,108 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map &mapRS2wA, map &mapRS2A1, map &mapRS2z) { - igzstream infile (file_s.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open s file: "<size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - for (size_t j=0; jsize2; j++) { - d=gsl_matrix_get(S, i, j)+atof(ch_ptr); - gsl_matrix_set(S, i, j, d); - ch_ptr=strtok (NULL, " , \t"); + string rs, chr, a1, a0, pos, cm; + double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, af=0, var_x=0; + size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; + size_t ni_total=0, ns_total=0, ns_test=0; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { + cout<<"error! missing sample size in the beta file."<size1; i++) { - !safeGetline(infile, line).eof(); + if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0)) { + cout<<"error! missing z scores in the beta file."<size2; j++) { - d=gsl_matrix_get(Svar, i, j)+atof(ch_ptr); - gsl_matrix_set(Svar, i, j, d); + + z=0; beta=0; se_beta=0; chisq=0; pvalue=0; + n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; af=0; var_x=0; + for (size_t i=0; i &vec_cat, const vector &vec_ni, const vector &vec_weight, const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s) { - gsl_matrix_set_zero(S); - gsl_matrix_set_zero(Svar); + gsl_matrix_set_zero (Vq); + gsl_vector_set_zero (q); + gsl_vector_set_zero (s); - string file_name; + size_t cat, n_total; + double w, zsquare; - igzstream infile (file_ms.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open ms file: "< vec_q, vec_s, n_snps; + for (size_t i=0; isize; i++) { + vec_q.push_back(0.0); + vec_s.push_back(0.0); + n_snps.push_back(0.0); + } - while (!safeGetline(infile, file_name).eof()) { - ReadFile_s(file_name, S, Svar); + vector > mat_q, mat_s; + for (size_t i=0; isize; i++) { + if (vec_s[i]!=0) { + gsl_vector_set(q, i, vec_q[i]/vec_s[i]); + } + gsl_vector_set(s, i, vec_s[i]); + } + + //compute Vq; divide SNPs in each category into evenly distributed blocks + size_t t=0, b=0, n_snp=0; + double d, m, n; + for (size_t l=0; lsize; l++) { + n_snp=floor(n_snps[l]/n_block); t=0; b=0; + if (n_snp==0) {continue;} + + //initiate everything to zero + for (size_t i=0; isize; j++) { + mat_q[i][j]=0; + mat_s[i][j]=0; + } + } + + //record values + for (size_t i=0; isize; i++) { + m=0; n=0; + for (size_t k=0; ksize; i++) { + d=0; n=0; + for (size_t k=0; ksize; i++) { + for (size_t j=i; jsize; j++) { + if (i==j) {continue;} + d=gsl_matrix_get(Vq, i, j); + gsl_matrix_set(Vq, i, j, d/2); + gsl_matrix_set(Vq, j, i, d/2); + } + } return; } @@ -3075,24 +3641,19 @@ void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar) -//read V file: V (i.e. Q) -void ReadFile_v (const string &file_v, gsl_matrix *V) +//read vector file +void ReadFile_vector (const string &file_vec, gsl_vector *vec) { - igzstream infile (file_v.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open v file: "<size1; i++) { + for (size_t i=0; isize; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - for (size_t j=0; jsize2; j++) { - d=gsl_matrix_get(V, i, j)+atof(ch_ptr); - gsl_matrix_set(V, i, j, d); - ch_ptr=strtok (NULL, " , \t"); - } + gsl_vector_set(vec, i, atof(ch_ptr)); } infile.clear(); @@ -3102,17 +3663,21 @@ void ReadFile_v (const string &file_v, gsl_matrix *V) } -void ReadFile_mv (const string &file_mv, gsl_matrix *V) +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat) { - gsl_matrix_set_zero(V); - - string file_name; + igzstream infile (file_mat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open matrix file: "<size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; jsize2; j++) { + gsl_matrix_set(mat, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } infile.clear(); @@ -3121,35 +3686,32 @@ void ReadFile_mv (const string &file_mv, gsl_matrix *V) return; } - -//read q file: q, s and ni_test -void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, gsl_matrix *mat2) { - igzstream infile (file_s.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open s file: "<size; i++) { + for (size_t i=0; isize1; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - d=gsl_vector_get(q_vec, i)+atof(ch_ptr); - gsl_vector_set(q_vec, i, d); + for (size_t j=0; jsize2; j++) { + gsl_matrix_set(mat1, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } - for (size_t i=0; isize; i++) { + for (size_t i=0; isize1; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - d=gsl_vector_get(s_vec, i)+atof(ch_ptr); - gsl_vector_set(s_vec, i, d); + for (size_t j=0; jsize2; j++) { + gsl_matrix_set(mat2, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } - !safeGetline(infile, line).eof(); - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - df=atof(ch_ptr); - infile.clear(); infile.close(); @@ -3158,22 +3720,274 @@ void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, dou -void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +//read study file +void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { + string Vqfile=file_study+".Vq.txt"; + string sfile=file_study+".size.txt"; + string qfile=file_study+".q.txt"; + + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + ReadFile_matrix(Vqfile, Vq_mat); + ReadFile_vector(sfile, s); + ReadFile_vector(qfile, q_vec); + + double d; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (s, i); + gsl_vector_set (s_vec, i, d); + } + ni=gsl_vector_get (s, s_vec->size); + + gsl_vector_free(s); + + return; +} + + +//read reference file +void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) +{ + string sfile=file_ref+".size.txt"; + string Sfile=file_ref+".S.txt"; + //string Vfile=file_ref+".V.txt"; + + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + ReadFile_vector(sfile, s); + ReadFile_matrix(Sfile, S_mat, Svar_mat); + //ReadFile_matrix(Vfile, V_mat); + + double d; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (s, i); + gsl_vector_set (s_vec, i, d); + } + ni=gsl_vector_get (s, s_vec->size); + + gsl_vector_free(s); + + return; +} + + +//read mstudy file +void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) +{ + gsl_matrix_set_zero(Vq_mat); gsl_vector_set_zero(q_vec); gsl_vector_set_zero(s_vec); + ni=0; + + gsl_matrix *Vq_sub=gsl_matrix_alloc(Vq_mat->size1, Vq_mat->size2); + gsl_vector *q_sub=gsl_vector_alloc(q_vec->size); + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + igzstream infile (file_mstudy.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mstudy file: "<size)); + + for (size_t i=0; isize; i++) { + d1=gsl_vector_get (s, i); + if (d1==0) {continue;} + + d=gsl_vector_get(q_vec, i)+gsl_vector_get(q_sub, i)*d1; + gsl_vector_set(q_vec, i, d); + + d=gsl_vector_get(s_vec, i)+d1; + gsl_vector_set(s_vec, i, d); + + for (size_t j=i; jsize; j++) { + d2=gsl_vector_get (s, j); + if (d2==0) {continue;} + + d=gsl_matrix_get(Vq_mat, i, j)+gsl_matrix_get(Vq_sub, i, j)*d1*d2; + gsl_matrix_set(Vq_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);} + } + } + } - igzstream infile (file_mq.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mq file: "<size; i++) { + d1=gsl_vector_get (s_vec, i); + if (d1==0) {continue;} + + d=gsl_vector_get (q_vec, i); + gsl_vector_set (q_vec, i, d/d1); + + for (size_t j=i; jsize; j++) { + d2=gsl_vector_get (s_vec, j); + if (d2==0) {continue;} + + d=gsl_matrix_get (Vq_mat, i, j)/(d1*d2); + gsl_matrix_set (Vq_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);} + } + } + + gsl_matrix_free(Vq_sub); + gsl_vector_free(q_sub); + gsl_vector_free(s); + + return; +} + + +//copied from lmm.cpp; is used in the following function compKtoV +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + +//read reference file +void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) +{ + gsl_matrix_set_zero(S_mat); + gsl_matrix_set_zero(Svar_mat); + // gsl_matrix_set_zero(V_mat); + gsl_vector_set_zero(s_vec); + ni=0; + + //size_t n_vc=S_mat->size1; + gsl_matrix *S_sub=gsl_matrix_alloc (S_mat->size1, S_mat->size2); + gsl_matrix *Svar_sub=gsl_matrix_alloc (Svar_mat->size1, Svar_mat->size2); + //gsl_matrix *V_sub=gsl_matrix_alloc (V_mat->size1, V_mat->size2); + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + igzstream infile (file_mref.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mref file: "<size; i++) { + d=gsl_vector_get (s, i)+gsl_vector_get (s_vec, i); + gsl_vector_set (s_vec, i, d); + } + ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size)); + + //update S and Svar from each file + for (size_t i=0; isize1; i++) { + d1=gsl_vector_get(s, i); + for (size_t j=0; jsize2; j++) { + d2=gsl_vector_get(s, j); + + d=gsl_matrix_get(S_sub, i, j)*d1*d2; + gsl_matrix_set(S_sub, i, j, d); + d=gsl_matrix_get(Svar_sub, i, j)*d1*d2*d1*d2; + gsl_matrix_set(Svar_sub, i, j, d); + } + } + + gsl_matrix_add (S_mat, S_sub); + gsl_matrix_add (Svar_mat, Svar_sub); + /* + //update V from each file + for (size_t i=0; isize1; i++) { + d1=gsl_vector_get(s_vec, i); + if (d1==0) {continue;} + for (size_t j=i; jsize2; j++) { + d2=gsl_vector_get(s_vec, j); + if (d2==0) {continue;} + + d=gsl_matrix_get(S_mat, i, j)/(d1*d2); + gsl_matrix_set(S_mat, i, j, d); + if (i!=j) {gsl_matrix_set(S_mat, j, i, d);} + + d=gsl_matrix_get(Svar_mat, i, j)/(d1*d2*d1*d2); + gsl_matrix_set(Svar_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Svar_mat, j, i, d);} + } + } + /* + //final: update V + for (size_t i=0; i &setSnps); +bool ReadFile_snps_header (const string &file_snps, set &setSnps); bool ReadFile_log (const string &file_log, double &pheno_mean); bool ReadFile_bim (const string &file_bim, vector &snpInfo); @@ -80,20 +81,23 @@ bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &mapRS2cat, size_t &n_vc); +bool ReadFile_mcat (const string &file_mcat, map &mapRS2cat, size_t &n_vc); -bool BimbamKin (const string &file_geno, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin); -bool PlinkKin (const string &file_bed, vector &indicator_idv, vector &indicator_snp, const int k_mode, const int display_pace, const map &mapRS2cat, map &mapRS2var, vector &snpInfo, gsl_matrix *matrix_kin); +bool BimbamKin (const string &file_geno, const int display_pace, const vector &indicator_idv, const vector &indicator_snp, const map &mapRS2weight, const map &mapRS2cat, const vector &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool PlinkKin (const string &file_bed, const int display_pace, const vector &indicator_idv, const vector &indicator_snp, const map &mapRS2weight, const map &mapRS2cat, const vector &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector &indicator_idv, const vector > &mindicator_snp, const map &mapRS2weight, const map &mapRS2cat, const vector > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); -bool ReadFile_var (const string &file_var, map &mapRS2var); -void ReadFile_beta (const string &file_beta, const int k_mode, const map &mapRS2cat, const map &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test); +bool ReadFile_wsnp (const string &file_wsnp, map &mapRS2double); +bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map > &mapRS2vector); +void ReadFile_beta (const string &file_beta, const map &mapRS2cat, const map &mapRS2wA, vector &vec_cat, vector &vec_ni, vector &vec_weight, vector &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test); +void ReadFile_beta (const string &file_beta, const map &mapRS2wA, map &mapRS2A1, map &mapRS2z); +void Calcq (const size_t n_block, const vector &vec_cat, const vector &vec_ni, const vector &vec_weight, const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s); -void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar); -void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar); -void ReadFile_v (const string &file_v, gsl_matrix *V); -void ReadFile_mv (const string &file_mq, gsl_matrix *V); -void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df); -void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df); +void ReadFile_study (const string &file_study, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); +void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); +void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); +void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); // WJA added bool bgenKin (const string &file_geno, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); diff --git a/src/lm.cpp b/src/lm.cpp index b4bc010..f8cb974 100644 --- a/src/lm.cpp +++ b/src/lm.cpp @@ -41,6 +41,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_integration.h" +#include "eigenlib.h" #include "gzstream.h" #include "lapack.h" @@ -519,9 +520,9 @@ void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) for (size_t i=0; i1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } @@ -626,9 +627,9 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) for (size_t i=0; i1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } //calculate statistics @@ -712,7 +713,6 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) b=ch[0]; } - for (vector::size_type t=0; t1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } //calculate statistics @@ -759,11 +759,11 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } cout<size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1236,6 +1243,7 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ // } //start reading genotypes and analyze + size_t c=0; for (size_t t=0; t1) {break;} !safeGetline(infile, line).eof(); @@ -1268,48 +1276,72 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ for (size_t i=0; i1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } - + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } + } cout<size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1371,7 +1412,7 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m b=ch[0]; } - + size_t c=0; for (vector::size_type t=0; t1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1, for beta - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } } cout<size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1537,6 +1612,7 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma //start reading genotypes and analyze + size_t c=0; for (size_t t=0; t1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } - + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } } cout<size1); for (size_t i=0; isize2; ++i) { gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); @@ -254,17 +256,28 @@ void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) gsl_vector_memcpy (&UtX_col.vector, Utx_vec); } gsl_vector_free (Utx_vec); + */ + + gsl_matrix *X=gsl_matrix_alloc (UtX->size1, UtX->size2); + gsl_matrix_memcpy (X, UtX); + eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX); + gsl_matrix_free (X); + return; } void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) { + /* for (size_t i=0; isize2; ++i) { gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector); } + */ + eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX); + return; } @@ -329,7 +342,8 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) het_probs[i] = 0.0; /* start at midpoint */ - int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + //XZ modified to add (long int) + int mid = ((long int)rare_copies * (2 * (long int)genotypes - (long int)rare_copies)) / (2 * (long int)genotypes); /* check to ensure that midpoint and rare alleles have same parity */ if ((rare_copies & 1) ^ (mid & 1)) @@ -390,7 +404,7 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) p_hwe += het_probs[i]; } - p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; free(het_probs); diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index 5826a1f..7655b50 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -42,6 +42,7 @@ #include "io.h" #include "lapack.h" +#include "eigenlib.h" #include "gzstream.h" #ifdef FORCE_FLOAT @@ -2935,12 +2936,17 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ ifstream infile (file_bgen.c_str(), ios::binary); if (!infile) {cout<<"error reading bgen file:"<size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; double crt_a, crt_b, crt_c; @@ -3179,6 +3185,7 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ //start reading genotypes and analyze + size_t csnp=0; for (size_t t=0; t1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); - if (p_scoresize1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==2 || a_mode==4) { - logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Xlarge); + + for (size_t i=0; i1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; isize1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3615,6 +3656,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix_memcpy (B_null, B); //start reading genotypes and analyze + size_t csnp=0; for (size_t t=0; t=1) {break;} !safeGetline(infile, line).eof(); @@ -3647,86 +3689,111 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs for (size_t i=0; i1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); - if (p_scoresize1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==2 || a_mode==4) { - logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Xlarge); + + for (size_t i=0; i1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; isize1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3992,6 +4068,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl b=ch[0]; } + size_t csnp=0; for (vector::size_type t=0; t1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } /* @@ -4047,85 +4124,110 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } */ + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (p_score1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; i1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<size1, ni_test=G->size1; - double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d; +//from an existing n by nd A and K matrices, compute the d by d S matrix (which is not necessary symmetric) +void compAKtoS (const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt, gsl_matrix *S) { + size_t n_vc=S->size1, ni_test=A->size1; + double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d; for (size_t i=0; in_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<size2/G->size1, ni_test=G->size1; - gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test); - gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc ); + gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, (n_vc*(n_vc+1))/2*ni_test); gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 ); gsl_vector *trKi=gsl_vector_alloc(n_vc); double d, tr, r=(double)ni_test/(double)(ni_test-1); - size_t t, t_ij, t_il, t_jl, t_ii; + size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij; - //compute KiKj for all pairs of i and j (including the identity matrix) + //compute KiKj for all pairs of i and j (not including the identity matrix) t=0; for (size_t i=0; im) { + gsl_blas_ddot (&KiKl_row.vector, &KjKm_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d); + tr-=r*d; + } else if (i>l && j<=m) { + gsl_blas_ddot (&KiKl_col.vector, &KjKm_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d); + tr-=r*d; + } else { + gsl_blas_ddot (&KiKl_col.vector, &KjKm_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d); + tr-=r*d; + } + } - //compute Q - for (size_t i=0; il) { - gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); - tr-=r*d; - } else if (i>j && i<=l) { - gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); - tr-=r*d; - } else { - gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); - tr-=r*d; + tr+=r*r*gsl_vector_get (trKiKj, t_lm); + } else if (l!=n_vc && m==n_vc) { + t_il=GetabIndex (i+1, l+1, n_vc-2); + t_jl=GetabIndex (j+1, l+1, n_vc-2); + tr=0; + for (size_t k=0; ksize1, ni_test=G->size1; - vector > > tr_KiKj, s_KiKj; - vector > sum_Ki, s_Ki, si; +void JackknifeAKtoS (const gsl_matrix *W, const gsl_matrix *A, const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) { + size_t n_vc=Svar->size1, ni_test=A->size1, n_cvt=W->size2; + + vector > > trAK, sumAK; + vector > sumA, sumK, trA, trK, sA, sK; vector vec_tmp; double di, dj, d, m, v; + //gsl_matrix *Stmp=gsl_matrix_alloc (n_vc, ni_test*n_vc); + //gsl_matrix *Stmp_sub=gsl_matrix_alloc (n_vc, n_vc); + //initialize and set all elements to zero for (size_t i=0; i &mapRS2wA, const map &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns) { string file_str; gsl_matrix_set_zero (S); gsl_matrix_set_zero (Svar); - gsl_matrix_set_zero (Q); + gsl_vector_set_zero (ns); //compute the kinship matrix G for multiple categories; these matrices are not centered, for convienence of Jacknife sampling - gsl_matrix *G=gsl_matrix_alloc (ni_test, n_vc*ni_test); - gsl_matrix_set_zero (G); - if (!file_bfile.empty() ) { file_str=file_bfile+".bed"; - if (PlinkKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} - } else { + if (mapRS2wA.size()==0) { + if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;} + } else { + if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_geno.empty()) { file_str=file_geno; - if (BimbamKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} + if (mapRS2wA.size()==0) { + if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;} + } else { + if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_mbfile.empty() ){ + if (mapRS2wA.size()==0) { + if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;} + } else { + if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_mgeno.empty()) { + if (mapRS2wA.size()==0) { + if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;} + } else { + if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;} + } } - //center and scale every kinship matrix inside G - double d; - for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); - } - gsl_r = gsl_rng_alloc(gslType); - gsl_rng_set(gsl_r, randseed); - - //bootstrap: in each iteration, sample individuals and compute S_pmt - size_t n_pmt=100; - vector idv_order, idv_remove; - for (size_t i=0; i(&idv_remove[0]), n_pmt, static_cast(&idv_order[0]), ni_test, sizeof(size_t)); + //center and scale every kinship matrix inside G + for (size_t i=0; isize2, S); //compute Svar and update S with Jacknife - JacknifeGtoS (G, S, Svar); + JackknifeAKtoS (W, A, K, S, Svar); + + //based on G, compute a matrix Q that can be used to calculate the variance of q + //compKtoV (G, V); - gsl_matrix_free(G); return; } @@ -1223,11 +1388,20 @@ void PARAM::WriteVar (const string suffix) outfile.precision(10); - for (size_t i=0; i &setSnps_beta, map &mapRS2wK) +{ + mapRS2wK.clear(); + + vector wsum, wcount; + + for (size_t i=0; i::iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + if (mapRS2cat.size()==0) { + it->second/=wsum[0]; + } else { + it->second/=wsum[mapRS2cat[it->first]]; + } + } + } + return; +} + + +//pve_flag=0 then do not change pve; pve_flag==1, then change pve to 0 if pve < 0 and pve to 1 if pve > 1 +void PARAM::UpdateWeight (const size_t pve_flag, const map &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map &mapRS2wA) +{ + double d; + vector wsum, wcount; + + for (size_t i=0; i::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + d=1; + for (size_t i=0; i=1 && pve_flag==1) { + d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]; + } else if (v_pve[i]<=0 && pve_flag==1) { + d+=0; + } else { + d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]*v_pve[i]; + } + } + mapRS2wA[it->first]=1/(d*d); + + if (mapRS2cat.size()==0) { + wsum[0]+=mapRS2wA[it->first]; + wcount[0]++; + } else { + wsum[mapRS2cat[it->first]]+=mapRS2wA[it->first]; + wcount[mapRS2cat[it->first]]++; + } + } + + for (size_t i=0; i::iterator it=mapRS2wA.begin(); it!=mapRS2wA.end(); ++it) { + if (mapRS2cat.size()==0) { + it->second/=wsum[0]; + } else { + it->second/=wsum[mapRS2cat[it->first]]; + } + } + return; +} + +// this function updates indicator_snp, and save z-scores and other values into vectors +void PARAM::UpdateSNPnZ (const map &mapRS2wA, const map &mapRS2A1, const map &mapRS2z, gsl_vector *w, gsl_vector *z, vector &vec_cat) +{ + gsl_vector_set_zero (w); + gsl_vector_set_zero (z); + vec_cat.clear(); + + string rs, a1; + size_t c=0; + if (msnpInfo.size()==0) { + for (size_t i=0; i &mapRS2wA) +{ + string rs; + if (msnpInfo.size()==0) { + for (size_t i=0; i p_column; //which phenotype column needs analysis size_t d_pace; //display pace - string file_bfile; - string file_geno; + string file_bfile, file_mbfile; + string file_geno, file_mgeno; string file_pheno; string file_anno; //optional string file_gxe; //optional string file_cvt; //optional - string file_cat; + string file_cat, file_mcat; string file_var; string file_beta; string file_cor; - string file_kin; + string file_kin, file_mk; string file_ku, file_kd; - string file_mk; - string file_q, file_mq; - string file_s, file_ms; - string file_v, file_mv; - string file_weight; + string file_study, file_mstudy; + string file_ref, file_mref; + string file_weight, file_wsnp, file_wcat; string file_out; string path_out; @@ -165,7 +165,7 @@ public: size_t n_region; double l_mle_null, l_remle_null; double logl_mle_H0, logl_remle_H0; - double pve_null, pve_se_null; + double pve_null, pve_se_null, pve_total, se_pve_total; double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; vector Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; @@ -185,6 +185,8 @@ public: vector v_sigma2; vector v_se_sigma2; + vector v_enrich; + vector v_se_enrich; vector v_beta; vector v_se_beta; @@ -210,15 +212,18 @@ public: size_t window_bp; size_t window_ns; + //vc related parameters + size_t n_block; + // Summary statistics bool error; - size_t ni_total, ni_test, ni_cvt; //number of individuals + size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref; //number of individuals size_t np_obs, np_miss; //number of observed and missing phenotypes - size_t ns_total, ns_test; //number of snps + size_t ns_total, ns_test, ns_study, ns_ref; //number of snps size_t ng_total, ng_test; //number of genes size_t ni_control, ni_case; //number of controls and number of cases size_t ni_subsample; //number of subsampled individuals - size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel + //size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel size_t n_cvt; //number of covariates size_t n_ph; //number of phenotypes size_t n_vc; //number of variance components (including the diagonal matrix) @@ -240,6 +245,7 @@ public: vector > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + vector< vector > mindicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis vector indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis vector indicator_gxe; //indicator for gxe, 0 missing, 1 available for analysis vector indicator_weight; //indicator for weight, 0 missing, 1 available for analysis @@ -256,9 +262,11 @@ public: map mapRS2cM; //map rs# to cM map mapRS2est; //map rs# to parameters map mapRS2cat; //map rs# to category number - map mapRS2var; //map rs# to category number + map mapRS2wsnp; //map rs# to snp weights + map > mapRS2wcat; //map rs# to snp cat weights vector snpInfo; //record SNP information + vector< vector > msnpInfo; //record SNP information set setSnps; //a set of snps for analysis //constructor @@ -279,12 +287,16 @@ public: void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); void CalcKin (gsl_matrix *matrix_kin); - void CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q); + void CalcS (const map &mapRS2wA, const map &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns); void WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix); void WriteVar (const string suffix); void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); void WriteVector (const gsl_vector *vector_D, const string suffix); void CopyRead (gsl_vector *log_N); + void ObtainWeight (const set &setSnps_beta, map &mapRS2wK); + void UpdateWeight (const size_t pve_flag, const map &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map &mapRS2wA); + void UpdateSNPnZ (const map &mapRS2wA, const map &mapRS2A1, const map &mapRS2z, gsl_vector *w, gsl_vector *z, vector &vec_cat); + void UpdateSNP (const map &mapRS2wA); }; diff --git a/src/vc.cpp b/src/vc.cpp index 77cf746..94bf931 100644 --- a/src/vc.cpp +++ b/src/vc.cpp @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see . */ @@ -26,8 +26,12 @@ #include #include #include -#include +#include #include +#include +#include +#include +#include #include #include "gsl/gsl_vector.h" @@ -39,9 +43,14 @@ #include "gsl/gsl_multiroots.h" #include "gsl/gsl_min.h" +#include "Eigen/Dense" + +#include "param.h" #include "io.h" #include "lapack.h" +#include "eigenlib.h" #include "gzstream.h" +#include "mathfunc.h" #ifdef FORCE_FLOAT #include "lmm_float.h" @@ -54,95 +63,194 @@ using namespace std; - +using namespace Eigen; //in this file, X, Y are already transformed (i.e. UtX and UtY) -void VC::CopyFromParam (PARAM &cPar) -{ - file_out=cPar.file_out; - - // v_sigma2=cPar.v_sigma2; - - time_UtX=0.0; - time_opt=0.0; +void VC::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + + file_cat=cPar.file_cat; + file_beta=cPar.file_beta; + file_cor=cPar.file_cor; - v_traceG=cPar.v_traceG; - - return; + setSnps=cPar.setSnps; + + file_out=cPar.file_out; + path_out=cPar.path_out; + + //v_sigma2=cPar.v_sigma2; + + time_UtX=0.0; + time_opt=0.0; + + v_traceG=cPar.v_traceG; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ns_test=cPar.ns_test; + + crt=cPar.crt; + window_cm=cPar.window_cm; + window_bp=cPar.window_bp; + window_ns=cPar.window_ns; + + n_vc=cPar.n_vc; + + return; } -void VC::CopyToParam (PARAM &cPar) +void VC::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - - cPar.v_sigma2=v_sigma2; - cPar.v_se_sigma2=v_se_sigma2; + cPar.time_opt=time_opt; + cPar.v_pve=v_pve; cPar.v_se_pve=v_se_pve; + cPar.v_sigma2=v_sigma2; + cPar.v_se_sigma2=v_se_sigma2; + cPar.pve_total=pve_total; + cPar.se_pve_total=se_pve_total; cPar.v_traceG=v_traceG; - + cPar.v_beta=v_beta; cPar.v_se_beta=v_se_beta; - + + cPar.ni_total=ni_total; + cPar.ns_total=ns_total; + cPar.ns_test=ns_test; + + cPar.n_vc=n_vc; + + return; +} + + + +void VC::WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".qvec.txt"; + + ofstream outfile_q (file_str.c_str(), ofstream::out); + if (!outfile_q) {cout<<"error writing file: "<size; i++) { + outfile_q<size; i++) { + outfile_q<size; i++) { + outfile_q<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_s<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_s<K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2; - + gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1); gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt); gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1); - double sigma2; + double sigma2; //calculate H=\sum_i^{k+1} \sigma_i^2 K_i gsl_matrix_set_zero (p->P); for (size_t i=0; iK, 0, n1*i, n1, n1); gsl_matrix_memcpy (K_temp, &K_sub.matrix); } - sigma2=exp(gsl_vector_get (log_sigma2, i) ); + //when unconstrained, update on sigma2 instead of log_sigma2 + if (p->noconstrain) { + sigma2=gsl_vector_get (log_sigma2, i); + } else { + sigma2=exp(gsl_vector_get (log_sigma2, i) ); + } gsl_matrix_scale(K_temp, sigma2); gsl_matrix_add (p->P, K_temp); } //calculate H^{-1} + /* int sig; gsl_permutation * pmt1=gsl_permutation_alloc (n1); - LUDecomp (p->P, pmt1, &sig); + LUDecomp (p->P, pmt1, &sig); LUInvert (p->P, pmt1, K_temp); gsl_permutation_free(pmt1); gsl_matrix_memcpy (p->P, K_temp); + */ + eigenlib_invert(p->P); //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1} - gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); - gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); + //gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + + eigenlib_dgemm ("N", "N", 1.0, p->P, p->W, 0.0, HiW); + eigenlib_dgemm ("T", "N", 1.0, p->W, HiW, 0.0, WtHiW); - gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); - LUDecomp (WtHiW, pmt2, &sig); - LUInvert (WtHiW, pmt2, WtHiWi); - gsl_permutation_free(pmt2); + //gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); + //LUDecomp (WtHiW, pmt2, &sig); + //LUInvert (WtHiW, pmt2, WtHiWi); + //gsl_permutation_free(pmt2); + eigenlib_invert(WtHiW); + gsl_matrix_memcpy(WtHiWi, WtHiW); + + //gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); + eigenlib_dgemm ("N", "T", 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + eigenlib_dgemm ("N", "N", -1.0, HiW, WtHiWiWtHi, 1.0, p->P); - gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); - gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); - //calculate Py, KPy, PKPy - gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + //eigenlib_dgemv("N", 1.0, p->P, p->y, 0.0, p->Py); + double d; for (size_t i=0; iKPy_mat, i); gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i); @@ -150,11 +258,22 @@ void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) if (i==n_vc) { gsl_vector_memcpy (&KPy.vector, p->Py); } else { - gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + //seems to be important to use gsl dgemv here instead of eigenlib_dgemv; otherwise gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); + //eigenlib_dgemv("N", 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); } - + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + //eigenlib_dgemv("N", 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + + //when phenotypes are not normalized well, then some values in the following matrix maybe nan; change that to 0; this seems to only happen when eigenlib_dgemv was used above + for (size_t j=0; jKPy_mat->size1; j++) { + d=gsl_matrix_get (p->KPy_mat, j, i); + if (std::isnan(d)) {gsl_matrix_set (p->KPy_mat, j, i, 0); cout<<"nan appears in "<PKPy_mat, j, i); + if (std::isnan(d)) {gsl_matrix_set (p->PKPy_mat, j, i, 0); cout<<"nan appears in "<K)->size1, n_vc=log_sigma2->size-1; - + double tr, d; //update parameters @@ -199,8 +318,12 @@ int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); - d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); - + if (p->noconstrain) { + d=(-0.5*tr+0.5*d); + } else { + d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); + } + gsl_vector_set(dev1, i, d); } @@ -214,32 +337,47 @@ int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2) VC_PARAM *p=(VC_PARAM *) params; size_t n_vc=log_sigma2->size-1; - + double d, sigma2_i, sigma2_j; //update parameters UpdateParam (log_sigma2, p); - + //calculate dev2=0.5(yPKPKPy) for (size_t i=0; iKPy_mat, i); - sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + if (p->noconstrain) { + sigma2_i=gsl_vector_get(log_sigma2, i); + } else { + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + } for (size_t j=i; jPKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); - sigma2_j=exp(gsl_vector_get(log_sigma2, j)); - - d*=-0.5*sigma2_i*sigma2_j; + if (p->noconstrain) { + sigma2_j=gsl_vector_get(log_sigma2, j); + d*=-0.5; + } else { + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} - } + } } gsl_matrix_memcpy (p->Hessian, dev2); - + /* + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + cout<K)->size1, n_vc=log_sigma2->size-1; - + double tr, d, sigma2_i, sigma2_j; //update parameters UpdateParam (log_sigma2, p); - //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy - //calculate dev2=0.5(yPKPKPy) + //calculate dev1=(-0.5*trace(PK_i)+0.5*yPK_iPy)*sigma2_i + //calculate dev2=0.5(yPK_iPK_jPy)*sigma2_i*sigma2_j for (size_t i=0; iKPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); - sigma2_i=exp(gsl_vector_get(log_sigma2, i)); - d=(-0.5*tr+0.5*d)*sigma2_i; - + if (p->noconstrain) { + sigma2_i=gsl_vector_get(log_sigma2, i); + d=(-0.5*tr+0.5*d); + } else { + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + d=(-0.5*tr+0.5*d)*sigma2_i; + } + gsl_vector_set(dev1, i, d); - + for (size_t j=i; jPKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); - sigma2_j=exp(gsl_vector_get(log_sigma2, j)); - d*=-0.5*sigma2_i*sigma2_j; + if (p->noconstrain) { + sigma2_j=gsl_vector_get(log_sigma2, j); + d*=-0.5; + } else { + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} - } + } } @@ -303,141 +451,2049 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g -void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) + +//read header to determine which column contains which item +bool ReadHeader (const string &line, HEADER &header) { - size_t n1=K->size1, n2=K->size2; - size_t n_vc=n2/n1; - gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1); - double d, s; + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; + set rs_set(rs_ptr, rs_ptr+10); + string chr_ptr[]={"chr","CHR"}; + set chr_set(chr_ptr, chr_ptr+2); + string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; + set pos_set(pos_ptr, pos_ptr+8); + string cm_ptr[]={"cm","CM"}; + set cm_set(cm_ptr, cm_ptr+2); + string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; + set a1_set(a1_ptr, a1_ptr+4); + string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; + set a0_set(a0_ptr, a0_ptr+4); + + string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; + set z_set(z_ptr, z_ptr+6); + string beta_ptr[]={"beta","BETA","b","B"}; + set beta_set(beta_ptr, beta_ptr+4); + string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"}; + set sebeta_set(sebeta_ptr, sebeta_ptr+4); + string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"}; + set chisq_set(chisq_ptr, chisq_ptr+4); + string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"}; + set p_set(p_ptr, p_ptr+6); + + string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"}; + set n_set(n_ptr, n_ptr+6); + string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"}; + set nmis_set(nmis_ptr, nmis_ptr+6); + string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; + set nobs_set(nobs_ptr, nobs_ptr+4); + + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; + set af_set(af_ptr, af_ptr+10); + string var_ptr[]={"var","VAR"}; + set var_set(var_ptr, var_ptr+2); + + string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"}; + set ws_set(ws_ptr, ws_ptr+4); + string cor_ptr[]={"cor","COR","r","R"}; + set cor_set(cor_ptr, cor_ptr+4); + + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + + char *ch_ptr; + string type; + size_t n_error=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + type=ch_ptr; + if (rs_set.count(type)!=0) { + if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."< &setSnps, vector &vec_rs, vector &vec_n, vector &vec_cm, vector &vec_bp, map &mapRS2in, map &mapRS2var) +{ + vec_rs.clear(); + vec_n.clear(); + mapRS2in.clear(); + mapRS2var.clear(); + + igzstream infile (file_cor.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open cov file: "<0) {vec_cm.push_back(d_cm);} else {vec_cm.push_back(d_cm);} + if (d_pos>0) {vec_bp.push_back(d_pos);} else {vec_bp.push_back(d_pos);} + + //record mapRS2in and mapRS2var + if (setSnps.size()==0 || setSnps.count(rs)!=0) { + if (mapRS2in.count(rs)==0) { + mapRS2in[rs]=1; + + if (header.var_col!=0) { + mapRS2var[rs]=var_x; + } else if (header.af_col!=0) { + var_x=2.0*af*(1.0-af); + mapRS2var[rs]=var_x; + } else {} + + ns_test++; + + } else { + cout<<"error! more than one snp has the same id "< &mapRS2cat, map &mapRS2in, map &mapRS2var, map &mapRS2nsamp, gsl_vector *q_vec, gsl_vector *qvar_vec, gsl_vector *s_vec, size_t &ni_total, size_t &ns_total) +{ + mapRS2nsamp.clear(); + + igzstream infile (file_beta.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open beta file: "< vec_q, vec_qvar, vec_s; + for (size_t i=0; isize; i++) { + vec_q.push_back(0.0); + vec_qvar.push_back(0.0); + vec_s.push_back(0.0); + } + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if (header.nobs_col==0 && header.nmis_col==0) { + cout<<"error! missing sample size in the beta file."<f, 1e-3); + if (header.n_col==0) { + n_total=n_mis+n_obs; + } + + //both z values and beta/se_beta have directions, while chisq/pvalue do not + if (header.z_col!=0) { + zsquare=z*z; + } else if (header.beta_col!=0 && header.sebeta_col!=0) { + z=beta/se_beta; + zsquare=z*z; + } else if (header.chisq_col!=0) { + zsquare=chisq; + } else if (header.p_col!=0) { + zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); + } else {zsquare=0;} + + //if the snp is also present in cor file, then do calculations + if ((header.var_col!=0 || header.af_col!=0 || mapRS2var.count(rs)!=0) && mapRS2in.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { + if (mapRS2in.at(rs)>1) { + cout<<"error! more than one snp has the same id "<f, ¶ms, dev1, dev2); + for (size_t i=0; isize; i++) { + gsl_vector_set(q_vec, i, vec_q[i]); + gsl_vector_set(qvar_vec, i, 2.0*vec_qvar[i]); + gsl_vector_set(s_vec, i, vec_s[i]); + } - gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1); - LUDecomp (dev2, pmt, &sig); - LUInvert (dev2, pmt, Hessian); - gsl_permutation_free(pmt); - //save data - v_sigma2.clear(); - for (size_t i=0; ix, i)); - v_sigma2.push_back(d); + infile.clear(); + infile.close(); + + return; +} + + + + + +//read covariance file the second time +//look for rs, n_mis+n_obs, var, window_size, cov +//if window_cm/bp/ns is provided, then use these max values to calibrate estimates +void ReadFile_cor (const string &file_cor, const vector &vec_rs, const vector &vec_n, const vector &vec_cm, const vector &vec_bp, const map &mapRS2cat, const map &mapRS2in, const map &mapRS2var, const map &mapRS2nsamp, const size_t crt, const double &window_cm, const double &window_bp, const double &window_ns, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *qvar_vec, size_t &ni_total, size_t &ns_total, size_t &ns_test, size_t &ns_pair) +{ + igzstream infile (file_cor.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open cov file: "< > mat_S, mat_Svar, mat_tmp; + vector vec_qvar, vec_tmp; + vector > > mat3d_Sbin; + + for (size_t i=0; isize1; i++) { + vec_qvar.push_back(0.0); } - v_se_sigma2.clear(); - for (size_t i=0; isize1; i++) { + mat_S.push_back(vec_qvar); + mat_Svar.push_back(vec_qvar); } - s=0; - for (size_t i=0; isize1; i++) { + mat_tmp.push_back(vec_tmp); + } + for (size_t i=0; isize1; i++) { + mat3d_Sbin.push_back(mat_tmp); } - v_se_pve.clear(); - for (size_t i=0; isize1; i++) { + for (size_t j=i; jsize2; j++) { + + //correct mat_S + n=0; var_y=0; var_x=0; mean_y=0; mean_x=0; cov_xy=0; + for (size_t k=0; k0) { + y=1/sqrt(y); + mean_x+=x; mean_y+=y; var_x+=x*x; var_y+=y*y; cov_xy+=x*y; + n++; + } + } + cout<=5) { + mean_x/=n; mean_y/=n; var_x/=n; var_y/=n; cov_xy/=n; + var_x-=mean_x*mean_x; var_y-=mean_y*mean_y; cov_xy-=mean_x*mean_y; + b=cov_xy/var_x; + a=mean_y-b*mean_x; + crt_factor=a/(b*(bin_size+0.5))+1; + if (i==j) { + mat_S[i][j]*=crt_factor; + } else { + mat_S[i][j]*=crt_factor; mat_S[j][i]*=crt_factor; + } + cout<size1; i++) { + d1=gsl_vector_get(qvar_vec, i)+2*vec_qvar[i]; + gsl_vector_set(qvar_vec, i, d1); + for (size_t j=0; jsize2; j++) { + if (i==j) { + gsl_matrix_set(S_mat, i, j, mat_S[i][i]); + gsl_matrix_set(Svar_mat, i, j, 2.0*mat_Svar[i][i]*ns_test*ns_test/(2.0*ns_pair) ); + } else { + gsl_matrix_set(S_mat, i, j, mat_S[i][j]+mat_S[j][i]); + gsl_matrix_set(Svar_mat, i, j, 2.0*(mat_Svar[i][j]+mat_Svar[j][i])*ns_test*ns_test/(2.0*ns_pair) ); + } + } + } + + + + infile.clear(); + infile.close(); return; } - +//copied from lmm.cpp; is used in the following function VCss +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + +//use the new method to calculate variance components with summary statistics +//first, use a function CalcS to compute S matrix (where the diagonal elements are part of V(q) ), and then use bootstrap to compute the variance for S, use a set of genotypes, phenotypes, and individual ids, and snp category label +void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector &v_pve, vector &v_se_pve, double &pve_total, double &se_pve_total, vector &v_sigma2, vector &v_se_sigma2, vector &v_enrich, vector &v_se_enrich) { + size_t n_vc=S_mat->size1; + + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + + gsl_vector *pve=gsl_vector_alloc (n_vc); + gsl_vector *pve_plus=gsl_vector_alloc (n_vc+1); + gsl_vector *tmp=gsl_vector_alloc (n_vc+1); + gsl_vector *sigma2persnp=gsl_vector_alloc (n_vc); + gsl_vector *enrich=gsl_vector_alloc (n_vc); + gsl_vector *se_pve=gsl_vector_alloc (n_vc); + gsl_vector *se_sigma2persnp=gsl_vector_alloc (n_vc); + gsl_vector *se_enrich=gsl_vector_alloc (n_vc); + + double d; + + //calculate S^{-1}q + gsl_matrix_memcpy (tmp_mat, S_mat); + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + LUDecomp (tmp_mat, pmt, &sig); + LUInvert (tmp_mat, pmt, Si_mat); + + //calculate sigma2snp and pve + gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); + gsl_vector_memcpy(sigma2persnp, pve); + gsl_vector_div(sigma2persnp, s_vec); + + //get qvar_mat + /* + if (n_block==0 || n_block==1) { + double s=1.0; + for (size_t i=0; isize1, n2=K->size2; + size_t n_vc=n2/n1; + + double r=(double)n1/(double)(n1 - W->size2); + double var_y, var_y_new; + double d, tr, s, v; + vector traceG_new; + + //new matrices/vectors + gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2); + gsl_vector *y_scale=gsl_vector_alloc (n1); + gsl_matrix *Kry=gsl_matrix_alloc (n1, n_vc); + gsl_matrix *yKrKKry=gsl_matrix_alloc (n_vc, n_vc*(n_vc+1) ); + gsl_vector *KKry=gsl_vector_alloc (n1); + + //old matrices/vectors + gsl_vector *pve=gsl_vector_alloc (n_vc); + gsl_vector *se_pve=gsl_vector_alloc (n_vc); + gsl_vector *q_vec=gsl_vector_alloc (n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + + //center and scale K by W + for (size_t i=0; i1) { + cout<<"total pve = "<size1); + time_start=clock(); + for (size_t i=0; i<1000; i++) { + gsl_blas_dgemv(CblasNoTrans, 1.0, K2, &W_col.vector, 0.0, v); + } + cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1, K->size1); + gsl_matrix *K3=gsl_matrix_alloc(K->size1, K->size1); + + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + EigenDecomp(K2copy, K3, v, 0); + cout<<"standard time 0: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1); + LUDecomp (K2copy, pmt1, &sigcopy); + LUInvert (K2copy, pmt1, K3); + gsl_permutation_free(pmt1); + cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<x, i))<<" "; + } + } + cout<f, 1e-3); + } + while (status==GSL_CONTINUE && iterx, ¶ms, dev1, dev2); + /* + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + cout<x, i); + } else { + d=exp(gsl_vector_get(s_fdf->x, i)); + } + v_sigma2.push_back(d); + + if (noconstrain) { + d=-1.0*gsl_matrix_get(Hessian, i, i); + } else { + d=-1.0*d*d*gsl_matrix_get(Hessian, i, i); + } + v_se_sigma2.push_back(sqrt(d)); + } + + s=0; + for (size_t i=0; ix, i); + d1=1; + } else { + d1=exp(gsl_vector_get(s_fdf->x, i)); + } + + if (kx, j); + d2=1; + } else { + d2=exp(gsl_vector_get(s_fdf->x, j)); + } + + if (k &indicator_idv, vector &indicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + gsl_vector *wz=gsl_vector_alloc (w->size); + gsl_vector_memcpy (wz, z); + gsl_vector_mul(wz, w); + + for (size_t t=0; t &indicator_idv, vector &indicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *wz=gsl_vector_alloc (w->size); + gsl_vector_memcpy (wz, z); + gsl_vector_mul(wz, w); + + int n_bit; + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t &indicator_idv, vector > &mindicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz) +{ + gsl_matrix_set_zero(XWz); + + igzstream infile (file_mfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mfile file: "< &indicator_idv, vector &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + + for (size_t t=0; tsize2; i++) { + gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i); + gsl_blas_ddot (geno, &XWz_col.vector, &d); + gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); + } + + ns_test++; + } + + cout< &indicator_idv, vector &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + + int n_bit; + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; tsize2; i++) { + gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i); + gsl_blas_ddot (geno, &XWz_col.vector, &d); + gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); + } + + ns_test++; + } + cout< &indicator_idv, vector > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz) +{ + gsl_matrix_set_zero(XtXWz); + + igzstream infile (file_mfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mfile file: "< &vec_cat, const vector &v_pve, vector &v_se_pve, double &pve_total, double &se_pve_total, vector &v_sigma2, vector &v_se_sigma2, vector &v_enrich, vector &v_se_enrich) { + size_t n_vc=XWz->size2, ns_test=w->size, ni_test=XWz->size1; + + //set up matrices + gsl_vector *w_pve=gsl_vector_alloc (ns_test); + gsl_vector *wz=gsl_vector_alloc (ns_test); + gsl_vector *zwz=gsl_vector_alloc (n_vc); + gsl_vector *zz=gsl_vector_alloc (n_vc); + gsl_vector *Xz_pve=gsl_vector_alloc (ni_test); + gsl_vector *WXtXWz=gsl_vector_alloc (ns_test); + + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + + double d, s0, s1, s, s_pve, s_snp; + + //compute wz and zwz + gsl_vector_memcpy (wz, z); + gsl_vector_mul (wz, w); + + gsl_vector_set_zero (zwz); + gsl_vector_set_zero (zz); + for (size_t i=0; isize; i++) { + d=gsl_vector_get (wz, i)*gsl_vector_get (z, i); + d+=gsl_vector_get (zwz, vec_cat[i]); + gsl_vector_set (zwz, vec_cat[i], d); + + d=gsl_vector_get (z, i)*gsl_vector_get (z, i); + d+=gsl_vector_get (zz, vec_cat[i]); + gsl_vector_set (zz, vec_cat[i], d); + } + + //compute wz, ve and Xz_pve + gsl_vector_set_zero (Xz_pve); s_pve=0; s_snp=0; + for (size_t i=0; isize; i++) { + d=v_pve[vec_cat[i]]/gsl_vector_get(s_vec, vec_cat[i]); + gsl_vector_set (w_pve, i, d); + } + + //compute Vq (in qvar_mat) + s0=1-s_pve; + for (size_t i=0; i. */ -#ifndef __VC_H__ +#ifndef __VC_H__ #define __VC_H__ #include "gsl/gsl_vector.h" @@ -38,7 +38,7 @@ using namespace std; class VC_PARAM { -public: +public: const gsl_matrix *K; const gsl_matrix *W; const gsl_vector *y; @@ -47,18 +47,34 @@ public: gsl_matrix *KPy_mat; gsl_matrix *PKPy_mat; gsl_matrix *Hessian; + bool noconstrain; }; + class VC { public: // IO related parameters + size_t a_mode; + string file_cat; + string file_beta; + string file_cor; + string file_mq; + string file_ms; + string file_out; string path_out; + set setSnps; + + size_t ni_total_ref, ns_total_ref, ns_pair; + size_t ni_total, ns_total, ns_test; + size_t n_vc; + + double pve_total, se_pve_total; vector v_sigma2; vector v_se_sigma2; vector v_pve; @@ -67,16 +83,33 @@ public: vector v_beta; vector v_se_beta; + size_t crt; + double window_cm, window_bp, window_ns; + double time_UtX; double time_opt; - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); + void WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat); void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); - void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); + void CalcVCreml (const bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); }; +void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector &v_pve, vector &v_se_pve, double &pve_total, double &se_pve_total, vector &v_sigma2, vector &v_se_sigma2, vector &v_enrich, vector &v_se_enrich); + + +bool BimbamXwz (const string &file_geno, const int display_pace, vector &indicator_idv, vector &indicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz); +bool PlinkXwz (const string &file_bed, const int display_pace, vector &indicator_idv, vector &indicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz); +bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector &indicator_idv, vector > &mindicator_snp, const vector &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz); + +bool BimbamXtXwz (const string &file_geno, const int display_pace, vector &indicator_idv, vector &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz); +bool PlinkXtXwz (const string &file_bed, const int display_pace, vector &indicator_idv, vector &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz); +bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector &indicator_idv, vector > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz); + +void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector &vec_cat, const vector &v_pve, vector &v_se_pve, double &pve_total, double &se_pve_total, vector &v_sigma2, vector &v_se_sigma2, vector &v_enrich, vector &v_se_enrich); + #endif -- cgit v1.2.3