/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_blas.h" #include "gsl/gsl_eigen.h" #include "gsl/gsl_cdf.h" #include "lapack.h" //for functions EigenDecomp #ifdef FORCE_FLOAT #include "io_float.h" //for function ReadFile_kin #include "gemma_float.h" #include "vc_float.h" #include "lm_float.h" //for LM class #include "bslmm_float.h" //for BSLMM class #include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe #include "mvlmm_float.h" //for MVLMM class #include "prdt_float.h" //for PRDT class #include "mathfunc_float.h" //for a few functions #else #include "io.h" #include "gemma.h" #include "vc.h" #include "lm.h" #include "bslmm.h" #include "lmm.h" #include "mvlmm.h" #include "prdt.h" #include "mathfunc.h" #endif using namespace std; GEMMA::GEMMA(void): version("0.95alpha"), date("08/08/2014"), year("2011") {} void GEMMA::PrintHeader (void) { cout< indicator_all; size_t c_bv=0; for (size_t i=0; isize; i++) { d=gsl_vector_get(y_prdt, i); d=gsl_cdf_gaussian_P(d, 1.0); gsl_vector_set(y_prdt, i, d); } } cPRDT.CopyToParam(cPar); cPRDT.WriteFiles(y_prdt); gsl_vector_free(y_prdt); } //Prediction with kinship matrix only; for one or more phenotypes if (cPar.a_mode==43) { //first, use individuals with full phenotypes to obtain estimates of Vg and Ve gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph); gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt); //set covariates matrix W and phenotype matrix Y //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); cPar.CopyCvtPhen (W_full, Y_full, 1); gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2); //read relatedness matrix G, and matrix G_full ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); //calculate variance component and beta estimates //and then obtain predicted values if (cPar.n_ph==1) { gsl_vector *beta=gsl_vector_alloc (W->size2); gsl_vector *se_beta=gsl_vector_alloc (W->size2); double lambda, logl, vg, ve; gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); //obtain estimates CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl); CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta); cout<<"REMLE estimate for vg in the null model = "<size1; i++) { for (size_t j=0; j<=i; j++) { cout<size1; i++) { for (size_t j=0; j<=i; j++) { cout<size1; i++) { for (size_t j=i; jsize2; j++) { cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) ); cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) ); } } //obtain Y_hat from fixed effects gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat); //obtain H KroneckerSym(G_full, Vg, H_full); for (size_t i=0; isize1; i++) { gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2); gsl_matrix_add (&H_sub.matrix, Ve); } //free matrices gsl_matrix_free (Vg); gsl_matrix_free (Ve); gsl_matrix_free (B); gsl_matrix_free (se_B); } PRDT cPRDT; cPRDT.CopyFromParam(cPar); cout<<"Predicting Missing Phentypes ... "<size1, cPar.n_cvt); //set covariates matrix W and phenotype matrix Y //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); //Fit LM or mvLM if (cPar.n_ph==1) { LM cLm; cLm.CopyFromParam(cPar); gsl_vector_view Y_col=gsl_matrix_column (Y, 0); if (!cPar.file_gene.empty()) { cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype } else if (!cPar.file_bfile.empty()) { cLm.AnalyzePlink (W, &Y_col.vector); } else { cLm.AnalyzeBimbam (W, &Y_col.vector); } cLm.WriteFiles(); cLm.CopyToParam(cPar); } /* else { MVLM cMvlm; cMvlm.CopyFromParam(cPar); if (!cPar.file_bfile.empty()) { cMvlm.AnalyzePlink (W, Y); } else { cMvlm.AnalyzeBimbam (W, Y); } cMvlm.WriteFiles(); cMvlm.CopyToParam(cPar); } */ //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); } //VC estimation with one or multiple kinship matrices //REML approach only //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp //for one phenotype only; if (cPar.a_mode==61) { gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc ); //set covariates matrix W and phenotype matrix Y //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); //read kinship matrices if (!(cPar.file_mk).empty()) { ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1, G->size1, G->size1); CenterMatrix (&G_sub.matrix); d=0; for (size_t j=0; jsize1; j++) { d+=gsl_matrix_get (&G_sub.matrix, j, j); } d/=(double)G->size1; (cPar.v_traceG).push_back(d); } } else if (!(cPar.file_kin).empty()) { ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1; j++) { d+=gsl_matrix_get (G, j, j); } d/=(double)G->size1; (cPar.v_traceG).push_back(d); } /* //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<size; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} cPar.trace_G+=gsl_vector_get(eval, i); } cPar.trace_G/=(double)eval->size; } */ //fit multiple variance components if (cPar.n_ph==1) { // if (cPar.n_vc==1) { /* //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); gsl_vector_view beta=gsl_matrix_row (B, 0); gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); cPar.beta_mle_null.clear(); cPar.se_beta_mle_null.clear(); for (size_t i=0; isize2; i++) { cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); cPar.beta_remle_null.clear(); cPar.se_beta_remle_null.clear(); for (size_t i=0; isize2; i++) { cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); double d, u, e; for (size_t i=0; isize; i++) { d=gsl_vector_get (eval, i); u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); } */ // } else { gsl_vector_view Y_col=gsl_matrix_column (Y, 0); VC cVc; cVc.CopyFromParam(cPar); cVc.CalcVCreml (G, W, &Y_col.vector); cVc.CopyToParam(cPar); //obtain pve from sigma2 //obtain se_pve from se_sigma2 //} } } //LMM or mvLMM or Eigen-Decomposition if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) { //Fit LMM or mvLMM or eigen gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2); //B is a d by c matrix gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); //set covariates matrix W and phenotype matrix Y //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); //read relatedness matrix G if (!(cPar.file_kin).empty()) { ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} cPar.trace_G+=gsl_vector_get(eval, i); } cPar.trace_G/=(double)eval->size; } if (cPar.a_mode==31) { cPar.WriteMatrix(U, "eigenU"); cPar.WriteVector(eval, "eigenD"); } else { //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); //calculate REMLE/MLE estimate and pve for univariate model if (cPar.n_ph==1) { gsl_vector_view beta=gsl_matrix_row (B, 0); gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); cPar.beta_mle_null.clear(); cPar.se_beta_mle_null.clear(); for (size_t i=0; isize2; i++) { cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); cPar.beta_remle_null.clear(); cPar.se_beta_remle_null.clear(); for (size_t i=0; isize2; i++) { cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); double d, u, e; for (size_t i=0; isize; i++) { d=gsl_vector_get (eval, i); u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); } } //Fit LMM or mvLMM if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) { if (cPar.n_ph==1) { LMM cLmm; cLmm.CopyFromParam(cPar); gsl_vector_view Y_col=gsl_matrix_column (Y, 0); gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); if (!cPar.file_gene.empty()) { cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype } else if (!cPar.file_bfile.empty()) { cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); } else { cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); } cLmm.WriteFiles(); cLmm.CopyToParam(cPar); } else { MVLMM cMvlmm; cMvlmm.CopyFromParam(cPar); if (!cPar.file_bfile.empty()) { cMvlmm.AnalyzePlink (U, eval, UtW, UtY); } else { cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); } cMvlmm.WriteFiles(); cMvlmm.CopyToParam(cPar); } } } //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); gsl_matrix_free(B); gsl_matrix_free(se_B); gsl_matrix_free (G); gsl_matrix_free (U); gsl_matrix_free (UtW); gsl_matrix_free (UtY); gsl_vector_free (eval); } //BSLMM if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { gsl_vector *y=gsl_vector_alloc (cPar.ni_test); gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); //set covariates matrix W and phenotype vector y //an intercept should be included in W, cPar.CopyCvtPhen (W, y, 0); //center y, even for case/control data cPar.pheno_mean=CenterVector(y); //run bslmm if rho==1 if (cPar.rho_min==1 && cPar.rho_max==1) { //read genotypes X (not UtX) cPar.ReadGenotypes (UtX, G, false); //perform BSLMM analysis BSLMM cBslmm; cBslmm.CopyFromParam(cPar); time_start=clock(); cBslmm.MCMC(UtX, y); cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); cBslmm.CopyToParam(cPar); //else, if rho!=1 } else { gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); gsl_vector *eval=gsl_vector_alloc (y->size); gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); gsl_vector *Uty=gsl_vector_alloc (y->size); //read relatedness matrix G if (!(cPar.file_kin).empty()) { cPar.ReadGenotypes (UtX, G, false); //read relatedness matrix G ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, y, Uty); //calculate REMLE/MLE estimate and pve CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); //Creat and calcualte UtX, use a large memory cout<<"Calculating UtX..."<tm_month<<":"<tm_day":"<tm_hour<<":"<tm_min<=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) { outfile<<"## time on optimization = "<