/*
Genome-wide Efficient Mixed Model Association (GEMMA)
Copyright (C) 2011 Xiang Zhou
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include "gsl/gsl_vector.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_blas.h"
#include "gsl/gsl_eigen.h"
#include "gsl/gsl_cdf.h"
#include "lapack.h" //for functions EigenDecomp
#ifdef FORCE_FLOAT
#include "io_float.h" //for function ReadFile_kin
#include "gemma_float.h"
#include "vc_float.h"
#include "lm_float.h" //for LM class
#include "bslmm_float.h" //for BSLMM class
#include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe
#include "mvlmm_float.h" //for MVLMM class
#include "prdt_float.h" //for PRDT class
#include "mathfunc_float.h" //for a few functions
#else
#include "io.h"
#include "gemma.h"
#include "vc.h"
#include "lm.h"
#include "bslmm.h"
#include "lmm.h"
#include "mvlmm.h"
#include "prdt.h"
#include "mathfunc.h"
#endif
using namespace std;
GEMMA::GEMMA(void):
version("0.95alpha"), date("08/08/2014"), year("2011")
{}
void GEMMA::PrintHeader (void)
{
cout< indicator_all;
size_t c_bv=0;
for (size_t i=0; isize; i++) {
d=gsl_vector_get(y_prdt, i);
d=gsl_cdf_gaussian_P(d, 1.0);
gsl_vector_set(y_prdt, i, d);
}
}
cPRDT.CopyToParam(cPar);
cPRDT.WriteFiles(y_prdt);
gsl_vector_free(y_prdt);
}
//Prediction with kinship matrix only; for one or more phenotypes
if (cPar.a_mode==43) {
//first, use individuals with full phenotypes to obtain estimates of Vg and Ve
gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1);
gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
gsl_vector *eval=gsl_vector_alloc (Y->size1);
gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph);
gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt);
//set covariates matrix W and phenotype matrix Y
//an intercept should be included in W,
cPar.CopyCvtPhen (W, Y, 0);
cPar.CopyCvtPhen (W_full, Y_full, 1);
gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph);
gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1);
gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2);
//read relatedness matrix G, and matrix G_full
ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) {
if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
cPar.trace_G+=gsl_vector_get (eval, i);
}
cPar.trace_G/=(double)eval->size;
cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
//calculate UtW and Uty
CalcUtX (U, W, UtW);
CalcUtX (U, Y, UtY);
//calculate variance component and beta estimates
//and then obtain predicted values
if (cPar.n_ph==1) {
gsl_vector *beta=gsl_vector_alloc (W->size2);
gsl_vector *se_beta=gsl_vector_alloc (W->size2);
double lambda, logl, vg, ve;
gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
//obtain estimates
CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl);
CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta);
cout<<"REMLE estimate for vg in the null model = "<size1; i++) {
for (size_t j=0; j<=i; j++) {
cout<size1; i++) {
for (size_t j=0; j<=i; j++) {
cout<size1; i++) {
for (size_t j=i; jsize2; j++) {
cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) );
cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) );
}
}
//obtain Y_hat from fixed effects
gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat);
//obtain H
KroneckerSym(G_full, Vg, H_full);
for (size_t i=0; isize1; i++) {
gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2);
gsl_matrix_add (&H_sub.matrix, Ve);
}
//free matrices
gsl_matrix_free (Vg);
gsl_matrix_free (Ve);
gsl_matrix_free (B);
gsl_matrix_free (se_B);
}
PRDT cPRDT;
cPRDT.CopyFromParam(cPar);
cout<<"Predicting Missing Phentypes ... "<size1, cPar.n_cvt);
//set covariates matrix W and phenotype matrix Y
//an intercept should be included in W,
cPar.CopyCvtPhen (W, Y, 0);
//Fit LM or mvLM
if (cPar.n_ph==1) {
LM cLm;
cLm.CopyFromParam(cPar);
gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
if (!cPar.file_gene.empty()) {
cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype
} else if (!cPar.file_bfile.empty()) {
cLm.AnalyzePlink (W, &Y_col.vector);
} else {
cLm.AnalyzeBimbam (W, &Y_col.vector);
}
cLm.WriteFiles();
cLm.CopyToParam(cPar);
}
/*
else {
MVLM cMvlm;
cMvlm.CopyFromParam(cPar);
if (!cPar.file_bfile.empty()) {
cMvlm.AnalyzePlink (W, Y);
} else {
cMvlm.AnalyzeBimbam (W, Y);
}
cMvlm.WriteFiles();
cMvlm.CopyToParam(cPar);
}
*/
//release all matrices and vectors
gsl_matrix_free (Y);
gsl_matrix_free (W);
}
//VC estimation with one or multiple kinship matrices
//REML approach only
//if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp
//for one phenotype only;
if (cPar.a_mode==61) {
gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc );
//set covariates matrix W and phenotype matrix Y
//an intercept should be included in W,
cPar.CopyCvtPhen (W, Y, 0);
//read kinship matrices
if (!(cPar.file_mk).empty()) {
ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1, G->size1, G->size1);
CenterMatrix (&G_sub.matrix);
d=0;
for (size_t j=0; jsize1; j++) {
d+=gsl_matrix_get (&G_sub.matrix, j, j);
}
d/=(double)G->size1;
(cPar.v_traceG).push_back(d);
}
} else if (!(cPar.file_kin).empty()) {
ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1; j++) {
d+=gsl_matrix_get (G, j, j);
}
d/=(double)G->size1;
(cPar.v_traceG).push_back(d);
}
/*
//eigen-decomposition and calculate trace_G
cout<<"Start Eigen-Decomposition..."<size; i++) {
if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
cPar.trace_G+=gsl_vector_get (eval, i);
}
cPar.trace_G/=(double)eval->size;
cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
} else {
ReadFile_eigenU (cPar.file_ku, cPar.error, U);
if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) {
if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
cPar.trace_G+=gsl_vector_get(eval, i);
}
cPar.trace_G/=(double)eval->size;
}
*/
//fit multiple variance components
if (cPar.n_ph==1) {
// if (cPar.n_vc==1) {
/*
//calculate UtW and Uty
CalcUtX (U, W, UtW);
CalcUtX (U, Y, UtY);
gsl_vector_view beta=gsl_matrix_row (B, 0);
gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
cPar.beta_mle_null.clear();
cPar.se_beta_mle_null.clear();
for (size_t i=0; isize2; i++) {
cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
}
CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
cPar.beta_remle_null.clear();
cPar.se_beta_remle_null.clear();
for (size_t i=0; isize2; i++) {
cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
}
CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
cPar.PrintSummary();
//calculate and output residuals
if (cPar.a_mode==5) {
gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
//obtain Utu and Ute
gsl_vector_memcpy (y_hat, &UtY_col.vector);
gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
double d, u, e;
for (size_t i=0; isize; i++) {
d=gsl_vector_get (eval, i);
u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
gsl_vector_set (Utu_hat, i, u);
gsl_vector_set (Ute_hat, i, e);
}
//obtain u and e
gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
//output residuals
cPar.WriteVector(u_hat, "residU");
cPar.WriteVector(e_hat, "residE");
gsl_vector_free(u_hat);
gsl_vector_free(e_hat);
gsl_vector_free(y_hat);
}
*/
// } else {
gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
VC cVc;
cVc.CopyFromParam(cPar);
cVc.CalcVCreml (G, W, &Y_col.vector);
cVc.CopyToParam(cPar);
//obtain pve from sigma2
//obtain se_pve from se_sigma2
//}
}
}
//LMM or mvLMM or Eigen-Decomposition
if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) { //Fit LMM or mvLMM or eigen
gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2); //B is a d by c matrix
gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2);
gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1);
gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
gsl_vector *eval=gsl_vector_alloc (Y->size1);
//set covariates matrix W and phenotype matrix Y
//an intercept should be included in W,
cPar.CopyCvtPhen (W, Y, 0);
//read relatedness matrix G
if (!(cPar.file_kin).empty()) {
ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) {
if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
cPar.trace_G+=gsl_vector_get (eval, i);
}
cPar.trace_G/=(double)eval->size;
cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
} else {
ReadFile_eigenU (cPar.file_ku, cPar.error, U);
if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) {
if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
cPar.trace_G+=gsl_vector_get(eval, i);
}
cPar.trace_G/=(double)eval->size;
}
if (cPar.a_mode==31) {
cPar.WriteMatrix(U, "eigenU");
cPar.WriteVector(eval, "eigenD");
} else {
//calculate UtW and Uty
CalcUtX (U, W, UtW);
CalcUtX (U, Y, UtY);
//calculate REMLE/MLE estimate and pve for univariate model
if (cPar.n_ph==1) {
gsl_vector_view beta=gsl_matrix_row (B, 0);
gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
cPar.beta_mle_null.clear();
cPar.se_beta_mle_null.clear();
for (size_t i=0; isize2; i++) {
cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
}
CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
cPar.beta_remle_null.clear();
cPar.se_beta_remle_null.clear();
for (size_t i=0; isize2; i++) {
cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
}
CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
cPar.PrintSummary();
//calculate and output residuals
if (cPar.a_mode==5) {
gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
//obtain Utu and Ute
gsl_vector_memcpy (y_hat, &UtY_col.vector);
gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
double d, u, e;
for (size_t i=0; isize; i++) {
d=gsl_vector_get (eval, i);
u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
gsl_vector_set (Utu_hat, i, u);
gsl_vector_set (Ute_hat, i, e);
}
//obtain u and e
gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
//output residuals
cPar.WriteVector(u_hat, "residU");
cPar.WriteVector(e_hat, "residE");
gsl_vector_free(u_hat);
gsl_vector_free(e_hat);
gsl_vector_free(y_hat);
}
}
//Fit LMM or mvLMM
if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) {
if (cPar.n_ph==1) {
LMM cLmm;
cLmm.CopyFromParam(cPar);
gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
if (!cPar.file_gene.empty()) {
cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype
} else if (!cPar.file_bfile.empty()) {
cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
} else {
cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
}
cLmm.WriteFiles();
cLmm.CopyToParam(cPar);
} else {
MVLMM cMvlmm;
cMvlmm.CopyFromParam(cPar);
if (!cPar.file_bfile.empty()) {
cMvlmm.AnalyzePlink (U, eval, UtW, UtY);
} else {
cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY);
}
cMvlmm.WriteFiles();
cMvlmm.CopyToParam(cPar);
}
}
}
//release all matrices and vectors
gsl_matrix_free (Y);
gsl_matrix_free (W);
gsl_matrix_free(B);
gsl_matrix_free(se_B);
gsl_matrix_free (G);
gsl_matrix_free (U);
gsl_matrix_free (UtW);
gsl_matrix_free (UtY);
gsl_vector_free (eval);
}
//BSLMM
if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);
gsl_matrix *G=gsl_matrix_alloc (y->size, y->size);
gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test);
//set covariates matrix W and phenotype vector y
//an intercept should be included in W,
cPar.CopyCvtPhen (W, y, 0);
//center y, even for case/control data
cPar.pheno_mean=CenterVector(y);
//run bslmm if rho==1
if (cPar.rho_min==1 && cPar.rho_max==1) {
//read genotypes X (not UtX)
cPar.ReadGenotypes (UtX, G, false);
//perform BSLMM analysis
BSLMM cBslmm;
cBslmm.CopyFromParam(cPar);
time_start=clock();
cBslmm.MCMC(UtX, y);
cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
cBslmm.CopyToParam(cPar);
//else, if rho!=1
} else {
gsl_matrix *U=gsl_matrix_alloc (y->size, y->size);
gsl_vector *eval=gsl_vector_alloc (y->size);
gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2);
gsl_vector *Uty=gsl_vector_alloc (y->size);
//read relatedness matrix G
if (!(cPar.file_kin).empty()) {
cPar.ReadGenotypes (UtX, G, false);
//read relatedness matrix G
ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) {
if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
cPar.trace_G+=gsl_vector_get (eval, i);
}
cPar.trace_G/=(double)eval->size;
cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
//calculate UtW and Uty
CalcUtX (U, W, UtW);
CalcUtX (U, y, Uty);
//calculate REMLE/MLE estimate and pve
CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
cPar.PrintSummary();
//Creat and calcualte UtX, use a large memory
cout<<"Calculating UtX..."<tm_month<<":"<tm_day":"<tm_hour<<":"<tm_min<=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) {
outfile<<"## time on optimization = "<