From 7762722f264adc402ea3b0f21923b18f072253ba Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Mon, 22 Sep 2014 11:06:02 -0400 Subject: version 0.95alpha --- bin/gemma | Bin 0 -> 4823751 bytes src/bslmm.cpp | 1928 ++++++++++++++++++++++++++++ src/bslmm.h | 146 +++ src/gemma.cpp | 1864 +++++++++++++++++++++++++++ src/gemma.h | 52 + src/gzstream.cpp | 165 +++ src/gzstream.h | 121 ++ src/io.cpp | 1396 ++++++++++++++++++++ src/io.h | 79 ++ src/lapack.cpp | 609 +++++++++ src/lapack.h | 53 + src/lm.cpp | 572 +++++++++ src/lm.h | 75 ++ src/lmm.cpp | 1771 ++++++++++++++++++++++++++ src/lmm.h | 111 ++ src/main.cpp | 86 ++ src/mathfunc.cpp | 310 +++++ src/mathfunc.h | 41 + src/mvlmm.cpp | 3749 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/mvlmm.h | 94 ++ src/param.cpp | 849 +++++++++++++ src/param.h | 232 ++++ src/prdt.cpp | 544 ++++++++ src/prdt.h | 81 ++ src/vc.cpp | 443 +++++++ src/vc.h | 82 ++ 26 files changed, 15453 insertions(+) create mode 100755 bin/gemma create mode 100644 src/bslmm.cpp create mode 100644 src/bslmm.h create mode 100644 src/gemma.cpp create mode 100644 src/gemma.h create mode 100644 src/gzstream.cpp create mode 100644 src/gzstream.h create mode 100644 src/io.cpp create mode 100644 src/io.h create mode 100644 src/lapack.cpp create mode 100644 src/lapack.h create mode 100644 src/lm.cpp create mode 100644 src/lm.h create mode 100644 src/lmm.cpp create mode 100644 src/lmm.h create mode 100644 src/main.cpp create mode 100644 src/mathfunc.cpp create mode 100644 src/mathfunc.h create mode 100644 src/mvlmm.cpp create mode 100644 src/mvlmm.h create mode 100644 src/param.cpp create mode 100644 src/param.h create mode 100644 src/prdt.cpp create mode 100644 src/prdt.h create mode 100644 src/vc.cpp create mode 100644 src/vc.h diff --git a/bin/gemma b/bin/gemma new file mode 100755 index 0000000..6734240 Binary files /dev/null and b/bin/gemma differ diff --git a/src/bslmm.cpp b/src/bslmm.cpp new file mode 100644 index 0000000..55a05ca --- /dev/null +++ b/src/bslmm.cpp @@ -0,0 +1,1928 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" + + + + +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "bslmm_float.h" +#include "lmm_float.h" //for class FUNC_PARAM and MatrixCalcLR +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "bslmm.h" +#include "lmm.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void BSLMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + l_min=cPar.h_min; + l_max=cPar.h_max; + n_region=cPar.n_region; + pve_null=cPar.pve_null; + pheno_mean=cPar.pheno_mean; + + time_UtZ=0.0; + time_Omega=0.0; + n_accept=0; + + h_min=cPar.h_min; + h_max=cPar.h_max; + h_scale=cPar.h_scale; + rho_min=cPar.rho_min; + rho_max=cPar.rho_max; + rho_scale=cPar.rho_scale; + logp_min=cPar.logp_min; + logp_max=cPar.logp_max; + logp_scale=cPar.logp_scale; + + s_min=cPar.s_min; + s_max=cPar.s_max; + w_step=cPar.w_step; + s_step=cPar.s_step; + r_pace=cPar.r_pace; + w_pace=cPar.w_pace; + n_mh=cPar.n_mh; + geo_mean=cPar.geo_mean; + randseed=cPar.randseed; + trace_G=cPar.trace_G; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void BSLMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtZ=time_UtZ; + cPar.time_Omega=time_Omega; + cPar.time_Proposal=time_Proposal; + cPar.cHyp_initial=cHyp_initial; + cPar.n_accept=n_accept; + cPar.pheno_mean=pheno_mean; + cPar.randseed=randseed; + + return; +} + + + +void BSLMM::WriteBV (const gsl_vector *bv) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".bv.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< > &beta_g, const gsl_vector *alpha, const size_t w) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".param.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< &rank) +{ + size_t pos; + for (size_t i=0; isize2, UtXgamma->size2); + gsl_vector *Xty=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *OiXty=gsl_vector_alloc (UtXgamma->size2); + + gsl_matrix_set_identity (Omega); + gsl_matrix_scale (Omega, 1.0/sigma_a2); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, UtXgamma, UtXgamma, 1.0, Omega); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, UtXgamma, UtXgamma, 1.0, Omega); +#endif + gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma, Uty, 0.0, Xty); + + CholeskySolve(Omega, Xty, OiXty); + + gsl_blas_ddot (Xty, OiXty, &pve); + gsl_blas_ddot (Uty, Uty, &var_y); + + pve/=var_y; + + gsl_matrix_free (Omega); + gsl_vector_free (Xty); + gsl_vector_free (OiXty); + + return pve; +} + + +void BSLMM::InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector &rank, class HYPBSLMM &cHyp, vector > &pos_loglr) +{ + double q_genome=gsl_cdf_chisq_Qinv(0.05/(double)ns_test, 1); + + cHyp.n_gamma=0; + for (size_t i=0; iq_genome) {cHyp.n_gamma++;} + } + if (cHyp.n_gamma<10) {cHyp.n_gamma=10;} + + if (cHyp.n_gamma>s_max) {cHyp.n_gamma=s_max;} + if (cHyp.n_gamma1.0) {cHyp.rho=1.0;} + + if (cHyp.hh_max) {cHyp.h=h_max;} + if (cHyp.rhorho_max) {cHyp.rho=rho_max;} + if (cHyp.logplogp_max) {cHyp.logp=logp_max;} + + +// if (fix_sigma>=0) { +// fix_sigma=cHyp.h; +// rho_max=1-cHyp.h; +// cHyp.rho=rho_max/2.0; +// } + + //Initial for grid sampling: +// cHyp.h=0.225; +// cHyp.rho=1.0; +// cHyp.logp=-4.835429; + + cout<<"initial value of h = "<size; i++) + { + d=gsl_ran_gaussian(gsl_r, 1); + gsl_vector_set(beta, i, d); + } + gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, beta); + + + //it compuates inv(L^T(Omega)) %*% beta; + gsl_vector_scale(beta, sqrt(sigma_a2/tau)); + gsl_vector_add(beta, beta_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, UtXgamma, beta, 0.0, UtXb); + + //sample alpha + gsl_vector_memcpy (alpha_prime, Uty); + gsl_vector_sub (alpha_prime, UtXb); + gsl_vector_mul (alpha_prime, weight_Hi); + gsl_vector_scale (alpha_prime, sigma_b2); + + //sample u + gsl_vector_memcpy (Utu, alpha_prime); + gsl_vector_mul (Utu, K_eval); + + if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));} + gsl_vector_add (Utu, Utu_rand); + + + //for quantitative traits, calculate pve and pge + if (a_mode==11) { + gsl_blas_ddot (UtXb, UtXb, &d); + cHyp.pge=d/(double)ni_test; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=cHyp.pge+d/(double)ni_test; + + if (cHyp.pve==0) {cHyp.pge=0.0;} + else {cHyp.pge/=cHyp.pve;} + cHyp.pve/=cHyp.pve+1.0/tau; + } + + + gsl_matrix_free (UtXgamma_eval); + gsl_matrix_free (Omega); + gsl_vector_free (XtHiy); + gsl_vector_free (beta_hat); + gsl_vector_free (Utu_rand); + gsl_vector_free (weight_Hi); + + logpost=-0.5*logdet_H-0.5*logdet_O; + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);} + else {logpost-=0.5*P_yy;} +// else {logpost+=-0.5*P_yy*tau+0.5*(double)ni_test*log(tau);} + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp)); + + return logpost; +} + + + +//calculate pve and pge, and calculate z_hat for case-control data +void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) +{ + double d; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=d/(double)ni_test; + + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, z_hat); + + cHyp.pve/=cHyp.pve+1.0; + cHyp.pge=0.0; + + return; +} + + +//calculate pve and pge, and calculate z_hat for case-control data +void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) +{ + double d; + gsl_vector *UtXbU=gsl_vector_alloc (Utu->size); + + gsl_blas_ddot (UtXb, UtXb, &d); + cHyp.pge=d/(double)ni_test; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=cHyp.pge+d/(double)ni_test; + + gsl_vector_memcpy (UtXbU, Utu); + gsl_vector_add (UtXbU, UtXb); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, UtXbU, 0.0, z_hat); + + if (cHyp.pve==0) {cHyp.pge=0.0;} + else {cHyp.pge/=cHyp.pve;} + + cHyp.pve/=cHyp.pve+1.0; + + gsl_vector_free(UtXbU); + return; +} + + + + +void BSLMM::SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z) +{ + double d1, d2, z_rand=0.0; + for (size_t i=0; isize; ++i) { + d1=gsl_vector_get (y, i); + d2=gsl_vector_get (z_hat, i); + //y is centerred for case control studies + if (d1<=0.0) { + //control, right truncated + do { + z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0); + } while (z_rand>0.0); + } + else { + do { + z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0); + } while (z_rand<0.0); + } + + gsl_vector_set (z, i, z_rand); + } + + return; +} + + + + + +double BSLMM::ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + + double h=cHyp_old.h, rho=cHyp_old.rho; + + double d_h=(h_max-h_min)*h_scale, d_rho=(rho_max-rho_min)*rho_scale; + + for (size_t i=0; ih_max) {h=2*h_max-h;} + + rho=rho+(gsl_rng_uniform(gsl_r)-0.5)*d_rho; + if (rhorho_max) {rho=2*rho_max-rho;} + } + /* + //Grid Sampling + for (size_t i=0; ih_max) {h=h_min;} + } + + for (size_t i=0; irho_max) {rho=rho_min;} + } + */ + cHyp_new.h=h; + cHyp_new.rho=rho; + return 0.0; +} + + +double BSLMM::ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + double logp_old=cHyp_old.logp, logp_new=cHyp_old.logp; + double log_ratio=0.0; + + double d_logp=min(0.1, (logp_max-logp_min)*logp_scale); + + for (size_t i=0; ilogp_max) {logp_new=2*logp_max-logp_new;} + + log_ratio+=logp_new-logp_old; + logp_old=logp_new; + } + /* + //Grid Sampling + for (size_t i=0; ilogp_max) {logp_new=logp_min;} + + log_ratio+=logp_new-logp_old; + logp_old=logp_new; + } + */ + cHyp_new.logp=logp_new; + + return log_ratio; +} + +bool comp_vec (size_t a, size_t b) +{ + return (a < b); +} + + +double BSLMM::ProposeGamma (const vector &rank_old, vector &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + map mapRank2in; + size_t r; + double unif, logp=0.0; + int flag_gamma; + size_t r_add, r_remove, col_id; + + rank_new.clear(); + if (cHyp_old.n_gamma!=rank_old.size()) {cout<<"size wrong"<=0.40 && unif < 0.80 && cHyp_new.n_gamma>s_min) {flag_gamma=2;} + else if (unif>=0.80 && cHyp_new.n_gamma>0 && cHyp_new.n_gamma a, pair b) +{ + return (a.second > b.second); +} + + + + + + + +//if a_mode==13 then Uty==y +void BSLMM::MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y) { + clock_t time_start; + + class HYPBSLMM cHyp_old, cHyp_new; + + gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6); + gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max); + + gsl_vector *alpha_prime=gsl_vector_alloc (ni_test); + gsl_vector *alpha_new=gsl_vector_alloc (ni_test); + gsl_vector *alpha_old=gsl_vector_alloc (ni_test); + gsl_vector *Utu=gsl_vector_alloc (ni_test); + gsl_vector *Utu_new=gsl_vector_alloc (ni_test); + gsl_vector *Utu_old=gsl_vector_alloc (ni_test); + + gsl_vector *UtXb_new=gsl_vector_alloc (ni_test); + gsl_vector *UtXb_old=gsl_vector_alloc (ni_test); + + gsl_vector *z_hat=gsl_vector_alloc (ni_test); + gsl_vector *z=gsl_vector_alloc (ni_test); + gsl_vector *Utz=gsl_vector_alloc (ni_test); + + gsl_vector_memcpy (Utz, Uty); + + double logPost_new, logPost_old; + double logMHratio; + double mean_z=0.0; + + gsl_matrix_set_zero (Result_gamma); + gsl_vector_set_zero (Utu); + gsl_vector_set_zero (alpha_prime); + if (a_mode==13) { + pheno_mean=0.0; + } + + vector > beta_g; + for (size_t i=0; i rank_new, rank_old; + vector beta_new, beta_old; + + vector > pos_loglr; + + time_start=clock(); + MatrixCalcLR (U, UtX, Utz, K_eval, l_min, l_max, n_region, pos_loglr); + time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr); + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + double *p_gamma = new double[ns_test]; + CalcPgamma (p_gamma); + + gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma); + + //initial parameters + InitialMCMC (UtX, Utz, rank_old, cHyp_old, pos_loglr); +// if (fix_sigma>=0) { +// rho_max=1-fix_sigma; +// cHyp_old.h=fix_sigma/(1-cHyp_old.rho); +// } + + cHyp_initial=cHyp_old; + + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + + //calculate centered z_hat, and pve + if (a_mode==13) { + time_start=clock(); + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old); + } + else { + CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old); + } + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } + + //start MCMC + int accept; + size_t total_step=w_step+s_step; + size_t w=0, w_col, pos; + size_t repeat=0; + + for (size_t t=0; t10) {break;} + + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz); + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //First proposal + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + } + + //MH steps + for (size_t i=0; i=0) { +// cHyp_new.h=fix_sigma/(1-cHyp_new.rho); +// } + + if (cHyp_new.n_gamma==0 || cHyp_new.rho==0) { + logPost_new=CalcPosterior(Utz, K_eval, Utu_new, alpha_new, cHyp_new); + beta_new.clear(); + for (size_t i=0; isize; ++i) { + beta_new.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + + logMHratio+=logPost_new-logPost_old; + + if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))size2); + gsl_vector *H_eval=gsl_vector_alloc (Uty->size); + gsl_vector *bv=gsl_vector_alloc (Uty->size); + + gsl_vector_memcpy (H_eval, eval); + gsl_vector_scale (H_eval, lambda); + gsl_vector_add_constant (H_eval, 1.0); + + gsl_vector_memcpy (bv, Uty); + gsl_vector_div (bv, H_eval); + + gsl_blas_dgemv (CblasTrans, lambda/(double)UtX->size2, UtX, bv, 0.0, beta); + gsl_vector_add_constant (H_eval, -1.0); + gsl_vector_mul (H_eval, bv); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, H_eval, 0.0, bv); + + WriteParam (beta); + WriteBV(bv); + + gsl_vector_free (H_eval); + gsl_vector_free (beta); + gsl_vector_free (bv); + + return; +} + + + + + + + + + + + + + + + + + +//below fits MCMC for rho=1 +void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty) +{ + time_t time_start=clock(); + gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size); + gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size); + gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#endif + gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + return; +} + + +void BSLMM::SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector &rank_old, const vector &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new) +{ + double d; + + //rank_old and rank_new are sorted already inside PorposeGamma + //calculate vectors rank_remove and rank_add + // size_t v_size=max(rank_old.size(), rank_new.size()); + //make sure that v_size is larger than repeat + size_t v_size=20; + vector rank_remove(v_size), rank_add(v_size), rank_union(s_max+v_size); + vector::iterator it; + + it=set_difference (rank_old.begin(), rank_old.end(), rank_new.begin(), rank_new.end(), rank_remove.begin()); + rank_remove.resize(it-rank_remove.begin()); + + it=set_difference (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_add.begin()); + rank_add.resize(it-rank_add.begin()); + + it=set_union (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_union.begin()); + rank_union.resize(it-rank_union.begin()); + + //map rank_remove and rank_add + map mapRank2in_remove, mapRank2in_add; + for (size_t i=0; isize1, rank_old.size()); + gsl_matrix_const_view XtXold_sub=gsl_matrix_const_submatrix(XtX_old, 0, 0, rank_old.size(), rank_old.size()); + gsl_vector_const_view Xtyold_sub=gsl_vector_const_subvector(Xty_old, 0, rank_old.size()); + + gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(X_new, 0, 0, X_new->size1, rank_new.size()); + gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size()); + gsl_vector_view Xtynew_sub=gsl_vector_subvector(Xty_new, 0, rank_new.size()); + + //get X_new and calculate XtX_new + /* + if (rank_remove.size()==0 && rank_add.size()==0) { + gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix); + gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix); + gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector); + } else { + gsl_matrix *X_temp=gsl_matrix_alloc(X_old->size1, rank_old.size()-rank_remove.size() ); + gsl_matrix *XtX_temp=gsl_matrix_alloc(X_temp->size2, X_temp->size2); + gsl_vector *Xty_temp=gsl_vector_alloc(X_temp->size2); + + if (rank_remove.size()==0) { + gsl_matrix_memcpy (X_temp, &Xold_sub.matrix); + gsl_matrix_memcpy (XtX_temp, &XtXold_sub.matrix); + gsl_vector_memcpy (Xty_temp, &Xtyold_sub.vector); + } else { + size_t i_temp=0, j_temp; + for (size_t i=0; isize1, rank_add.size() ); + gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2); + gsl_matrix *XtX_at=gsl_matrix_alloc(X_add->size2, X_temp->size2); + gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2); + + //get X_add + SetXgamma (X_add, X, rank_add); + + //get t(X_add)X_add and t(X_add)X_temp + clock_t time_start=clock(); + + //somehow the lapack_dgemm does not work here + //#ifdef WITH_LAPACK + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa); + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at); + + //#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_temp, 0.0, XtX_at); + //#endif + gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //save to X_new, XtX_new and Xty_new + size_t i_temp=0, j_temp, i_flag=0, j_flag=0; + for (size_t i=0; isize1, rank_add.size() ); + gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2); + gsl_matrix *XtX_ao=gsl_matrix_alloc(X_add->size2, X_old->size2); + gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2); + + //get X_add + SetXgamma (X_add, X, rank_add); + + //get t(X_add)X_add and t(X_add)X_temp + clock_t time_start=clock(); + + //somehow the lapack_dgemm does not work here + //#ifdef WITH_LAPACK + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa); + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at); + + //#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_old, 0.0, XtX_ao); + //#endif + gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //save to X_new, XtX_new and Xty_new + i_old=0; i_new=0; i_add=0; + for (size_t i=0; isize1, s_size); + gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size); + gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size); + + gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size); + gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size); + gsl_vector *beta_hat=gsl_vector_alloc (s_size); + gsl_vector *Xty_temp=gsl_vector_alloc (s_size); + + gsl_vector_memcpy (Xty_temp, &Xty_sub.vector); + + //calculate Omega + gsl_matrix_memcpy (Omega, &XtX_sub.matrix); + gsl_matrix_scale (Omega, sigma_a2); + gsl_matrix_set_identity (M_temp); + gsl_matrix_add (Omega, M_temp); + + //calculate beta_hat + logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat); + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (Xty_temp, beta_hat, &d); + P_yy-=d; + + //sample tau + double tau=1.0; + if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0, 2.0/P_yy); } + + //sample beta + for (size_t i=0; i > beta_g; + for (size_t i=0; i rank_new, rank_old; + vector > pos_loglr; + + time_start=clock(); + MatrixCalcLmLR (X, z, pos_loglr); + time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr); + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + double *p_gamma = new double[ns_test]; + CalcPgamma (p_gamma); + + gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma); + + //initial parameters + InitialMCMC (X, z, rank_old, cHyp_old, pos_loglr); + + cHyp_initial=cHyp_old; + + if (cHyp_old.n_gamma==0) { + logPost_old=CalcPosterior (ztz, cHyp_old); + } + else { + SetXgamma (Xgamma_old, X, rank_old); + CalcXtX (Xgamma_old, z, rank_old.size(), XtX_old, Xtz_old); + logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old); + } + + //calculate centered z_hat, and pve + if (a_mode==13) { + if (cHyp_old.n_gamma==0) { + CalcCC_PVEnZ (z_hat, cHyp_old); + } + else { + CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old); + } + } + + //start MCMC + int accept; + size_t total_step=w_step+s_step; + size_t w=0, w_col, pos; + size_t repeat=0; + + for (size_t t=0; t10) {break;} + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + gsl_blas_ddot(z,z,&ztz); + + //First proposal + if (cHyp_old.n_gamma==0) { + logPost_old=CalcPosterior (ztz, cHyp_old); + } else { + gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_old.size()); + gsl_vector_view Xtz_sub=gsl_vector_subvector(Xtz_old, 0, rank_old.size()); + gsl_blas_dgemv (CblasTrans, 1.0, &Xold_sub.matrix, z, 0.0, &Xtz_sub.vector); + logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old); + } + } + + //MH steps + for (size_t i=0; i0 || log(gsl_rng_uniform(gsl_r)). + */ + + +#ifndef __BSLMM_H__ +#define __BSLMM_H__ + +#include +#include +#include +#include + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class BSLMM { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double pve_null; + double pheno_mean; + + // BSLMM MCMC related parameters + double h_min, h_max, h_scale; //priors for h + double rho_min, rho_max, rho_scale; //priors for rho + double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t s_min, s_max; //minimum and maximum number of gammas + size_t w_step; //number of warm up/burn in iterations + size_t s_step; //number of sampling iterations + size_t r_pace; //record pace + size_t w_pace; //write pace + size_t n_accept; //number of acceptance + size_t n_mh; //number of MH steps within each iteration + double geo_mean; //mean of the geometric distribution + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + size_t n_cvt; //number of covariates + double time_UtZ; + double time_Omega; //time spent on optimization iterations + double time_Proposal; //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis) + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + gsl_rng *gsl_r; + gsl_ran_discrete_t *gsl_t; + map mapRank2pos; + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda); + + void MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y); + void WriteLog (); + void WriteLR (); + void WriteBV (const gsl_vector *bv); + void WriteParam (vector > &beta_g, const gsl_vector *alpha, const size_t w); + void WriteParam (const gsl_vector *alpha); + void WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col); + + //Subfunctions inside MCMC + void CalcPgamma (double *p_gammar); + + double CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2); + void InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector &rank_old, class HYPBSLMM &cHyp, vector > &pos_loglr); + double CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp); + double CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp); + double CalcREMLE (const gsl_matrix *Utw, const gsl_vector *Uty, const gsl_vector *K_eval); + double CalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, vector > &loglr_sort); //calculate the maximum marginal likelihood ratio for each analyzed SNPs with gemma, use it to rank SNPs + void SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z); + double ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + double ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + double ProposeGamma (const vector &rank_old, vector &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + void SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector &rank); + + void CalcXtX (const gsl_matrix *X_new, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX_new, gsl_vector *Xty_new); + void SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector &rank_old, const vector &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new); + double CalcPosterior (const double yty, class HYPBSLMM &cHyp); + double CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp); + void MCMC (const gsl_matrix *X, const gsl_vector *y); + + //utility functions +// double vec_sum (gsl_vector *v); +// void vec_center (gsl_vector *v); +// double calc_var (gsl_vector *v); +// void calc_sigma (MCMC &cMcmc); +// bool comp_lr (pair a, pair b); +}; + + + +#endif + + diff --git a/src/gemma.cpp b/src/gemma.cpp new file mode 100644 index 0000000..b8693a8 --- /dev/null +++ b/src/gemma.cpp @@ -0,0 +1,1864 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" //for functions EigenDecomp + +#ifdef FORCE_FLOAT +#include "io_float.h" //for function ReadFile_kin +#include "gemma_float.h" +#include "vc_float.h" +#include "lm_float.h" //for LM class +#include "bslmm_float.h" //for BSLMM class +#include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe +#include "mvlmm_float.h" //for MVLMM class +#include "prdt_float.h" //for PRDT class +#include "mathfunc_float.h" //for a few functions +#else +#include "io.h" +#include "gemma.h" +#include "vc.h" +#include "lm.h" +#include "bslmm.h" +#include "lmm.h" +#include "mvlmm.h" +#include "prdt.h" +#include "mathfunc.h" +#endif + + +using namespace std; + + + +GEMMA::GEMMA(void): +version("0.95alpha"), date("08/08/2014"), year("2011") +{} + +void GEMMA::PrintHeader (void) +{ + cout< indicator_all; + size_t c_bv=0; + for (size_t i=0; isize; i++) { + d=gsl_vector_get(y_prdt, i); + d=gsl_cdf_gaussian_P(d, 1.0); + gsl_vector_set(y_prdt, i, d); + } + } + + + cPRDT.CopyToParam(cPar); + + cPRDT.WriteFiles(y_prdt); + + gsl_vector_free(y_prdt); + } + + + //Prediction with kinship matrix only; for one or more phenotypes + if (cPar.a_mode==43) { + //first, use individuals with full phenotypes to obtain estimates of Vg and Ve + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); + gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); + gsl_vector *eval=gsl_vector_alloc (Y->size1); + + gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph); + gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt); + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + cPar.CopyCvtPhen (W_full, Y_full, 1); + + gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); + gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); + gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2); + + //read relatedness matrix G, and matrix G_full + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + //calculate variance component and beta estimates + //and then obtain predicted values + if (cPar.n_ph==1) { + gsl_vector *beta=gsl_vector_alloc (W->size2); + gsl_vector *se_beta=gsl_vector_alloc (W->size2); + + double lambda, logl, vg, ve; + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + //obtain estimates + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta); + + cout<<"REMLE estimate for vg in the null model = "<size1; i++) { + for (size_t j=0; j<=i; j++) { + cout<size1; i++) { + for (size_t j=0; j<=i; j++) { + cout<size1; i++) { + for (size_t j=i; jsize2; j++) { + cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) ); + cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) ); + } + } + + //obtain Y_hat from fixed effects + gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat); + + //obtain H + KroneckerSym(G_full, Vg, H_full); + for (size_t i=0; isize1; i++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2); + gsl_matrix_add (&H_sub.matrix, Ve); + } + + //free matrices + gsl_matrix_free (Vg); + gsl_matrix_free (Ve); + gsl_matrix_free (B); + gsl_matrix_free (se_B); + } + + PRDT cPRDT; + + cPRDT.CopyFromParam(cPar); + + cout<<"Predicting Missing Phentypes ... "<size1, cPar.n_cvt); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //Fit LM or mvLM + if (cPar.n_ph==1) { + LM cLm; + cLm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + + if (!cPar.file_gene.empty()) { + cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype + } else if (!cPar.file_bfile.empty()) { + cLm.AnalyzePlink (W, &Y_col.vector); + } else { + cLm.AnalyzeBimbam (W, &Y_col.vector); + } + + cLm.WriteFiles(); + cLm.CopyToParam(cPar); + } + /* + else { + MVLM cMvlm; + cMvlm.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cMvlm.AnalyzePlink (W, Y); + } else { + cMvlm.AnalyzeBimbam (W, Y); + } + + cMvlm.WriteFiles(); + cMvlm.CopyToParam(cPar); + } + */ + //release all matrices and vectors + gsl_matrix_free (Y); + gsl_matrix_free (W); + } + + + //VC estimation with one or multiple kinship matrices + //REML approach only + //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp + //for one phenotype only; + if (cPar.a_mode==61) { + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc ); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //read kinship matrices + if (!(cPar.file_mk).empty()) { + ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1, G->size1, G->size1); + CenterMatrix (&G_sub.matrix); + d=0; + for (size_t j=0; jsize1; j++) { + d+=gsl_matrix_get (&G_sub.matrix, j, j); + } + d/=(double)G->size1; + (cPar.v_traceG).push_back(d); + } + } else if (!(cPar.file_kin).empty()) { + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1; j++) { + d+=gsl_matrix_get (G, j, j); + } + d/=(double)G->size1; + (cPar.v_traceG).push_back(d); + } + /* + //eigen-decomposition and calculate trace_G + cout<<"Start Eigen-Decomposition..."<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } else { + ReadFile_eigenU (cPar.file_ku, cPar.error, U); + if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { + if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} + cPar.trace_G+=gsl_vector_get(eval, i); + } + cPar.trace_G/=(double)eval->size; + } + */ + //fit multiple variance components + if (cPar.n_ph==1) { + // if (cPar.n_vc==1) { + /* + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + gsl_vector_view beta=gsl_matrix_row (B, 0); + gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); + + cPar.beta_mle_null.clear(); + cPar.se_beta_mle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); + cPar.beta_remle_null.clear(); + cPar.se_beta_remle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + cPar.PrintSummary(); + + //calculate and output residuals + if (cPar.a_mode==5) { + gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); + gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); + gsl_vector *u_hat=gsl_vector_alloc (Y->size1); + gsl_vector *e_hat=gsl_vector_alloc (Y->size1); + gsl_vector *y_hat=gsl_vector_alloc (Y->size1); + + //obtain Utu and Ute + gsl_vector_memcpy (y_hat, &UtY_col.vector); + gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); + + double d, u, e; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (eval, i); + u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + gsl_vector_set (Utu_hat, i, u); + gsl_vector_set (Ute_hat, i, e); + } + + //obtain u and e + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); + + //output residuals + cPar.WriteVector(u_hat, "residU"); + cPar.WriteVector(e_hat, "residE"); + + gsl_vector_free(u_hat); + gsl_vector_free(e_hat); + gsl_vector_free(y_hat); + } +*/ + // } else { + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + VC cVc; + cVc.CopyFromParam(cPar); + cVc.CalcVCreml (G, W, &Y_col.vector); + cVc.CopyToParam(cPar); + + //obtain pve from sigma2 + //obtain se_pve from se_sigma2 + + //} + } + + + } + + + //LMM or mvLMM or Eigen-Decomposition + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) { //Fit LMM or mvLMM or eigen + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2); //B is a d by c matrix + gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); + gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); + gsl_vector *eval=gsl_vector_alloc (Y->size1); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } else { + ReadFile_eigenU (cPar.file_ku, cPar.error, U); + if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { + if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} + cPar.trace_G+=gsl_vector_get(eval, i); + } + cPar.trace_G/=(double)eval->size; + } + + if (cPar.a_mode==31) { + cPar.WriteMatrix(U, "eigenU"); + cPar.WriteVector(eval, "eigenD"); + } else { + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + //calculate REMLE/MLE estimate and pve for univariate model + if (cPar.n_ph==1) { + gsl_vector_view beta=gsl_matrix_row (B, 0); + gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); + + cPar.beta_mle_null.clear(); + cPar.se_beta_mle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); + cPar.beta_remle_null.clear(); + cPar.se_beta_remle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + cPar.PrintSummary(); + + //calculate and output residuals + if (cPar.a_mode==5) { + gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); + gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); + gsl_vector *u_hat=gsl_vector_alloc (Y->size1); + gsl_vector *e_hat=gsl_vector_alloc (Y->size1); + gsl_vector *y_hat=gsl_vector_alloc (Y->size1); + + //obtain Utu and Ute + gsl_vector_memcpy (y_hat, &UtY_col.vector); + gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); + + double d, u, e; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (eval, i); + u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + gsl_vector_set (Utu_hat, i, u); + gsl_vector_set (Ute_hat, i, e); + } + + //obtain u and e + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); + + //output residuals + cPar.WriteVector(u_hat, "residU"); + cPar.WriteVector(e_hat, "residE"); + + gsl_vector_free(u_hat); + gsl_vector_free(e_hat); + gsl_vector_free(y_hat); + } + } + + //Fit LMM or mvLMM + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) { + if (cPar.n_ph==1) { + LMM cLmm; + cLmm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + if (!cPar.file_gene.empty()) { + cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype + } else if (!cPar.file_bfile.empty()) { + cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } + + cLmm.WriteFiles(); + cLmm.CopyToParam(cPar); + } else { + MVLMM cMvlmm; + cMvlmm.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cMvlmm.AnalyzePlink (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + } + + cMvlmm.WriteFiles(); + cMvlmm.CopyToParam(cPar); + } + } + } + + + //release all matrices and vectors + gsl_matrix_free (Y); + gsl_matrix_free (W); + gsl_matrix_free(B); + gsl_matrix_free(se_B); + gsl_matrix_free (G); + gsl_matrix_free (U); + gsl_matrix_free (UtW); + gsl_matrix_free (UtY); + gsl_vector_free (eval); + } + + + //BSLMM + if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); + gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); + + //set covariates matrix W and phenotype vector y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, y, 0); + + //center y, even for case/control data + cPar.pheno_mean=CenterVector(y); + + //run bslmm if rho==1 + if (cPar.rho_min==1 && cPar.rho_max==1) { + //read genotypes X (not UtX) + cPar.ReadGenotypes (UtX, G, false); + + //perform BSLMM analysis + BSLMM cBslmm; + cBslmm.CopyFromParam(cPar); + time_start=clock(); + cBslmm.MCMC(UtX, y); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmm.CopyToParam(cPar); + //else, if rho!=1 + } else { + gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); + gsl_vector *eval=gsl_vector_alloc (y->size); + gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); + gsl_vector *Uty=gsl_vector_alloc (y->size); + + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { + cPar.ReadGenotypes (UtX, G, false); + + //read relatedness matrix G + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, y, Uty); + + //calculate REMLE/MLE estimate and pve + CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + + cPar.PrintSummary(); + + //Creat and calcualte UtX, use a large memory + cout<<"Calculating UtX..."<tm_month<<":"<tm_day":"<tm_hour<<":"<tm_min<=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) { + outfile<<"## time on optimization = "<. +*/ + +#ifndef __GEMMA_H__ +#define __GEMMA_H__ + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +class GEMMA { + +public: + //parameters + string version; + string date; + string year; + + //constructor + GEMMA(void); + + //functions + void PrintHeader (void); + void PrintHelp (size_t option); + void PrintLicense (void); + void Assign (int argc, char **argv, PARAM &cPar); + void BatchRun (PARAM &cPar); + void WriteLog (int argc, char **argv, PARAM &cPar); +}; + + +#endif + diff --git a/src/gzstream.cpp b/src/gzstream.cpp new file mode 100644 index 0000000..bbb4ba8 --- /dev/null +++ b/src/gzstream.cpp @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include "gzstream.h" +#include +#include // for memcpy + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + char fmode[10]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + *fmodeptr = '\0'; + file = gzopen( name, fmode); + if (file == 0) + return (gzstreambuf*)0; + opened = 1; + return this; +} + +gzstreambuf * gzstreambuf::close() { + if ( is_open()) { + sync(); + opened = 0; + if ( gzclose( file) == Z_OK) + return this; + } + return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only + if ( gptr() && ( gptr() < egptr())) + return * reinterpret_cast( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + return EOF; + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + return EOF; + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/src/gzstream.h b/src/gzstream.h new file mode 100644 index 0000000..861653f --- /dev/null +++ b/src/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/src/io.cpp b/src/io.cpp new file mode 100644 index 0000000..c22f668 --- /dev/null +++ b/src/io.cpp @@ -0,0 +1,1396 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" +#include "gzstream.h" +#include "mathfunc.h" + +#ifdef FORCE_FLOAT +#include "io_float.h" +#else +#include "io.h" +#endif + + +using namespace std; + + + +//Print process bar +void ProgressBar (string str, double p, double total) +{ + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); + char bar[51]; + + cout<sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if(sb->sgetc() == '\n') + sb->sbumpc(); + return is; + case EOF: + // Also handle the case when the last line has no line ending + if(t.empty()) + is.setstate(std::ios::eofbit); + return is; + default: + t += (char)c; + } + } +} + +//Read snp file +bool ReadFile_snps (const string &file_snps, set &setSnps) +{ + setSnps.clear(); + + ifstream infile (file_snps.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open snps file: "< &mapRS2chr, map &mapRS2bp, map &mapRS2cM) +{ + mapRS2chr.clear(); + mapRS2bp.clear(); + + ifstream infile (file_anno.c_str(), ifstream::in); + if (!infile) {cout<<"error opening annotation file: "< &indicator_idv, vector &pheno, const int &p_column) +{ + indicator_idv.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< > &indicator_pheno, vector > &pheno, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &indicator_cvt, vector > &cvt, size_t &n_cvt) +{ + indicator_cvt.clear(); + + ifstream infile (file_cvt.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open covariates file: "< v_d; flag_na=0; + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} + else {d=atof(ch_ptr);} + + v_d.push_back(d); + ch_ptr=strtok (NULL, " , \t"); + } + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + cvt.push_back(v_d); + } + + if (indicator_cvt.empty()) {n_cvt=0;} + else { + flag_na=0; + for (vector::size_type i=0; i &snpInfo) +{ + snpInfo.clear(); + + ifstream infile (file_bim.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .bim file: "< > &indicator_pheno, vector > &pheno, map &mapID2num, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + mapID2num.clear(); + + igzstream infile (file_fam.c_str(), igzstream::in); + //ifstream infile (file_fam.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .fam file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test) +{ + indicator_snp.clear(); + snpInfo.clear(); + + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w; + int c_idv=0; + + string line; + char *ch_ptr; + + string rs; + long int b_pos; + string chr; + string major; + string minor; + double cM; + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + int ni_total=indicator_idv.size(); + int ni_test=0; + for (int i=0; i=0 && geno<=0.5) {n_0++;} + if (geno>0.5 && geno<1.5) {n_1++;} + if (geno>=1.5 && geno<=2.0) {n_2++;} + + gsl_vector_set (genotype, c_idv, geno); + +// if (geno<0) {n_miss++; continue;} + + if (flag_poly==0) {geno_old=geno; flag_poly=2;} + if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} + + maf+=geno; + + c_idv++; + } + maf/=2.0*(double)(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + snpInfo.push_back(sInfo); + + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=0) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//Read bed file, the first time +bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) +{ + indicator_snp.clear(); + size_t ns_total=snpInfo.size(); + + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w, geno; + size_t c_idv=0; + + char ch[1]; + bitset<8> b; + + size_t ni_total=indicator_idv.size(); + size_t ni_test=0; + for (size_t i=0; i miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=1) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + +void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) +{ + igzstream infile (file_kin.c_str(), igzstream::in); +// ifstream infile (file_kin.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open kinship file: "< mapID2ID; + size_t c=0; + for (size_t i=0; isize1, G->size1, G->size1); + ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix); + i++; + } + + infile.close(); + infile.clear(); + return; +} + + +void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) +{ + igzstream infile (file_ku.c_str(), igzstream::in); +// ifstream infile (file_ku.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open the U file: "<size1, n_col=U->size2, i_row=0, i_col=0; + + gsl_matrix_set_zero (U); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<size, i_row=0; + + gsl_vector_set_zero (eval); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + for (size_t t=0; t &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total; + double d, geno_mean, geno_var; + + size_t ni_total=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + int n_bit; + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1); + double geno, geno_mean; + size_t n_miss; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + + int c_idv=0, c_snp=0; + + for (int i=0; isize; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} + else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + int n_bit; + + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); + + double geno, geno_mean; + size_t n_miss; + int c_idv=0, c_snp=0, c=0; + + //start reading snps and doing association test + for (int t=0; tsize; ++i) { + geno=gsl_vector_get (genotype, i); + if (geno==-9) {geno=0;} + else {geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &est_column, map &mapRS2est) +{ + mapRS2est.clear(); + + ifstream infile (file_est.c_str(), ifstream::in); + if (!infile) {cout<<"error opening estimated parameter file: "<(infile), istreambuf_iterator(), '\n'); + infile.seekg (0, ios::beg); + + return true; +} + + + +//Read gene expression file +bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total) +{ + vec_read.clear(); + ng_total=0; + + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open gene expression file: "<. +*/ + +#ifndef __IO_H__ +#define __IO_H__ + + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +void ProgressBar (string str, double p, double total); +void ProgressBar (string str, double p, double total, double ratio); +std::istream& safeGetline(std::istream& is, std::string& t); + +bool ReadFile_snps (const string &file_snps, set &setSnps); +bool ReadFile_log (const string &file_log, double &pheno_mean); + +bool ReadFile_bim (const string &file_bim, vector &snpInfo); +bool ReadFile_fam (const string &file_fam, vector > &indicator_pheno, vector > &pheno, map &mapID2num, const vector &p_column); + +bool ReadFile_cvt (const string &file_cvt, vector &indicator_cvt, vector > &cvt, size_t &n_cvt); +bool ReadFile_anno (const string &file_bim, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM); +bool ReadFile_pheno (const string &file_pheno, vector > &indicator_pheno, vector > &pheno, const vector &p_column); +bool ReadFile_column (const string &file_pheno, vector &indicator_idv, vector &pheno, const int &p_column); + +bool ReadFile_geno (const string &file_geno, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test); +bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); + +void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); +void ReadFile_mk (const string &file_mk, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); +void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U); +void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); + +bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); +bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); + +bool ReadFile_geno (const string &file_geno, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); +bool ReadFile_bed (const string &file_bed, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + +bool ReadFile_est (const string &file_est, const vector &est_column, map &mapRS2est); + +bool CountFileLines (const string &file_input, size_t &n_lines); + +bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total); + +#endif + + + + + + + diff --git a/src/lapack.cpp b/src/lapack.cpp new file mode 100644 index 0000000..83d5290 --- /dev/null +++ b/src/lapack.cpp @@ -0,0 +1,609 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" + +using namespace std; + +extern "C" void sgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, float *ALPHA, float *A, int *LDA, float *B, int *LDB, float *BETA, float *C, int *LDC); +extern "C" void spotrf_(char *UPLO, int *N, float *A, int *LDA, int *INFO); +extern "C" void spotrs_(char *UPLO, int *N, int *NRHS, float *A, int *LDA, float *B, int *LDB, int *INFO); +extern "C" void ssyev_(char* JOBZ, char* UPLO, int *N, float *A, int *LDA, float *W, float *WORK, int *LWORK, int *INFO); +extern "C" void ssyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, float *A, int *LDA, float *VL, float *VU, int *IL, int *IU, float *ABSTOL, int *M, float *W, float *Z, int *LDZ, int *ISUPPZ, float *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO); + +extern "C" void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int *LDB, double *BETA, double *C, int *LDC); +extern "C" void dpotrf_(char *UPLO, int *N, double *A, int *LDA, int *INFO); +extern "C" void dpotrs_(char *UPLO, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, int *INFO); +extern "C" void dsyev_(char* JOBZ, char* UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO); +extern "C" void dsyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, double *A, int *LDA, double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z, int *LDZ, int *ISUPPZ, double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO); + + +//cholesky decomposition, A is distroyed +void lapack_float_cholesky_decomp (gsl_matrix_float *A) +{ + int N=A->size1, LDA=A->size1, INFO; + char UPLO='L'; + + if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<data, &LDA, &INFO); + if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<size1, LDA=A->size1, INFO; + char UPLO='L'; + + if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<data, &LDA, &INFO); + if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO; + char UPLO='L'; + + if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<data, &LDA, x->data, &LDB, &INFO); + if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO; + char UPLO='L'; + + if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<data, &LDA, x->data, &LDB, &INFO); + if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<size1, LDB=B->size1, LDC=C->size2; + + if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;} + else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;} + else {cout<<"need 'N' or 'T' in lapack_sgemm"<size2; K2=B->size1;} + else if (*TransB=='T' || *TransB=='t') {N=B->size1; K2=B->size2;} + else {cout<<"need 'N' or 'T' in lapack_sgemm"<size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_sgemm"<size2, A->size1); + gsl_matrix_float_transpose_memcpy (A_t, A); + gsl_matrix_float *B_t=gsl_matrix_float_alloc (B->size2, B->size1); + gsl_matrix_float_transpose_memcpy (B_t, B); + gsl_matrix_float *C_t=gsl_matrix_float_alloc (C->size2, C->size1); + gsl_matrix_float_transpose_memcpy (C_t, C); + + sgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC); + gsl_matrix_float_transpose_memcpy (C, C_t); + + gsl_matrix_float_free (A_t); + gsl_matrix_float_free (B_t); + gsl_matrix_float_free (C_t); + return; +} + + + +void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C) +{ + int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2; + + if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;} + else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;} + else {cout<<"need 'N' or 'T' in lapack_dgemm"<size2; K2=B->size1;} + else if (*TransB=='T' || *TransB=='t') {N=B->size1; K2=B->size2;} + else {cout<<"need 'N' or 'T' in lapack_dgemm"<size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_dgemm"<size2, A->size1); + gsl_matrix_transpose_memcpy (A_t, A); + gsl_matrix *B_t=gsl_matrix_alloc (B->size2, B->size1); + gsl_matrix_transpose_memcpy (B_t, B); + gsl_matrix *C_t=gsl_matrix_alloc (C->size2, C->size1); + gsl_matrix_transpose_memcpy (C_t, C); + + dgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC); + + gsl_matrix_transpose_memcpy (C, C_t); + + gsl_matrix_free (A_t); + gsl_matrix_free (B_t); + gsl_matrix_free (C_t); + return; +} + + + +//eigen value decomposition, matrix A is destroyed, float seems to have problem with large matrices (in mac) +void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix) +{ + if (flag_largematrix==1) { + int N=A->size1, LDA=A->size1, INFO, LWORK=-1; + char JOBZ='V', UPLO='L'; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, eval->data, temp, &LWORK, &INFO); + // if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, eval->data, WORK, &LWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1; + char JOBZ='V', UPLO='L', RANGE='A'; + float ABSTOL=1.0E-7; + + //VL, VU, IL, IU are not referenced; M equals N if RANGE='A' + float VL=0.0, VU=0.0; + int IL=0, IU=0, M; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_float_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_float_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_float_eigen_symmv."<size1, LDA=A->size1, INFO, LWORK=-1; + char JOBZ='V', UPLO='L'; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, eval->data, temp, &LWORK, &INFO); + // if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, eval->data, WORK, &LWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1; + char JOBZ='V', UPLO='L', RANGE='A'; + double ABSTOL=1.0E-7; + + //VL, VU, IL, IU are not referenced; M equals N if RANGE='A' + double VL=0.0, VU=0.0; + int IL=0, IU=0, M; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1); + gsl_eigen_symmv (G, eval, U, w); + gsl_eigen_symmv_free (w); +#endif + /* + for (size_t i=0; isize; ++i) { + if (gsl_vector_get (eval, i)<1e-10) { +// cout<size; ++i) { + d+=gsl_vector_get(eval, i); + } + d/=(double)eval->size; + + return d; +} + + +//DO NOT set eigen values to be positive +double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix) +{ +#ifdef WITH_LAPACK + lapack_float_eigen_symmv (G, eval, U, flag_largematrix); +#else + //gsl doesn't provide float precision eigen decomposition; plus, float precision eigen decomposition in lapack may not work on OS 10.4 + //first change to double precision + gsl_matrix *G_double=gsl_matrix_alloc (G->size1, G->size2); + gsl_matrix *U_double=gsl_matrix_alloc (U->size1, U->size2); + gsl_vector *eval_double=gsl_vector_alloc (eval->size); + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set(G_double, i, j, gsl_matrix_float_get(G, i, j)); + } + } + gsl_eigen_symmv_workspace *w_space=gsl_eigen_symmv_alloc (G->size1); + gsl_eigen_symmv (G_double, eval_double, U_double, w_space); + gsl_eigen_symmv_free (w_space); + + //change back to float precision + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set(K, i, j, gsl_matrix_get(G_double, i, j)); + } + } + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set(U, i, j, gsl_matrix_get(U_double, i, j)); + } + } + for (size_t i=0; isize; i++) { + gsl_vector_float_set(eval, i, gsl_vector_get(eval_double, i)); + } + + //delete double precision matrices + gsl_matrix_free (G_double); + gsl_matrix_free (U_double); + gsl_vector_free (eval_double); +#endif + /* + for (size_t i=0; isize; ++i) { + if (gsl_vector_float_get (eval, i)<1e-10) { + gsl_vector_float_set (eval, i, 0); + } + } + */ + //calculate track_G=mean(diag(G)) + double d=0.0; + for (size_t i=0; isize; ++i) { + d+=gsl_vector_float_get(eval, i); + } + d/=(double)eval->size; + + return d; +} + + +double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty) +{ + double logdet_O=0.0; + +#ifdef WITH_LAPACK + lapack_cholesky_decomp(Omega); + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_get (Omega, i, i)); + } + logdet_O*=2.0; + lapack_cholesky_solve(Omega, Xty, OiXty); +#else + int status = gsl_linalg_cholesky_decomp(Omega); + if(status == GSL_EDOM) { + cout << "## non-positive definite matrix" << endl; + // exit(0); + } + + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_get (Omega, i, i)); + } + logdet_O*=2.0; + + gsl_vector_memcpy (OiXty, Xty); + gsl_blas_dtrsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); + gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); + // gsl_linalg_cholesky_solve(XtX, Xty, iXty); +#endif + + return logdet_O; +} + + +double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty) +{ + double logdet_O=0.0; + +#ifdef WITH_LAPACK + lapack_float_cholesky_decomp(Omega); + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_float_get (Omega, i, i)); + } + logdet_O*=2.0; + lapack_float_cholesky_solve(Omega, Xty, OiXty); +#else + gsl_matrix *Omega_double=gsl_matrix_alloc (Omega->size1, Omega->size2); + double d; + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize2; ++j) { + d=(double)gsl_matrix_float_get (Omega, i, j); + gsl_matrix_set (Omega_double, i, j, d); + } + } + + int status = gsl_linalg_cholesky_decomp(Omega_double); + if(status == GSL_EDOM) { + cout << "## non-positive definite matrix" << endl; + // exit(0); + } + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize2; ++j) { + d=gsl_matrix_get (Omega_double, i, j); + if (j==i) {logdet_O+=log(d);} + gsl_matrix_float_set (Omega, i, j, (float)d); + } + } + logdet_O*=2.0; + + gsl_vector_float_memcpy (OiXty, Xty); + gsl_blas_strsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); + gsl_blas_strsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); + // gsl_linalg_cholesky_solve(XtX, Xty, iXty); + + gsl_matrix_free (Omega_double); +#endif + + return logdet_O; +} + + +//LU decomposition +void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum) +{ + gsl_linalg_LU_decomp (LU, p, signum); + return; +} + +void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + gsl_linalg_LU_decomp (LU_double, p, signum); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j)); + } + } + + //free matrix + gsl_matrix_free (LU_double); + return; +} + + +//LU invert +void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse) +{ + gsl_linalg_LU_invert (LU, p, inverse); + return; +} + +void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + gsl_matrix *inverse_double=gsl_matrix_alloc (inverse->size1, inverse->size2); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + gsl_linalg_LU_invert (LU_double, p, inverse_double); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (inverse, i, j, gsl_matrix_get(inverse_double, i, j)); + } + } + + //free matrix + gsl_matrix_free (LU_double); + gsl_matrix_free (inverse_double); + return; +} + +//LU lndet +double LULndet (gsl_matrix *LU) +{ + double d; + d=gsl_linalg_LU_lndet (LU); + return d; +} + +double LULndet (gsl_matrix_float *LU) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + double d; + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + d=gsl_linalg_LU_lndet (LU_double); + + //copy float matrix to double + /* + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j)); + } + } + */ + //free matrix + gsl_matrix_free (LU_double); + return d; +} + + +//LU solve +void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x) +{ + gsl_linalg_LU_solve (LU, p, b, x); + return; +} + +void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + gsl_vector *b_double=gsl_vector_alloc (b->size); + gsl_vector *x_double=gsl_vector_alloc (x->size); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + for (size_t i=0; isize; i++) { + gsl_vector_set (b_double, i, gsl_vector_float_get(b, i)); + } + + for (size_t i=0; isize; i++) { + gsl_vector_set (x_double, i, gsl_vector_float_get(x, i)); + } + + //LU decomposition + gsl_linalg_LU_solve (LU_double, p, b_double, x_double); + + //copy float matrix to double + for (size_t i=0; isize; i++) { + gsl_vector_float_set (x, i, gsl_vector_get(x_double, i)); + } + + //free matrix + gsl_matrix_free (LU_double); + gsl_vector_free (b_double); + gsl_vector_free (x_double); + return; +} + + diff --git a/src/lapack.h b/src/lapack.h new file mode 100644 index 0000000..cb7b156 --- /dev/null +++ b/src/lapack.h @@ -0,0 +1,53 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __LAPACK_H__ +#define __LAPACK_H__ + + + +using namespace std; + + +void lapack_float_cholesky_decomp (gsl_matrix_float *A); +void lapack_cholesky_decomp (gsl_matrix *A); +void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x); +void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x); +void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C); +void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C); +void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix); +void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix); + +double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix); +double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix); + +double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty); +double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty); + +void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum); +void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum); +void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse); +void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse); +double LULndet (gsl_matrix *LU); +double LULndet (gsl_matrix_float *LU); +void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x); +void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x); +#endif + + + diff --git a/src/lm.cpp b/src/lm.cpp new file mode 100644 index 0000000..7577d0a --- /dev/null +++ b/src/lm.cpp @@ -0,0 +1,572 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "gzstream.h" +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "lm_float.h" +#else +#include "lm.h" +#endif + + +using namespace std; + + + + + +void LM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + file_gene=cPar.file_gene; + + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + ng_total=cPar.ng_total; + ng_test=0; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LM::CopyToParam (PARAM &cPar) +{ + cPar.time_opt=time_opt; + + cPar.ng_test=ng_test; + + return; +} + + + +void LM::WriteFiles () +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<::size_type t=0; tsize; + double d; + + gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); + + gsl_blas_ddot (x, x, &xPwx); + gsl_blas_ddot (x, y, &xPwy); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + + gsl_blas_ddot (WtWiWtx, Wtx, &d); + xPwx-=d; + + gsl_blas_ddot (WtWiWtx, Wty, &d); + xPwy-=d; + + gsl_vector_free (WtWiWtx); + + return; +} + + +void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy) +{ + size_t c_size=Wty->size; + double d; + + gsl_vector *WtWiWty=gsl_vector_alloc (c_size); + + gsl_blas_ddot (y, y, &yPwy); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + + gsl_blas_ddot (WtWiWty, Wty, &d); + yPwy-=d; + + gsl_vector_free (WtWiWty); + + return; +} + + + +//calculate p values and beta/se in a linear model +void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score) +{ + double yPxy=yPwy-xPwy*xPwy/xPwx; + double se_wald, se_score; + + beta=xPwy/xPwx; + se_wald=sqrt(yPxy/(df*xPwx) ); + se_score=sqrt(yPwy/((double)n_size*xPwx) ); + + p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); + p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); + p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); + + if (test_mode==3) {se=se_score;} else {se=se_wald;} + + return; +} + + + + +void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) +{ + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error reading gene expression file:"<size1-(double)W->size2-1.0; + + gsl_vector *y=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wtx, x, xPwx); + + //header + getline(infile, line); + + for (size_t t=0; tsize1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + gsl_vector *x_miss=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + //start reading genotypes and analyze + for (size_t t=0; t1) {break;} + getline(infile, line); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< b; + + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //calculate some basic quantities + double yPwy, xPwy, xPwx; + double df=(double)W->size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector::size_type t=0; t1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< > &pos_loglr) +{ + double yty, xty, xtx, log_lr; + gsl_blas_ddot(y, y, &yty); + + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); + gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); + gsl_blas_ddot(&X_col.vector, y, &xty); + + log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); + pos_loglr.push_back(make_pair(i,log_lr) ); + } + + return; +} diff --git a/src/lm.h b/src/lm.h new file mode 100644 index 0000000..ceec060 --- /dev/null +++ b/src/lm.h @@ -0,0 +1,75 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef __LM_H__ +#define __LM_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + +class LM { + +public: + // IO related parameters + int a_mode; //analysis mode, 50+1/2/3/4 for Frequentist tests + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + string file_gene; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t n_cvt; + double time_opt; //time spent + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x); + void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y); + void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); +}; +void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector > &pos_loglr); +#endif diff --git a/src/lmm.cpp b/src/lmm.cpp new file mode 100644 index 0000000..e0b4160 --- /dev/null +++ b/src/lmm.cpp @@ -0,0 +1,1771 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#else +#include "lmm.h" +#endif + + +using namespace std; + + + + + +void LMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + file_gene=cPar.file_gene; + + l_min=cPar.l_min; + l_max=cPar.l_max; + n_region=cPar.n_region; + l_mle_null=cPar.l_mle_null; + logl_mle_H0=cPar.logl_mle_H0; + + time_UtX=0.0; + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + ng_total=cPar.ng_total; + ng_test=0; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.ng_test=ng_test; + + return; +} + + + +void LMM::WriteFiles () +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<::size_type t=0; tn_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + +void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, const gsl_matrix *Uab, const gsl_vector *ab, gsl_matrix *Pab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p_ab; + double ps_ab, ps_aw, ps_bw, ps_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab); + if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;} + gsl_matrix_set (Pab, 0, index_ab, p_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps_ab=gsl_matrix_get (Pab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + + p_ab=ps_ab-ps_aw*ps_bw/ps_ww; + gsl_matrix_set (Pab, p, index_ab, p_ab); + } + } + } + } + return; +} + + +void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, gsl_matrix *PPab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p2_ab; + double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab); + if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);} + gsl_matrix_set (PPab, 0, index_ab, p2_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps2_ab=gsl_matrix_get (PPab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); + ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); + ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); + + p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww); + p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww; + gsl_matrix_set (PPab, p, index_ab, p2_ab); + + } + } + } + } + return; +} + + +void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, const gsl_matrix *PPab, gsl_matrix *PPPab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p3_ab; + double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab); + if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);} + gsl_matrix_set (PPPab, 0, index_ab, p3_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); + ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); + ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); + ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw); + ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw); + ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww); + + p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww); + p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww; + p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww); + + gsl_matrix_set (PPPab, p, index_ab, p3_ab); + } + } + } + } + return; +} + + + +double LogL_f (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double f=0.0, logdet_h=0.0, d; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + for (size_t i=0; i<(p->eval)->size; ++i) { + d=gsl_vector_get (v_temp, i); + logdet_h+=log(fabs(d)); + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + + double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return f; +} + + + + + + +double LogL_dev1 (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double dev1=0.0, trace_Hi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + + if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;} + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double yPKPy=(P_yy-PP_yy)/l; + dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (v_temp); + + return dev1; +} + + + + +double LogL_dev2 (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; + double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); + + dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_matrix_free (PPPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (HiHiHi_eval); + gsl_vector_free (v_temp); + + return dev2; +} + + + + + +void LogL_dev12 (double l, void *params, double *dev1, double *dev2) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double trace_Hi=0.0, trace_HiHi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; + double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); + + *dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; + *dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_matrix_free (PPPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (HiHiHi_eval); + gsl_vector_free (v_temp); + + return; +} + + + +double LogRL_f (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + for (size_t i=0; i<(p->eval)->size; ++i) { + d=gsl_vector_get (v_temp, i); + logdet_h+=log(fabs(d)); + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + gsl_vector_set_all (v_temp, 1.0); + CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); + + //calculate |WHiW|-|WW| + logdet_hiw=0.0; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double dev1=0.0, trace_Hi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi; + double ps_ww, ps2_ww; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi, trace_PP=trace_HiHi; + double ps_ww, ps2_ww, ps3_ww; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double trace_Hi=0.0, trace_HiHi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi, trace_PP=trace_HiHi; + double ps_ww, ps2_ww, ps3_ww; + for (size_t i=0; isize); + gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); + + gsl_vector_memcpy (v_temp, params.eval); + gsl_vector_scale (v_temp, l); + if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); + size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + + beta=P_xy/P_xx; + double tau=(double)df/Px_yy; + se=sqrt(1.0/(tau*P_xx)); + p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); +// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return ; +} + + +void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score) +{ + size_t n_cvt=params.n_cvt; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + int df=(int)ni_test-(int)n_cvt-1; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); + gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); + + gsl_vector_memcpy (v_temp, params.eval); + gsl_vector_scale (v_temp, l); + if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); + size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + + beta=P_xy/P_xx; + double tau=(double)df/Px_yy; + se=sqrt(1.0/(tau*P_xx)); + + p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df); +// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return ; +} + + + + + + + + +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) +{ + size_t index_ab; + size_t n_cvt=UtW->size2; + + gsl_vector *u_a=gsl_vector_alloc (Uty->size); + + for (size_t a=1; a<=n_cvt+2; ++a) { + if (a==n_cvt+1) {continue;} + + if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1); + gsl_vector_memcpy (u_a, &UtW_col.vector); + } + + for (size_t b=a; b>=1; --b) { + if (b==n_cvt+1) {continue;} + + index_ab=GetabIndex (a, b, n_cvt); + gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); + + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); + gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); + } + + gsl_vector_mul(&Uab_col.vector, u_a); + } + } + + gsl_vector_free (u_a); + return; +} + + +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) +{ + size_t index_ab; + size_t n_cvt=UtW->size2; + + for (size_t b=1; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (n_cvt+1, b, n_cvt); + gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); + + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} + else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); + gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); + } + + gsl_vector_mul(&Uab_col.vector, Utx); + } + + return; +} + + + +void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) +{ + size_t index_ab; + size_t n_cvt=W->size2; + + double d; + gsl_vector *v_a=gsl_vector_alloc (y->size); + gsl_vector *v_b=gsl_vector_alloc (y->size); + + for (size_t a=1; a<=n_cvt+2; ++a) { + if (a==n_cvt+1) {continue;} + + if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1); + gsl_vector_memcpy (v_a, &W_col.vector); + } + + for (size_t b=a; b>=1; --b) { + if (b==n_cvt+1) {continue;} + + index_ab=GetabIndex (a, b, n_cvt); + + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); + gsl_vector_memcpy (v_b, &W_col.vector); + } + + gsl_blas_ddot (v_a, v_b, &d); + gsl_vector_set(ab, index_ab, d); + } + } + + gsl_vector_free (v_a); + gsl_vector_free (v_b); + return; +} + + +void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) +{ + size_t index_ab; + size_t n_cvt=W->size2; + + double d; + gsl_vector *v_b=gsl_vector_alloc (y->size); + + for (size_t b=1; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (n_cvt+1, b, n_cvt); + + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} + else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); + gsl_vector_memcpy (v_b, &W_col.vector); + } + + gsl_blas_ddot (x, v_b, &d); + gsl_vector_set(ab, index_ab, d); + } + + gsl_vector_free (v_b); + + return; +} + + + + + +void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) +{ + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error reading gene expression file:"<size1); + gsl_vector *Uty=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + //header + getline(infile, line); + + for (size_t t=0; tsize1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //start reading genotypes and analyze + for (size_t t=0; t1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< b; + + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector::size_type t=0; t1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1, for beta + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< > &pos_loglr) +{ + double logl_H0, logl_H1, log_lr, lambda0, lambda1; + + gsl_vector *w=gsl_vector_alloc (Uty->size); + gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); + gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6); + gsl_vector *ab=gsl_vector_alloc (6); + + gsl_vector_set_zero(ab); + gsl_vector_set_all (w, 1.0); + gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); + gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); + + CalcUab (Utw, Uty, Uab) ; + FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; + + CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0); + + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i); + CalcUab(Utw, Uty, &UtX_col.vector, Uab); + FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0}; + + CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1); + log_lr=logl_H1-logl_H0; + + pos_loglr.push_back(make_pair(i,log_lr) ); + } + + gsl_vector_free (w); + gsl_matrix_free (Utw); + gsl_matrix_free (Uab); + gsl_vector_free (ab); + + return; +} + + + + +void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf) +{ + if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."< > lambda_lh; + + //evaluate first order derivates in different intervals + double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region; + double dev1_l, dev1_h, logf_l, logf_h; + + for (size_t i=0; i=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;} + } + else { + //if derivates change signs + int status; + int iter=0, max_iter=100; + double l, l_temp; + + gsl_function F; + gsl_function_fdf FDF; + + F.params=¶ms; + FDF.params=¶ms; + + if (func_name=='R' || func_name=='r') { + F.function=&LogRL_dev1; + FDF.f=&LogRL_dev1; + FDF.df=&LogRL_dev2; + FDF.fdf=&LogRL_dev12; + } + else { + F.function=&LogL_dev1; + FDF.f=&LogL_dev1; + FDF.df=&LogL_dev2; + FDF.fdf=&LogL_dev12; + } + + const gsl_root_fsolver_type *T_f; + gsl_root_fsolver *s_f; + T_f=gsl_root_fsolver_brent; + s_f=gsl_root_fsolver_alloc (T_f); + + const gsl_root_fdfsolver_type *T_fdf; + gsl_root_fdfsolver *s_fdf; + T_fdf=gsl_root_fdfsolver_newton; + s_fdf=gsl_root_fdfsolver_alloc(T_fdf); + + for (vector::size_type i=0; il_min && ll_max) {l=l_max;} + if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} + + if (i==0) {logf=logf_l; lambda=l;} + else if (logflogf) {lambda=l_min; logf=logf_l;} + if (logf_h>logf) {lambda=l_max; logf=logf_h;} + } + + return; +} + + + + + +//calculate lambda in the null model +void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0) +{ + if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; + + CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0); + + gsl_matrix_free(Uab); + gsl_vector_free(ab); + + return; +} + + +//obtain REMLE estimate for PVE using lambda_remle +void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se) +{ + size_t n_cvt=UtW->size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); + // if (e_mode!=0) { + // gsl_vector_set_zero (ab); + // Calcab (W, y, ab); + // } + + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; + + double se=sqrt(-1.0/LogRL_dev2 (lambda, ¶m0)); + + pve=trace_G*lambda/(trace_G*lambda+1.0); + pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se; + + gsl_matrix_free (Uab); + gsl_vector_free (ab); + return; +} + +//obtain REML estimate for Vg and Ve using lambda_remle +//obtain beta and se(beta) for coefficients +//ab is not used when e_mode==0 +void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta) +{ + size_t n_cvt=UtW->size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc(eval->size); + gsl_vector *v_temp=gsl_vector_alloc(eval->size); + gsl_matrix *HiW=gsl_matrix_alloc(eval->size, UtW->size2); + gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2); + gsl_vector *WHiy=gsl_vector_alloc(UtW->size2); + gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); + + gsl_vector_memcpy (v_temp, eval); + gsl_vector_scale (v_temp, lambda); + gsl_vector_set_all (Hi_eval, 1.0); + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + //calculate beta + gsl_matrix_memcpy (HiW, UtW); + for (size_t i=0; isize2; i++) { + gsl_vector_view HiW_col=gsl_matrix_column(HiW, i); + gsl_vector_mul(&HiW_col.vector, Hi_eval); + } + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW); + gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy); + + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2); + LUDecomp (WHiW, pmt, &sig); + LUSolve (WHiW, pmt, WHiy, beta); + LUInvert (WHiW, pmt, Vbeta); + + //calculate vg and ve + CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + + ve=P_yy/(double)(ni_test-n_cvt); + vg=ve*lambda; + + //with ve, calculate se(beta) + gsl_matrix_scale(Vbeta, ve); + + //obtain se_beta + for (size_t i=0; isize1; i++) { + gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) ); + } + + gsl_matrix_free(Uab); + gsl_matrix_free(Pab); + gsl_vector_free(ab); + gsl_vector_free(Hi_eval); + gsl_vector_free(v_temp); + gsl_matrix_free(HiW); + gsl_matrix_free(WHiW); + gsl_vector_free(WHiy); + gsl_matrix_free(Vbeta); + + gsl_permutation_free(pmt); + return; +} + diff --git a/src/lmm.h b/src/lmm.h new file mode 100644 index 0000000..45f9b72 --- /dev/null +++ b/src/lmm.h @@ -0,0 +1,111 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __LMM_H__ +#define __LMM_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + +class FUNC_PARAM +{ + +public: + bool calc_null; + size_t ni_test; + size_t n_cvt; + const gsl_vector *eval; + const gsl_matrix *Uab; + const gsl_vector *ab; + size_t e_mode; +}; + + + + +class LMM { + +public: + // IO related parameters + int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + string file_gene; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double l_mle_null; + double logl_mle_H0; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t n_cvt; + double time_UtX; //time spent on optimization iterations + double time_opt; //time spent on optimization iterations + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x); + void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); + + void CalcRLWald (const double &lambda, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_wald); + void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); +}; + +void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector > &pos_loglr); +void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf); +void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0); +void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se); +void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta); + +#endif + + diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..e1fb336 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,86 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include + +#include "param.h" + +#ifdef FORCE_FLOAT +#include "gemma_float.h" +#else +#include "gemma.h" +#endif + +using namespace std; + + + +int main(int argc, char * argv[]) +{ + GEMMA cGemma; + PARAM cPar; + + if (argc <= 1) { + cGemma.PrintHeader(); + return EXIT_SUCCESS; + } + if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'h') { + cGemma.PrintHelp(0); + return EXIT_SUCCESS; + } + if (argc==3 && argv[1][0] == '-' && argv[1][1] == 'h') { + string str; + str.assign(argv[2]); + cGemma.PrintHelp(atoi(str.c_str())); + return EXIT_SUCCESS; + } + if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'l') { + cGemma.PrintLicense(); + return EXIT_SUCCESS; + } + + cGemma.Assign(argc, argv, cPar); + + ifstream check_dir((cPar.path_out).c_str()); + if (!check_dir) { + mkdir((cPar.path_out).c_str(), S_IRWXU|S_IRGRP|S_IROTH); + } + + if (cPar.error==true) {return EXIT_FAILURE;} + + if (cPar.mode_silence) {stringstream ss; cout.rdbuf (ss.rdbuf());} + + cPar.CheckParam(); + + if (cPar.error==true) {return EXIT_FAILURE;} + + cGemma.BatchRun(cPar); + + if (cPar.error==true) {return EXIT_FAILURE;} + + cGemma.WriteLog(argc, argv, cPar); + + return EXIT_SUCCESS; +} + + + diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp new file mode 100644 index 0000000..09e58dc --- /dev/null +++ b/src/mathfunc.cpp @@ -0,0 +1,310 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#ifdef FORCE_FLOAT +#include "mathfunc_float.h" +#else +#include "mathfunc.h" +#endif + + +using namespace std; + + + +//calculate variance of a vector +double VectorVar (const gsl_vector *v) +{ + double d, m=0.0, m2=0.0; + for (size_t i=0; isize; ++i) { + d=gsl_vector_get (v, i); + m+=d; + m2+=d*d; + } + m/=(double)v->size; + m2/=(double)v->size; + return m2-m*m; +} + + + +//center the matrix G +void CenterMatrix (gsl_matrix *G) +{ + double d; + gsl_vector *w=gsl_vector_alloc (G->size1); + gsl_vector *Gw=gsl_vector_alloc (G->size1); + gsl_vector_set_all (w, 1.0); + + gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw); + gsl_blas_dsyr2 (CblasUpper, -1.0/(double)G->size1, Gw, w, G); + gsl_blas_ddot (w, Gw, &d); + gsl_blas_dsyr (CblasUpper, d/((double)G->size1*(double)G->size1), w, G); + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize1); + + gsl_blas_ddot (w, w, &wtw); + gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw); + gsl_blas_dsyr2 (CblasUpper, -1.0/wtw, Gw, w, G); + gsl_blas_ddot (w, Gw, &d); + gsl_blas_dsyr (CblasUpper, d/(wtw*wtw), w, G); + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize1; ++i) { + d+=gsl_matrix_get(G, i, i); + } + d/=(double)G->size1; + + gsl_matrix_scale (G, 1.0/d); + + return; +} + + +//center the vector y +double CenterVector (gsl_vector *y) +{ + double d=0.0; + + for (size_t i=0; isize; ++i) { + d+=gsl_vector_get (y, i); + } + d/=(double)y->size; + + gsl_vector_add_constant (y, -1.0*d); + + return d; +} + + +//calculate UtX +void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) +{ + gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1); + for (size_t i=0; isize2; ++i) { + gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); + gsl_blas_dgemv (CblasTrans, 1.0, U, &UtX_col.vector, 0.0, Utx_vec); + gsl_vector_memcpy (&UtX_col.vector, Utx_vec); + } + gsl_vector_free (Utx_vec); + return; +} + + +void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) +{ + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); + gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); + gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector); + } + return; +} + +void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx) +{ + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + return; +} + + +//Kronecker product +void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) +{ + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub.matrix, V); + gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j)); + } + } + return; +} + +//symmetric K matrix +void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) +{ + for (size_t i=0; isize1; i++) { + for (size_t j=i; jsize2; j++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub.matrix, V); + gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j)); + + if (i!=j) { + gsl_matrix_view H_sub_sym=gsl_matrix_submatrix (H, j*V->size1, i*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub_sym.matrix, &H_sub.matrix); + } + } + } + return; +} + + +// this function calculates HWE p value with methods described in Wigginton et al., 2005 AJHG; +// it is based on the code in plink 1.07 +double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) +{ + if ( (n_hom1+n_hom2+n_ab)==0 ) {return 1;} + + //aa is the rare allele + int n_aa=n_hom1 < n_hom2 ? n_hom1 : n_hom2; + int n_bb=n_hom1 < n_hom2 ? n_hom2 : n_hom1; + + int rare_copies = 2 * n_aa + n_ab; + int genotypes = n_ab + n_bb + n_aa; + + double * het_probs = (double *) malloc( (rare_copies + 1) * sizeof(double)); + if (het_probs == NULL) + cout<<"Internal error: SNP-HWE: Unable to allocate array"< 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) + / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc + /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + + for (i = 0; i <= rare_copies; i++) + het_probs[i] /= sum; + + /* alternate p-value calculation for p_hi/p_lo + double p_hi = het_probs[n_ab]; + for (i = n_ab + 1; i <= rare_copies; i++) + p_hi += het_probs[i]; + + double p_lo = het_probs[n_ab]; + for (i = n_ab - 1; i >= 0; i--) + p_lo += het_probs[i]; + + double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; + */ + + double p_hwe = 0.0; + /* p-value calculation for p_hwe */ + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[n_ab]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + + free(het_probs); + + return p_hwe; +} + + + + + + + + diff --git a/src/mathfunc.h b/src/mathfunc.h new file mode 100644 index 0000000..d0e1696 --- /dev/null +++ b/src/mathfunc.h @@ -0,0 +1,41 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef __MATHFUNC_H__ +#define __MATHFUNC_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +using namespace std; + +double VectorVar (const gsl_vector *v); +void CenterMatrix (gsl_matrix *G); +void CenterMatrix (gsl_matrix *G, gsl_vector *w); +void ScaleMatrix (gsl_matrix *G); +double CenterVector (gsl_vector *y); +void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX); +void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX); +void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx); +double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab); +void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H); +void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H); + + +#endif diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp new file mode 100644 index 0000000..4b910ee --- /dev/null +++ b/src/mvlmm.cpp @@ -0,0 +1,3749 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#include "mvlmm_float.h" +#else +#include "lmm.h" +#include "mvlmm.h" +#endif + + + +using namespace std; + + +//in this file, X, Y are already transformed (i.e. UtX and UtY) + + +void MVLMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + l_min=cPar.l_min; + l_max=cPar.l_max; + n_region=cPar.n_region; + p_nr=cPar.p_nr; + em_iter=cPar.em_iter; + nr_iter=cPar.nr_iter; + em_prec=cPar.em_prec; + nr_prec=cPar.nr_prec; + crt=cPar.crt; + + Vg_remle_null=cPar.Vg_remle_null; + Ve_remle_null=cPar.Ve_remle_null; + Vg_mle_null=cPar.Vg_mle_null; + Ve_mle_null=cPar.Ve_mle_null; + + time_UtX=0.0; + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + n_ph=cPar.n_ph; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void MVLMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.Vg_remle_null=Vg_remle_null; + cPar.Ve_remle_null=Ve_remle_null; + cPar.Vg_mle_null=Vg_mle_null; + cPar.Ve_mle_null=Ve_mle_null; + + cPar.VVg_remle_null=VVg_remle_null; + cPar.VVe_remle_null=VVe_remle_null; + cPar.VVg_mle_null=VVg_mle_null; + cPar.VVe_mle_null=VVe_mle_null; + + cPar.beta_remle_null=beta_remle_null; + cPar.se_beta_remle_null=se_beta_remle_null; + cPar.beta_mle_null=beta_mle_null; + cPar.se_beta_mle_null=se_beta_mle_null; + + cPar.logl_remle_H0=logl_remle_H0; + cPar.logl_mle_H0=logl_mle_H0; + return; +} + + +void MVLMM::WriteFiles () +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<size1; + double d, logdet_Ve=0.0; + + //eigen decomposition of V_e + gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix_memcpy(V_e_temp, V_e); + EigenDecomp(V_e_temp, U_l, D_l, 0); + + //calculate V_e_h and V_e_hi + gsl_matrix_set_zero(V_e_h); + gsl_matrix_set_zero(V_e_hi); + for (size_t i=0; isize, d_size=D_l->size, dc_size=Qi->size1; + size_t c_size=dc_size/d_size; + + double delta, dl, d1, d2, d, logdet_Q; + + gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix_set_zero (Q); + + for (size_t i=0; isize, c_size=X->size1, d_size=D_l->size; + + gsl_vector_set_zero (xHiy); + + double x, delta, dl, y, d; + for (size_t i=0; isize, d_size=D_l->size; + double delta, dl, d_u, d_e; + + for (size_t k=0; ksize1, d_size=UltVehiY->size1; + + gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_memcpy (UltVehiBX, UltVehiY); + gsl_matrix_sub (UltVehiBX, UltVehiU); + + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB); + + gsl_matrix_free(YUX); + + return; +} + +void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB) +{ + size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1; + + gsl_vector *b=gsl_vector_alloc (dc_size); + + //calculate b=Qiv + gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b); + + //copy b to UltVehiB + for (size_t i=0; isize, d_size=U->size1; + + gsl_matrix_set_zero (V_g); + gsl_matrix_set_zero (V_e); + + double delta; + + //calculate the first part: UD^{-1}U^T and EE^T + for (size_t k=0; ksize, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1; + + gsl_matrix_set_zero(Sigma_uu); + gsl_matrix_set_zero(Sigma_ee); + + double delta, dl, x, d; + + //calculate the first diagonal term + gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu); + gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee); + + for (size_t k=0; ksize, d_size=D_l->size, dc_size=Qi->size1; + double logl=0.0, delta, dl, y, d; + + //calculate yHiy+log|H_k| + for (size_t k=0; ksize, c_size=X->size1, d_size=Y->size1; + size_t dc_size=d_size*c_size; + + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); + gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size); + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehiB=gsl_matrix_alloc (d_size, c_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size); + gsl_vector *xHiy=gsl_vector_alloc (dc_size); + gsl_permutation * pmt=gsl_permutation_alloc (c_size); + + double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve; + int sig; + + //calculate |XXt| and (XXt)^{-1} + gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); + for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; + size_t dc_size=d_size*c_size; + double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value; + + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); + gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size); + + gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); + gsl_vector *xPy=gsl_vector_alloc (d_size); + //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); + gsl_vector *WHiy=gsl_vector_alloc (dc_size); + + gsl_matrix_set_zero (xPx); + gsl_matrix_set_zero (WHix); + gsl_vector_set_zero (xPy); + gsl_vector_set_zero (WHiy); + + //eigen decomposition and calculate log|Ve| + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate Qi and log|Q| + logdet_Q=CalcQi (eval, D_l, W, Qi); + + //calculate UltVehiY + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); + + //calculate WHix, WHiy, xHiy, xHix + for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; + size_t dc_size=d_size*c_size; + double delta, dl, d, dy, dw, logdet_Ve, logdet_Q; + + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *Qi_temp=gsl_matrix_alloc (dc_size, dc_size); + //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); + gsl_vector *WHiy=gsl_vector_alloc (dc_size); + gsl_vector *QiWHiy=gsl_vector_alloc (dc_size); + gsl_vector *beta=gsl_vector_alloc (dc_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size); + + gsl_vector_set_zero (WHiy); + + //eigen decomposition and calculate log|Ve| + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate Qi and log|Q| + logdet_Q=CalcQi (eval, D_l, W, Qi); + + //calculate UltVehiY + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); + + //calculate WHiy + for (size_t i=0; isize2; j++) { + for (size_t i=0; isize1; i++) { + gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i)); + gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i))); + } + } + + //free matrices + gsl_vector_free(D_l); + gsl_matrix_free(UltVeh); + gsl_matrix_free(UltVehi); + gsl_matrix_free(Qi); + gsl_matrix_free(Qi_temp); + gsl_vector_free(WHiy); + gsl_vector_free(QiWHiy); + gsl_vector_free(beta); + gsl_matrix_free(Vbeta); + + return; +} + + + +//below are functions for Newton-Raphson's algorithm + + + + + +//calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k| +//and calculate Qi and return logdet_Q +//and calculate yPy +void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all, gsl_matrix *Qi, double &logdet_H, double &logdet_Q) +{ + gsl_matrix_set_zero (Hi_all); + gsl_matrix_set_zero (Qi); + logdet_H=0.0; logdet_Q=0.0; + + size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1; + double logdet_Ve=0.0, delta, dl, d; + + gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_vector *D_l=gsl_vector_alloc (d_size); + + //calculate D_l, UltVeh and UltVehi + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate each Hi and log|H_k| + logdet_H=(double)n_size*logdet_Ve; + for (size_t k=0; ksize2, d_size=Y->size1; + + for (size_t k=0; ksize2, c_size=X->size1, d_size=Hi_all->size1; + + double d; + + for (size_t k=0; ksize2; + + for (size_t k=0; ksize2, d_size=Y->size1, dc_size=xHi->size1; + + for (size_t k=0; k=d_size || j>=d_size) {cout<<"error in GetIndex."<size; + + double delta, d1, d2; + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d; + + for (size_t k=0; ksize, dc_size=xHi->size1; + size_t d_size=xHi->size2/n_size; + + double delta; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize, d_size=Hi->size1, dc_size=xHi->size1; + + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize, d_size=Hi->size1; + double delta, d; + + for (size_t k=0; ksize, d_size=Hi->size1; + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize1, d_size=Hi->size1; + size_t v=GetIndex(i, j, d_size); + + double d; + + //calculate the first part: trace(HiD) + Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e); + + //calculate the second part: -trace(HixQixHiD) + for (size_t k=0; ksize1, d_size=Hi->size1; + size_t v_size=d_size*(d_size+1)/2; + size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); + + double d; + + //calculate the first part: trace(HiDHiD) + Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge); + + //calculate the second and third parts: -trace(HixQixHiDHiD)-trace(HiDHixQixHiD) + for (size_t i=0; isize1; + size_t v; + + for (size_t i=0; isize2/eval->size, dc_size=xHi->size1; + size_t v; + + for (size_t i=0; isize1; + size_t v1, v2; + + for (size_t i1=0; i1size2/eval->size, dc_size=xHi->size1; + size_t v1, v2; + + for (size_t i1=0; i1size; + double delta, d_Hi_ij; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize1; + size_t v_size=xHiDHix_all_g->size2/dc_size; + + for (size_t i=0; isize2; i++) { + gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i); + gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i); + + gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i); + gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i); + + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector); + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector); + } + + return; +} + + +//calculate Qi(xHiDHix) for each pair of i j (i<=j) +void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gsl_matrix *mat_all_e, gsl_matrix *Qimat_all_g, gsl_matrix *Qimat_all_e) +{ + size_t dc_size=Qi->size1; + size_t v_size=mat_all_g->size2/mat_all_g->size1; + + for (size_t i=0; isize1; + size_t v=GetIndex(i, j, d_size); + + double d; + + //first part: ytHiDHiy + Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e); + + //second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v); + gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v); + + gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d); + yPDPy_g-=d*2.0; + gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d); + yPDPy_e-=d*2.0; + + //fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v); + gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v); + + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d); + yPDPy_g+=d; + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d); + yPDPy_e+=d; + + return; +} + +//calculate yPDPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)D(Hi-HixQixHi)y +//yPDPDPy=yHiDHiDHiy +//-(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) +//-(yHiDHix)Qi(xHiDHiy) +//+(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) +//+(yHix)Qi(xHiDHiDHix)Qi(xHiy) +//-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) +void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge) +{ + size_t d_size=Hi->size1, dc_size=xHi->size1; + size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); + size_t v_size=d_size*(d_size+1)/2; + + double d; + + gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size); + + //first part: yHiDHiDHiy + Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); + + //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2); + gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2); + gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2); + + gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1); + gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1); + gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); + yPDPDPy_ge-=d; + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); + yPDPDPy_ge-=d; + + //fourth part: -(yHiDHix)Qi(xHiDHiy) + gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1); + gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1); + gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2); + gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2); + + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ge-=d; + + //fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1); + gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1); + + gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1); + gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1); + gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2); + gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2); + + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d); + yPDPDPy_gg+=d; + gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d); + yPDPDPy_gg+=d; + + gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ee+=d; + gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d); + yPDPDPy_ee+=d; + + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ge+=d; + gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d); + yPDPDPy_ge+=d; + + //seventh part: +(yHix)Qi(xHiDHiDHix)Qi(xHiy) + gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_gg+=d; + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ee.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_ee+=d; + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_ge+=d; + + //eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1); + gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1); + + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d); + yPDPDPy_ge-=d; + + //free memory + gsl_vector_free(xHiDHiDHixQixHiy); + + return; +} + + +//calculate Edgeworth correctation factors for small samples +//notation and method follows Thomas J. Rothenberg, Econometirca 1984; 52 (4) +//M=xHiDHix +void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c) +{ + crt_a=0.0; crt_b=0.0; crt_c=0.0; + + size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2; + size_t c_size=dc_size/d_size; + double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0; + double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0; + + gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size); + + //invert Qi_sub to Qi_si + gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); + + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (d_size); + + gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix); + LUDecomp (Qi_sub, pmt, &sig); + LUInvert (Qi_sub, pmt, Qi_si); + + gsl_permutation_free(pmt); + gsl_matrix_free(Qi_sub); + + //calculate correctation factors + for (size_t v1=0; v1size1, d_size=Hi->size1; + size_t c_size=dc_size/d_size; + size_t v_size=d_size*(d_size+1)/2; + size_t v1, v2; + double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge; + + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + + gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + + gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + + //calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all + Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); + Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e); + Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e); + + Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge); + Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge); + + //calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all + Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e); + Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e); + Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e); + + double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge; + double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge; + + //calculate gradient and Hessian for Vg + for (size_t i1=0; i11) { + CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c); + } else { + crt_a=0.0; crt_b=0.0; crt_c=0.0; + } + + gsl_matrix_free(xHiDHiy_all_g); + gsl_matrix_free(xHiDHiy_all_e); + gsl_matrix_free(xHiDHix_all_g); + gsl_matrix_free(xHiDHix_all_e); + gsl_matrix_free(xHiDHixQixHiy_all_g); + gsl_matrix_free(xHiDHixQixHiy_all_e); + + gsl_matrix_free(QixHiDHiy_all_g); + gsl_matrix_free(QixHiDHiy_all_e); + gsl_matrix_free(QixHiDHix_all_g); + gsl_matrix_free(QixHiDHix_all_e); + gsl_matrix_free(QixHiDHixQixHiy_all_g); + gsl_matrix_free(QixHiDHixQixHiy_all_e); + + gsl_matrix_free(xHiDHiDHiy_all_gg); + gsl_matrix_free(xHiDHiDHiy_all_ee); + gsl_matrix_free(xHiDHiDHiy_all_ge); + gsl_matrix_free(xHiDHiDHix_all_gg); + gsl_matrix_free(xHiDHiDHix_all_ee); + gsl_matrix_free(xHiDHiDHix_all_ge); + + return; +} + + +//update Vg, Ve +void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, const double step_scale, gsl_matrix *V_g, gsl_matrix *V_e) +{ + size_t v_size=gradient->size/2, d_size=V_g->size1; + size_t v; + + gsl_vector *vec_v=gsl_vector_alloc (v_size*2); + + double d; + + //vectorize Vg and Ve + for (size_t i=0; isize, c_size=X->size1, d_size=Y->size1; + size_t dc_size=d_size*c_size; + size_t v_size=d_size*(d_size+1)/2; + + double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale; + int sig; + size_t step_iter, flag_pd; + + gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size); + gsl_vector *D_temp=gsl_vector_alloc (d_size); + gsl_vector *xHiy=gsl_vector_alloc (dc_size); + gsl_vector *QixHiy=gsl_vector_alloc (dc_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); + + gsl_vector *gradient=gsl_vector_alloc (v_size*2); + + //calculate |XXt| and (XXt)^{-1} + gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); + for (size_t i=0; i10 ) && step_iter<10 && t!=0); + + //terminate if change is small + if (t!=0) { + if (logl_newsize1; i++) { + for (size_t j=0; jsize2; j++) { + cout<size, c_size=X->size1, d_size=Y->size1; + double a, b, c; + double lambda, logl, vg, ve; + + //Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates + gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); + gsl_vector *beta_temp=gsl_vector_alloc(c_size); + gsl_vector *se_beta_temp=gsl_vector_alloc(c_size); + + gsl_matrix_transpose_memcpy (Xt, X); + + for (size_t i=0; i4) { + //first obtain good initial values + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (2, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (2, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (2, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (6, 6); + + //2 by n matrix of Y + gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size); + gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2); + gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2); + gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size); + + for (size_t i=0; isize1, d_size=UtY->size2, c_size=UtW->size2; + + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_vector *x_miss=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); + + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + + cout<<"REMLE estimate for Vg in the null model: "<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i b; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); + + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + //time_start=clock(); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<=0) {break;} + //if (snpInfo[t].rs_number!="MAG18140902") {continue;} + //cout<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + /* + if (t==0) { + ofstream outfile ("./snp1.txt", ofstream::out); + if (!outfile) {cout<<"error writing file: "<size; i++) { + outfile<1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; + vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; + vector beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; + double p_nr; + size_t em_iter, nr_iter; + double em_prec, nr_prec; + size_t crt; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t n_cvt; + size_t n_ph; + double time_UtX; //time spent on optimization iterations + double time_opt; //time spent on optimization iterations + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void WriteFiles (); + +}; + +void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B); + +#endif + + diff --git a/src/param.cpp b/src/param.cpp new file mode 100644 index 0000000..7a89ff8 --- /dev/null +++ b/src/param.cpp @@ -0,0 +1,849 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + + + +PARAM::PARAM(void): +mode_silence (false), a_mode (0), k_mode(1), d_pace (100000), +file_out("result"), path_out("./output/"), +miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), +l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0), +pheno_mean(0), +h_min(-1), h_max(-1), h_scale(-1), +rho_min(0.0), rho_max(1.0), rho_scale(-1), +logp_min(0.0), logp_max(0.0), logp_scale(-1), +s_min(0), s_max(300), +w_step(100000), s_step(1000000), +r_pace(10), w_pace(1000), +n_accept(0), +n_mh(10), +geo_mean(2000.0), +randseed(-1), +error(false), + n_cvt(1), n_vc(1), +time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) +{} + + +//read files +//obtain ns_total, ng_total, ns_test, ni_test +void PARAM::ReadFiles (void) +{ + string file_str; + if (!file_mk.empty()) { + if (CountFileLines (file_mk, n_vc)==false) {error=true;} + } + + if (!file_snps.empty()) { + if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} + } else { + setSnps.clear(); + } + + //for prediction + if (!file_epm.empty()) { + if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;} + + if (!file_bfile.empty()) { + file_str=file_bfile+".bim"; + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + + file_str=file_bfile+".fam"; + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + } + + if (!file_geno.empty()) { + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + if (CountFileLines (file_geno, ns_total)==false) {error=true;} + } + + if (!file_ebv.empty() ) { + if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;} + } + + if (!file_log.empty() ) { + if (ReadFile_log (file_log, pheno_mean)==false) {error=true;} + } + + //convert indicator_pheno to indicator_idv + int k=1; + for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_read[i]; + ni_test+=indicator_idv[i]; + } + + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "<1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<1) {cout<<"error! rho values must be between 0 and 1. current values = "<0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) { + cout<<"error! the current analysis mode "<1 && !file_gene.empty() ) { + cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<1) { + cout<<"error! pnr value must be between 0 and 1. current value = "<::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ni_test++; + } + + ni_cvt=0; + for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_cvt[i]; + ni_test+=indicator_idv[i]; + } + } + + if ((indicator_read).size()!=0) { + ni_test=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_read[i]; + ni_test+=indicator_idv[i]; + } + } + */ + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "<ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."<size1; ++i) { + for (size_t j=0; jsize2; ++j) { + outfile<size; ++i) { + outfile<::size_type i=0; i set_remove; + + //check if any columns is an intercept + for (size_t i=0; isize2; i++) { + gsl_vector_view w_col=gsl_matrix_column (W, i); + gsl_vector_minmax (&w_col.vector, &v_min, &v_max); + if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);} + } + + //add an intecept term if needed + if (n_cvt==set_remove.size()) { + indicator_cvt.clear(); + n_cvt=1; + } else if (flag_ipt==0) { + cout<<"no intecept term is found in the cvt file. a column of 1s is added."<::size_type i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_cvt[i]; + } + } + + //obtain ni_test + ni_test=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ni_test++; + } + + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "< cvt_row; + cvt_row.push_back(1); + + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_cvt.push_back(1); + + cvt.push_back(cvt_row); + } + } + + return; +} + + + + +void PARAM::CopyCvt (gsl_matrix *W) +{ + size_t ci_test=0; + + for (vector::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i. +*/ + +#ifndef __PARAM_H__ +#define __PARAM_H__ + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +using namespace std; + + + +class SNPINFO { +public: + string chr; + string rs_number; + double cM; + long int base_position; + string a_minor; + string a_major; + size_t n_miss; + double missingness; + double maf; +}; + +//results for lmm +class SUMSTAT { +public: + double beta; //REML estimator for beta + double se; //SE for beta + double lambda_remle; //REML estimator for lambda + double lambda_mle; //MLE estimator for lambda + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test +}; + +//results for mvlmm +class MPHSUMSTAT { +public: + vector v_beta; //REML estimator for beta + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test + vector v_Vg; //estimator for Vg, right half + vector v_Ve; //estimator for Ve, right half + vector v_Vbeta; //estimator for Vbeta, right half +}; + + +//hyper-parameters for bslmm +class HYPBSLMM { +public: + double h; + double pve; + double rho; + double pge; + double logp; + + size_t n_gamma; +}; + + + + +class PARAM { +public: + // IO related parameters + bool mode_silence; + int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests + int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; + vector p_column; //which phenotype column needs analysis + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_pheno; + string file_anno; //optional + string file_cvt; //optional + string file_kin; + string file_ku, file_kd; + string file_mk; + string file_out; + string path_out; + + string file_epm; //estimated parameter file + string file_ebv; //estimated breeding value file + string file_log; //log file containing mean estimate + + string file_read; //file containing total number of reads + string file_gene; //gene expression file + + string file_snps; //file containing analyzed snps or genes + + + + // QC related parameters + double miss_level; + double maf_level; + double hwe_level; + double r2_level; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double l_mle_null, l_remle_null; + double logl_mle_H0, logl_remle_H0; + double pve_null, pve_se_null; + double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; + vector Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; + vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; + vector beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; + double p_nr; + double em_prec, nr_prec; + size_t em_iter, nr_iter; + size_t crt; + double pheno_mean; //phenotype mean from bslmm fitting or for prediction + + //for fitting multiple variance components + //the first three are of size n_vc, and the next two are of size n_vc+1 + vector v_traceG; + vector v_pve; + vector v_se_pve; + + vector v_sigma2; + vector v_se_sigma2; + vector v_beta; + vector v_se_beta; + + // BSLMM MCMC related parameters + double h_min, h_max, h_scale; //priors for h + double rho_min, rho_max, rho_scale; //priors for rho + double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t s_min, s_max; //minimum and maximum number of gammas + size_t w_step; //number of warm up/burn in iterations + size_t s_step; //number of sampling iterations + size_t r_pace; //record pace + size_t w_pace; //write pace + size_t n_accept; //number of acceptance + size_t n_mh; //number of MH steps within each iteration + double geo_mean; //mean of the geometric distribution + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + bool error; + size_t ni_total, ni_test, ni_cvt; //number of individuals + size_t np_obs, np_miss; //number of observed and missing phenotypes + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t ni_control, ni_case; //number of controls and number of cases + size_t n_cvt; //number of covariates + size_t n_ph; //number of phenotypes + size_t n_vc; //number of variance components (including the diagonal matrix) + double time_total; //record total time + double time_G; //time spent on reading files the second time and calculate K + double time_eigen; //time spent on eigen-decomposition + double time_UtX; //time spent on calculating UX and Uy + double time_UtZ; //time spent on calculating UtZ, for probit BSLMM + double time_opt; //time spent on optimization iterations/or mcmc + double time_Omega; //time spent on calculating Omega + double time_hyp; //time spent on sampling hyper-parameters, in PMM + double time_Proposal; //time spend on constructing the proposal distribution (i.e. the initial lmm or lm analysis) + + // Data + vector > pheno; //a vector record all phenotypes, NA replaced with -9 + vector > cvt; //a vector record all covariates, NA replaced with -9 + vector > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + vector indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis + + vector indicator_bv; //indicator for estimated breeding value file, 0 missing, 1 available for analysis + vector indicator_read; //indicator for read file, 0 missing, 1 available for analysis + vector vec_read; //total number of reads + vector vec_bv; //breeding values + vector est_column; + + map mapID2num; //map small ID number to number, from 0 to n-1 + map mapRS2chr; //map rs# to chromosome location + map mapRS2bp; //map rs# to base position + map mapRS2cM; //map rs# to cM + map mapRS2est; //map rs# to parameters + + vector snpInfo; //record SNP information + set setSnps; //a set of snps for analysis + + //constructor + PARAM(); + + //functions + void ReadFiles (); + void CheckParam (); + void CheckData (); + void PrintSummary (); + void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void CheckCvt (); + void CopyCvt (gsl_matrix *W); + void ProcessCvtPhen(); + void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); + void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); + void CalcKin (gsl_matrix *matrix_kin); + void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); + void WriteVector (const gsl_vector *vector_D, const string suffix); + void CopyRead (gsl_vector *log_N); +}; + + +#endif + diff --git a/src/prdt.cpp b/src/prdt.cpp new file mode 100644 index 0000000..2875119 --- /dev/null +++ b/src/prdt.cpp @@ -0,0 +1,544 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "io.h" +#include "lapack.h" //for functions EigenDecomp +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "io_float.h" +#include "prdt_float.h" +#include "mathfunc_float.h" +#else +#include "io.h" +#include "prdt.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void PRDT::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + indicator_pheno=cPar.indicator_pheno; + indicator_cvt=cPar.indicator_cvt; + indicator_idv=cPar.indicator_idv; + + snpInfo=cPar.snpInfo; + mapRS2est=cPar.mapRS2est; + + time_eigen=0; + + n_ph=cPar.n_ph; + np_obs=cPar.np_obs; + np_miss=cPar.np_miss; + ns_total=cPar.ns_total; + ns_test=0; + + return; +} + +void PRDT::CopyToParam (PARAM &cPar) +{ + cPar.ns_test=ns_test; + cPar.time_eigen=time_eigen; + + return; +} + + + + +void PRDT::WriteFiles (gsl_vector *y_prdt) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+="."; + file_str+="prdt"; + file_str+=".txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<size2; j++) { + outfile<size, ni_total=G->size1; + + gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test); + gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test); + gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); + gsl_vector *eval=gsl_vector_alloc (ni_test); + gsl_vector *Utu=gsl_vector_alloc (ni_test); + gsl_vector *w=gsl_vector_alloc (ni_total); + gsl_permutation *pmt=gsl_permutation_alloc (ni_test); + + //center matrix G based on indicator_idv + for (size_t i=0; isize; i++) { + if (gsl_vector_get(eval,i)<1e-10) {gsl_vector_set(eval, i, 0);} + } + + time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu); + for (size_t i=0; isize; i++) { + d=gsl_vector_get(eval, i); + if (d!=0) {d=gsl_vector_get(Utu, i)/d; gsl_vector_set(Utu, i, d);} + } + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval); + gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); + + //free matrices + gsl_matrix_free(Goo); + gsl_matrix_free(Gfo); + gsl_matrix_free(U); + gsl_vector_free(eval); + gsl_vector_free(Utu); + gsl_vector_free(w); + gsl_permutation_free(pmt); + + return; +} + + + +void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size); + gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size); + + ns_test=0; + + //start reading genotypes and analyze + for (size_t t=0; tsize==n_miss) {cout<<"snp "<size-n_miss); + x_train_mean/=(double)(n_train_nomiss); + + + for (size_t i=0; isize; ++i) { + geno=gsl_vector_get(x, i); + if (gsl_vector_get (x_miss, i)==0) { + gsl_vector_set(x, i, x_mean-x_train_mean); + } else { + gsl_vector_set(x, i, geno-x_train_mean); + } + } + + gsl_vector_scale (x, effect_size); + gsl_vector_add (y_prdt, x); + + ns_test++; + } + cout< b; + string rs; + + size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; + double geno, x_mean, x_train_mean, effect_size; + + gsl_vector *x=gsl_vector_alloc (y_prdt->size); + + //calculate n_bit and c, the number of bit for each snp + if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;} + else {n_bit=indicator_idv.size()/4+1; } + + //print the first three majic numbers + for (size_t i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + ns_test=0; + + for (vector::size_type t=0; tsize==n_miss) {cout<<"snp "<size-n_miss); + x_train_mean/=(double)(n_train_nomiss); + + for (size_t i=0; isize; ++i) { + geno=gsl_vector_get(x, i); + if (geno==-9) { + gsl_vector_set(x, i, x_mean-x_train_mean); + } else { + gsl_vector_set(x, i, geno-x_train_mean); + } + } + + gsl_vector_scale (x, effect_size); + gsl_vector_add (y_prdt, x); + + ns_test++; + } + cout<::size_type i1=0; i1::size_type j1=0; j1::size_type i2=0; i2::size_type j2=0; j2::size_type i=0; i::size_type j=0; j::size_type i=0; i::size_type j=0; j::size_type i=0; i::size_type j=0; j<2; ++j) { + if (indicator_pheno[i][j]==1) { + gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j+k*2)-gsl_matrix_get (Y_hat, i, j) ); + c_obs1++; + } else { + gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) ); + c_miss1++; + } + } + } + + LUSolve (H_oo, pmt, y_obs, Hiy); + + gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss); + + //put back predicted y_miss to Y_full + c_miss1=0; + for (vector::size_type i=0; i::size_type j=0; j<2; ++j) { + if (indicator_pheno[i][j]==0) { + gsl_matrix_set (Y_full, i, j+k*2, gsl_vector_get (y_miss, c_miss1) ); + c_miss1++; + } + } + } + } + } +*/ + //free matrices + gsl_vector_free(y_obs); + gsl_vector_free(y_miss); + gsl_matrix_free(H_oo); + gsl_matrix_free(H_mo); + gsl_vector_free(Hiy); + + return; +} + + diff --git a/src/prdt.h b/src/prdt.h new file mode 100644 index 0000000..8af2cee --- /dev/null +++ b/src/prdt.h @@ -0,0 +1,81 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __PRDT_H__ +#define __PRDT_H__ + + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +class PRDT { + +public: + // IO related parameters + size_t a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + vector > indicator_pheno; + vector indicator_cvt; + vector indicator_idv; + vector snpInfo; + map mapRS2est; + + size_t n_ph; + size_t np_obs, np_miss; + size_t ns_total; + size_t ns_test; + + double time_eigen; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void WriteFiles (gsl_vector *y_prdt); + void WriteFiles (gsl_matrix *Y_full); + void AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt); + void AnalyzeBimbam (gsl_vector *y_prdt); + void AnalyzePlink (gsl_vector *y_prdt); + void MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full); +}; + + +#endif + + + + + + + diff --git a/src/vc.cpp b/src/vc.cpp new file mode 100644 index 0000000..77cf746 --- /dev/null +++ b/src/vc.cpp @@ -0,0 +1,443 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_multiroots.h" +#include "gsl/gsl_min.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#include "vc_float.h" +#else +#include "lmm.h" +#include "vc.h" +#endif + + + +using namespace std; + + +//in this file, X, Y are already transformed (i.e. UtX and UtY) + + +void VC::CopyFromParam (PARAM &cPar) +{ + file_out=cPar.file_out; + + // v_sigma2=cPar.v_sigma2; + + time_UtX=0.0; + time_opt=0.0; + + v_traceG=cPar.v_traceG; + + return; +} + + +void VC::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.v_sigma2=v_sigma2; + cPar.v_se_sigma2=v_se_sigma2; + cPar.v_pve=v_pve; + cPar.v_se_pve=v_se_pve; + cPar.v_traceG=v_traceG; + + cPar.v_beta=v_beta; + cPar.v_se_beta=v_se_beta; + + return; +} + + + +void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) +{ + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2; + + gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1); + gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt); + gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt); + gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt); + gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1); + + double sigma2; + //calculate H=\sum_i^{k+1} \sigma_i^2 K_i + gsl_matrix_set_zero (p->P); + for (size_t i=0; iK, 0, n1*i, n1, n1); + gsl_matrix_memcpy (K_temp, &K_sub.matrix); + } + + sigma2=exp(gsl_vector_get (log_sigma2, i) ); + gsl_matrix_scale(K_temp, sigma2); + gsl_matrix_add (p->P, K_temp); + } + + //calculate H^{-1} + int sig; + gsl_permutation * pmt1=gsl_permutation_alloc (n1); + LUDecomp (p->P, pmt1, &sig); + LUInvert (p->P, pmt1, K_temp); + gsl_permutation_free(pmt1); + + gsl_matrix_memcpy (p->P, K_temp); + + //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1} + gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + + gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); + LUDecomp (WtHiW, pmt2, &sig); + LUInvert (WtHiW, pmt2, WtHiWi); + gsl_permutation_free(pmt2); + + gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); + + //calculate Py, KPy, PKPy + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + + for (size_t i=0; iKPy_mat, i); + gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i); + + if (i==n_vc) { + gsl_vector_memcpy (&KPy.vector, p->Py); + } else { + gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); + } + + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + } + + gsl_matrix_free (K_temp); + gsl_matrix_free (HiW); + gsl_matrix_free (WtHiW); + gsl_matrix_free (WtHiWi); + gsl_matrix_free (WtHiWiWtHi); + + return; +} + + +//below are functions for AI algorithm +int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; + + double tr, d; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy + for (size_t i=0; iP, l, l); + } + } else { + tr=0; + for (size_t l=0; lP, l); + gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); + gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); + tr+=d; + } + } + + gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); + gsl_blas_ddot(p->Py, &KPy_i.vector, &d); + + d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); + + gsl_vector_set(dev1, i, d); + } + + return GSL_SUCCESS; +} + + + +int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n_vc=log_sigma2->size-1; + + double d, sigma2_i, sigma2_j; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev2=0.5(yPKPKPy) + for (size_t i=0; iKPy_mat, i); + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + + for (size_t j=i; jPKPy_mat, j); + + gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + + d*=-0.5*sigma2_i*sigma2_j; + + gsl_matrix_set(dev2, i, j, d); + if (j!=i) {gsl_matrix_set(dev2, j, i, d);} + } + } + + gsl_matrix_memcpy (p->Hessian, dev2); + + return GSL_SUCCESS; +} + + + +int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, gsl_matrix *dev2) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; + + double tr, d, sigma2_i, sigma2_j; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy + //calculate dev2=0.5(yPKPKPy) + for (size_t i=0; iP, l, l); + } + } else { + tr=0; + for (size_t l=0; lP, l); + gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); + gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); + tr+=d; + } + } + + gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); + gsl_blas_ddot(p->Py, &KPy_i.vector, &d); + + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + d=(-0.5*tr+0.5*d)*sigma2_i; + + gsl_vector_set(dev1, i, d); + + for (size_t j=i; jPKPy_mat, j); + gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); + + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + + gsl_matrix_set(dev2, i, j, d); + if (j!=i) {gsl_matrix_set(dev2, j, i, d);} + } + + } + + gsl_matrix_memcpy (p->Hessian, dev2); + + return GSL_SUCCESS; +} + + + + +void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) +{ + size_t n1=K->size1, n2=K->size2; + size_t n_vc=n2/n1; + gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1); + double d, s; + + //set up params + gsl_matrix *P=gsl_matrix_alloc (n1, n1); + gsl_vector *Py=gsl_vector_alloc (n1); + gsl_matrix *KPy_mat=gsl_matrix_alloc (n1, n_vc+1); + gsl_matrix *PKPy_mat=gsl_matrix_alloc (n1, n_vc+1); + gsl_vector *dev1=gsl_vector_alloc (n_vc+1); + gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1); + gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1); + VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian}; + + //initialize sigma2/log_sigma2 + gsl_blas_ddot (y, y, &s); + s/=(double)n1; + for (size_t i=0; if, 1e-3); + } + while (status==GSL_CONTINUE && iterf, ¶ms, dev1, dev2); + + gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1); + LUDecomp (dev2, pmt, &sig); + LUInvert (dev2, pmt, Hessian); + gsl_permutation_free(pmt); + + //save data + v_sigma2.clear(); + for (size_t i=0; ix, i)); + v_sigma2.push_back(d); + } + + v_se_sigma2.clear(); + for (size_t i=0; i. +*/ + +#ifndef __VC_H__ +#define __VC_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + +class VC_PARAM +{ + +public: + const gsl_matrix *K; + const gsl_matrix *W; + const gsl_vector *y; + gsl_matrix *P; + gsl_vector *Py; + gsl_matrix *KPy_mat; + gsl_matrix *PKPy_mat; + gsl_matrix *Hessian; +}; + + + + +class VC { + +public: + // IO related parameters + string file_out; + string path_out; + + vector v_sigma2; + vector v_se_sigma2; + vector v_pve; + vector v_se_pve; + vector v_traceG; + vector v_beta; + vector v_se_beta; + + double time_UtX; + double time_opt; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); + void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); +}; + +#endif + + -- cgit v1.2.3