From 17deca2d54827a00df3ea4d98df700fc2b8ed777 Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Sat, 20 Sep 2014 10:17:34 -0400 Subject: initial upload, version 0.95alpha --- bslmm.cpp | 1927 ++++++++++++++++++++++++++++++ bslmm.h | 145 +++ gemma.cpp | 1856 +++++++++++++++++++++++++++++ gemma.h | 52 + gzstream.cpp | 165 +++ gzstream.h | 121 ++ io.cpp | 1396 ++++++++++++++++++++++ io.h | 79 ++ lapack.cpp | 609 ++++++++++ lapack.h | 53 + lm.cpp | 571 +++++++++ lm.h | 74 ++ lmm.cpp | 1770 +++++++++++++++++++++++++++ lmm.h | 110 ++ main.cpp | 86 ++ mathfunc.cpp | 310 +++++ mathfunc.h | 41 + mvlmm.cpp | 3748 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mvlmm.h | 93 ++ param.cpp | 849 +++++++++++++ param.h | 231 ++++ prdt.cpp | 543 +++++++++ prdt.h | 80 ++ vc.cpp | 443 +++++++ vc.h | 80 ++ 25 files changed, 15432 insertions(+) create mode 100644 bslmm.cpp create mode 100644 bslmm.h create mode 100644 gemma.cpp create mode 100644 gemma.h create mode 100644 gzstream.cpp create mode 100644 gzstream.h create mode 100644 io.cpp create mode 100644 io.h create mode 100644 lapack.cpp create mode 100644 lapack.h create mode 100644 lm.cpp create mode 100644 lm.h create mode 100644 lmm.cpp create mode 100644 lmm.h create mode 100644 main.cpp create mode 100644 mathfunc.cpp create mode 100644 mathfunc.h create mode 100644 mvlmm.cpp create mode 100644 mvlmm.h create mode 100644 param.cpp create mode 100644 param.h create mode 100644 prdt.cpp create mode 100644 prdt.h create mode 100644 vc.cpp create mode 100644 vc.h diff --git a/bslmm.cpp b/bslmm.cpp new file mode 100644 index 0000000..ff9618d --- /dev/null +++ b/bslmm.cpp @@ -0,0 +1,1927 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" + + + + +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "bslmm_float.h" +#include "lmm_float.h" //for class FUNC_PARAM and MatrixCalcLR +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "bslmm.h" +#include "lmm.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void BSLMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + + l_min=cPar.h_min; + l_max=cPar.h_max; + n_region=cPar.n_region; + pve_null=cPar.pve_null; + pheno_mean=cPar.pheno_mean; + + time_UtZ=0.0; + time_Omega=0.0; + n_accept=0; + + h_min=cPar.h_min; + h_max=cPar.h_max; + h_scale=cPar.h_scale; + rho_min=cPar.rho_min; + rho_max=cPar.rho_max; + rho_scale=cPar.rho_scale; + logp_min=cPar.logp_min; + logp_max=cPar.logp_max; + logp_scale=cPar.logp_scale; + + s_min=cPar.s_min; + s_max=cPar.s_max; + w_step=cPar.w_step; + s_step=cPar.s_step; + r_pace=cPar.r_pace; + w_pace=cPar.w_pace; + n_mh=cPar.n_mh; + geo_mean=cPar.geo_mean; + randseed=cPar.randseed; + trace_G=cPar.trace_G; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void BSLMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtZ=time_UtZ; + cPar.time_Omega=time_Omega; + cPar.time_Proposal=time_Proposal; + cPar.cHyp_initial=cHyp_initial; + cPar.n_accept=n_accept; + cPar.pheno_mean=pheno_mean; + cPar.randseed=randseed; + + return; +} + + + +void BSLMM::WriteBV (const gsl_vector *bv) +{ + string file_str; + file_str="./output/"+file_out; + file_str+=".bv.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< > &beta_g, const gsl_vector *alpha, const size_t w) +{ + string file_str; + file_str="./output/"+file_out; + file_str+=".param.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< &rank) +{ + size_t pos; + for (size_t i=0; isize2, UtXgamma->size2); + gsl_vector *Xty=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *OiXty=gsl_vector_alloc (UtXgamma->size2); + + gsl_matrix_set_identity (Omega); + gsl_matrix_scale (Omega, 1.0/sigma_a2); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, UtXgamma, UtXgamma, 1.0, Omega); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, UtXgamma, UtXgamma, 1.0, Omega); +#endif + gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma, Uty, 0.0, Xty); + + CholeskySolve(Omega, Xty, OiXty); + + gsl_blas_ddot (Xty, OiXty, &pve); + gsl_blas_ddot (Uty, Uty, &var_y); + + pve/=var_y; + + gsl_matrix_free (Omega); + gsl_vector_free (Xty); + gsl_vector_free (OiXty); + + return pve; +} + + +void BSLMM::InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector &rank, class HYPBSLMM &cHyp, vector > &pos_loglr) +{ + double q_genome=gsl_cdf_chisq_Qinv(0.05/(double)ns_test, 1); + + cHyp.n_gamma=0; + for (size_t i=0; iq_genome) {cHyp.n_gamma++;} + } + if (cHyp.n_gamma<10) {cHyp.n_gamma=10;} + + if (cHyp.n_gamma>s_max) {cHyp.n_gamma=s_max;} + if (cHyp.n_gamma1.0) {cHyp.rho=1.0;} + + if (cHyp.hh_max) {cHyp.h=h_max;} + if (cHyp.rhorho_max) {cHyp.rho=rho_max;} + if (cHyp.logplogp_max) {cHyp.logp=logp_max;} + + +// if (fix_sigma>=0) { +// fix_sigma=cHyp.h; +// rho_max=1-cHyp.h; +// cHyp.rho=rho_max/2.0; +// } + + //Initial for grid sampling: +// cHyp.h=0.225; +// cHyp.rho=1.0; +// cHyp.logp=-4.835429; + + cout<<"initial value of h = "<size; i++) + { + d=gsl_ran_gaussian(gsl_r, 1); + gsl_vector_set(beta, i, d); + } + gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, beta); + + + //it compuates inv(L^T(Omega)) %*% beta; + gsl_vector_scale(beta, sqrt(sigma_a2/tau)); + gsl_vector_add(beta, beta_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, UtXgamma, beta, 0.0, UtXb); + + //sample alpha + gsl_vector_memcpy (alpha_prime, Uty); + gsl_vector_sub (alpha_prime, UtXb); + gsl_vector_mul (alpha_prime, weight_Hi); + gsl_vector_scale (alpha_prime, sigma_b2); + + //sample u + gsl_vector_memcpy (Utu, alpha_prime); + gsl_vector_mul (Utu, K_eval); + + if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));} + gsl_vector_add (Utu, Utu_rand); + + + //for quantitative traits, calculate pve and pge + if (a_mode==11) { + gsl_blas_ddot (UtXb, UtXb, &d); + cHyp.pge=d/(double)ni_test; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=cHyp.pge+d/(double)ni_test; + + if (cHyp.pve==0) {cHyp.pge=0.0;} + else {cHyp.pge/=cHyp.pve;} + cHyp.pve/=cHyp.pve+1.0/tau; + } + + + gsl_matrix_free (UtXgamma_eval); + gsl_matrix_free (Omega); + gsl_vector_free (XtHiy); + gsl_vector_free (beta_hat); + gsl_vector_free (Utu_rand); + gsl_vector_free (weight_Hi); + + logpost=-0.5*logdet_H-0.5*logdet_O; + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);} + else {logpost-=0.5*P_yy;} +// else {logpost+=-0.5*P_yy*tau+0.5*(double)ni_test*log(tau);} + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp)); + + return logpost; +} + + + +//calculate pve and pge, and calculate z_hat for case-control data +void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) +{ + double d; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=d/(double)ni_test; + + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, z_hat); + + cHyp.pve/=cHyp.pve+1.0; + cHyp.pge=0.0; + + return; +} + + +//calculate pve and pge, and calculate z_hat for case-control data +void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) +{ + double d; + gsl_vector *UtXbU=gsl_vector_alloc (Utu->size); + + gsl_blas_ddot (UtXb, UtXb, &d); + cHyp.pge=d/(double)ni_test; + + gsl_blas_ddot (Utu, Utu, &d); + cHyp.pve=cHyp.pge+d/(double)ni_test; + + gsl_vector_memcpy (UtXbU, Utu); + gsl_vector_add (UtXbU, UtXb); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, UtXbU, 0.0, z_hat); + + if (cHyp.pve==0) {cHyp.pge=0.0;} + else {cHyp.pge/=cHyp.pve;} + + cHyp.pve/=cHyp.pve+1.0; + + gsl_vector_free(UtXbU); + return; +} + + + + +void BSLMM::SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z) +{ + double d1, d2, z_rand=0.0; + for (size_t i=0; isize; ++i) { + d1=gsl_vector_get (y, i); + d2=gsl_vector_get (z_hat, i); + //y is centerred for case control studies + if (d1<=0.0) { + //control, right truncated + do { + z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0); + } while (z_rand>0.0); + } + else { + do { + z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0); + } while (z_rand<0.0); + } + + gsl_vector_set (z, i, z_rand); + } + + return; +} + + + + + +double BSLMM::ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + + double h=cHyp_old.h, rho=cHyp_old.rho; + + double d_h=(h_max-h_min)*h_scale, d_rho=(rho_max-rho_min)*rho_scale; + + for (size_t i=0; ih_max) {h=2*h_max-h;} + + rho=rho+(gsl_rng_uniform(gsl_r)-0.5)*d_rho; + if (rhorho_max) {rho=2*rho_max-rho;} + } + /* + //Grid Sampling + for (size_t i=0; ih_max) {h=h_min;} + } + + for (size_t i=0; irho_max) {rho=rho_min;} + } + */ + cHyp_new.h=h; + cHyp_new.rho=rho; + return 0.0; +} + + +double BSLMM::ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + double logp_old=cHyp_old.logp, logp_new=cHyp_old.logp; + double log_ratio=0.0; + + double d_logp=min(0.1, (logp_max-logp_min)*logp_scale); + + for (size_t i=0; ilogp_max) {logp_new=2*logp_max-logp_new;} + + log_ratio+=logp_new-logp_old; + logp_old=logp_new; + } + /* + //Grid Sampling + for (size_t i=0; ilogp_max) {logp_new=logp_min;} + + log_ratio+=logp_new-logp_old; + logp_old=logp_new; + } + */ + cHyp_new.logp=logp_new; + + return log_ratio; +} + +bool comp_vec (size_t a, size_t b) +{ + return (a < b); +} + + +double BSLMM::ProposeGamma (const vector &rank_old, vector &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat) +{ + map mapRank2in; + size_t r; + double unif, logp=0.0; + int flag_gamma; + size_t r_add, r_remove, col_id; + + rank_new.clear(); + if (cHyp_old.n_gamma!=rank_old.size()) {cout<<"size wrong"<=0.40 && unif < 0.80 && cHyp_new.n_gamma>s_min) {flag_gamma=2;} + else if (unif>=0.80 && cHyp_new.n_gamma>0 && cHyp_new.n_gamma a, pair b) +{ + return (a.second > b.second); +} + + + + + + + +//if a_mode==13 then Uty==y +void BSLMM::MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y) { + clock_t time_start; + + class HYPBSLMM cHyp_old, cHyp_new; + + gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6); + gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max); + + gsl_vector *alpha_prime=gsl_vector_alloc (ni_test); + gsl_vector *alpha_new=gsl_vector_alloc (ni_test); + gsl_vector *alpha_old=gsl_vector_alloc (ni_test); + gsl_vector *Utu=gsl_vector_alloc (ni_test); + gsl_vector *Utu_new=gsl_vector_alloc (ni_test); + gsl_vector *Utu_old=gsl_vector_alloc (ni_test); + + gsl_vector *UtXb_new=gsl_vector_alloc (ni_test); + gsl_vector *UtXb_old=gsl_vector_alloc (ni_test); + + gsl_vector *z_hat=gsl_vector_alloc (ni_test); + gsl_vector *z=gsl_vector_alloc (ni_test); + gsl_vector *Utz=gsl_vector_alloc (ni_test); + + gsl_vector_memcpy (Utz, Uty); + + double logPost_new, logPost_old; + double logMHratio; + double mean_z=0.0; + + gsl_matrix_set_zero (Result_gamma); + gsl_vector_set_zero (Utu); + gsl_vector_set_zero (alpha_prime); + if (a_mode==13) { + pheno_mean=0.0; + } + + vector > beta_g; + for (size_t i=0; i rank_new, rank_old; + vector beta_new, beta_old; + + vector > pos_loglr; + + time_start=clock(); + MatrixCalcLR (U, UtX, Utz, K_eval, l_min, l_max, n_region, pos_loglr); + time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr); + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + double *p_gamma = new double[ns_test]; + CalcPgamma (p_gamma); + + gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma); + + //initial parameters + InitialMCMC (UtX, Utz, rank_old, cHyp_old, pos_loglr); +// if (fix_sigma>=0) { +// rho_max=1-fix_sigma; +// cHyp_old.h=fix_sigma/(1-cHyp_old.rho); +// } + + cHyp_initial=cHyp_old; + + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + + //calculate centered z_hat, and pve + if (a_mode==13) { + time_start=clock(); + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old); + } + else { + CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old); + } + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } + + //start MCMC + int accept; + size_t total_step=w_step+s_step; + size_t w=0, w_col, pos; + size_t repeat=0; + + for (size_t t=0; t10) {break;} + + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz); + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //First proposal + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + } + + //MH steps + for (size_t i=0; i=0) { +// cHyp_new.h=fix_sigma/(1-cHyp_new.rho); +// } + + if (cHyp_new.n_gamma==0 || cHyp_new.rho==0) { + logPost_new=CalcPosterior(Utz, K_eval, Utu_new, alpha_new, cHyp_new); + beta_new.clear(); + for (size_t i=0; isize; ++i) { + beta_new.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + + logMHratio+=logPost_new-logPost_old; + + if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))size2); + gsl_vector *H_eval=gsl_vector_alloc (Uty->size); + gsl_vector *bv=gsl_vector_alloc (Uty->size); + + gsl_vector_memcpy (H_eval, eval); + gsl_vector_scale (H_eval, lambda); + gsl_vector_add_constant (H_eval, 1.0); + + gsl_vector_memcpy (bv, Uty); + gsl_vector_div (bv, H_eval); + + gsl_blas_dgemv (CblasTrans, lambda/(double)UtX->size2, UtX, bv, 0.0, beta); + gsl_vector_add_constant (H_eval, -1.0); + gsl_vector_mul (H_eval, bv); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, H_eval, 0.0, bv); + + WriteParam (beta); + WriteBV(bv); + + gsl_vector_free (H_eval); + gsl_vector_free (beta); + gsl_vector_free (bv); + + return; +} + + + + + + + + + + + + + + + + + +//below fits MCMC for rho=1 +void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty) +{ + time_t time_start=clock(); + gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size); + gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size); + gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#endif + gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + return; +} + + +void BSLMM::SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector &rank_old, const vector &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new) +{ + double d; + + //rank_old and rank_new are sorted already inside PorposeGamma + //calculate vectors rank_remove and rank_add + // size_t v_size=max(rank_old.size(), rank_new.size()); + //make sure that v_size is larger than repeat + size_t v_size=20; + vector rank_remove(v_size), rank_add(v_size), rank_union(s_max+v_size); + vector::iterator it; + + it=set_difference (rank_old.begin(), rank_old.end(), rank_new.begin(), rank_new.end(), rank_remove.begin()); + rank_remove.resize(it-rank_remove.begin()); + + it=set_difference (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_add.begin()); + rank_add.resize(it-rank_add.begin()); + + it=set_union (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_union.begin()); + rank_union.resize(it-rank_union.begin()); + + //map rank_remove and rank_add + map mapRank2in_remove, mapRank2in_add; + for (size_t i=0; isize1, rank_old.size()); + gsl_matrix_const_view XtXold_sub=gsl_matrix_const_submatrix(XtX_old, 0, 0, rank_old.size(), rank_old.size()); + gsl_vector_const_view Xtyold_sub=gsl_vector_const_subvector(Xty_old, 0, rank_old.size()); + + gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(X_new, 0, 0, X_new->size1, rank_new.size()); + gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size()); + gsl_vector_view Xtynew_sub=gsl_vector_subvector(Xty_new, 0, rank_new.size()); + + //get X_new and calculate XtX_new + /* + if (rank_remove.size()==0 && rank_add.size()==0) { + gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix); + gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix); + gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector); + } else { + gsl_matrix *X_temp=gsl_matrix_alloc(X_old->size1, rank_old.size()-rank_remove.size() ); + gsl_matrix *XtX_temp=gsl_matrix_alloc(X_temp->size2, X_temp->size2); + gsl_vector *Xty_temp=gsl_vector_alloc(X_temp->size2); + + if (rank_remove.size()==0) { + gsl_matrix_memcpy (X_temp, &Xold_sub.matrix); + gsl_matrix_memcpy (XtX_temp, &XtXold_sub.matrix); + gsl_vector_memcpy (Xty_temp, &Xtyold_sub.vector); + } else { + size_t i_temp=0, j_temp; + for (size_t i=0; isize1, rank_add.size() ); + gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2); + gsl_matrix *XtX_at=gsl_matrix_alloc(X_add->size2, X_temp->size2); + gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2); + + //get X_add + SetXgamma (X_add, X, rank_add); + + //get t(X_add)X_add and t(X_add)X_temp + clock_t time_start=clock(); + + //somehow the lapack_dgemm does not work here + //#ifdef WITH_LAPACK + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa); + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at); + + //#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_temp, 0.0, XtX_at); + //#endif + gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //save to X_new, XtX_new and Xty_new + size_t i_temp=0, j_temp, i_flag=0, j_flag=0; + for (size_t i=0; isize1, rank_add.size() ); + gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2); + gsl_matrix *XtX_ao=gsl_matrix_alloc(X_add->size2, X_old->size2); + gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2); + + //get X_add + SetXgamma (X_add, X, rank_add); + + //get t(X_add)X_add and t(X_add)X_temp + clock_t time_start=clock(); + + //somehow the lapack_dgemm does not work here + //#ifdef WITH_LAPACK + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa); + //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_old, 0.0, XtX_ao); + + //#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_old, 0.0, XtX_ao); + //#endif + gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //save to X_new, XtX_new and Xty_new + i_old=0; i_new=0; i_add=0; + for (size_t i=0; isize1, s_size); + gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size); + gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size); + + gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size); + gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size); + gsl_vector *beta_hat=gsl_vector_alloc (s_size); + gsl_vector *Xty_temp=gsl_vector_alloc (s_size); + + gsl_vector_memcpy (Xty_temp, &Xty_sub.vector); + + //calculate Omega + gsl_matrix_memcpy (Omega, &XtX_sub.matrix); + gsl_matrix_scale (Omega, sigma_a2); + gsl_matrix_set_identity (M_temp); + gsl_matrix_add (Omega, M_temp); + + //calculate beta_hat + logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat); + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (Xty_temp, beta_hat, &d); + P_yy-=d; + + //sample tau + double tau=1.0; + if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0, 2.0/P_yy); } + + //sample beta + for (size_t i=0; i > beta_g; + for (size_t i=0; i rank_new, rank_old; + vector > pos_loglr; + + time_start=clock(); + MatrixCalcLmLR (X, z, pos_loglr); + time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr); + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + double *p_gamma = new double[ns_test]; + CalcPgamma (p_gamma); + + gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma); + + //initial parameters + InitialMCMC (X, z, rank_old, cHyp_old, pos_loglr); + + cHyp_initial=cHyp_old; + + if (cHyp_old.n_gamma==0) { + logPost_old=CalcPosterior (ztz, cHyp_old); + } + else { + SetXgamma (Xgamma_old, X, rank_old); + CalcXtX (Xgamma_old, z, rank_old.size(), XtX_old, Xtz_old); + logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old); + } + + //calculate centered z_hat, and pve + if (a_mode==13) { + if (cHyp_old.n_gamma==0) { + CalcCC_PVEnZ (z_hat, cHyp_old); + } + else { + CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old); + } + } + + //start MCMC + int accept; + size_t total_step=w_step+s_step; + size_t w=0, w_col, pos; + size_t repeat=0; + + for (size_t t=0; t10) {break;} + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + gsl_blas_ddot(z,z,&ztz); + + //First proposal + if (cHyp_old.n_gamma==0) { + logPost_old=CalcPosterior (ztz, cHyp_old); + } else { + gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_old.size()); + gsl_vector_view Xtz_sub=gsl_vector_subvector(Xtz_old, 0, rank_old.size()); + gsl_blas_dgemv (CblasTrans, 1.0, &Xold_sub.matrix, z, 0.0, &Xtz_sub.vector); + logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old); + } + } + + //MH steps + for (size_t i=0; i0 || log(gsl_rng_uniform(gsl_r)). + */ + + +#ifndef __BSLMM_H__ +#define __BSLMM_H__ + +#include +#include +#include +#include + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class BSLMM { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double pve_null; + double pheno_mean; + + // BSLMM MCMC related parameters + double h_min, h_max, h_scale; //priors for h + double rho_min, rho_max, rho_scale; //priors for rho + double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t s_min, s_max; //minimum and maximum number of gammas + size_t w_step; //number of warm up/burn in iterations + size_t s_step; //number of sampling iterations + size_t r_pace; //record pace + size_t w_pace; //write pace + size_t n_accept; //number of acceptance + size_t n_mh; //number of MH steps within each iteration + double geo_mean; //mean of the geometric distribution + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + size_t n_cvt; //number of covariates + double time_UtZ; + double time_Omega; //time spent on optimization iterations + double time_Proposal; //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis) + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + gsl_rng *gsl_r; + gsl_ran_discrete_t *gsl_t; + map mapRank2pos; + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda); + + void MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y); + void WriteLog (); + void WriteLR (); + void WriteBV (const gsl_vector *bv); + void WriteParam (vector > &beta_g, const gsl_vector *alpha, const size_t w); + void WriteParam (const gsl_vector *alpha); + void WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col); + + //Subfunctions inside MCMC + void CalcPgamma (double *p_gammar); + + double CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2); + void InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector &rank_old, class HYPBSLMM &cHyp, vector > &pos_loglr); + double CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp); + double CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp); + double CalcREMLE (const gsl_matrix *Utw, const gsl_vector *Uty, const gsl_vector *K_eval); + double CalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, vector > &loglr_sort); //calculate the maximum marginal likelihood ratio for each analyzed SNPs with gemma, use it to rank SNPs + void SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z); + double ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + double ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + double ProposeGamma (const vector &rank_old, vector &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat); + void SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector &rank); + + void CalcXtX (const gsl_matrix *X_new, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX_new, gsl_vector *Xty_new); + void SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector &rank_old, const vector &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new); + double CalcPosterior (const double yty, class HYPBSLMM &cHyp); + double CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp); + void CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp); + void MCMC (const gsl_matrix *X, const gsl_vector *y); + + //utility functions +// double vec_sum (gsl_vector *v); +// void vec_center (gsl_vector *v); +// double calc_var (gsl_vector *v); +// void calc_sigma (MCMC &cMcmc); +// bool comp_lr (pair a, pair b); +}; + + + +#endif + + diff --git a/gemma.cpp b/gemma.cpp new file mode 100644 index 0000000..093cd05 --- /dev/null +++ b/gemma.cpp @@ -0,0 +1,1856 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" //for functions EigenDecomp + +#ifdef FORCE_FLOAT +#include "io_float.h" //for function ReadFile_kin +#include "gemma_float.h" +#include "vc_float.h" +#include "lm_float.h" //for LM class +#include "bslmm_float.h" //for BSLMM class +#include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe +#include "mvlmm_float.h" //for MVLMM class +#include "prdt_float.h" //for PRDT class +#include "mathfunc_float.h" //for a few functions +#else +#include "io.h" +#include "gemma.h" +#include "vc.h" +#include "lm.h" +#include "bslmm.h" +#include "lmm.h" +#include "mvlmm.h" +#include "prdt.h" +#include "mathfunc.h" +#endif + + +using namespace std; + + + +GEMMA::GEMMA(void): +version("0.95"), date("08/08/2014"), year("2011") +{} + +void GEMMA::PrintHeader (void) +{ + cout< indicator_all; + size_t c_bv=0; + for (size_t i=0; isize; i++) { + d=gsl_vector_get(y_prdt, i); + d=gsl_cdf_gaussian_P(d, 1.0); + gsl_vector_set(y_prdt, i, d); + } + } + + + cPRDT.CopyToParam(cPar); + + cPRDT.WriteFiles(y_prdt); + + gsl_vector_free(y_prdt); + } + + + //Prediction with kinship matrix only; for one or more phenotypes + if (cPar.a_mode==43) { + //first, use individuals with full phenotypes to obtain estimates of Vg and Ve + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); + gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); + gsl_vector *eval=gsl_vector_alloc (Y->size1); + + gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph); + gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt); + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + cPar.CopyCvtPhen (W_full, Y_full, 1); + + gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); + gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); + gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2); + + //read relatedness matrix G, and matrix G_full + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + //calculate variance component and beta estimates + //and then obtain predicted values + if (cPar.n_ph==1) { + gsl_vector *beta=gsl_vector_alloc (W->size2); + gsl_vector *se_beta=gsl_vector_alloc (W->size2); + + double lambda, logl, vg, ve; + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + //obtain estimates + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta); + + cout<<"REMLE estimate for vg in the null model = "<size1; i++) { + for (size_t j=0; j<=i; j++) { + cout<size1; i++) { + for (size_t j=0; j<=i; j++) { + cout<size1; i++) { + for (size_t j=i; jsize2; j++) { + cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) ); + cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) ); + } + } + + //obtain Y_hat from fixed effects + gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat); + + //obtain H + KroneckerSym(G_full, Vg, H_full); + for (size_t i=0; isize1; i++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2); + gsl_matrix_add (&H_sub.matrix, Ve); + } + + //free matrices + gsl_matrix_free (Vg); + gsl_matrix_free (Ve); + gsl_matrix_free (B); + gsl_matrix_free (se_B); + } + + PRDT cPRDT; + + cPRDT.CopyFromParam(cPar); + + cout<<"Predicting Missing Phentypes ... "<size1, cPar.n_cvt); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //Fit LM or mvLM + if (cPar.n_ph==1) { + LM cLm; + cLm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + + if (!cPar.file_gene.empty()) { + cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype + } else if (!cPar.file_bfile.empty()) { + cLm.AnalyzePlink (W, &Y_col.vector); + } else { + cLm.AnalyzeBimbam (W, &Y_col.vector); + } + + cLm.WriteFiles(); + cLm.CopyToParam(cPar); + } + /* + else { + MVLM cMvlm; + cMvlm.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cMvlm.AnalyzePlink (W, Y); + } else { + cMvlm.AnalyzeBimbam (W, Y); + } + + cMvlm.WriteFiles(); + cMvlm.CopyToParam(cPar); + } + */ + //release all matrices and vectors + gsl_matrix_free (Y); + gsl_matrix_free (W); + } + + + //VC estimation with one or multiple kinship matrices + //REML approach only + //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp + //for one phenotype only; + if (cPar.a_mode==61) { + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc ); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //read kinship matrices + if (!(cPar.file_mk).empty()) { + ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1, G->size1, G->size1); + CenterMatrix (&G_sub.matrix); + d=0; + for (size_t j=0; jsize1; j++) { + d+=gsl_matrix_get (&G_sub.matrix, j, j); + } + d/=(double)G->size1; + (cPar.v_traceG).push_back(d); + } + } else if (!(cPar.file_kin).empty()) { + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size1; j++) { + d+=gsl_matrix_get (G, j, j); + } + d/=(double)G->size1; + (cPar.v_traceG).push_back(d); + } + /* + //eigen-decomposition and calculate trace_G + cout<<"Start Eigen-Decomposition..."<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } else { + ReadFile_eigenU (cPar.file_ku, cPar.error, U); + if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { + if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} + cPar.trace_G+=gsl_vector_get(eval, i); + } + cPar.trace_G/=(double)eval->size; + } + */ + //fit multiple variance components + if (cPar.n_ph==1) { + // if (cPar.n_vc==1) { + /* + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + gsl_vector_view beta=gsl_matrix_row (B, 0); + gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); + + cPar.beta_mle_null.clear(); + cPar.se_beta_mle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); + cPar.beta_remle_null.clear(); + cPar.se_beta_remle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + cPar.PrintSummary(); + + //calculate and output residuals + if (cPar.a_mode==5) { + gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); + gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); + gsl_vector *u_hat=gsl_vector_alloc (Y->size1); + gsl_vector *e_hat=gsl_vector_alloc (Y->size1); + gsl_vector *y_hat=gsl_vector_alloc (Y->size1); + + //obtain Utu and Ute + gsl_vector_memcpy (y_hat, &UtY_col.vector); + gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); + + double d, u, e; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (eval, i); + u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + gsl_vector_set (Utu_hat, i, u); + gsl_vector_set (Ute_hat, i, e); + } + + //obtain u and e + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); + + //output residuals + cPar.WriteVector(u_hat, "residU"); + cPar.WriteVector(e_hat, "residE"); + + gsl_vector_free(u_hat); + gsl_vector_free(e_hat); + gsl_vector_free(y_hat); + } +*/ + // } else { + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + VC cVc; + cVc.CopyFromParam(cPar); + cVc.CalcVCreml (G, W, &Y_col.vector); + cVc.CopyToParam(cPar); + + //obtain pve from sigma2 + //obtain se_pve from se_sigma2 + + //} + } + + + } + + + //LMM or mvLMM or Eigen-Decomposition + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) { //Fit LMM or mvLMM or eigen + gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2); //B is a d by c matrix + gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2); + gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); + gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); + gsl_vector *eval=gsl_vector_alloc (Y->size1); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, Y, 0); + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + } else { + ReadFile_eigenU (cPar.file_ku, cPar.error, U); + if (cPar.error==true) {cout<<"error! fail to read the U file. "<size; i++) { + if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} + cPar.trace_G+=gsl_vector_get(eval, i); + } + cPar.trace_G/=(double)eval->size; + } + + if (cPar.a_mode==31) { + cPar.WriteMatrix(U, "eigenU"); + cPar.WriteVector(eval, "eigenD"); + } else { + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); + + //calculate REMLE/MLE estimate and pve for univariate model + if (cPar.n_ph==1) { + gsl_vector_view beta=gsl_matrix_row (B, 0); + gsl_vector_view se_beta=gsl_matrix_row (se_B, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); + + cPar.beta_mle_null.clear(); + cPar.se_beta_mle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector); + cPar.beta_remle_null.clear(); + cPar.se_beta_remle_null.clear(); + for (size_t i=0; isize2; i++) { + cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); + cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); + } + + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + cPar.PrintSummary(); + + //calculate and output residuals + if (cPar.a_mode==5) { + gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); + gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1); + gsl_vector *u_hat=gsl_vector_alloc (Y->size1); + gsl_vector *e_hat=gsl_vector_alloc (Y->size1); + gsl_vector *y_hat=gsl_vector_alloc (Y->size1); + + //obtain Utu and Ute + gsl_vector_memcpy (y_hat, &UtY_col.vector); + gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); + + double d, u, e; + for (size_t i=0; isize; i++) { + d=gsl_vector_get (eval, i); + u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i); + gsl_vector_set (Utu_hat, i, u); + gsl_vector_set (Ute_hat, i, e); + } + + //obtain u and e + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); + + //output residuals + cPar.WriteVector(u_hat, "residU"); + cPar.WriteVector(e_hat, "residE"); + + gsl_vector_free(u_hat); + gsl_vector_free(e_hat); + gsl_vector_free(y_hat); + } + } + + //Fit LMM or mvLMM + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) { + if (cPar.n_ph==1) { + LMM cLmm; + cLmm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + if (!cPar.file_gene.empty()) { + cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype + } else if (!cPar.file_bfile.empty()) { + cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } + + cLmm.WriteFiles(); + cLmm.CopyToParam(cPar); + } else { + MVLMM cMvlmm; + cMvlmm.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cMvlmm.AnalyzePlink (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + } + + cMvlmm.WriteFiles(); + cMvlmm.CopyToParam(cPar); + } + } + } + + + //release all matrices and vectors + gsl_matrix_free (Y); + gsl_matrix_free (W); + gsl_matrix_free(B); + gsl_matrix_free(se_B); + gsl_matrix_free (G); + gsl_matrix_free (U); + gsl_matrix_free (UtW); + gsl_matrix_free (UtY); + gsl_vector_free (eval); + } + + + //BSLMM + if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); + gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); + + //set covariates matrix W and phenotype vector y + //an intercept should be included in W, + cPar.CopyCvtPhen (W, y, 0); + + //center y, even for case/control data + cPar.pheno_mean=CenterVector(y); + + //run bslmm if rho==1 + if (cPar.rho_min==1 && cPar.rho_max==1) { + //read genotypes X (not UtX) + cPar.ReadGenotypes (UtX, G, false); + + //perform BSLMM analysis + BSLMM cBslmm; + cBslmm.CopyFromParam(cPar); + time_start=clock(); + cBslmm.MCMC(UtX, y); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cBslmm.CopyToParam(cPar); + //else, if rho!=1 + } else { + gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); + gsl_vector *eval=gsl_vector_alloc (y->size); + gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); + gsl_vector *Uty=gsl_vector_alloc (y->size); + + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { + cPar.ReadGenotypes (UtX, G, false); + + //read relatedness matrix G + ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); + if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<size; i++) { + if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} + cPar.trace_G+=gsl_vector_get (eval, i); + } + cPar.trace_G/=(double)eval->size; + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, y, Uty); + + //calculate REMLE/MLE estimate and pve + CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); + CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); + CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); + + cPar.PrintSummary(); + + //Creat and calcualte UtX, use a large memory + cout<<"Calculating UtX..."<tm_month<<":"<tm_day":"<tm_hour<<":"<tm_min<=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) { + outfile<<"## time on optimization = "<. +*/ + +#ifndef __GEMMA_H__ +#define __GEMMA_H__ + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +class GEMMA { + +public: + //parameters + string version; + string date; + string year; + + //constructor + GEMMA(void); + + //functions + void PrintHeader (void); + void PrintHelp (size_t option); + void PrintLicense (void); + void Assign (int argc, char **argv, PARAM &cPar); + void BatchRun (PARAM &cPar); + void WriteLog (int argc, char **argv, PARAM &cPar); +}; + + +#endif + diff --git a/gzstream.cpp b/gzstream.cpp new file mode 100644 index 0000000..bbb4ba8 --- /dev/null +++ b/gzstream.cpp @@ -0,0 +1,165 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include "gzstream.h" +#include +#include // for memcpy + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + char fmode[10]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + *fmodeptr = '\0'; + file = gzopen( name, fmode); + if (file == 0) + return (gzstreambuf*)0; + opened = 1; + return this; +} + +gzstreambuf * gzstreambuf::close() { + if ( is_open()) { + sync(); + opened = 0; + if ( gzclose( file) == Z_OK) + return this; + } + return (gzstreambuf*)0; +} + +int gzstreambuf::underflow() { // used for input buffer only + if ( gptr() && ( gptr() < egptr())) + return * reinterpret_cast( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + return EOF; + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + return EOF; + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/gzstream.h b/gzstream.h new file mode 100644 index 0000000..861653f --- /dev/null +++ b/gzstream.h @@ -0,0 +1,121 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+256; // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); +public: + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/io.cpp b/io.cpp new file mode 100644 index 0000000..c22f668 --- /dev/null +++ b/io.cpp @@ -0,0 +1,1396 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" +#include "gzstream.h" +#include "mathfunc.h" + +#ifdef FORCE_FLOAT +#include "io_float.h" +#else +#include "io.h" +#endif + + +using namespace std; + + + +//Print process bar +void ProgressBar (string str, double p, double total) +{ + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); + char bar[51]; + + cout<sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if(sb->sgetc() == '\n') + sb->sbumpc(); + return is; + case EOF: + // Also handle the case when the last line has no line ending + if(t.empty()) + is.setstate(std::ios::eofbit); + return is; + default: + t += (char)c; + } + } +} + +//Read snp file +bool ReadFile_snps (const string &file_snps, set &setSnps) +{ + setSnps.clear(); + + ifstream infile (file_snps.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open snps file: "< &mapRS2chr, map &mapRS2bp, map &mapRS2cM) +{ + mapRS2chr.clear(); + mapRS2bp.clear(); + + ifstream infile (file_anno.c_str(), ifstream::in); + if (!infile) {cout<<"error opening annotation file: "< &indicator_idv, vector &pheno, const int &p_column) +{ + indicator_idv.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< > &indicator_pheno, vector > &pheno, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &indicator_cvt, vector > &cvt, size_t &n_cvt) +{ + indicator_cvt.clear(); + + ifstream infile (file_cvt.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open covariates file: "< v_d; flag_na=0; + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} + else {d=atof(ch_ptr);} + + v_d.push_back(d); + ch_ptr=strtok (NULL, " , \t"); + } + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + cvt.push_back(v_d); + } + + if (indicator_cvt.empty()) {n_cvt=0;} + else { + flag_na=0; + for (vector::size_type i=0; i &snpInfo) +{ + snpInfo.clear(); + + ifstream infile (file_bim.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .bim file: "< > &indicator_pheno, vector > &pheno, map &mapID2num, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + mapID2num.clear(); + + igzstream infile (file_fam.c_str(), igzstream::in); + //ifstream infile (file_fam.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .fam file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test) +{ + indicator_snp.clear(); + snpInfo.clear(); + + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w; + int c_idv=0; + + string line; + char *ch_ptr; + + string rs; + long int b_pos; + string chr; + string major; + string minor; + double cM; + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + int ni_total=indicator_idv.size(); + int ni_test=0; + for (int i=0; i=0 && geno<=0.5) {n_0++;} + if (geno>0.5 && geno<1.5) {n_1++;} + if (geno>=1.5 && geno<=2.0) {n_2++;} + + gsl_vector_set (genotype, c_idv, geno); + +// if (geno<0) {n_miss++; continue;} + + if (flag_poly==0) {geno_old=geno; flag_poly=2;} + if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} + + maf+=geno; + + c_idv++; + } + maf/=2.0*(double)(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + snpInfo.push_back(sInfo); + + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=0) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//Read bed file, the first time +bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) +{ + indicator_snp.clear(); + size_t ns_total=snpInfo.size(); + + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w, geno; + size_t c_idv=0; + + char ch[1]; + bitset<8> b; + + size_t ni_total=indicator_idv.size(); + size_t ni_test=0; + for (size_t i=0; i miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=1) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + +void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) +{ + igzstream infile (file_kin.c_str(), igzstream::in); +// ifstream infile (file_kin.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open kinship file: "< mapID2ID; + size_t c=0; + for (size_t i=0; isize1, G->size1, G->size1); + ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix); + i++; + } + + infile.close(); + infile.clear(); + return; +} + + +void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) +{ + igzstream infile (file_ku.c_str(), igzstream::in); +// ifstream infile (file_ku.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open the U file: "<size1, n_col=U->size2, i_row=0, i_col=0; + + gsl_matrix_set_zero (U); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<size, i_row=0; + + gsl_vector_set_zero (eval); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + for (size_t t=0; t &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total; + double d, geno_mean, geno_var; + + size_t ni_total=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + int n_bit; + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1); + double geno, geno_mean; + size_t n_miss; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + + int c_idv=0, c_snp=0; + + for (int i=0; isize; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} + else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + int n_bit; + + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); + + double geno, geno_mean; + size_t n_miss; + int c_idv=0, c_snp=0, c=0; + + //start reading snps and doing association test + for (int t=0; tsize; ++i) { + geno=gsl_vector_get (genotype, i); + if (geno==-9) {geno=0;} + else {geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &est_column, map &mapRS2est) +{ + mapRS2est.clear(); + + ifstream infile (file_est.c_str(), ifstream::in); + if (!infile) {cout<<"error opening estimated parameter file: "<(infile), istreambuf_iterator(), '\n'); + infile.seekg (0, ios::beg); + + return true; +} + + + +//Read gene expression file +bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total) +{ + vec_read.clear(); + ng_total=0; + + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open gene expression file: "<. +*/ + +#ifndef __IO_H__ +#define __IO_H__ + + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +void ProgressBar (string str, double p, double total); +void ProgressBar (string str, double p, double total, double ratio); +std::istream& safeGetline(std::istream& is, std::string& t); + +bool ReadFile_snps (const string &file_snps, set &setSnps); +bool ReadFile_log (const string &file_log, double &pheno_mean); + +bool ReadFile_bim (const string &file_bim, vector &snpInfo); +bool ReadFile_fam (const string &file_fam, vector > &indicator_pheno, vector > &pheno, map &mapID2num, const vector &p_column); + +bool ReadFile_cvt (const string &file_cvt, vector &indicator_cvt, vector > &cvt, size_t &n_cvt); +bool ReadFile_anno (const string &file_bim, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM); +bool ReadFile_pheno (const string &file_pheno, vector > &indicator_pheno, vector > &pheno, const vector &p_column); +bool ReadFile_column (const string &file_pheno, vector &indicator_idv, vector &pheno, const int &p_column); + +bool ReadFile_geno (const string &file_geno, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test); +bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test); + +void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); +void ReadFile_mk (const string &file_mk, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G); +void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U); +void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); + +bool BimbamKin (const string &file_geno, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); +bool PlinkKin (const string &file_bed, vector &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); + +bool ReadFile_geno (const string &file_geno, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); +bool ReadFile_bed (const string &file_bed, vector &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + +bool ReadFile_est (const string &file_est, const vector &est_column, map &mapRS2est); + +bool CountFileLines (const string &file_input, size_t &n_lines); + +bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total); + +#endif + + + + + + + diff --git a/lapack.cpp b/lapack.cpp new file mode 100644 index 0000000..83d5290 --- /dev/null +++ b/lapack.cpp @@ -0,0 +1,609 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" + +using namespace std; + +extern "C" void sgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, float *ALPHA, float *A, int *LDA, float *B, int *LDB, float *BETA, float *C, int *LDC); +extern "C" void spotrf_(char *UPLO, int *N, float *A, int *LDA, int *INFO); +extern "C" void spotrs_(char *UPLO, int *N, int *NRHS, float *A, int *LDA, float *B, int *LDB, int *INFO); +extern "C" void ssyev_(char* JOBZ, char* UPLO, int *N, float *A, int *LDA, float *W, float *WORK, int *LWORK, int *INFO); +extern "C" void ssyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, float *A, int *LDA, float *VL, float *VU, int *IL, int *IU, float *ABSTOL, int *M, float *W, float *Z, int *LDZ, int *ISUPPZ, float *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO); + +extern "C" void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int *LDB, double *BETA, double *C, int *LDC); +extern "C" void dpotrf_(char *UPLO, int *N, double *A, int *LDA, int *INFO); +extern "C" void dpotrs_(char *UPLO, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, int *INFO); +extern "C" void dsyev_(char* JOBZ, char* UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO); +extern "C" void dsyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, double *A, int *LDA, double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z, int *LDZ, int *ISUPPZ, double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO); + + +//cholesky decomposition, A is distroyed +void lapack_float_cholesky_decomp (gsl_matrix_float *A) +{ + int N=A->size1, LDA=A->size1, INFO; + char UPLO='L'; + + if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<data, &LDA, &INFO); + if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<size1, LDA=A->size1, INFO; + char UPLO='L'; + + if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<data, &LDA, &INFO); + if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO; + char UPLO='L'; + + if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<data, &LDA, x->data, &LDB, &INFO); + if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO; + char UPLO='L'; + + if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<data, &LDA, x->data, &LDB, &INFO); + if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<size1, LDB=B->size1, LDC=C->size2; + + if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;} + else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;} + else {cout<<"need 'N' or 'T' in lapack_sgemm"<size2; K2=B->size1;} + else if (*TransB=='T' || *TransB=='t') {N=B->size1; K2=B->size2;} + else {cout<<"need 'N' or 'T' in lapack_sgemm"<size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_sgemm"<size2, A->size1); + gsl_matrix_float_transpose_memcpy (A_t, A); + gsl_matrix_float *B_t=gsl_matrix_float_alloc (B->size2, B->size1); + gsl_matrix_float_transpose_memcpy (B_t, B); + gsl_matrix_float *C_t=gsl_matrix_float_alloc (C->size2, C->size1); + gsl_matrix_float_transpose_memcpy (C_t, C); + + sgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC); + gsl_matrix_float_transpose_memcpy (C, C_t); + + gsl_matrix_float_free (A_t); + gsl_matrix_float_free (B_t); + gsl_matrix_float_free (C_t); + return; +} + + + +void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C) +{ + int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2; + + if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;} + else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;} + else {cout<<"need 'N' or 'T' in lapack_dgemm"<size2; K2=B->size1;} + else if (*TransB=='T' || *TransB=='t') {N=B->size1; K2=B->size2;} + else {cout<<"need 'N' or 'T' in lapack_dgemm"<size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_dgemm"<size2, A->size1); + gsl_matrix_transpose_memcpy (A_t, A); + gsl_matrix *B_t=gsl_matrix_alloc (B->size2, B->size1); + gsl_matrix_transpose_memcpy (B_t, B); + gsl_matrix *C_t=gsl_matrix_alloc (C->size2, C->size1); + gsl_matrix_transpose_memcpy (C_t, C); + + dgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC); + + gsl_matrix_transpose_memcpy (C, C_t); + + gsl_matrix_free (A_t); + gsl_matrix_free (B_t); + gsl_matrix_free (C_t); + return; +} + + + +//eigen value decomposition, matrix A is destroyed, float seems to have problem with large matrices (in mac) +void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix) +{ + if (flag_largematrix==1) { + int N=A->size1, LDA=A->size1, INFO, LWORK=-1; + char JOBZ='V', UPLO='L'; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, eval->data, temp, &LWORK, &INFO); + // if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, eval->data, WORK, &LWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1; + char JOBZ='V', UPLO='L', RANGE='A'; + float ABSTOL=1.0E-7; + + //VL, VU, IL, IU are not referenced; M equals N if RANGE='A' + float VL=0.0, VU=0.0; + int IL=0, IU=0, M; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_float_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_float_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_float_eigen_symmv."<size1, LDA=A->size1, INFO, LWORK=-1; + char JOBZ='V', UPLO='L'; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, eval->data, temp, &LWORK, &INFO); + // if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, eval->data, WORK, &LWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1; + char JOBZ='V', UPLO='L', RANGE='A'; + double ABSTOL=1.0E-7; + + //VL, VU, IL, IU are not referenced; M equals N if RANGE='A' + double VL=0.0, VU=0.0; + int IL=0, IU=0, M; + + if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO); + if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<size1); + gsl_eigen_symmv (G, eval, U, w); + gsl_eigen_symmv_free (w); +#endif + /* + for (size_t i=0; isize; ++i) { + if (gsl_vector_get (eval, i)<1e-10) { +// cout<size; ++i) { + d+=gsl_vector_get(eval, i); + } + d/=(double)eval->size; + + return d; +} + + +//DO NOT set eigen values to be positive +double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix) +{ +#ifdef WITH_LAPACK + lapack_float_eigen_symmv (G, eval, U, flag_largematrix); +#else + //gsl doesn't provide float precision eigen decomposition; plus, float precision eigen decomposition in lapack may not work on OS 10.4 + //first change to double precision + gsl_matrix *G_double=gsl_matrix_alloc (G->size1, G->size2); + gsl_matrix *U_double=gsl_matrix_alloc (U->size1, U->size2); + gsl_vector *eval_double=gsl_vector_alloc (eval->size); + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set(G_double, i, j, gsl_matrix_float_get(G, i, j)); + } + } + gsl_eigen_symmv_workspace *w_space=gsl_eigen_symmv_alloc (G->size1); + gsl_eigen_symmv (G_double, eval_double, U_double, w_space); + gsl_eigen_symmv_free (w_space); + + //change back to float precision + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set(K, i, j, gsl_matrix_get(G_double, i, j)); + } + } + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set(U, i, j, gsl_matrix_get(U_double, i, j)); + } + } + for (size_t i=0; isize; i++) { + gsl_vector_float_set(eval, i, gsl_vector_get(eval_double, i)); + } + + //delete double precision matrices + gsl_matrix_free (G_double); + gsl_matrix_free (U_double); + gsl_vector_free (eval_double); +#endif + /* + for (size_t i=0; isize; ++i) { + if (gsl_vector_float_get (eval, i)<1e-10) { + gsl_vector_float_set (eval, i, 0); + } + } + */ + //calculate track_G=mean(diag(G)) + double d=0.0; + for (size_t i=0; isize; ++i) { + d+=gsl_vector_float_get(eval, i); + } + d/=(double)eval->size; + + return d; +} + + +double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty) +{ + double logdet_O=0.0; + +#ifdef WITH_LAPACK + lapack_cholesky_decomp(Omega); + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_get (Omega, i, i)); + } + logdet_O*=2.0; + lapack_cholesky_solve(Omega, Xty, OiXty); +#else + int status = gsl_linalg_cholesky_decomp(Omega); + if(status == GSL_EDOM) { + cout << "## non-positive definite matrix" << endl; + // exit(0); + } + + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_get (Omega, i, i)); + } + logdet_O*=2.0; + + gsl_vector_memcpy (OiXty, Xty); + gsl_blas_dtrsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); + gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); + // gsl_linalg_cholesky_solve(XtX, Xty, iXty); +#endif + + return logdet_O; +} + + +double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty) +{ + double logdet_O=0.0; + +#ifdef WITH_LAPACK + lapack_float_cholesky_decomp(Omega); + for (size_t i=0; isize1; ++i) { + logdet_O+=log(gsl_matrix_float_get (Omega, i, i)); + } + logdet_O*=2.0; + lapack_float_cholesky_solve(Omega, Xty, OiXty); +#else + gsl_matrix *Omega_double=gsl_matrix_alloc (Omega->size1, Omega->size2); + double d; + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize2; ++j) { + d=(double)gsl_matrix_float_get (Omega, i, j); + gsl_matrix_set (Omega_double, i, j, d); + } + } + + int status = gsl_linalg_cholesky_decomp(Omega_double); + if(status == GSL_EDOM) { + cout << "## non-positive definite matrix" << endl; + // exit(0); + } + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize2; ++j) { + d=gsl_matrix_get (Omega_double, i, j); + if (j==i) {logdet_O+=log(d);} + gsl_matrix_float_set (Omega, i, j, (float)d); + } + } + logdet_O*=2.0; + + gsl_vector_float_memcpy (OiXty, Xty); + gsl_blas_strsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); + gsl_blas_strsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); + // gsl_linalg_cholesky_solve(XtX, Xty, iXty); + + gsl_matrix_free (Omega_double); +#endif + + return logdet_O; +} + + +//LU decomposition +void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum) +{ + gsl_linalg_LU_decomp (LU, p, signum); + return; +} + +void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + gsl_linalg_LU_decomp (LU_double, p, signum); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j)); + } + } + + //free matrix + gsl_matrix_free (LU_double); + return; +} + + +//LU invert +void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse) +{ + gsl_linalg_LU_invert (LU, p, inverse); + return; +} + +void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + gsl_matrix *inverse_double=gsl_matrix_alloc (inverse->size1, inverse->size2); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + gsl_linalg_LU_invert (LU_double, p, inverse_double); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (inverse, i, j, gsl_matrix_get(inverse_double, i, j)); + } + } + + //free matrix + gsl_matrix_free (LU_double); + gsl_matrix_free (inverse_double); + return; +} + +//LU lndet +double LULndet (gsl_matrix *LU) +{ + double d; + d=gsl_linalg_LU_lndet (LU); + return d; +} + +double LULndet (gsl_matrix_float *LU) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + double d; + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + //LU decomposition + d=gsl_linalg_LU_lndet (LU_double); + + //copy float matrix to double + /* + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j)); + } + } + */ + //free matrix + gsl_matrix_free (LU_double); + return d; +} + + +//LU solve +void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x) +{ + gsl_linalg_LU_solve (LU, p, b, x); + return; +} + +void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x) +{ + gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2); + gsl_vector *b_double=gsl_vector_alloc (b->size); + gsl_vector *x_double=gsl_vector_alloc (x->size); + + //copy float matrix to double + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j)); + } + } + + for (size_t i=0; isize; i++) { + gsl_vector_set (b_double, i, gsl_vector_float_get(b, i)); + } + + for (size_t i=0; isize; i++) { + gsl_vector_set (x_double, i, gsl_vector_float_get(x, i)); + } + + //LU decomposition + gsl_linalg_LU_solve (LU_double, p, b_double, x_double); + + //copy float matrix to double + for (size_t i=0; isize; i++) { + gsl_vector_float_set (x, i, gsl_vector_get(x_double, i)); + } + + //free matrix + gsl_matrix_free (LU_double); + gsl_vector_free (b_double); + gsl_vector_free (x_double); + return; +} + + diff --git a/lapack.h b/lapack.h new file mode 100644 index 0000000..cb7b156 --- /dev/null +++ b/lapack.h @@ -0,0 +1,53 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __LAPACK_H__ +#define __LAPACK_H__ + + + +using namespace std; + + +void lapack_float_cholesky_decomp (gsl_matrix_float *A); +void lapack_cholesky_decomp (gsl_matrix *A); +void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x); +void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x); +void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C); +void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C); +void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix); +void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix); + +double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix); +double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix); + +double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty); +double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty); + +void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum); +void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum); +void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse); +void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse); +double LULndet (gsl_matrix *LU); +double LULndet (gsl_matrix_float *LU); +void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x); +void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x); +#endif + + + diff --git a/lm.cpp b/lm.cpp new file mode 100644 index 0000000..c983253 --- /dev/null +++ b/lm.cpp @@ -0,0 +1,571 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "gzstream.h" +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "lm_float.h" +#else +#include "lm.h" +#endif + + +using namespace std; + + + + + +void LM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + file_gene=cPar.file_gene; + + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + ng_total=cPar.ng_total; + ng_test=0; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LM::CopyToParam (PARAM &cPar) +{ + cPar.time_opt=time_opt; + + cPar.ng_test=ng_test; + + return; +} + + + +void LM::WriteFiles () +{ + string file_str; + file_str="./output/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<::size_type t=0; tsize; + double d; + + gsl_vector *WtWiWtx=gsl_vector_alloc (c_size); + + gsl_blas_ddot (x, x, &xPwx); + gsl_blas_ddot (x, y, &xPwy); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + + gsl_blas_ddot (WtWiWtx, Wtx, &d); + xPwx-=d; + + gsl_blas_ddot (WtWiWtx, Wty, &d); + xPwy-=d; + + gsl_vector_free (WtWiWtx); + + return; +} + + +void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy) +{ + size_t c_size=Wty->size; + double d; + + gsl_vector *WtWiWty=gsl_vector_alloc (c_size); + + gsl_blas_ddot (y, y, &yPwy); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty); + + gsl_blas_ddot (WtWiWty, Wty, &d); + yPwy-=d; + + gsl_vector_free (WtWiWty); + + return; +} + + + +//calculate p values and beta/se in a linear model +void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score) +{ + double yPxy=yPwy-xPwy*xPwy/xPwx; + double se_wald, se_score; + + beta=xPwy/xPwx; + se_wald=sqrt(yPxy/(df*xPwx) ); + se_score=sqrt(yPwy/((double)n_size*xPwx) ); + + p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df); + p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df); + p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1); + + if (test_mode==3) {se=se_score;} else {se=se_wald;} + + return; +} + + + + +void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) +{ + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error reading gene expression file:"<size1-(double)W->size2-1.0; + + gsl_vector *y=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wtx, x, xPwx); + + //header + getline(infile, line); + + for (size_t t=0; tsize1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout<size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + gsl_vector *x_miss=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + //start reading genotypes and analyze + for (size_t t=0; t1) {break;} + getline(infile, line); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< b; + + double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //calculate some basic quantities + double yPwy, xPwy, xPwx; + double df=(double)W->size1-(double)W->size2-1.0; + + gsl_vector *x=gsl_vector_alloc (W->size1); + + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wty=gsl_vector_alloc (W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty); + CalcvPv(WtWi, Wty, y, yPwy); + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector::size_type t=0; t1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + + gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx); + CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); + LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< > &pos_loglr) +{ + double yty, xty, xtx, log_lr; + gsl_blas_ddot(y, y, &yty); + + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); + gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx); + gsl_blas_ddot(&X_col.vector, y, &xty); + + log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx)); + pos_loglr.push_back(make_pair(i,log_lr) ); + } + + return; +} diff --git a/lm.h b/lm.h new file mode 100644 index 0000000..84a0322 --- /dev/null +++ b/lm.h @@ -0,0 +1,74 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef __LM_H__ +#define __LM_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + +class LM { + +public: + // IO related parameters + int a_mode; //analysis mode, 50+1/2/3/4 for Frequentist tests + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_out; + + string file_gene; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t n_cvt; + double time_opt; //time spent + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x); + void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y); + void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); +}; +void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector > &pos_loglr); +#endif diff --git a/lmm.cpp b/lmm.cpp new file mode 100644 index 0000000..fed94ee --- /dev/null +++ b/lmm.cpp @@ -0,0 +1,1770 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#else +#include "lmm.h" +#endif + + +using namespace std; + + + + + +void LMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + file_gene=cPar.file_gene; + + l_min=cPar.l_min; + l_max=cPar.l_max; + n_region=cPar.n_region; + l_mle_null=cPar.l_mle_null; + logl_mle_H0=cPar.logl_mle_H0; + + time_UtX=0.0; + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + ng_total=cPar.ng_total; + ng_test=0; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.ng_test=ng_test; + + return; +} + + + +void LMM::WriteFiles () +{ + string file_str; + file_str="./output/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<::size_type t=0; tn_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + +void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, const gsl_matrix *Uab, const gsl_vector *ab, gsl_matrix *Pab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p_ab; + double ps_ab, ps_aw, ps_bw, ps_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab); + if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;} + gsl_matrix_set (Pab, 0, index_ab, p_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps_ab=gsl_matrix_get (Pab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + + p_ab=ps_ab-ps_aw*ps_bw/ps_ww; + gsl_matrix_set (Pab, p, index_ab, p_ab); + } + } + } + } + return; +} + + +void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, gsl_matrix *PPab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p2_ab; + double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab); + if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);} + gsl_matrix_set (PPab, 0, index_ab, p2_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps2_ab=gsl_matrix_get (PPab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); + ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); + ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); + + p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww); + p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww; + gsl_matrix_set (PPab, p, index_ab, p2_ab); + + } + } + } + } + return; +} + + +void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, const gsl_matrix *PPab, gsl_matrix *PPPab) +{ + size_t index_ab, index_aw, index_bw, index_ww; + double p3_ab; + double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww; + + for (size_t p=0; p<=n_cvt+1; ++p) { + for (size_t a=p+1; a<=n_cvt+2; ++a) { + for (size_t b=a; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (a, b, n_cvt); + if (p==0) { + gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab); + gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab); + if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);} + gsl_matrix_set (PPPab, 0, index_ab, p3_ab); + } + else { + index_aw=GetabIndex (a, p, n_cvt); + index_bw=GetabIndex (b, p, n_cvt); + index_ww=GetabIndex (p, p, n_cvt); + + ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab); + ps_aw=gsl_matrix_get (Pab, p-1, index_aw); + ps_bw=gsl_matrix_get (Pab, p-1, index_bw); + ps_ww=gsl_matrix_get (Pab, p-1, index_ww); + ps2_aw=gsl_matrix_get (PPab, p-1, index_aw); + ps2_bw=gsl_matrix_get (PPab, p-1, index_bw); + ps2_ww=gsl_matrix_get (PPab, p-1, index_ww); + ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw); + ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw); + ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww); + + p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww); + p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww; + p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww); + + gsl_matrix_set (PPPab, p, index_ab, p3_ab); + } + } + } + } + return; +} + + + +double LogL_f (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double f=0.0, logdet_h=0.0, d; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + for (size_t i=0; i<(p->eval)->size; ++i) { + d=gsl_vector_get (v_temp, i); + logdet_h+=log(fabs(d)); + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + + double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return f; +} + + + + + + +double LogL_dev1 (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double dev1=0.0, trace_Hi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + + if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;} + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double yPKPy=(P_yy-PP_yy)/l; + dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (v_temp); + + return dev1; +} + + + + +double LogL_dev2 (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; + double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); + + dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_matrix_free (PPPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (HiHiHi_eval); + gsl_vector_free (v_temp); + + return dev2; +} + + + + + +void LogL_dev12 (double l, void *params, double *dev1, double *dev2) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;} + + double trace_Hi=0.0, trace_HiHi=0.0; + size_t index_yy; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + double trace_HiK=((double)ni_test-trace_Hi)/l; + double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l); + + index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + + double P_yy=gsl_matrix_get (Pab, nc_total, index_yy); + double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy); + double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy); + + double yPKPy=(P_yy-PP_yy)/l; + double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l); + + *dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy; + *dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy); + + gsl_matrix_free (Pab); + gsl_matrix_free (PPab); + gsl_matrix_free (PPPab); + gsl_vector_free (Hi_eval); + gsl_vector_free (HiHi_eval); + gsl_vector_free (HiHiHi_eval); + gsl_vector_free (v_temp); + + return; +} + + + +double LogRL_f (double l, void *params) +{ + FUNC_PARAM *p=(FUNC_PARAM *) params; + size_t n_cvt=p->n_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + for (size_t i=0; i<(p->eval)->size; ++i) { + d=gsl_vector_get (v_temp, i); + logdet_h+=log(fabs(d)); + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + gsl_vector_set_all (v_temp, 1.0); + CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab); + + //calculate |WHiW|-|WW| + logdet_hiw=0.0; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double dev1=0.0, trace_Hi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi; + double ps_ww, ps2_ww; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi, trace_PP=trace_HiHi; + double ps_ww, ps2_ww, ps3_ww; + for (size_t i=0; in_cvt; + size_t ni_test=p->ni_test; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + double df; + size_t nc_total; + if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; } + else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;} + + double trace_Hi=0.0, trace_HiHi=0.0; + size_t index_ww; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size); + gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size); + + gsl_vector_memcpy (v_temp, p->eval); + gsl_vector_scale (v_temp, l); + if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + gsl_vector_memcpy (HiHi_eval, Hi_eval); + gsl_vector_mul (HiHi_eval, Hi_eval); + gsl_vector_memcpy (HiHiHi_eval, HiHi_eval); + gsl_vector_mul (HiHiHi_eval, Hi_eval); + + gsl_vector_set_all (v_temp, 1.0); + gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi); + gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi); + + if (p->e_mode!=0) { + trace_Hi=(double)ni_test-trace_Hi; + trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test; + } + + CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab); + CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab); + CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab); + + //calculate tracePK and trace PKPK + double trace_P=trace_Hi, trace_PP=trace_HiHi; + double ps_ww, ps2_ww, ps3_ww; + for (size_t i=0; isize); + gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); + + gsl_vector_memcpy (v_temp, params.eval); + gsl_vector_scale (v_temp, l); + if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); + size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + + beta=P_xy/P_xx; + double tau=(double)df/Px_yy; + se=sqrt(1.0/(tau*P_xx)); + p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df); +// p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return ; +} + + +void LMM::CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score) +{ + size_t n_cvt=params.n_cvt; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + int df=(int)ni_test-(int)n_cvt-1; + + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size); + gsl_vector *v_temp=gsl_vector_alloc(params.eval->size); + + gsl_vector_memcpy (v_temp, params.eval); + gsl_vector_scale (v_temp, l); + if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);} + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt); + size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx); + double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy); + double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy); + + beta=P_xy/P_xx; + double tau=(double)df/Px_yy; + se=sqrt(1.0/(tau*P_xx)); + + p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df); +// p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1); + + gsl_matrix_free (Pab); + gsl_vector_free (Hi_eval); + gsl_vector_free (v_temp); + return ; +} + + + + + + + + +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) +{ + size_t index_ab; + size_t n_cvt=UtW->size2; + + gsl_vector *u_a=gsl_vector_alloc (Uty->size); + + for (size_t a=1; a<=n_cvt+2; ++a) { + if (a==n_cvt+1) {continue;} + + if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1); + gsl_vector_memcpy (u_a, &UtW_col.vector); + } + + for (size_t b=a; b>=1; --b) { + if (b==n_cvt+1) {continue;} + + index_ab=GetabIndex (a, b, n_cvt); + gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); + + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); + gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); + } + + gsl_vector_mul(&Uab_col.vector, u_a); + } + } + + gsl_vector_free (u_a); + return; +} + + +void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) +{ + size_t index_ab; + size_t n_cvt=UtW->size2; + + for (size_t b=1; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (n_cvt+1, b, n_cvt); + gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab); + + if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);} + else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);} + else { + gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1); + gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector); + } + + gsl_vector_mul(&Uab_col.vector, Utx); + } + + return; +} + + + +void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) +{ + size_t index_ab; + size_t n_cvt=W->size2; + + double d; + gsl_vector *v_a=gsl_vector_alloc (y->size); + gsl_vector *v_b=gsl_vector_alloc (y->size); + + for (size_t a=1; a<=n_cvt+2; ++a) { + if (a==n_cvt+1) {continue;} + + if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1); + gsl_vector_memcpy (v_a, &W_col.vector); + } + + for (size_t b=a; b>=1; --b) { + if (b==n_cvt+1) {continue;} + + index_ab=GetabIndex (a, b, n_cvt); + + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); + gsl_vector_memcpy (v_b, &W_col.vector); + } + + gsl_blas_ddot (v_a, v_b, &d); + gsl_vector_set(ab, index_ab, d); + } + } + + gsl_vector_free (v_a); + gsl_vector_free (v_b); + return; +} + + +void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) +{ + size_t index_ab; + size_t n_cvt=W->size2; + + double d; + gsl_vector *v_b=gsl_vector_alloc (y->size); + + for (size_t b=1; b<=n_cvt+2; ++b) { + index_ab=GetabIndex (n_cvt+1, b, n_cvt); + + if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);} + else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);} + else { + gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1); + gsl_vector_memcpy (v_b, &W_col.vector); + } + + gsl_blas_ddot (x, v_b, &d); + gsl_vector_set(ab, index_ab, d); + } + + gsl_vector_free (v_b); + + return; +} + + + + + +void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) +{ + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error reading gene expression file:"<size1); + gsl_vector *Uty=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + //header + getline(infile, line); + + for (size_t t=0; tsize1); + gsl_vector *x_miss=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //start reading genotypes and analyze + for (size_t t=0; t1) {break;} + !safeGetline(infile, line).eof(); + if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + x_mean=0.0; c_phen=0; n_miss=0; + gsl_vector_set_zero(x_miss); + for (size_t i=0; i1) { + gsl_vector_set(x, i, 2-geno); + } + } + + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< b; + + double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0; + double logl_H1=0.0; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + + //Calculate basic quantities + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_vector *x=gsl_vector_alloc (U->size1); + gsl_vector *Utx=gsl_vector_alloc (U->size2); + gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + + for (vector::size_type t=0; t1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + CalcUab(UtW, Uty, Utx, Uab); +// if (e_mode!=0) { +// Calcab (W, y, x, ab); +// } + + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1, for beta + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + cout< > &pos_loglr) +{ + double logl_H0, logl_H1, log_lr, lambda0, lambda1; + + gsl_vector *w=gsl_vector_alloc (Uty->size); + gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1); + gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6); + gsl_vector *ab=gsl_vector_alloc (6); + + gsl_vector_set_zero(ab); + gsl_vector_set_all (w, 1.0); + gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0); + gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector); + + CalcUab (Utw, Uty, Uab) ; + FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0}; + + CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0); + + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i); + CalcUab(Utw, Uty, &UtX_col.vector, Uab); + FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0}; + + CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1); + log_lr=logl_H1-logl_H0; + + pos_loglr.push_back(make_pair(i,log_lr) ); + } + + gsl_vector_free (w); + gsl_matrix_free (Utw); + gsl_matrix_free (Uab); + gsl_vector_free (ab); + + return; +} + + + + +void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf) +{ + if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."< > lambda_lh; + + //evaluate first order derivates in different intervals + double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region; + double dev1_l, dev1_h, logf_l, logf_h; + + for (size_t i=0; i=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;} + } + else { + //if derivates change signs + int status; + int iter=0, max_iter=100; + double l, l_temp; + + gsl_function F; + gsl_function_fdf FDF; + + F.params=¶ms; + FDF.params=¶ms; + + if (func_name=='R' || func_name=='r') { + F.function=&LogRL_dev1; + FDF.f=&LogRL_dev1; + FDF.df=&LogRL_dev2; + FDF.fdf=&LogRL_dev12; + } + else { + F.function=&LogL_dev1; + FDF.f=&LogL_dev1; + FDF.df=&LogL_dev2; + FDF.fdf=&LogL_dev12; + } + + const gsl_root_fsolver_type *T_f; + gsl_root_fsolver *s_f; + T_f=gsl_root_fsolver_brent; + s_f=gsl_root_fsolver_alloc (T_f); + + const gsl_root_fdfsolver_type *T_fdf; + gsl_root_fdfsolver *s_fdf; + T_fdf=gsl_root_fdfsolver_newton; + s_fdf=gsl_root_fdfsolver_alloc(T_fdf); + + for (vector::size_type i=0; il_min && ll_max) {l=l_max;} + if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, ¶ms);} else {logf_l=LogL_f (l, ¶ms);} + + if (i==0) {logf=logf_l; lambda=l;} + else if (logflogf) {lambda=l_min; logf=logf_l;} + if (logf_h>logf) {lambda=l_max; logf=logf_h;} + } + + return; +} + + + + + +//calculate lambda in the null model +void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0) +{ + if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); +// if (e_mode!=0) { +// gsl_vector_set_zero (ab); +// Calcab (W, y, ab); +// } + + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; + + CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0); + + gsl_matrix_free(Uab); + gsl_vector_free(ab); + + return; +} + + +//obtain REMLE estimate for PVE using lambda_remle +void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se) +{ + size_t n_cvt=UtW->size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); + // if (e_mode!=0) { + // gsl_vector_set_zero (ab); + // Calcab (W, y, ab); + // } + + FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0}; + + double se=sqrt(-1.0/LogRL_dev2 (lambda, ¶m0)); + + pve=trace_G*lambda/(trace_G*lambda+1.0); + pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se; + + gsl_matrix_free (Uab); + gsl_vector_free (ab); + return; +} + +//obtain REML estimate for Vg and Ve using lambda_remle +//obtain beta and se(beta) for coefficients +//ab is not used when e_mode==0 +void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta) +{ + size_t n_cvt=UtW->size2, ni_test=UtW->size1; + size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2; + + gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index); + gsl_vector *ab=gsl_vector_alloc (n_index); + gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index); + gsl_vector *Hi_eval=gsl_vector_alloc(eval->size); + gsl_vector *v_temp=gsl_vector_alloc(eval->size); + gsl_matrix *HiW=gsl_matrix_alloc(eval->size, UtW->size2); + gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2); + gsl_vector *WHiy=gsl_vector_alloc(UtW->size2); + gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2); + + gsl_matrix_set_zero (Uab); + CalcUab (UtW, Uty, Uab); + + gsl_vector_memcpy (v_temp, eval); + gsl_vector_scale (v_temp, lambda); + gsl_vector_set_all (Hi_eval, 1.0); + gsl_vector_add_constant (v_temp, 1.0); + gsl_vector_div (Hi_eval, v_temp); + + //calculate beta + gsl_matrix_memcpy (HiW, UtW); + for (size_t i=0; isize2; i++) { + gsl_vector_view HiW_col=gsl_matrix_column(HiW, i); + gsl_vector_mul(&HiW_col.vector, Hi_eval); + } + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW); + gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy); + + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2); + LUDecomp (WHiW, pmt, &sig); + LUSolve (WHiW, pmt, WHiy, beta); + LUInvert (WHiW, pmt, Vbeta); + + //calculate vg and ve + CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab); + + size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt); + double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy); + + ve=P_yy/(double)(ni_test-n_cvt); + vg=ve*lambda; + + //with ve, calculate se(beta) + gsl_matrix_scale(Vbeta, ve); + + //obtain se_beta + for (size_t i=0; isize1; i++) { + gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) ); + } + + gsl_matrix_free(Uab); + gsl_matrix_free(Pab); + gsl_vector_free(ab); + gsl_vector_free(Hi_eval); + gsl_vector_free(v_temp); + gsl_matrix_free(HiW); + gsl_matrix_free(WHiW); + gsl_vector_free(WHiy); + gsl_matrix_free(Vbeta); + + gsl_permutation_free(pmt); + return; +} + diff --git a/lmm.h b/lmm.h new file mode 100644 index 0000000..d65b785 --- /dev/null +++ b/lmm.h @@ -0,0 +1,110 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __LMM_H__ +#define __LMM_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + +class FUNC_PARAM +{ + +public: + bool calc_null; + size_t ni_test; + size_t n_cvt; + const gsl_vector *eval; + const gsl_matrix *Uab; + const gsl_vector *ab; + size_t e_mode; +}; + + + + +class LMM { + +public: + // IO related parameters + int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_out; + + string file_gene; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double l_mle_null; + double logl_mle_H0; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t n_cvt; + double time_UtX; //time spent on optimization iterations + double time_opt; //time spent on optimization iterations + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x); + void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y); + void WriteFiles (); + + void CalcRLWald (const double &lambda, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_wald); + void CalcRLScore (const double &l, const FUNC_PARAM ¶ms, double &beta, double &se, double &p_score); +}; + +void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector > &pos_loglr); +void CalcLambda (const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf); +void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0); +void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se); +void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta); + +#endif + + diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..9ab98ea --- /dev/null +++ b/main.cpp @@ -0,0 +1,86 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include + +#include "param.h" + +#ifdef FORCE_FLOAT +#include "gemma_float.h" +#else +#include "gemma.h" +#endif + +using namespace std; + + + +int main(int argc, char * argv[]) +{ + GEMMA cGemma; + PARAM cPar; + + if (argc <= 1) { + cGemma.PrintHeader(); + return EXIT_SUCCESS; + } + if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'h') { + cGemma.PrintHelp(0); + return EXIT_SUCCESS; + } + if (argc==3 && argv[1][0] == '-' && argv[1][1] == 'h') { + string str; + str.assign(argv[2]); + cGemma.PrintHelp(atoi(str.c_str())); + return EXIT_SUCCESS; + } + if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'l') { + cGemma.PrintLicense(); + return EXIT_SUCCESS; + } + + ifstream check_dir("output/"); + if (!check_dir) { + mkdir("output", S_IRWXU|S_IRGRP|S_IROTH); + } + + cGemma.Assign(argc, argv, cPar); + + if (cPar.error==true) {return EXIT_FAILURE;} + + if (cPar.mode_silence) {stringstream ss; cout.rdbuf (ss.rdbuf());} + + cPar.CheckParam(); + + if (cPar.error==true) {return EXIT_FAILURE;} + + cGemma.BatchRun(cPar); + + if (cPar.error==true) {return EXIT_FAILURE;} + + cGemma.WriteLog(argc, argv, cPar); + + return EXIT_SUCCESS; +} + + + diff --git a/mathfunc.cpp b/mathfunc.cpp new file mode 100644 index 0000000..09e58dc --- /dev/null +++ b/mathfunc.cpp @@ -0,0 +1,310 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#ifdef FORCE_FLOAT +#include "mathfunc_float.h" +#else +#include "mathfunc.h" +#endif + + +using namespace std; + + + +//calculate variance of a vector +double VectorVar (const gsl_vector *v) +{ + double d, m=0.0, m2=0.0; + for (size_t i=0; isize; ++i) { + d=gsl_vector_get (v, i); + m+=d; + m2+=d*d; + } + m/=(double)v->size; + m2/=(double)v->size; + return m2-m*m; +} + + + +//center the matrix G +void CenterMatrix (gsl_matrix *G) +{ + double d; + gsl_vector *w=gsl_vector_alloc (G->size1); + gsl_vector *Gw=gsl_vector_alloc (G->size1); + gsl_vector_set_all (w, 1.0); + + gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw); + gsl_blas_dsyr2 (CblasUpper, -1.0/(double)G->size1, Gw, w, G); + gsl_blas_ddot (w, Gw, &d); + gsl_blas_dsyr (CblasUpper, d/((double)G->size1*(double)G->size1), w, G); + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize1); + + gsl_blas_ddot (w, w, &wtw); + gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw); + gsl_blas_dsyr2 (CblasUpper, -1.0/wtw, Gw, w, G); + gsl_blas_ddot (w, Gw, &d); + gsl_blas_dsyr (CblasUpper, d/(wtw*wtw), w, G); + + for (size_t i=0; isize1; ++i) { + for (size_t j=0; jsize1; ++i) { + d+=gsl_matrix_get(G, i, i); + } + d/=(double)G->size1; + + gsl_matrix_scale (G, 1.0/d); + + return; +} + + +//center the vector y +double CenterVector (gsl_vector *y) +{ + double d=0.0; + + for (size_t i=0; isize; ++i) { + d+=gsl_vector_get (y, i); + } + d/=(double)y->size; + + gsl_vector_add_constant (y, -1.0*d); + + return d; +} + + +//calculate UtX +void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) +{ + gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1); + for (size_t i=0; isize2; ++i) { + gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); + gsl_blas_dgemv (CblasTrans, 1.0, U, &UtX_col.vector, 0.0, Utx_vec); + gsl_vector_memcpy (&UtX_col.vector, Utx_vec); + } + gsl_vector_free (Utx_vec); + return; +} + + +void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) +{ + for (size_t i=0; isize2; ++i) { + gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); + gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); + gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector); + } + return; +} + +void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx) +{ + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); + return; +} + + +//Kronecker product +void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) +{ + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub.matrix, V); + gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j)); + } + } + return; +} + +//symmetric K matrix +void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) +{ + for (size_t i=0; isize1; i++) { + for (size_t j=i; jsize2; j++) { + gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub.matrix, V); + gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j)); + + if (i!=j) { + gsl_matrix_view H_sub_sym=gsl_matrix_submatrix (H, j*V->size1, i*V->size2, V->size1, V->size2); + gsl_matrix_memcpy (&H_sub_sym.matrix, &H_sub.matrix); + } + } + } + return; +} + + +// this function calculates HWE p value with methods described in Wigginton et al., 2005 AJHG; +// it is based on the code in plink 1.07 +double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) +{ + if ( (n_hom1+n_hom2+n_ab)==0 ) {return 1;} + + //aa is the rare allele + int n_aa=n_hom1 < n_hom2 ? n_hom1 : n_hom2; + int n_bb=n_hom1 < n_hom2 ? n_hom2 : n_hom1; + + int rare_copies = 2 * n_aa + n_ab; + int genotypes = n_ab + n_bb + n_aa; + + double * het_probs = (double *) malloc( (rare_copies + 1) * sizeof(double)); + if (het_probs == NULL) + cout<<"Internal error: SNP-HWE: Unable to allocate array"< 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) + / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc + /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + + for (i = 0; i <= rare_copies; i++) + het_probs[i] /= sum; + + /* alternate p-value calculation for p_hi/p_lo + double p_hi = het_probs[n_ab]; + for (i = n_ab + 1; i <= rare_copies; i++) + p_hi += het_probs[i]; + + double p_lo = het_probs[n_ab]; + for (i = n_ab - 1; i >= 0; i--) + p_lo += het_probs[i]; + + double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; + */ + + double p_hwe = 0.0; + /* p-value calculation for p_hwe */ + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[n_ab]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + + free(het_probs); + + return p_hwe; +} + + + + + + + + diff --git a/mathfunc.h b/mathfunc.h new file mode 100644 index 0000000..d0e1696 --- /dev/null +++ b/mathfunc.h @@ -0,0 +1,41 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#ifndef __MATHFUNC_H__ +#define __MATHFUNC_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +using namespace std; + +double VectorVar (const gsl_vector *v); +void CenterMatrix (gsl_matrix *G); +void CenterMatrix (gsl_matrix *G, gsl_vector *w); +void ScaleMatrix (gsl_matrix *G); +double CenterVector (gsl_vector *y); +void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX); +void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX); +void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx); +double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab); +void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H); +void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H); + + +#endif diff --git a/mvlmm.cpp b/mvlmm.cpp new file mode 100644 index 0000000..56540d8 --- /dev/null +++ b/mvlmm.cpp @@ -0,0 +1,3748 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "gsl/gsl_min.h" +#include "gsl/gsl_integration.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#include "mvlmm_float.h" +#else +#include "lmm.h" +#include "mvlmm.h" +#endif + + + +using namespace std; + + +//in this file, X, Y are already transformed (i.e. UtX and UtY) + + +void MVLMM::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + + l_min=cPar.l_min; + l_max=cPar.l_max; + n_region=cPar.n_region; + p_nr=cPar.p_nr; + em_iter=cPar.em_iter; + nr_iter=cPar.nr_iter; + em_prec=cPar.em_prec; + nr_prec=cPar.nr_prec; + crt=cPar.crt; + + Vg_remle_null=cPar.Vg_remle_null; + Ve_remle_null=cPar.Ve_remle_null; + Vg_mle_null=cPar.Vg_mle_null; + Ve_mle_null=cPar.Ve_mle_null; + + time_UtX=0.0; + time_opt=0.0; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + n_ph=cPar.n_ph; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void MVLMM::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.Vg_remle_null=Vg_remle_null; + cPar.Ve_remle_null=Ve_remle_null; + cPar.Vg_mle_null=Vg_mle_null; + cPar.Ve_mle_null=Ve_mle_null; + + cPar.VVg_remle_null=VVg_remle_null; + cPar.VVe_remle_null=VVe_remle_null; + cPar.VVg_mle_null=VVg_mle_null; + cPar.VVe_mle_null=VVe_mle_null; + + cPar.beta_remle_null=beta_remle_null; + cPar.se_beta_remle_null=se_beta_remle_null; + cPar.beta_mle_null=beta_mle_null; + cPar.se_beta_mle_null=se_beta_mle_null; + + cPar.logl_remle_H0=logl_remle_H0; + cPar.logl_mle_H0=logl_mle_H0; + return; +} + + +void MVLMM::WriteFiles () +{ + string file_str; + file_str="./output/"+file_out; + file_str+=".assoc.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<size1; + double d, logdet_Ve=0.0; + + //eigen decomposition of V_e + gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix_memcpy(V_e_temp, V_e); + EigenDecomp(V_e_temp, U_l, D_l, 0); + + //calculate V_e_h and V_e_hi + gsl_matrix_set_zero(V_e_h); + gsl_matrix_set_zero(V_e_hi); + for (size_t i=0; isize, d_size=D_l->size, dc_size=Qi->size1; + size_t c_size=dc_size/d_size; + + double delta, dl, d1, d2, d, logdet_Q; + + gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix_set_zero (Q); + + for (size_t i=0; isize, c_size=X->size1, d_size=D_l->size; + + gsl_vector_set_zero (xHiy); + + double x, delta, dl, y, d; + for (size_t i=0; isize, d_size=D_l->size; + double delta, dl, d_u, d_e; + + for (size_t k=0; ksize1, d_size=UltVehiY->size1; + + gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_memcpy (UltVehiBX, UltVehiY); + gsl_matrix_sub (UltVehiBX, UltVehiU); + + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB); + + gsl_matrix_free(YUX); + + return; +} + +void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB) +{ + size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1; + + gsl_vector *b=gsl_vector_alloc (dc_size); + + //calculate b=Qiv + gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b); + + //copy b to UltVehiB + for (size_t i=0; isize, d_size=U->size1; + + gsl_matrix_set_zero (V_g); + gsl_matrix_set_zero (V_e); + + double delta; + + //calculate the first part: UD^{-1}U^T and EE^T + for (size_t k=0; ksize, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1; + + gsl_matrix_set_zero(Sigma_uu); + gsl_matrix_set_zero(Sigma_ee); + + double delta, dl, x, d; + + //calculate the first diagonal term + gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu); + gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee); + + for (size_t k=0; ksize, d_size=D_l->size, dc_size=Qi->size1; + double logl=0.0, delta, dl, y, d; + + //calculate yHiy+log|H_k| + for (size_t k=0; ksize, c_size=X->size1, d_size=Y->size1; + size_t dc_size=d_size*c_size; + + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); + gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size); + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehiB=gsl_matrix_alloc (d_size, c_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size); + gsl_vector *xHiy=gsl_vector_alloc (dc_size); + gsl_permutation * pmt=gsl_permutation_alloc (c_size); + + double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve; + int sig; + + //calculate |XXt| and (XXt)^{-1} + gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); + for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; + size_t dc_size=d_size*c_size; + double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value; + + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size); + gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size); + + gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size); + gsl_vector *xPy=gsl_vector_alloc (d_size); + //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); + gsl_vector *WHiy=gsl_vector_alloc (dc_size); + + gsl_matrix_set_zero (xPx); + gsl_matrix_set_zero (WHix); + gsl_vector_set_zero (xPy); + gsl_vector_set_zero (WHiy); + + //eigen decomposition and calculate log|Ve| + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate Qi and log|Q| + logdet_Q=CalcQi (eval, D_l, W, Qi); + + //calculate UltVehiY + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); + + //calculate WHix, WHiy, xHiy, xHix + for (size_t i=0; isize, c_size=W->size1, d_size=V_g->size1; + size_t dc_size=d_size*c_size; + double delta, dl, d, dy, dw, logdet_Ve, logdet_Q; + + gsl_vector *D_l=gsl_vector_alloc (d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *Qi_temp=gsl_matrix_alloc (dc_size, dc_size); + //gsl_vector *UltVehiy=gsl_vector_alloc (d_size); + gsl_vector *WHiy=gsl_vector_alloc (dc_size); + gsl_vector *QiWHiy=gsl_vector_alloc (dc_size); + gsl_vector *beta=gsl_vector_alloc (dc_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size); + + gsl_vector_set_zero (WHiy); + + //eigen decomposition and calculate log|Ve| + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate Qi and log|Q| + logdet_Q=CalcQi (eval, D_l, W, Qi); + + //calculate UltVehiY + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); + + //calculate WHiy + for (size_t i=0; isize2; j++) { + for (size_t i=0; isize1; i++) { + gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i)); + gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i))); + } + } + + //free matrices + gsl_vector_free(D_l); + gsl_matrix_free(UltVeh); + gsl_matrix_free(UltVehi); + gsl_matrix_free(Qi); + gsl_matrix_free(Qi_temp); + gsl_vector_free(WHiy); + gsl_vector_free(QiWHiy); + gsl_vector_free(beta); + gsl_matrix_free(Vbeta); + + return; +} + + + +//below are functions for Newton-Raphson's algorithm + + + + + +//calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k| +//and calculate Qi and return logdet_Q +//and calculate yPy +void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all, gsl_matrix *Qi, double &logdet_H, double &logdet_Q) +{ + gsl_matrix_set_zero (Hi_all); + gsl_matrix_set_zero (Qi); + logdet_H=0.0; logdet_Q=0.0; + + size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1; + double logdet_Ve=0.0, delta, dl, d; + + gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size); + gsl_vector *D_l=gsl_vector_alloc (d_size); + + //calculate D_l, UltVeh and UltVehi + logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi); + + //calculate each Hi and log|H_k| + logdet_H=(double)n_size*logdet_Ve; + for (size_t k=0; ksize2, d_size=Y->size1; + + for (size_t k=0; ksize2, c_size=X->size1, d_size=Hi_all->size1; + + double d; + + for (size_t k=0; ksize2; + + for (size_t k=0; ksize2, d_size=Y->size1, dc_size=xHi->size1; + + for (size_t k=0; k=d_size || j>=d_size) {cout<<"error in GetIndex."<size; + + double delta, d1, d2; + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d; + + for (size_t k=0; ksize, dc_size=xHi->size1; + size_t d_size=xHi->size2/n_size; + + double delta; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize, d_size=Hiy->size1; + + double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize, d_size=Hi->size1, dc_size=xHi->size1; + + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize, d_size=Hi->size1; + double delta, d; + + for (size_t k=0; ksize, d_size=Hi->size1; + double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2; + + for (size_t k=0; ksize1, d_size=Hi->size1; + size_t v=GetIndex(i, j, d_size); + + double d; + + //calculate the first part: trace(HiD) + Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e); + + //calculate the second part: -trace(HixQixHiD) + for (size_t k=0; ksize1, d_size=Hi->size1; + size_t v_size=d_size*(d_size+1)/2; + size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); + + double d; + + //calculate the first part: trace(HiDHiD) + Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge); + + //calculate the second and third parts: -trace(HixQixHiDHiD)-trace(HiDHixQixHiD) + for (size_t i=0; isize1; + size_t v; + + for (size_t i=0; isize2/eval->size, dc_size=xHi->size1; + size_t v; + + for (size_t i=0; isize1; + size_t v1, v2; + + for (size_t i1=0; i1size2/eval->size, dc_size=xHi->size1; + size_t v1, v2; + + for (size_t i1=0; i1size; + double delta, d_Hi_ij; + + gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size); + + for (size_t k=0; ksize1; + size_t v_size=xHiDHix_all_g->size2/dc_size; + + for (size_t i=0; isize2; i++) { + gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i); + gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i); + + gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i); + gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i); + + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector); + gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector); + } + + return; +} + + +//calculate Qi(xHiDHix) for each pair of i j (i<=j) +void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gsl_matrix *mat_all_e, gsl_matrix *Qimat_all_g, gsl_matrix *Qimat_all_e) +{ + size_t dc_size=Qi->size1; + size_t v_size=mat_all_g->size2/mat_all_g->size1; + + for (size_t i=0; isize1; + size_t v=GetIndex(i, j, d_size); + + double d; + + //first part: ytHiDHiy + Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e); + + //second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v); + gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v); + + gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d); + yPDPy_g-=d*2.0; + gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d); + yPDPy_e-=d*2.0; + + //fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v); + gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v); + + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d); + yPDPy_g+=d; + gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d); + yPDPy_e+=d; + + return; +} + +//calculate yPDPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)D(Hi-HixQixHi)y +//yPDPDPy=yHiDHiDHiy +//-(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) +//-(yHiDHix)Qi(xHiDHiy) +//+(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) +//+(yHix)Qi(xHiDHiDHix)Qi(xHiy) +//-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) +void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge) +{ + size_t d_size=Hi->size1, dc_size=xHi->size1; + size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size); + size_t v_size=d_size*(d_size+1)/2; + + double d; + + gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size); + + //first part: yHiDHiDHiy + Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge); + + //second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy) + gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2); + gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2); + gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2); + + gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1); + gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1); + gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1); + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); + yPDPDPy_ge-=d; + + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); + yPDPDPy_ge-=d; + + //fourth part: -(yHiDHix)Qi(xHiDHiy) + gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1); + gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1); + gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2); + gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2); + + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ge-=d; + + //fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1); + gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1); + + gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1); + gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1); + gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2); + gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2); + + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d); + yPDPDPy_gg+=d; + gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d); + yPDPDPy_gg+=d; + + gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ee+=d; + gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d); + yPDPDPy_ee+=d; + + gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d); + yPDPDPy_ge+=d; + gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d); + yPDPDPy_ge+=d; + + //seventh part: +(yHix)Qi(xHiDHiDHix)Qi(xHiy) + gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size); + + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_gg+=d; + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ee.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_ee+=d; + gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy); + gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d); + yPDPDPy_ge+=d; + + //eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy) + gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1); + gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1); + + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d); + yPDPDPy_gg-=d; + gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d); + yPDPDPy_ee-=d; + gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d); + yPDPDPy_ge-=d; + + //free memory + gsl_vector_free(xHiDHiDHixQixHiy); + + return; +} + + +//calculate Edgeworth correctation factors for small samples +//notation and method follows Thomas J. Rothenberg, Econometirca 1984; 52 (4) +//M=xHiDHix +void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c) +{ + crt_a=0.0; crt_b=0.0; crt_c=0.0; + + size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2; + size_t c_size=dc_size/d_size; + double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0; + double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0; + + gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size); + + gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size); + + //invert Qi_sub to Qi_si + gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size); + + gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size); + + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (d_size); + + gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix); + LUDecomp (Qi_sub, pmt, &sig); + LUInvert (Qi_sub, pmt, Qi_si); + + gsl_permutation_free(pmt); + gsl_matrix_free(Qi_sub); + + //calculate correctation factors + for (size_t v1=0; v1size1, d_size=Hi->size1; + size_t c_size=dc_size/d_size; + size_t v_size=d_size*(d_size+1)/2; + size_t v1, v2; + double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge; + + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + + gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size); + gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size); + gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size); + + gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size); + gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size); + + //calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all + Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e); + Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e); + Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e); + + Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge); + Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge); + + //calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all + Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e); + Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e); + Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e); + + double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge; + double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge; + + //calculate gradient and Hessian for Vg + for (size_t i1=0; i11) { + CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c); + } else { + crt_a=0.0; crt_b=0.0; crt_c=0.0; + } + + gsl_matrix_free(xHiDHiy_all_g); + gsl_matrix_free(xHiDHiy_all_e); + gsl_matrix_free(xHiDHix_all_g); + gsl_matrix_free(xHiDHix_all_e); + gsl_matrix_free(xHiDHixQixHiy_all_g); + gsl_matrix_free(xHiDHixQixHiy_all_e); + + gsl_matrix_free(QixHiDHiy_all_g); + gsl_matrix_free(QixHiDHiy_all_e); + gsl_matrix_free(QixHiDHix_all_g); + gsl_matrix_free(QixHiDHix_all_e); + gsl_matrix_free(QixHiDHixQixHiy_all_g); + gsl_matrix_free(QixHiDHixQixHiy_all_e); + + gsl_matrix_free(xHiDHiDHiy_all_gg); + gsl_matrix_free(xHiDHiDHiy_all_ee); + gsl_matrix_free(xHiDHiDHiy_all_ge); + gsl_matrix_free(xHiDHiDHix_all_gg); + gsl_matrix_free(xHiDHiDHix_all_ee); + gsl_matrix_free(xHiDHiDHix_all_ge); + + return; +} + + +//update Vg, Ve +void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, const double step_scale, gsl_matrix *V_g, gsl_matrix *V_e) +{ + size_t v_size=gradient->size/2, d_size=V_g->size1; + size_t v; + + gsl_vector *vec_v=gsl_vector_alloc (v_size*2); + + double d; + + //vectorize Vg and Ve + for (size_t i=0; isize, c_size=X->size1, d_size=Y->size1; + size_t dc_size=d_size*c_size; + size_t v_size=d_size*(d_size+1)/2; + + double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale; + int sig; + size_t step_iter, flag_pd; + + gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size); + gsl_vector *D_temp=gsl_vector_alloc (d_size); + gsl_vector *xHiy=gsl_vector_alloc (dc_size); + gsl_vector *QixHiy=gsl_vector_alloc (dc_size); + gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size); + gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size); + + gsl_vector *gradient=gsl_vector_alloc (v_size*2); + + //calculate |XXt| and (XXt)^{-1} + gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt); + for (size_t i=0; i10 ) && step_iter<10 && t!=0); + + //terminate if change is small + if (t!=0) { + if (logl_newsize1; i++) { + for (size_t j=0; jsize2; j++) { + cout<size, c_size=X->size1, d_size=Y->size1; + double a, b, c; + double lambda, logl, vg, ve; + + //Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates + gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size); + gsl_vector *beta_temp=gsl_vector_alloc(c_size); + gsl_vector *se_beta_temp=gsl_vector_alloc(c_size); + + gsl_matrix_transpose_memcpy (Xt, X); + + for (size_t i=0; i4) { + //first obtain good initial values + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (2, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (2, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (2, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (6, 6); + + //2 by n matrix of Y + gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size); + gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2); + gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2); + gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size); + + for (size_t i=0; isize1, d_size=UtY->size2, c_size=UtW->size2; + + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + gsl_vector *x_miss=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); + + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + + c=0; + Vg_remle_null.clear(); + Ve_remle_null.clear(); + for (size_t i=0; isize1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + + cout<<"REMLE estimate for Vg in the null model: "<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + //calculate statistics + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i b; + + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; + double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; + double crt_a, crt_b, crt_c; + int n_bit, n_miss, ci_total, ci_test; + double geno, x_mean; + size_t c=0; + // double s=0.0; + size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; + size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + + //large matrices for EM + gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size); + + //large matrices for NR + gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size); //each dxd block is H_k^{-1} + gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size); //each column is H_k^{-1}y_k + gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size); //each dcxdc block is x_k\otimes H_k^{-1} + gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2); + + gsl_vector *x=gsl_vector_alloc (n_size); + + gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size); + gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size); + gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1); + gsl_vector *beta=gsl_vector_alloc (d_size); + gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size); + + //null estimates for initial values + gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size); + gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1); + gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size); + + gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size); + gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size); + gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size); + + gsl_matrix_transpose_memcpy (Y, UtY); + gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW); + + gsl_vector_view X_row=gsl_matrix_row(X, c_size); + gsl_vector_set_zero(&X_row.vector); + gsl_vector_view B_col=gsl_matrix_column(B, c_size); + gsl_vector_set_zero(&B_col.vector); + + //time_start=clock(); + MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix); + + logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix); + logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null); + //cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<size1; i++) { + for (size_t j=0; jsize2; j++) { + beta_remle_null.push_back(gsl_matrix_get(B, i, j) ); + se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) ); + } + } + logl_remle_H0=logl_H0; + + cout.setf(std::ios_base::fixed, std::ios_base::floatfield); + cout.precision(4); + cout<<"REMLE estimate for Vg in the null model: "<=0) {break;} + //if (snpInfo[t].rs_number!="MAG18140902") {continue;} + //cout<1) { + gsl_vector_set(x, i, 2-geno); + } + } + + /* + if (t==0) { + ofstream outfile ("./snp1.txt", ofstream::out); + if (!outfile) {cout<<"error writing file: "<size; i++) { + outfile<1) {gsl_vector_scale(beta, -1.0);} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; + vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; + vector beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; + double p_nr; + size_t em_iter, nr_iter; + double em_prec, nr_prec; + size_t crt; + + // Summary statistics + size_t ni_total, ni_test; //number of individuals + size_t ns_total, ns_test; //number of snps + size_t n_cvt; + size_t n_ph; + double time_UtX; //time spent on optimization iterations + double time_opt; //time spent on optimization iterations + + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + vector sumStat; //Output SNPSummary Data + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY); + void WriteFiles (); + +}; + +void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B); + +#endif + + diff --git a/param.cpp b/param.cpp new file mode 100644 index 0000000..edacc42 --- /dev/null +++ b/param.cpp @@ -0,0 +1,849 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + + + +PARAM::PARAM(void): +mode_silence (false), a_mode (0), k_mode(1), d_pace (100000), +file_out("result"), +miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), +l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0), +pheno_mean(0), +h_min(-1), h_max(-1), h_scale(-1), +rho_min(0.0), rho_max(1.0), rho_scale(-1), +logp_min(0.0), logp_max(0.0), logp_scale(-1), +s_min(0), s_max(300), +w_step(100000), s_step(1000000), +r_pace(10), w_pace(1000), +n_accept(0), +n_mh(10), +geo_mean(2000.0), +randseed(-1), +error(false), + n_cvt(1), n_vc(1), +time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) +{} + + +//read files +//obtain ns_total, ng_total, ns_test, ni_test +void PARAM::ReadFiles (void) +{ + string file_str; + if (!file_mk.empty()) { + if (CountFileLines (file_mk, n_vc)==false) {error=true;} + } + + if (!file_snps.empty()) { + if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} + } else { + setSnps.clear(); + } + + //for prediction + if (!file_epm.empty()) { + if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;} + + if (!file_bfile.empty()) { + file_str=file_bfile+".bim"; + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + + file_str=file_bfile+".fam"; + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + } + + if (!file_geno.empty()) { + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + if (CountFileLines (file_geno, ns_total)==false) {error=true;} + } + + if (!file_ebv.empty() ) { + if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;} + } + + if (!file_log.empty() ) { + if (ReadFile_log (file_log, pheno_mean)==false) {error=true;} + } + + //convert indicator_pheno to indicator_idv + int k=1; + for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_read[i]; + ni_test+=indicator_idv[i]; + } + + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "<1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<1) {cout<<"error! rho values must be between 0 and 1. current values = "<0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) { + cout<<"error! the current analysis mode "<1 && !file_gene.empty() ) { + cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<1) { + cout<<"error! pnr value must be between 0 and 1. current value = "<::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ni_test++; + } + + ni_cvt=0; + for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_cvt[i]; + ni_test+=indicator_idv[i]; + } + } + + if ((indicator_read).size()!=0) { + ni_test=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_read[i]; + ni_test+=indicator_idv[i]; + } + } + */ + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "<ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."<size1; ++i) { + for (size_t j=0; jsize2; ++j) { + outfile<size; ++i) { + outfile<::size_type i=0; i set_remove; + + //check if any columns is an intercept + for (size_t i=0; isize2; i++) { + gsl_vector_view w_col=gsl_matrix_column (W, i); + gsl_vector_minmax (&w_col.vector, &v_min, &v_max); + if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);} + } + + //add an intecept term if needed + if (n_cvt==set_remove.size()) { + indicator_cvt.clear(); + n_cvt=1; + } else if (flag_ipt==0) { + cout<<"no intecept term is found in the cvt file. a column of 1s is added."<::size_type i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_cvt[i]; + } + } + + //obtain ni_test + ni_test=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ni_test++; + } + + if (ni_test==0) { + error=true; + cout<<"error! number of analyzed individuals equals 0. "< cvt_row; + cvt_row.push_back(1); + + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_cvt.push_back(1); + + cvt.push_back(cvt_row); + } + } + + return; +} + + + + +void PARAM::CopyCvt (gsl_matrix *W) +{ + size_t ci_test=0; + + for (vector::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i. +*/ + +#ifndef __PARAM_H__ +#define __PARAM_H__ + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +using namespace std; + + + +class SNPINFO { +public: + string chr; + string rs_number; + double cM; + long int base_position; + string a_minor; + string a_major; + size_t n_miss; + double missingness; + double maf; +}; + +//results for lmm +class SUMSTAT { +public: + double beta; //REML estimator for beta + double se; //SE for beta + double lambda_remle; //REML estimator for lambda + double lambda_mle; //MLE estimator for lambda + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test +}; + +//results for mvlmm +class MPHSUMSTAT { +public: + vector v_beta; //REML estimator for beta + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test + vector v_Vg; //estimator for Vg, right half + vector v_Ve; //estimator for Ve, right half + vector v_Vbeta; //estimator for Vbeta, right half +}; + + +//hyper-parameters for bslmm +class HYPBSLMM { +public: + double h; + double pve; + double rho; + double pge; + double logp; + + size_t n_gamma; +}; + + + + +class PARAM { +public: + // IO related parameters + bool mode_silence; + int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests + int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; + vector p_column; //which phenotype column needs analysis + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_pheno; + string file_anno; //optional + string file_cvt; //optional + string file_kin; + string file_ku, file_kd; + string file_mk; + string file_out; + + string file_epm; //estimated parameter file + string file_ebv; //estimated breeding value file + string file_log; //log file containing mean estimate + + string file_read; //file containing total number of reads + string file_gene; //gene expression file + + string file_snps; //file containing analyzed snps or genes + + + + // QC related parameters + double miss_level; + double maf_level; + double hwe_level; + double r2_level; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double l_mle_null, l_remle_null; + double logl_mle_H0, logl_remle_H0; + double pve_null, pve_se_null; + double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; + vector Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; + vector VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; + vector beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; + double p_nr; + double em_prec, nr_prec; + size_t em_iter, nr_iter; + size_t crt; + double pheno_mean; //phenotype mean from bslmm fitting or for prediction + + //for fitting multiple variance components + //the first three are of size n_vc, and the next two are of size n_vc+1 + vector v_traceG; + vector v_pve; + vector v_se_pve; + + vector v_sigma2; + vector v_se_sigma2; + vector v_beta; + vector v_se_beta; + + // BSLMM MCMC related parameters + double h_min, h_max, h_scale; //priors for h + double rho_min, rho_max, rho_scale; //priors for rho + double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t s_min, s_max; //minimum and maximum number of gammas + size_t w_step; //number of warm up/burn in iterations + size_t s_step; //number of sampling iterations + size_t r_pace; //record pace + size_t w_pace; //write pace + size_t n_accept; //number of acceptance + size_t n_mh; //number of MH steps within each iteration + double geo_mean; //mean of the geometric distribution + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + bool error; + size_t ni_total, ni_test, ni_cvt; //number of individuals + size_t np_obs, np_miss; //number of observed and missing phenotypes + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t ni_control, ni_case; //number of controls and number of cases + size_t n_cvt; //number of covariates + size_t n_ph; //number of phenotypes + size_t n_vc; //number of variance components (including the diagonal matrix) + double time_total; //record total time + double time_G; //time spent on reading files the second time and calculate K + double time_eigen; //time spent on eigen-decomposition + double time_UtX; //time spent on calculating UX and Uy + double time_UtZ; //time spent on calculating UtZ, for probit BSLMM + double time_opt; //time spent on optimization iterations/or mcmc + double time_Omega; //time spent on calculating Omega + double time_hyp; //time spent on sampling hyper-parameters, in PMM + double time_Proposal; //time spend on constructing the proposal distribution (i.e. the initial lmm or lm analysis) + + // Data + vector > pheno; //a vector record all phenotypes, NA replaced with -9 + vector > cvt; //a vector record all covariates, NA replaced with -9 + vector > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + vector indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis + + vector indicator_bv; //indicator for estimated breeding value file, 0 missing, 1 available for analysis + vector indicator_read; //indicator for read file, 0 missing, 1 available for analysis + vector vec_read; //total number of reads + vector vec_bv; //breeding values + vector est_column; + + map mapID2num; //map small ID number to number, from 0 to n-1 + map mapRS2chr; //map rs# to chromosome location + map mapRS2bp; //map rs# to base position + map mapRS2cM; //map rs# to cM + map mapRS2est; //map rs# to parameters + + vector snpInfo; //record SNP information + set setSnps; //a set of snps for analysis + + //constructor + PARAM(); + + //functions + void ReadFiles (); + void CheckParam (); + void CheckData (); + void PrintSummary (); + void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void CheckCvt (); + void CopyCvt (gsl_matrix *W); + void ProcessCvtPhen(); + void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); + void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); + void CalcKin (gsl_matrix *matrix_kin); + void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); + void WriteVector (const gsl_vector *vector_D, const string suffix); + void CopyRead (gsl_vector *log_N); +}; + + +#endif + diff --git a/prdt.cpp b/prdt.cpp new file mode 100644 index 0000000..7570d36 --- /dev/null +++ b/prdt.cpp @@ -0,0 +1,543 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + + +#include "io.h" +#include "lapack.h" //for functions EigenDecomp +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "io_float.h" +#include "prdt_float.h" +#include "mathfunc_float.h" +#else +#include "io.h" +#include "prdt.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void PRDT::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + + indicator_pheno=cPar.indicator_pheno; + indicator_cvt=cPar.indicator_cvt; + indicator_idv=cPar.indicator_idv; + + snpInfo=cPar.snpInfo; + mapRS2est=cPar.mapRS2est; + + time_eigen=0; + + n_ph=cPar.n_ph; + np_obs=cPar.np_obs; + np_miss=cPar.np_miss; + ns_total=cPar.ns_total; + ns_test=0; + + return; +} + +void PRDT::CopyToParam (PARAM &cPar) +{ + cPar.ns_test=ns_test; + cPar.time_eigen=time_eigen; + + return; +} + + + + +void PRDT::WriteFiles (gsl_vector *y_prdt) +{ + string file_str; + file_str="./output/"+file_out; + file_str+="."; + file_str+="prdt"; + file_str+=".txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<size2; j++) { + outfile<size, ni_total=G->size1; + + gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test); + gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test); + gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); + gsl_vector *eval=gsl_vector_alloc (ni_test); + gsl_vector *Utu=gsl_vector_alloc (ni_test); + gsl_vector *w=gsl_vector_alloc (ni_total); + gsl_permutation *pmt=gsl_permutation_alloc (ni_test); + + //center matrix G based on indicator_idv + for (size_t i=0; isize; i++) { + if (gsl_vector_get(eval,i)<1e-10) {gsl_vector_set(eval, i, 0);} + } + + time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu); + for (size_t i=0; isize; i++) { + d=gsl_vector_get(eval, i); + if (d!=0) {d=gsl_vector_get(Utu, i)/d; gsl_vector_set(Utu, i, d);} + } + gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval); + gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); + + //free matrices + gsl_matrix_free(Goo); + gsl_matrix_free(Gfo); + gsl_matrix_free(U); + gsl_vector_free(eval); + gsl_vector_free(Utu); + gsl_vector_free(w); + gsl_permutation_free(pmt); + + return; +} + + + +void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size); + gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size); + + ns_test=0; + + //start reading genotypes and analyze + for (size_t t=0; tsize==n_miss) {cout<<"snp "<size-n_miss); + x_train_mean/=(double)(n_train_nomiss); + + + for (size_t i=0; isize; ++i) { + geno=gsl_vector_get(x, i); + if (gsl_vector_get (x_miss, i)==0) { + gsl_vector_set(x, i, x_mean-x_train_mean); + } else { + gsl_vector_set(x, i, geno-x_train_mean); + } + } + + gsl_vector_scale (x, effect_size); + gsl_vector_add (y_prdt, x); + + ns_test++; + } + cout< b; + string rs; + + size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; + double geno, x_mean, x_train_mean, effect_size; + + gsl_vector *x=gsl_vector_alloc (y_prdt->size); + + //calculate n_bit and c, the number of bit for each snp + if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;} + else {n_bit=indicator_idv.size()/4+1; } + + //print the first three majic numbers + for (size_t i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + ns_test=0; + + for (vector::size_type t=0; tsize==n_miss) {cout<<"snp "<size-n_miss); + x_train_mean/=(double)(n_train_nomiss); + + for (size_t i=0; isize; ++i) { + geno=gsl_vector_get(x, i); + if (geno==-9) { + gsl_vector_set(x, i, x_mean-x_train_mean); + } else { + gsl_vector_set(x, i, geno-x_train_mean); + } + } + + gsl_vector_scale (x, effect_size); + gsl_vector_add (y_prdt, x); + + ns_test++; + } + cout<::size_type i1=0; i1::size_type j1=0; j1::size_type i2=0; i2::size_type j2=0; j2::size_type i=0; i::size_type j=0; j::size_type i=0; i::size_type j=0; j::size_type i=0; i::size_type j=0; j<2; ++j) { + if (indicator_pheno[i][j]==1) { + gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j+k*2)-gsl_matrix_get (Y_hat, i, j) ); + c_obs1++; + } else { + gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) ); + c_miss1++; + } + } + } + + LUSolve (H_oo, pmt, y_obs, Hiy); + + gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss); + + //put back predicted y_miss to Y_full + c_miss1=0; + for (vector::size_type i=0; i::size_type j=0; j<2; ++j) { + if (indicator_pheno[i][j]==0) { + gsl_matrix_set (Y_full, i, j+k*2, gsl_vector_get (y_miss, c_miss1) ); + c_miss1++; + } + } + } + } + } +*/ + //free matrices + gsl_vector_free(y_obs); + gsl_vector_free(y_miss); + gsl_matrix_free(H_oo); + gsl_matrix_free(H_mo); + gsl_vector_free(Hiy); + + return; +} + + diff --git a/prdt.h b/prdt.h new file mode 100644 index 0000000..69043df --- /dev/null +++ b/prdt.h @@ -0,0 +1,80 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __PRDT_H__ +#define __PRDT_H__ + + +#include +#include +#include +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + +using namespace std; + +class PRDT { + +public: + // IO related parameters + size_t a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + + vector > indicator_pheno; + vector indicator_cvt; + vector indicator_idv; + vector snpInfo; + map mapRS2est; + + size_t n_ph; + size_t np_obs, np_miss; + size_t ns_total; + size_t ns_test; + + double time_eigen; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void WriteFiles (gsl_vector *y_prdt); + void WriteFiles (gsl_matrix *Y_full); + void AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt); + void AnalyzeBimbam (gsl_vector *y_prdt); + void AnalyzePlink (gsl_vector *y_prdt); + void MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full); +}; + + +#endif + + + + + + + diff --git a/vc.cpp b/vc.cpp new file mode 100644 index 0000000..77cf746 --- /dev/null +++ b/vc.cpp @@ -0,0 +1,443 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + + + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_multiroots.h" +#include "gsl/gsl_min.h" + +#include "io.h" +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "lmm_float.h" +#include "vc_float.h" +#else +#include "lmm.h" +#include "vc.h" +#endif + + + +using namespace std; + + +//in this file, X, Y are already transformed (i.e. UtX and UtY) + + +void VC::CopyFromParam (PARAM &cPar) +{ + file_out=cPar.file_out; + + // v_sigma2=cPar.v_sigma2; + + time_UtX=0.0; + time_opt=0.0; + + v_traceG=cPar.v_traceG; + + return; +} + + +void VC::CopyToParam (PARAM &cPar) +{ + cPar.time_UtX=time_UtX; + cPar.time_opt=time_opt; + + cPar.v_sigma2=v_sigma2; + cPar.v_se_sigma2=v_se_sigma2; + cPar.v_pve=v_pve; + cPar.v_se_pve=v_se_pve; + cPar.v_traceG=v_traceG; + + cPar.v_beta=v_beta; + cPar.v_se_beta=v_se_beta; + + return; +} + + + +void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) +{ + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2; + + gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1); + gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt); + gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt); + gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt); + gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1); + + double sigma2; + //calculate H=\sum_i^{k+1} \sigma_i^2 K_i + gsl_matrix_set_zero (p->P); + for (size_t i=0; iK, 0, n1*i, n1, n1); + gsl_matrix_memcpy (K_temp, &K_sub.matrix); + } + + sigma2=exp(gsl_vector_get (log_sigma2, i) ); + gsl_matrix_scale(K_temp, sigma2); + gsl_matrix_add (p->P, K_temp); + } + + //calculate H^{-1} + int sig; + gsl_permutation * pmt1=gsl_permutation_alloc (n1); + LUDecomp (p->P, pmt1, &sig); + LUInvert (p->P, pmt1, K_temp); + gsl_permutation_free(pmt1); + + gsl_matrix_memcpy (p->P, K_temp); + + //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1} + gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + + gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); + LUDecomp (WtHiW, pmt2, &sig); + LUInvert (WtHiW, pmt2, WtHiWi); + gsl_permutation_free(pmt2); + + gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); + + //calculate Py, KPy, PKPy + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + + for (size_t i=0; iKPy_mat, i); + gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i); + + if (i==n_vc) { + gsl_vector_memcpy (&KPy.vector, p->Py); + } else { + gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); + } + + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + } + + gsl_matrix_free (K_temp); + gsl_matrix_free (HiW); + gsl_matrix_free (WtHiW); + gsl_matrix_free (WtHiWi); + gsl_matrix_free (WtHiWiWtHi); + + return; +} + + +//below are functions for AI algorithm +int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; + + double tr, d; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy + for (size_t i=0; iP, l, l); + } + } else { + tr=0; + for (size_t l=0; lP, l); + gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); + gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); + tr+=d; + } + } + + gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); + gsl_blas_ddot(p->Py, &KPy_i.vector, &d); + + d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); + + gsl_vector_set(dev1, i, d); + } + + return GSL_SUCCESS; +} + + + +int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n_vc=log_sigma2->size-1; + + double d, sigma2_i, sigma2_j; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev2=0.5(yPKPKPy) + for (size_t i=0; iKPy_mat, i); + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + + for (size_t j=i; jPKPy_mat, j); + + gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + + d*=-0.5*sigma2_i*sigma2_j; + + gsl_matrix_set(dev2, i, j, d); + if (j!=i) {gsl_matrix_set(dev2, j, i, d);} + } + } + + gsl_matrix_memcpy (p->Hessian, dev2); + + return GSL_SUCCESS; +} + + + +int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, gsl_matrix *dev2) +{ + VC_PARAM *p=(VC_PARAM *) params; + + size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; + + double tr, d, sigma2_i, sigma2_j; + + //update parameters + UpdateParam (log_sigma2, p); + + //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy + //calculate dev2=0.5(yPKPKPy) + for (size_t i=0; iP, l, l); + } + } else { + tr=0; + for (size_t l=0; lP, l); + gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); + gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); + tr+=d; + } + } + + gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); + gsl_blas_ddot(p->Py, &KPy_i.vector, &d); + + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + d=(-0.5*tr+0.5*d)*sigma2_i; + + gsl_vector_set(dev1, i, d); + + for (size_t j=i; jPKPy_mat, j); + gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); + + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + + gsl_matrix_set(dev2, i, j, d); + if (j!=i) {gsl_matrix_set(dev2, j, i, d);} + } + + } + + gsl_matrix_memcpy (p->Hessian, dev2); + + return GSL_SUCCESS; +} + + + + +void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) +{ + size_t n1=K->size1, n2=K->size2; + size_t n_vc=n2/n1; + gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1); + double d, s; + + //set up params + gsl_matrix *P=gsl_matrix_alloc (n1, n1); + gsl_vector *Py=gsl_vector_alloc (n1); + gsl_matrix *KPy_mat=gsl_matrix_alloc (n1, n_vc+1); + gsl_matrix *PKPy_mat=gsl_matrix_alloc (n1, n_vc+1); + gsl_vector *dev1=gsl_vector_alloc (n_vc+1); + gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1); + gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1); + VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian}; + + //initialize sigma2/log_sigma2 + gsl_blas_ddot (y, y, &s); + s/=(double)n1; + for (size_t i=0; if, 1e-3); + } + while (status==GSL_CONTINUE && iterf, ¶ms, dev1, dev2); + + gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1); + LUDecomp (dev2, pmt, &sig); + LUInvert (dev2, pmt, Hessian); + gsl_permutation_free(pmt); + + //save data + v_sigma2.clear(); + for (size_t i=0; ix, i)); + v_sigma2.push_back(d); + } + + v_se_sigma2.clear(); + for (size_t i=0; i. +*/ + +#ifndef __VC_H__ +#define __VC_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + +class VC_PARAM +{ + +public: + const gsl_matrix *K; + const gsl_matrix *W; + const gsl_vector *y; + gsl_matrix *P; + gsl_vector *Py; + gsl_matrix *KPy_mat; + gsl_matrix *PKPy_mat; + gsl_matrix *Hessian; +}; + + + + +class VC { + +public: + // IO related parameters + string file_out; + + vector v_sigma2; + vector v_se_sigma2; + vector v_pve; + vector v_se_pve; + vector v_traceG; + vector v_beta; + vector v_se_beta; + + double time_UtX; + double time_opt; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); +}; + +#endif + + -- cgit v1.2.3