From 0dd4e05fc8babc1517de1d7981a99ad0a5241a5e Mon Sep 17 00:00:00 2001 From: Peter Carbonetto Date: Thu, 4 May 2017 14:43:12 -0500 Subject: Added new files shared by Xiang via email on May 4, 2017. --- src/bslmmdap.cpp | 1015 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1015 insertions(+) create mode 100644 src/bslmmdap.cpp (limited to 'src/bslmmdap.cpp') diff --git a/src/bslmmdap.cpp b/src/bslmmdap.cpp new file mode 100644 index 0000000..0bf0e7b --- /dev/null +++ b/src/bslmmdap.cpp @@ -0,0 +1,1015 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" + + + +#include "logistic.h" +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "bslmmdap_float.h" +#include "lmm_float.h" //for class FUNC_PARAM and MatrixCalcLR +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "bslmmdap.h" +#include "lmm.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void BSLMMDAP::CopyFromParam (PARAM &cPar) +{ + file_out=cPar.file_out; + path_out=cPar.path_out; + + time_UtZ=0.0; + time_Omega=0.0; + + h_min=cPar.h_min; + h_max=cPar.h_max; + h_ngrid=cPar.h_ngrid; + rho_min=cPar.rho_min; + rho_max=cPar.rho_max; + rho_ngrid=cPar.rho_ngrid; + + if (h_min<=0) {h_min=0.01;} + if (h_max>=1) {h_max=0.99;} + if (rho_min<=0) {rho_min=0.01;} + if (rho_max>=1) {rho_max=0.99;} + + trace_G=cPar.trace_G; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void BSLMMDAP::CopyToParam (PARAM &cPar) +{ + cPar.time_UtZ=time_UtZ; + cPar.time_Omega=time_Omega; + + return; +} + + + +//read hyp file +void ReadFile_hyb (const string &file_hyp, vector &vec_sa2, vector &vec_sb2, vector &vec_wab) +{ + vec_sa2.clear(); vec_sb2.clear(); vec_wab.clear(); + + igzstream infile (file_hyp.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open hyp file: "< &vec_rs, vector > > &BF) +{ + BF.clear(); vec_rs.clear(); + + igzstream infile (file_bf.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open bf file: "< vec_bf; + vector > mat_bf; + char *ch_ptr; + + size_t bf_size, flag_block; + + getline(infile, line); + + size_t t=0; + while (!safeGetline(infile, line).eof()) { + flag_block=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + rs=ch_ptr; + vec_rs.push_back(rs); + + ch_ptr=strtok (NULL, " , \t"); + if (t==0) { + block=ch_ptr; + } else { + if (strcmp(ch_ptr, block.c_str() )!=0) { + flag_block=1; + block=ch_ptr; + } + } + + ch_ptr=strtok (NULL, " , \t"); + while (ch_ptr!=NULL) { + vec_bf.push_back(atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } + + if (t==0) { + bf_size=vec_bf.size(); + } else { + if (bf_size!=vec_bf.size()) {cout<<"error! unequal row size in bf file."< &vec_rs, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel, size_t &kc, size_t &kd) +{ + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "< > mapRS2catc; + map > mapRS2catd; + vector catc; + vector catd; + + //read the following lines to record mapRS2cat + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + catc.clear(); catd.clear(); + + for (size_t i=0; i0) {mapRS2catc[rs]=catc;} + if (mapRS2catd.count(rs)==0 && kd>0) {mapRS2catd[rs]=catd;} + } + + //load into Ad and Ac + if (kc>0) { + Ac=gsl_matrix_alloc(vec_rs.size(), kc); + for (size_t i=0; i0) { + Ad=gsl_matrix_int_alloc(vec_rs.size(), kd); + + for (size_t i=0; i rcd; + int val; + for (size_t j=0; jsize1; i++) { + val = gsl_matrix_int_get(Ad, i, j); + rcd[val] = 1; + } + gsl_vector_int_set (dlevel, j, rcd.size()); + } + } + + infile.clear(); + infile.close(); + + return; +} + + + + + + + + +void BSLMMDAP::WriteResult (const gsl_matrix *Hyper, const gsl_matrix *BF) +{ + string file_bf, file_hyp; + file_bf=path_out+"/"+file_out; + file_bf+=".bf.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + + ofstream outfile_bf, outfile_hyp; + + outfile_bf.open (file_bf.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + + if (!outfile_bf) {cout<<"error writing file: "<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_hyp<size2; i++) { + outfile_bf<<"\t"<<"BF"<size2; j++) { + outfile_bf<<"\t"< &vec_rs, const gsl_matrix *Hyper, const gsl_vector *pip, const gsl_vector *coef) +{ + string file_gamma, file_hyp, file_coef; + file_gamma=path_out+"/"+file_out; + file_gamma+=".gamma.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + file_coef=path_out+"/"+file_out; + file_coef+=".coef.txt"; + + ofstream outfile_gamma, outfile_hyp, outfile_coef; + + outfile_gamma.open (file_gamma.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + outfile_coef.open (file_coef.c_str(), ofstream::out); + + if (!outfile_gamma) {cout<<"error writing file: "<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_hyp<size; i++) { + outfile_coef< &rank) +{ + size_t pos; + for (size_t i=0; isize); + + double logm=0.0; + double d, uy, Hi_yy=0, logdet_H=0.0; + for (size_t i=0; isize1, UtXgamma->size2); + gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2); + gsl_vector *XtHiy=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *beta_hat=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *weight_Hi=gsl_vector_alloc (UtXgamma->size1); + + gsl_matrix_memcpy (UtXgamma_eval, UtXgamma); + + logdet_H=0.0; P_yy=0.0; + for (size_t i=0; i vec_sa2, vec_sb2, logm_null; + + gsl_matrix *BF=gsl_matrix_alloc(ns_test, n_grid); + gsl_matrix *Xgamma=gsl_matrix_alloc(ni_test, 1); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + + //compute tau by using yty + gsl_blas_ddot (Uty, Uty, &tau); + tau=(double)ni_test/tau; + + //set up grid values for sigma_a2 and sigma_b2 based on an approximately even grid for h and rho, and a fixed number of causals + size_t ij=0; + for (size_t i=0; i sum_pip; + map sum; + + int levels = gsl_vector_int_get(dlevel,0); + + for(int i=0;isize1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + sum_pip[cat] += gsl_vector_get(pip_vec,i); + sum[cat] += 1; + } + + for(int i=0;isize1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + gsl_vector_set(prior_vec,i,sum_pip[cat]/sum[cat]); + } + + //double baseline=0; + for(int i=0;i &vec_rs, const vector &vec_sa2, const vector &vec_sb2, const vector &wab, const vector > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) { + clock_t time_start; + + //set up BF + double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save; + size_t t1, t2; + size_t n_grid=wab.size(), ns_test=vec_rs.size(); + + gsl_vector *prior_vec=gsl_vector_alloc(ns_test); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + gsl_vector *pip=gsl_vector_alloc(ns_test); + gsl_vector *coef=gsl_vector_alloc(kc+kd+1); + + //perform the EM algorithm + vector vec_wab, vec_wab_new; + + //initial values + for (size_t t=0; t1e-3) { + //update E_gamma + t1=0, t2=0; + for (size_t b=0; bsize; t++) { + s+=gsl_vector_get(pip, t); + } + s=s/(double)pip->size; + for (size_t t=0; tsize; t++) { + gsl_vector_set(prior_vec, t, s); + } + + gsl_vector_set (coef, 0, log(s/(1-s))); + } else if(kc==0 && kd!=0){//only discrete annotations + if(kd == 1){ + single_ct_regression(Ad, dlevel, pip, coef, prior_vec); + }else{ + logistic_cat_fit(coef, Ad, dlevel, pip, 0, 0); + logistic_cat_pred(coef, Ad, dlevel, prior_vec); + } + } else if (kc!=0 && kd==0) {//only continuous annotations + logistic_cont_fit(coef, Ac, pip, 0, 0); + logistic_cont_pred(coef, Ac, prior_vec); + } else if (kc!=0 && kd!=0) {//both continuous and categorical annotations + logistic_mixed_fit(coef, Ad, dlevel, Ac, pip, 0, 0); + logistic_mixed_pred(coef, Ad, dlevel, Ac, prior_vec); + } + + //compute marginal likelihood + logm=0; + + t1=0; + for (size_t b=0; b0) { + dif=logm-logm_save; + } + logm_save=logm; + it++; + + cout<<"iteration = "<10) {break;} + + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz); + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //First proposal + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + } + + + delete [] p_gamma; + beta_g.clear(); + + return; +} + +*/ + + + + + + +/* +//below fits MCMC for rho=1 +void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty) +{ + time_t time_start=clock(); + gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size); + gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size); + gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#endif + gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + return; +} + + + +double BSLMM::CalcPosterior (const double yty, class HYPBSLMM &cHyp) +{ + double logpost=0.0; + + //for quantitative traits, calculate pve and pge + //pve and pge for case/control data are calculted in CalcCC_PVEnZ + if (a_mode==11) { + cHyp.pve=0.0; + cHyp.pge=1.0; + } + + //calculate likelihood + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(yty);} + else {logpost-=0.5*yty;} + + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp)); + + return logpost; +} + + +double BSLMM::CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp) +{ + double sigma_a2=cHyp.h/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test); + double logpost=0.0; + double d, P_yy=yty, logdet_O=0.0; + + gsl_matrix_const_view Xgamma_sub=gsl_matrix_const_submatrix (Xgamma, 0, 0, Xgamma->size1, s_size); + gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size); + gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size); + + gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size); + gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size); + gsl_vector *beta_hat=gsl_vector_alloc (s_size); + gsl_vector *Xty_temp=gsl_vector_alloc (s_size); + + gsl_vector_memcpy (Xty_temp, &Xty_sub.vector); + + //calculate Omega + gsl_matrix_memcpy (Omega, &XtX_sub.matrix); + gsl_matrix_scale (Omega, sigma_a2); + gsl_matrix_set_identity (M_temp); + gsl_matrix_add (Omega, M_temp); + + //calculate beta_hat + logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat); + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (Xty_temp, beta_hat, &d); + P_yy-=d; + + //sample tau + double tau=1.0; + if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0, 2.0/P_yy); } + + //sample beta + for (size_t i=0; i