diff options
author | Peter Carbonetto | 2017-05-04 14:43:12 -0500 |
---|---|---|
committer | Peter Carbonetto | 2017-05-04 14:43:12 -0500 |
commit | 0dd4e05fc8babc1517de1d7981a99ad0a5241a5e (patch) | |
tree | 759b47320ed404951ecb745e228c1fcc0a2200d5 | |
parent | c18588b6d00650b9ce742229fdf1eca7133f58fc (diff) | |
download | pangemma-0dd4e05fc8babc1517de1d7981a99ad0a5241a5e.tar.gz |
Added new files shared by Xiang via email on May 4, 2017.
-rw-r--r-- | src/bslmmdap.cpp | 1015 | ||||
-rw-r--r-- | src/bslmmdap.h | 101 | ||||
-rw-r--r-- | src/ldr.cpp | 285 | ||||
-rw-r--r-- | src/ldr.h | 77 | ||||
-rw-r--r-- | src/logistic.cpp | 783 | ||||
-rw-r--r-- | src/logistic.h | 70 | ||||
-rw-r--r-- | src/varcov.cpp | 482 | ||||
-rw-r--r-- | src/varcov.h | 72 |
8 files changed, 2885 insertions, 0 deletions
diff --git a/src/bslmmdap.cpp b/src/bslmmdap.cpp new file mode 100644 index 0000000..0bf0e7b --- /dev/null +++ b/src/bslmmdap.cpp @@ -0,0 +1,1015 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <iostream> +#include <fstream> +#include <sstream> + +#include <iomanip> +#include <cmath> +#include <iostream> +#include <stdio.h> +#include <stdlib.h> +#include <ctime> +#include <cstring> +#include <algorithm> + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" + + + +#include "logistic.h" +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "bslmmdap_float.h" +#include "lmm_float.h" //for class FUNC_PARAM and MatrixCalcLR +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "bslmmdap.h" +#include "lmm.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void BSLMMDAP::CopyFromParam (PARAM &cPar) +{ + file_out=cPar.file_out; + path_out=cPar.path_out; + + time_UtZ=0.0; + time_Omega=0.0; + + h_min=cPar.h_min; + h_max=cPar.h_max; + h_ngrid=cPar.h_ngrid; + rho_min=cPar.rho_min; + rho_max=cPar.rho_max; + rho_ngrid=cPar.rho_ngrid; + + if (h_min<=0) {h_min=0.01;} + if (h_max>=1) {h_max=0.99;} + if (rho_min<=0) {rho_min=0.01;} + if (rho_max>=1) {rho_max=0.99;} + + trace_G=cPar.trace_G; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void BSLMMDAP::CopyToParam (PARAM &cPar) +{ + cPar.time_UtZ=time_UtZ; + cPar.time_Omega=time_Omega; + + return; +} + + + +//read hyp file +void ReadFile_hyb (const string &file_hyp, vector<double> &vec_sa2, vector<double> &vec_sb2, vector<double> &vec_wab) +{ + vec_sa2.clear(); vec_sb2.clear(); vec_wab.clear(); + + igzstream infile (file_hyp.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open hyp file: "<<file_hyp<<endl; return;} + + string line; + char *ch_ptr; + + getline(infile, line); + + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + ch_ptr=strtok (NULL, " , \t"); + vec_sa2.push_back(atof(ch_ptr)); + + ch_ptr=strtok (NULL, " , \t"); + vec_sb2.push_back(atof(ch_ptr)); + + ch_ptr=strtok (NULL, " , \t"); + vec_wab.push_back(atof(ch_ptr)); + } + + infile.close(); + infile.clear(); + + return; +} + + +//read bf file +void ReadFile_bf (const string &file_bf, vector<string> &vec_rs, vector<vector<vector<double> > > &BF) +{ + BF.clear(); vec_rs.clear(); + + igzstream infile (file_bf.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open bf file: "<<file_bf<<endl; return;} + + string line, rs, block; + vector<double> vec_bf; + vector<vector<double> > mat_bf; + char *ch_ptr; + + size_t bf_size, flag_block; + + getline(infile, line); + + size_t t=0; + while (!safeGetline(infile, line).eof()) { + flag_block=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + rs=ch_ptr; + vec_rs.push_back(rs); + + ch_ptr=strtok (NULL, " , \t"); + if (t==0) { + block=ch_ptr; + } else { + if (strcmp(ch_ptr, block.c_str() )!=0) { + flag_block=1; + block=ch_ptr; + } + } + + ch_ptr=strtok (NULL, " , \t"); + while (ch_ptr!=NULL) { + vec_bf.push_back(atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } + + if (t==0) { + bf_size=vec_bf.size(); + } else { + if (bf_size!=vec_bf.size()) {cout<<"error! unequal row size in bf file."<<endl;} + } + + if (flag_block==0) { + mat_bf.push_back(vec_bf); + } else { + BF.push_back(mat_bf); + mat_bf.clear(); + } + vec_bf.clear(); + + t++; + } + + infile.close(); + infile.clear(); + + return; +} + + +//read category files +//read both continuous and discrete category file, record mapRS2catc +void ReadFile_cat (const string &file_cat, const vector<string> &vec_rs, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel, size_t &kc, size_t &kd) +{ + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "<<file_cat<<endl; return;} + + string line; + char *ch_ptr; + + string rs, chr, a1, a0, pos, cm; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + //use the header to determine the number of categories + kc=header.catc_col.size(); kd=header.catd_col.size(); + + //set up storage and mapper + map<string, vector<double> > mapRS2catc; + map<string, vector<int> > mapRS2catd; + vector<double> catc; + vector<int> catd; + + //read the following lines to record mapRS2cat + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + catc.clear(); catd.clear(); + + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) { + rs=ch_ptr; + } else if (header.chr_col!=0 && header.chr_col==i+1) { + chr=ch_ptr; + } else if (header.pos_col!=0 && header.pos_col==i+1) { + pos=ch_ptr; + } else if (header.cm_col!=0 && header.cm_col==i+1) { + cm=ch_ptr; + } else if (header.a1_col!=0 && header.a1_col==i+1) { + a1=ch_ptr; + } else if (header.a0_col!=0 && header.a0_col==i+1) { + a0=ch_ptr; + } else if (header.catc_col.size()!=0 && header.catc_col.count(i+1)!=0 ) { + catc.push_back(atof(ch_ptr)); + } else if (header.catd_col.size()!=0 && header.catd_col.count(i+1)!=0 ) { + catd.push_back(atoi(ch_ptr)); + } else {} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (mapRS2catc.count(rs)==0 && kc>0) {mapRS2catc[rs]=catc;} + if (mapRS2catd.count(rs)==0 && kd>0) {mapRS2catd[rs]=catd;} + } + + //load into Ad and Ac + if (kc>0) { + Ac=gsl_matrix_alloc(vec_rs.size(), kc); + for (size_t i=0; i<vec_rs.size(); i++) { + if (mapRS2catc.count(vec_rs[i])!=0) { + for (size_t j=0; j<kc; j++) { + gsl_matrix_set(Ac, i, j, mapRS2catc[vec_rs[i]][j]); + } + } else { + for (size_t j=0; j<kc; j++) { + gsl_matrix_set(Ac, i, j, 0); + } + } + } + } + + if (kd>0) { + Ad=gsl_matrix_int_alloc(vec_rs.size(), kd); + + for (size_t i=0; i<vec_rs.size(); i++) { + if (mapRS2catd.count(vec_rs[i])!=0) { + for (size_t j=0; j<kd; j++) { + gsl_matrix_int_set(Ad, i, j, mapRS2catd[vec_rs[i]][j]); + } + } else { + for (size_t j=0; j<kd; j++) { + gsl_matrix_int_set(Ad, i, j, 0); + } + } + } + + dlevel=gsl_vector_int_alloc(kd); + map<int, int> rcd; + int val; + for (size_t j=0; j<kd; j++) { + rcd.clear(); + for (size_t i=0; i<Ad->size1; i++) { + val = gsl_matrix_int_get(Ad, i, j); + rcd[val] = 1; + } + gsl_vector_int_set (dlevel, j, rcd.size()); + } + } + + infile.clear(); + infile.close(); + + return; +} + + + + + + + + +void BSLMMDAP::WriteResult (const gsl_matrix *Hyper, const gsl_matrix *BF) +{ + string file_bf, file_hyp; + file_bf=path_out+"/"+file_out; + file_bf+=".bf.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + + ofstream outfile_bf, outfile_hyp; + + outfile_bf.open (file_bf.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + + if (!outfile_bf) {cout<<"error writing file: "<<file_bf<<endl; return;} + if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;} + + outfile_hyp<<"h"<<"\t"<<"rho"<<"\t"<<"sa2"<<"\t"<<"sb2"<<"\t"<<"weight"<<endl; + outfile_hyp<<scientific; + for (size_t i=0; i<Hyper->size1; i++) { + for (size_t j=0; j<Hyper->size2; j++) { + outfile_hyp<<setprecision(6)<<gsl_matrix_get (Hyper, i, j)<<"\t"; + } + outfile_hyp<<endl; + } + + outfile_bf<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"; + for (size_t i=0; i<BF->size2; i++) { + outfile_bf<<"\t"<<"BF"<<i+1; + } + outfile_bf<<endl; + + size_t t=0; + for (size_t i=0; i<ns_total; ++i) { + if (indicator_snp[i]==0) {continue;} + + outfile_bf<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t" + <<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss; + + outfile_bf<<scientific; + for (size_t j=0; j<BF->size2; j++) { + outfile_bf<<"\t"<<setprecision(6)<<gsl_matrix_get (BF, t, j); + } + outfile_bf<<endl; + + t++; + } + + outfile_hyp.close(); + outfile_hyp.clear(); + outfile_bf.close(); + outfile_bf.clear(); + return; +} + + + +void BSLMMDAP::WriteResult (const vector<string> &vec_rs, const gsl_matrix *Hyper, const gsl_vector *pip, const gsl_vector *coef) +{ + string file_gamma, file_hyp, file_coef; + file_gamma=path_out+"/"+file_out; + file_gamma+=".gamma.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + file_coef=path_out+"/"+file_out; + file_coef+=".coef.txt"; + + ofstream outfile_gamma, outfile_hyp, outfile_coef; + + outfile_gamma.open (file_gamma.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + outfile_coef.open (file_coef.c_str(), ofstream::out); + + if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;} + if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;} + if (!outfile_coef) {cout<<"error writing file: "<<file_coef<<endl; return;} + + outfile_hyp<<"h"<<"\t"<<"rho"<<"\t"<<"sa2"<<"\t"<<"sb2"<<"\t"<<"weight"<<endl; + outfile_hyp<<scientific; + for (size_t i=0; i<Hyper->size1; i++) { + for (size_t j=0; j<Hyper->size2; j++) { + outfile_hyp<<setprecision(6)<<gsl_matrix_get (Hyper, i, j)<<"\t"; + } + outfile_hyp<<endl; + } + + + outfile_gamma<<"rs"<<"\t"<<"gamma"<<endl; + for (size_t i=0; i<vec_rs.size(); ++i) { + outfile_gamma<<vec_rs[i]<<"\t"<<scientific<<setprecision(6)<<gsl_vector_get(pip, i)<<endl; + } + + outfile_coef<<"coef"<<endl; + outfile_coef<<scientific; + for (size_t i=0; i<coef->size; i++) { + outfile_coef<<setprecision(6)<<gsl_vector_get (coef, i)<<endl; + } + + outfile_coef.close(); + outfile_coef.clear(); + outfile_hyp.close(); + outfile_hyp.clear(); + outfile_gamma.close(); + outfile_gamma.clear(); + return; +} + + + + +/* +void BSLMMDAP::SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector<size_t> &rank) +{ + size_t pos; + for (size_t i=0; i<rank.size(); ++i) { + pos=mapRank2pos[rank[i]]; + gsl_vector_view Xgamma_col=gsl_matrix_column (Xgamma, i); + gsl_vector_const_view X_col=gsl_matrix_const_column (X, pos); + gsl_vector_memcpy (&Xgamma_col.vector, &X_col.vector); + } + + return; +} +*/ + +double BSLMMDAP::CalcMarginal (const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_b2, const double tau) +{ + gsl_vector *weight_Hi=gsl_vector_alloc (Uty->size); + + double logm=0.0; + double d, uy, Hi_yy=0, logdet_H=0.0; + for (size_t i=0; i<ni_test; ++i) { + d=gsl_vector_get (K_eval, i)*sigma_b2; + d=1.0/(d+1.0); + gsl_vector_set (weight_Hi, i, d); + + logdet_H-=log(d); + uy=gsl_vector_get (Uty, i); + Hi_yy+=d*uy*uy; + } + + //calculate likelihood + logm=-0.5*logdet_H-0.5*tau*Hi_yy+0.5*log(tau)*(double)ni_test; + + gsl_vector_free (weight_Hi); + + return logm; +} + + +double BSLMMDAP::CalcMarginal (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_a2, const double sigma_b2, const double tau) +{ + clock_t time_start; + double logm=0.0; + double d, uy, P_yy=0, logdet_O=0.0, logdet_H=0.0; + + gsl_matrix *UtXgamma_eval=gsl_matrix_alloc (UtXgamma->size1, UtXgamma->size2); + gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2); + gsl_vector *XtHiy=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *beta_hat=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *weight_Hi=gsl_vector_alloc (UtXgamma->size1); + + gsl_matrix_memcpy (UtXgamma_eval, UtXgamma); + + logdet_H=0.0; P_yy=0.0; + for (size_t i=0; i<ni_test; ++i) { + gsl_vector_view UtXgamma_row=gsl_matrix_row (UtXgamma_eval, i); + d=gsl_vector_get (K_eval, i)*sigma_b2; + d=1.0/(d+1.0); + gsl_vector_set (weight_Hi, i, d); + + logdet_H-=log(d); + uy=gsl_vector_get (Uty, i); + P_yy+=d*uy*uy; + gsl_vector_scale (&UtXgamma_row.vector, d); + } + + //calculate Omega + gsl_matrix_set_identity (Omega); + + time_start=clock(); +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega); +#endif + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate beta_hat + gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma_eval, Uty, 0.0, XtHiy); + + logdet_O=CholeskySolve(Omega, XtHiy, beta_hat); + + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (XtHiy, beta_hat, &d); + P_yy-=d; + + gsl_matrix_free (UtXgamma_eval); + gsl_matrix_free (Omega); + gsl_vector_free (XtHiy); + gsl_vector_free (beta_hat); + gsl_vector_free (weight_Hi); + + logm=-0.5*logdet_H-0.5*logdet_O-0.5*tau*P_yy+0.5*log(tau)*(double)ni_test; + + return logm; +} + + +double BSLMMDAP::CalcPrior (class HYPBSLMM &cHyp) { + double logprior=0; + logprior=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp)); + return logprior; +} + + +//where A is the ni_test by n_cat matrix of annotations +void BSLMMDAP::DAP_CalcBF (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y) { + clock_t time_start; + + //set up BF + double tau, h, rho, sigma_a2, sigma_b2, d; + size_t ns_causal=10; + size_t n_grid=h_ngrid*rho_ngrid; + vector<double> vec_sa2, vec_sb2, logm_null; + + gsl_matrix *BF=gsl_matrix_alloc(ns_test, n_grid); + gsl_matrix *Xgamma=gsl_matrix_alloc(ni_test, 1); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + + //compute tau by using yty + gsl_blas_ddot (Uty, Uty, &tau); + tau=(double)ni_test/tau; + + //set up grid values for sigma_a2 and sigma_b2 based on an approximately even grid for h and rho, and a fixed number of causals + size_t ij=0; + for (size_t i=0; i<h_ngrid; i++) { + h=h_min+(h_max-h_min)*(double)i/((double)h_ngrid-1); + for (size_t j=0; j<rho_ngrid; j++) { + rho=rho_min+(rho_max-rho_min)*(double)j/((double)rho_ngrid-1); + + sigma_a2=h*rho/((1-h)*(double)ns_causal); + sigma_b2=h*(1.0-rho)/(trace_G*(1-h)); + + vec_sa2.push_back(sigma_a2); + vec_sb2.push_back(sigma_b2); + logm_null.push_back(CalcMarginal (Uty, K_eval, 0.0, tau)); + + gsl_matrix_set (Hyper, ij, 0, h); + gsl_matrix_set (Hyper, ij, 1, rho); + gsl_matrix_set (Hyper, ij, 2, sigma_a2); + gsl_matrix_set (Hyper, ij, 3, sigma_b2); + gsl_matrix_set (Hyper, ij, 4, 1/(double)n_grid); + ij++; + } + } + + //compute BF factors + time_start=clock(); + cout<<"Calculating BF..."<<endl; + for (size_t t=0; t<ns_test; t++) { + gsl_vector_view Xgamma_col=gsl_matrix_column (Xgamma, 0); + gsl_vector_const_view X_col=gsl_matrix_const_column (UtX, t); + gsl_vector_memcpy (&Xgamma_col.vector, &X_col.vector); + + for (size_t ij=0; ij<n_grid; ij++) { + sigma_a2=vec_sa2[ij]; + sigma_b2=vec_sb2[ij]; + + d=CalcMarginal (Xgamma, Uty, K_eval, sigma_a2, sigma_b2, tau); + d-=logm_null[ij]; + d=exp(d); + + gsl_matrix_set(BF, t, ij, d); + } + } + time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //save results + WriteResult (Hyper, BF); + + //free matrices and vectors + gsl_matrix_free(BF); + gsl_matrix_free(Xgamma); + gsl_matrix_free(Hyper); + return; +} + + + + + +void single_ct_regression(const gsl_matrix_int *Xd, const gsl_vector_int *dlevel, const gsl_vector *pip_vec, gsl_vector *coef, gsl_vector *prior_vec) { + + map<int,double> sum_pip; + map<int,double> sum; + + int levels = gsl_vector_int_get(dlevel,0); + + for(int i=0;i<levels;i++){ + sum_pip[i] = sum[i] = 0; + } + + for(int i=0;i<Xd->size1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + sum_pip[cat] += gsl_vector_get(pip_vec,i); + sum[cat] += 1; + } + + for(int i=0;i<Xd->size1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + gsl_vector_set(prior_vec,i,sum_pip[cat]/sum[cat]); + } + + //double baseline=0; + for(int i=0;i<levels;i++){ + double new_prior = sum_pip[i]/sum[i]; + //gsl_vector_set(coef, i, log(new_prior/(1-new_prior))-baseline); + //if(i==0){ + //baseline = log(new_prior/(1-new_prior)); + //} + gsl_vector_set(coef, i, log(new_prior/(1-new_prior)) ); + } + + return; +} + + + + +//where A is the ni_test by n_cat matrix of annotations +void BSLMMDAP::DAP_EstimateHyper (const size_t kc, const size_t kd, const vector<string> &vec_rs, const vector<double> &vec_sa2, const vector<double> &vec_sb2, const vector<double> &wab, const vector<vector<vector<double> > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) { + clock_t time_start; + + //set up BF + double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save; + size_t t1, t2; + size_t n_grid=wab.size(), ns_test=vec_rs.size(); + + gsl_vector *prior_vec=gsl_vector_alloc(ns_test); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + gsl_vector *pip=gsl_vector_alloc(ns_test); + gsl_vector *coef=gsl_vector_alloc(kc+kd+1); + + //perform the EM algorithm + vector<double> vec_wab, vec_wab_new; + + //initial values + for (size_t t=0; t<ns_test; t++) { + gsl_vector_set (prior_vec, t, (double)BF.size()/(double)ns_test); + } + for (size_t ij=0; ij<n_grid; ij++) { + vec_wab.push_back(wab[ij]); + vec_wab_new.push_back(wab[ij]); + } + + //EM iteration + size_t it=0; + double dif=1; + while (it<100 && dif>1e-3) { + //update E_gamma + t1=0, t2=0; + for (size_t b=0; b<BF.size(); b++) { + s=1; + for (size_t m=0; m<BF[b].size(); m++) { + d=0; + for (size_t ij=0; ij<n_grid; ij++) { + d+=vec_wab_new[ij]*BF[b][m][ij]; + } + d*=gsl_vector_get(prior_vec, t1)/(1-gsl_vector_get(prior_vec, t1)); + + gsl_vector_set(pip, t1, d); + s+=d; + t1++; + } + + for (size_t m=0; m<BF[b].size(); m++) { + d=gsl_vector_get(pip, t2)/s; + gsl_vector_set(pip, t2, d); + t2++; + } + } + + //update E_wab + s=0; + for (size_t ij=0; ij<n_grid; ij++) { + vec_wab_new[ij]=0; + + t1=0; + for (size_t b=0; b<BF.size(); b++) { + d=1; + for (size_t m=0; m<BF[b].size(); m++) { + d+=gsl_vector_get(prior_vec, t1)/(1-gsl_vector_get(prior_vec, t1))*vec_wab[ij]*BF[b][m][ij]; + t1++; + } + vec_wab_new[ij]+=log(d); + } + + s=max(s, vec_wab_new[ij]); + } + + d=0; + for (size_t ij=0; ij<n_grid; ij++) { + vec_wab_new[ij]=exp(vec_wab_new[ij]-s); + d+=vec_wab_new[ij]; + } + + for (size_t ij=0; ij<n_grid; ij++) { + vec_wab_new[ij]/=d; + // vec_wab[ij]=vec_wab_new[ij]; + } + + //update coef, and pi + if(kc==0 && kd==0){//no annotation + s=0; + for (size_t t=0; t<pip->size; t++) { + s+=gsl_vector_get(pip, t); + } + s=s/(double)pip->size; + for (size_t t=0; t<pip->size; t++) { + gsl_vector_set(prior_vec, t, s); + } + + gsl_vector_set (coef, 0, log(s/(1-s))); + } else if(kc==0 && kd!=0){//only discrete annotations + if(kd == 1){ + single_ct_regression(Ad, dlevel, pip, coef, prior_vec); + }else{ + logistic_cat_fit(coef, Ad, dlevel, pip, 0, 0); + logistic_cat_pred(coef, Ad, dlevel, prior_vec); + } + } else if (kc!=0 && kd==0) {//only continuous annotations + logistic_cont_fit(coef, Ac, pip, 0, 0); + logistic_cont_pred(coef, Ac, prior_vec); + } else if (kc!=0 && kd!=0) {//both continuous and categorical annotations + logistic_mixed_fit(coef, Ad, dlevel, Ac, pip, 0, 0); + logistic_mixed_pred(coef, Ad, dlevel, Ac, prior_vec); + } + + //compute marginal likelihood + logm=0; + + t1=0; + for (size_t b=0; b<BF.size(); b++) { + d=1; s=0; + for (size_t m=0; m<BF[b].size(); m++) { + s+=log(1-gsl_vector_get(prior_vec, t1)); + for (size_t ij=0; ij<n_grid; ij++) { + d+=gsl_vector_get(prior_vec, t1)/(1-gsl_vector_get(prior_vec, t1))*vec_wab[ij]*BF[b][m][ij]; + } + } + logm+=log(d)+s; + t1++; + } + + if (it>0) { + dif=logm-logm_save; + } + logm_save=logm; + it++; + + cout<<"iteration = "<<it<<"; marginal likelihood = "<<logm<<endl; + } + + //update h and rho that correspond to w_ab + for (size_t ij=0; ij<n_grid; ij++) { + sigma_a2=vec_sa2[ij]; + sigma_b2=vec_sb2[ij]; + + d=exp(gsl_vector_get(coef, coef->size-1))/(1+exp(gsl_vector_get(coef, coef->size-1))); + h=(d*(double)ns_test*sigma_a2+1*sigma_b2)/(1+d*(double)ns_test*sigma_a2+1*sigma_b2); + rho=d*(double)ns_test*sigma_a2/(d*(double)ns_test*sigma_a2+1*sigma_b2); + + gsl_matrix_set (Hyper, ij, 0, h); + gsl_matrix_set (Hyper, ij, 1, rho); + gsl_matrix_set (Hyper, ij, 2, sigma_a2); + gsl_matrix_set (Hyper, ij, 3, sigma_b2); + gsl_matrix_set (Hyper, ij, 4, vec_wab_new[ij]); + } + + //obtain beta and alpha parameters + + + //save results + WriteResult (vec_rs, Hyper, pip, coef); + + //free matrices and vectors + gsl_vector_free(prior_vec); + gsl_matrix_free(Hyper); + gsl_vector_free(pip); + gsl_vector_free(coef); + return; +} + +/* +//readin the estimated hyper-parameters and perform fine mapping for each region +void BSLMM::DAP_FineMapping (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_matrix *A, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y, gsl_matrix *Hyper, gsl_vector *alpha, gsl_vector *pip) { + clock_t time_start; + + //two priority sets: S_1 contains all candidate causal SNPs; S_2 contains the prioritized combintion of them + //two marginal probability sets: P_1 contains marginals for S_1; P_2 contains marginals for S_2; + set<size_t> S1set, S2set; + vector<size_t> S1vec; + vector<set<size_t> > S2vec; + vector<double> P1, P2; + + //calculate P0 (null) and P1 (for every SNP) + + + + //loop through the number of combinations + for (size_t s=0; s<p; s++) { + //if (s==0), set up S_1: compute marginal of the null model, then compute P_1, then compute BF_1 and use them to select S_1; compute C_1 + + + + //if (s==1), set up S_2: compute pair-wise P_2 and use them to select S_2; compute C_2 + + //otherwise, match each combination of S_2 with each SNP from S_1, select into S_3; and replace S_2 with S_3; compute C_s + + + //stop when the stopping critieria are reached (if S_2 is empty; if t; if kappa); add the residual component R + + for (size_t t=0; t<total_step; ++t) { + if (t%d_pace==0 || t==total_step-1) {ProgressBar ("Running MCMC ", t, total_step-1, (double)n_accept/(double)(t*n_mh+1));} +// if (t>10) {break;} + + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz); + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //First proposal + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + beta_old.clear(); + for (size_t i=0; i<cHyp_old.n_gamma; ++i) { + beta_old.push_back(0); + } + } + else { + gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_old.n_gamma); + gsl_vector *beta=gsl_vector_alloc (cHyp_old.n_gamma); + SetXgamma (UtXgamma, UtX, rank_old); + logPost_old=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_old, Utu_old, alpha_old, beta, cHyp_old); + + beta_old.clear(); + for (size_t i=0; i<beta->size; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + } + + + delete [] p_gamma; + beta_g.clear(); + + return; +} + +*/ + + + + + + +/* +//below fits MCMC for rho=1 +void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty) +{ + time_t time_start=clock(); + gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size); + gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size); + gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#endif + gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + return; +} + + + +double BSLMM::CalcPosterior (const double yty, class HYPBSLMM &cHyp) +{ + double logpost=0.0; + + //for quantitative traits, calculate pve and pge + //pve and pge for case/control data are calculted in CalcCC_PVEnZ + if (a_mode==11) { + cHyp.pve=0.0; + cHyp.pge=1.0; + } + + //calculate likelihood + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(yty);} + else {logpost-=0.5*yty;} + + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp)); + + return logpost; +} + + +double BSLMM::CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp) +{ + double sigma_a2=cHyp.h/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test); + double logpost=0.0; + double d, P_yy=yty, logdet_O=0.0; + + gsl_matrix_const_view Xgamma_sub=gsl_matrix_const_submatrix (Xgamma, 0, 0, Xgamma->size1, s_size); + gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size); + gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size); + + gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size); + gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size); + gsl_vector *beta_hat=gsl_vector_alloc (s_size); + gsl_vector *Xty_temp=gsl_vector_alloc (s_size); + + gsl_vector_memcpy (Xty_temp, &Xty_sub.vector); + + //calculate Omega + gsl_matrix_memcpy (Omega, &XtX_sub.matrix); + gsl_matrix_scale (Omega, sigma_a2); + gsl_matrix_set_identity (M_temp); + gsl_matrix_add (Omega, M_temp); + + //calculate beta_hat + logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat); + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (Xty_temp, beta_hat, &d); + P_yy-=d; + + //sample tau + double tau=1.0; + if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0, 2.0/P_yy); } + + //sample beta + for (size_t i=0; i<s_size; i++) + { + d=gsl_ran_gaussian(gsl_r, 1); + gsl_vector_set(beta, i, d); + } + gsl_vector_view beta_sub=gsl_vector_subvector(beta, 0, s_size); + gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, &beta_sub.vector); + + //it compuates inv(L^T(Omega)) %*% beta; + gsl_vector_scale(&beta_sub.vector, sqrt(sigma_a2/tau)); + gsl_vector_add(&beta_sub.vector, beta_hat); + gsl_blas_dgemv (CblasNoTrans, 1.0, &Xgamma_sub.matrix, &beta_sub.vector, 0.0, Xb); + + //for quantitative traits, calculate pve and pge + if (a_mode==11) { + gsl_blas_ddot (Xb, Xb, &d); + cHyp.pve=d/(double)ni_test; + cHyp.pve/=cHyp.pve+1.0/tau; + cHyp.pge=1.0; + } + + logpost=-0.5*logdet_O; + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);} + else {logpost-=0.5*P_yy;} + + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp)); + + gsl_matrix_free (Omega); + gsl_matrix_free (M_temp); + gsl_vector_free (beta_hat); + gsl_vector_free (Xty_temp); + + return logpost; +} +*/ + + diff --git a/src/bslmmdap.h b/src/bslmmdap.h new file mode 100644 index 0000000..ac78f97 --- /dev/null +++ b/src/bslmmdap.h @@ -0,0 +1,101 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#ifndef __BSLMMDAP_H__ +#define __BSLMMDAP_H__ + +#include <vector> +#include <map> +#include <gsl/gsl_rng.h> +#include <gsl/gsl_randist.h> + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class BSLMMDAP { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + // LMM related parameters + double pve_null; + double pheno_mean; + + // BSLMM MCMC related parameters + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + + double h_min, h_max, rho_min, rho_max; + size_t h_ngrid, rho_ngrid; + + double time_UtZ; + double time_Omega; //time spent on optimization iterations + double time_Proposal; //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis) + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector<SNPINFO> snpInfo; //record SNP information + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void WriteResult (const gsl_matrix *Hyper, const gsl_matrix *BF); + void WriteResult (const vector<string> &vec_rs, const gsl_matrix *Hyper, const gsl_vector *pip, const gsl_vector *coef); + double CalcMarginal (const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_b2, const double tau); + double CalcMarginal (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_a2, const double sigma_b2, const double tau); + double CalcPrior (class HYPBSLMM &cHyp); + + void DAP_CalcBF (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y); + void DAP_EstimateHyper (const size_t kc, const size_t kd, const vector<string> &vec_rs, const vector<double> &vec_sa2, const vector<double> &vec_sb2, const vector<double> &wab, const vector<vector<vector<double> > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel); + +}; + +void ReadFile_hyb (const string &file_hyp, vector<double> &vec_sa2, vector<double> &vec_sb2, vector<double> &vec_wab); +void ReadFile_bf (const string &file_bf, vector<string> &vec_rs, vector<vector<vector<double> > > &BF); +void ReadFile_cat (const string &file_cat, const vector<string> &vec_rs, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel, size_t &kc, size_t &kd); + + +#endif + + diff --git a/src/ldr.cpp b/src/ldr.cpp new file mode 100644 index 0000000..e28490a --- /dev/null +++ b/src/ldr.cpp @@ -0,0 +1,285 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <iostream> +#include <fstream> +#include <sstream> + +#include <iomanip> +#include <cmath> +#include <iostream> +#include <stdio.h> +#include <stdlib.h> +#include <ctime> +#include <cstring> +#include <algorithm> + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "Eigen/Dense" + + + +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "ldr_float.h" +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "ldr.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; +using namespace Eigen; + + + +void LDR::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LDR::CopyToParam (PARAM &cPar) +{ + //cPar.pheno_mean=pheno_mean; + //cPar.randseed=randseed; + + return; +} + + +/* +void BSLMM::WriteBV (const gsl_vector *bv) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".bv.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + size_t t=0; + for (size_t i=0; i<ni_total; ++i) { + if (indicator_idv[i]==0) { + outfile<<"NA"<<endl; + } + else { + outfile<<scientific<<setprecision(6)<<gsl_vector_get(bv, t)<<endl; + t++; + } + } + + outfile.clear(); + outfile.close(); + return; +} + + + + +void BSLMM::WriteParam (vector<pair<double, double> > &beta_g, const gsl_vector *alpha, const size_t w) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".param.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + outfile<<"chr"<<"\t"<<"rs"<<"\t" + <<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t" + <<"beta"<<"\t"<<"gamma"<<endl; + + size_t t=0; + for (size_t i=0; i<ns_total; ++i) { + if (indicator_snp[i]==0) {continue;} + + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t" + <<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"; + + outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t"; + if (beta_g[t].second!=0) { + outfile<<beta_g[t].first/beta_g[t].second<<"\t"<<beta_g[t].second/(double)w<<endl; + } + else { + outfile<<0.0<<"\t"<<0.0<<endl; + } + t++; + } + + outfile.clear(); + outfile.close(); + return; +} + + +void BSLMM::WriteParam (const gsl_vector *alpha) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".param.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + outfile<<"chr"<<"\t"<<"rs"<<"\t" + <<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t" + <<"beta"<<"\t"<<"gamma"<<endl; + + size_t t=0; + for (size_t i=0; i<ns_total; ++i) { + if (indicator_snp[i]==0) {continue;} + + outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t" + <<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"; + outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t"; + outfile<<0.0<<"\t"<<0.0<<endl; + t++; + } + + outfile.clear(); + outfile.close(); + return; +} + + +void BSLMM::WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col) +{ + string file_gamma, file_hyp; + file_gamma=path_out+"/"+file_out; + file_gamma+=".gamma.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + + ofstream outfile_gamma, outfile_hyp; + + if (flag==0) { + outfile_gamma.open (file_gamma.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;} + if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;} + + outfile_hyp<<"h \t pve \t rho \t pge \t pi \t n_gamma"<<endl; + + for (size_t i=0; i<s_max; ++i) { + outfile_gamma<<"s"<<i<<"\t"; + } + outfile_gamma<<endl; + } + else { + outfile_gamma.open (file_gamma.c_str(), ofstream::app); + outfile_hyp.open (file_hyp.c_str(), ofstream::app); + if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;} + if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;} + + size_t w; + if (w_col==0) {w=w_pace;} + else {w=w_col;} + + for (size_t i=0; i<w; ++i) { + outfile_hyp<<scientific; + for (size_t j=0; j<4; ++j) { + outfile_hyp<<setprecision(6)<<gsl_matrix_get (Result_hyp, i, j)<<"\t"; + } + outfile_hyp<<setprecision(6)<<exp(gsl_matrix_get (Result_hyp, i, 4))<<"\t"; + outfile_hyp<<(int)gsl_matrix_get (Result_hyp, i, 5)<<"\t"; + outfile_hyp<<endl; + } + + for (size_t i=0; i<w; ++i) { + for (size_t j=0; j<s_max; ++j) { + outfile_gamma<<(int)gsl_matrix_get (Result_gamma, i, j)<<"\t"; + } + outfile_gamma<<endl; + } + + } + + outfile_hyp.close(); + outfile_hyp.clear(); + outfile_gamma.close(); + outfile_gamma.clear(); + return; +} + +*/ + +//X is a p by n matrix +void LDR::VB (const vector<vector<unsigned char> > &Xt, const gsl_matrix *W_gsl, const gsl_vector *y_gsl) +{ + //save gsl_vector and gsl_matrix into eigen library formats + MatrixXd W(W_gsl->size1, W_gsl->size2); + VectorXd y(y_gsl->size); + VectorXd x_col(y_gsl->size); + + double d; + for (size_t i=0; i<W_gsl->size1; i++) { + d=gsl_vector_get(y_gsl, i); + y(i)=d; + for (size_t j=0; j<W_gsl->size2; j++) { + W(i,j)=gsl_matrix_get(W_gsl, i, j); + } + } + + //initial VB values by lm + cout<<indicator_snp[0]<<" "<<indicator_snp[1]<<" "<<indicator_snp[2]<<endl; + uchar_matrix_get_row (Xt, 0, x_col); + + for (size_t j=0; j<10; j++) { + cout<<x_col(j)<<endl; + } + + + //run VB iterations + + + + //save results + + return; +} diff --git a/src/ldr.h b/src/ldr.h new file mode 100644 index 0000000..d81048c --- /dev/null +++ b/src/ldr.h @@ -0,0 +1,77 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#ifndef __LDR_H__ +#define __LDR_H__ + +#include <vector> +#include <map> +#include <gsl/gsl_rng.h> +#include <gsl/gsl_randist.h> + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class LDR { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + size_t n_cvt; //number of covariates + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector<SNPINFO> snpInfo; //record SNP information + + // Not included in PARAM + gsl_rng *gsl_r; + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void VB(const vector<vector<unsigned char> > &Xt, const gsl_matrix *W_gsl, const gsl_vector *y_gsl); +}; + + + +#endif + + diff --git a/src/logistic.cpp b/src/logistic.cpp new file mode 100644 index 0000000..1b47946 --- /dev/null +++ b/src/logistic.cpp @@ -0,0 +1,783 @@ +#include <stdio.h>
+#include <math.h>
+#include <gsl/gsl_matrix.h>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_multimin.h>
+#include <gsl/gsl_sf.h>
+#include <gsl/gsl_linalg.h>
+#include "logistic.h"
+
+// I need to bundle all the data that goes to the function to optimze together.
+typedef struct{
+ gsl_matrix_int *X;
+ gsl_vector_int *nlev;
+ gsl_vector *y;
+ gsl_matrix *Xc; // continuous covariates Matrix Nobs x Kc (NULL if not used)
+ double lambdaL1;
+ double lambdaL2;
+}fix_parm_mixed_T;
+
+
+
+
+
+
+double fLogit_mixed(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_matrix *Xc
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+ int n = y->size;
+ // int k = X->size2;
+ int npar = beta->size;
+ double total = 0;
+ double aux = 0;
+ /* omp_set_num_threads(ompthr); */
+ /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta_0*\/ */
+ /* /\*#pragma omp parallel for reduction (+:total)*\/ */
+ for(int i = 1; i < npar; ++i)
+ total += beta->data[i]*beta->data[i];
+ total = (-total*lambdaL2/2);
+ /* /\*#pragma omp parallel for reduction (+:aux)*\/ */
+ for(int i = 1; i < npar; ++i)
+ aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]);
+ total = total-aux*lambdaL1;
+ /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */
+ for(int i = 0; i < n; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < X->size2; ++k) {
+ if(gsl_matrix_int_get(X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm];
+ iParm+=nlev->data[k]-1;
+ }
+ for(int k = 0; k < (Xc->size2); ++k)
+ Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++];
+ total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai));
+ }
+ return -total;
+}
+
+
+void logistic_mixed_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1)
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used)
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ )
+{
+ for(int i = 0; i < X->size1; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < X->size2; ++k) {
+ if(gsl_matrix_int_get(X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm];
+ iParm+=nlev->data[k]-1;
+ }
+ // Adding the continuous
+ for(int k = 0; k < (Xc->size2); ++k)
+ Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++];
+ yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai));
+ }
+}
+
+
+/* The gradient of f, df = (df/dx, df/dy). */
+void
+wgsl_mixed_optim_df (const gsl_vector *beta, void *params,
+ gsl_vector *out)
+{
+ fix_parm_mixed_T *p = (fix_parm_mixed_T *)params;
+ int n = p->y->size;
+ int K = p->X->size2;
+ int Kc = p->Xc->size2;
+ int npar = beta->size;
+ // Intitialize gradient out necessary?
+ for(int i = 0; i < npar; ++i)
+ out->data[i]= 0;
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */
+ for(int i = 1; i < npar; ++i)
+ out->data[i]= p->lambdaL2*beta->data[i];
+ for(int i = 1; i < npar; ++i)
+ out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0));
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm];
+ iParm+=p->nlev->data[k]-1;
+ }
+ // Adding the continuous
+ for(int k = 0; k < Kc; ++k)
+ Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm++];
+
+ pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) );
+
+ out->data[0]+= pn;
+ iParm=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ out->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]+=pn;
+ iParm+=p->nlev->data[k]-1;
+ }
+ // Adding the continuous
+ for(int k = 0; k < Kc; ++k) {
+ out->data[iParm++] += gsl_matrix_get(p->Xc,i,k)*pn;
+ }
+ }
+
+}
+
+
+/* The Hessian of f */
+void
+wgsl_mixed_optim_hessian (const gsl_vector *beta, void *params,
+ gsl_matrix *out)
+{
+ fix_parm_mixed_T *p = (fix_parm_mixed_T *)params;
+ int n = p->y->size;
+ int K = p->X->size2;
+ int Kc = p->Xc->size2;
+ int npar = beta->size;
+ gsl_vector *gn = gsl_vector_alloc(npar); /* gn */
+ // Intitialize Hessian out necessary ???
+ gsl_matrix_set_zero(out);
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/
+ for(int i = 1; i < npar; ++i)
+ gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this
+ // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net)
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double aux=0;
+ double Xbetai=beta->data[0];
+ int iParm1=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1];
+ iParm1+=p->nlev->data[k]-1; //-1?
+ }
+ // Adding the continuous
+ for(int k = 0; k < Kc; ++k)
+ Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm1++];
+
+ pn= 1/(1 + gsl_sf_exp(-Xbetai));
+ // Add a protection for pn very close to 0 or 1?
+ aux=pn*(1-pn);
+
+ // Calculate sub-gradient vector gn
+ gsl_vector_set_zero(gn);
+ gn->data[0]= 1;
+ iParm1=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ gn->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1]=1;
+ iParm1+=p->nlev->data[k]-1;
+ }
+ // Adding the continuous
+ for(int k = 0; k < Kc; ++k) {
+ gn->data[iParm1++] = gsl_matrix_get(p->Xc,i,k);
+ }
+
+ for(int k1=0;k1<npar; ++k1)
+ if(gn->data[k1]!=0)
+ for(int k2=0;k2<npar; ++k2)
+ if(gn->data[k2]!=0)
+ *gsl_matrix_ptr(out,k1,k2) += (aux * gn->data[k1] * gn->data[k2]);
+ }
+ gsl_vector_free(gn);
+}
+
+
+double wgsl_mixed_optim_f(gsl_vector *v, void *params)
+{
+ double mLogLik=0;
+ fix_parm_mixed_T *p = (fix_parm_mixed_T *)params;
+ mLogLik = fLogit_mixed(v,p->X,p->nlev,p->Xc,p->y,p->lambdaL1,p->lambdaL2);
+ return mLogLik;
+}
+
+
+/* Compute both f and df together. */
+void
+wgsl_mixed_optim_fdf (gsl_vector *x, void *params,
+ double *f, gsl_vector *df)
+{
+ *f = wgsl_mixed_optim_f(x, params);
+ wgsl_mixed_optim_df(x, params, df);
+}
+
+
+int logistic_mixed_fit(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used)
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+
+ double mLogLik=0;
+ fix_parm_mixed_T p;
+ int npar = beta->size;
+ int iter=0;
+ double maxchange=0;
+
+ //Intializing fix parameters
+ p.X=X;
+ p.Xc=Xc;
+ p.nlev=nlev;
+ p.y=y;
+ p.lambdaL1=lambdaL1;
+ p.lambdaL2=lambdaL2;
+
+ //Initial fit
+ //#ifdef _RPR_DEBUG_
+ mLogLik = wgsl_mixed_optim_f(beta,&p);
+ //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik);
+ //#endif //_RPR_DEBUG
+
+ gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/
+ gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */
+
+ gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/
+ gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/
+
+ for(iter=0;iter<100;iter++){
+ wgsl_mixed_optim_hessian(beta,&p,myH); //Calculate Hessian
+ wgsl_mixed_optim_df(beta,&p,myG); //Calculate Gradient
+ gsl_linalg_QR_decomp(myH,tau); //Calculate next beta
+ gsl_linalg_QR_solve(myH,tau,myG,stBeta);
+ gsl_vector_sub(beta,stBeta);
+
+ //Monitor convergence
+ maxchange=0;
+ for(int i=0;i<npar; i++)
+ if(maxchange<fabs(stBeta->data[i]))
+ maxchange=fabs(stBeta->data[i]);
+
+#ifdef _RPR_DEBUG_
+ //mLogLik = wgsl_mixed_optim_f(beta,&p);
+ //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange);
+#endif //_RPR_DEBUG
+
+ if(maxchange<1E-4)
+ break;
+ }
+
+#ifdef _RPR_DEBUG_
+ //for (int i = 0; i < npar; i++)
+ // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]);
+#endif //_RPR_DEBUG
+
+ //Final fit
+ mLogLik = wgsl_mixed_optim_f(beta,&p);
+ //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange);
+
+ gsl_vector_free (tau);
+ gsl_vector_free (stBeta);
+ gsl_vector_free (myG);
+ gsl_matrix_free (myH);
+
+ return 0;
+}
+
+/***************/
+/* Categorical */
+/***************/
+
+// I need to bundle all the data that goes to the function to optimze together.
+typedef struct{
+ gsl_matrix_int *X;
+ gsl_vector_int *nlev;
+ gsl_vector *y;
+ double lambdaL1;
+ double lambdaL2;
+}fix_parm_cat_T;
+
+
+double fLogit_cat(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+ int n = y->size;
+ // int k = X->size2;
+ int npar = beta->size;
+ double total = 0;
+ double aux = 0;
+ /* omp_set_num_threads(ompthr); */
+ /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*\/ */
+ /* /\*#pragma omp parallel for reduction (+:total)*\/ */
+ for(int i = 1; i < npar; ++i)
+ total += beta->data[i]*beta->data[i];
+ total = (-total*lambdaL2/2);
+ /* /\*#pragma omp parallel for reduction (+:aux)*\/ */
+ for(int i = 1; i < npar; ++i)
+ aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]);
+ total = total-aux*lambdaL1;
+ /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */
+ for(int i = 0; i < n; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < X->size2; ++k) {
+ if(gsl_matrix_int_get(X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm];
+ iParm+=nlev->data[k]-1;
+ }
+ total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai));
+ }
+ return -total;
+}
+
+
+void logistic_cat_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1)
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ )
+{
+ for(int i = 0; i < X->size1; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < X->size2; ++k) {
+ if(gsl_matrix_int_get(X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm];
+ iParm+=nlev->data[k]-1;
+ }
+ yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai));
+ }
+}
+
+
+/* The gradient of f, df = (df/dx, df/dy). */
+void
+wgsl_cat_optim_df (const gsl_vector *beta, void *params,
+ gsl_vector *out)
+{
+ fix_parm_cat_T *p = (fix_parm_cat_T *)params;
+ int n = p->y->size;
+ int K = p->X->size2;
+ int npar = beta->size;
+ // Intitialize gradient out necessary?
+ for(int i = 0; i < npar; ++i)
+ out->data[i]= 0;
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */
+ for(int i = 1; i < npar; ++i)
+ out->data[i]= p->lambdaL2*beta->data[i];
+ for(int i = 1; i < npar; ++i)
+ out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0));
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm];
+ iParm+=p->nlev->data[k]-1;
+ }
+ // total += y->data[i]*Xbetai-log(1+gsl_sf_exp(Xbetai));
+ pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) );
+
+ out->data[0]+= pn;
+ iParm=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ out->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]+=pn;
+ iParm+=p->nlev->data[k]-1;
+ }
+ }
+}
+
+
+/* The Hessian of f */
+void
+wgsl_cat_optim_hessian (const gsl_vector *beta, void *params,
+ gsl_matrix *out)
+{
+ fix_parm_cat_T *p = (fix_parm_cat_T *)params;
+ int n = p->y->size;
+ int K = p->X->size2;
+ int npar = beta->size;
+ // Intitialize Hessian out necessary ???
+ gsl_matrix_set_zero(out);
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/
+ for(int i = 1; i < npar; ++i)
+ gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this
+ // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net)
+
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double aux=0;
+ double Xbetai=beta->data[0];
+ int iParm2=1;
+ int iParm1=1;
+ for(int k = 0; k < K; ++k) {
+ if(gsl_matrix_int_get(p->X,i,k)>0)
+ Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1];
+ iParm1+=p->nlev->data[k]-1; //-1?
+ }
+ // total += y->data[i]*Xbetai-log(1+gsl_sf_exp(Xbetai));
+ pn= 1/(1 + gsl_sf_exp(-Xbetai));
+ // Add a protection for pn very close to 0 or 1?
+ aux=pn*(1-pn);
+ *gsl_matrix_ptr(out,0,0)+=aux;
+ iParm2=1;
+ for(int k2 = 0; k2 < K; ++k2) {
+ if(gsl_matrix_int_get(p->X,i,k2)>0)
+ *gsl_matrix_ptr(out,0,gsl_matrix_int_get(p->X,i,k2)-1+iParm2)+=aux;
+ iParm2+=p->nlev->data[k2]-1; //-1?
+ }
+ iParm1=1;
+ for(int k1 = 0; k1 < K; ++k1) {
+ if(gsl_matrix_int_get(p->X,i,k1)>0)
+ *gsl_matrix_ptr(out,gsl_matrix_int_get(p->X,i,k1)-1+iParm1,0)+=aux;
+ iParm2=1;
+ for(int k2 = 0; k2 < K; ++k2) {
+ if((gsl_matrix_int_get(p->X,i,k1)>0) && (gsl_matrix_int_get(p->X,i,k2)>0))
+ *gsl_matrix_ptr(out
+ ,gsl_matrix_int_get(p->X,i,k1)-1+iParm1
+ ,gsl_matrix_int_get(p->X,i,k2)-1+iParm2
+ )+=aux;
+ iParm2+=p->nlev->data[k2]-1; //-1?
+ }
+ iParm1+=p->nlev->data[k1]-1; //-1?
+ }
+ }
+}
+
+
+double wgsl_cat_optim_f(gsl_vector *v, void *params)
+{
+ double mLogLik=0;
+ fix_parm_cat_T *p = (fix_parm_cat_T *)params;
+ mLogLik = fLogit_cat(v,p->X,p->nlev,p->y,p->lambdaL1,p->lambdaL2);
+ return mLogLik;
+}
+
+
+/* Compute both f and df together. */
+void
+wgsl_cat_optim_fdf (gsl_vector *x, void *params,
+ double *f, gsl_vector *df)
+{
+ *f = wgsl_cat_optim_f(x, params);
+ wgsl_cat_optim_df(x, params, df);
+}
+
+
+int logistic_cat_fit(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+ double mLogLik=0;
+ fix_parm_cat_T p;
+ int npar = beta->size;
+ int iter=0;
+ double maxchange=0;
+
+ //Intializing fix parameters
+ p.X=X;
+ p.nlev=nlev;
+ p.y=y;
+ p.lambdaL1=lambdaL1;
+ p.lambdaL2=lambdaL2;
+
+ //Initial fit
+ //#ifdef _RPR_DEBUG_
+ mLogLik = wgsl_cat_optim_f(beta,&p);
+ //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik);
+ //#endif //_RPR_DEBUG
+
+ gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/
+ gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */
+
+ gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/
+ gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/
+
+ for(iter=0;iter<100;iter++){
+ wgsl_cat_optim_hessian(beta,&p,myH); //Calculate Hessian
+ wgsl_cat_optim_df(beta,&p,myG); //Calculate Gradient
+ gsl_linalg_QR_decomp(myH,tau); //Calculate next beta
+ gsl_linalg_QR_solve(myH,tau,myG,stBeta);
+ gsl_vector_sub(beta,stBeta);
+
+ //Monitor convergence
+ maxchange=0;
+ for(int i=0;i<npar; i++)
+ if(maxchange<fabs(stBeta->data[i]))
+ maxchange=fabs(stBeta->data[i]);
+
+#ifdef _RPR_DEBUG_
+ mLogLik = wgsl_cat_optim_f(beta,&p);
+ //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange);
+#endif //_RPR_DEBUG
+
+ if(maxchange<1E-4)
+ break;
+ }
+
+#ifdef _RPR_DEBUG_
+ //for (int i = 0; i < npar; i++)
+ // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]);
+#endif //_RPR_DEBUG
+
+ //Final fit
+ mLogLik = wgsl_cat_optim_f(beta,&p);
+ //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange);
+
+ gsl_vector_free (tau);
+ gsl_vector_free (stBeta);
+ gsl_vector_free (myG);
+ gsl_matrix_free (myH);
+
+ return 0;
+}
+
+
+/***************/
+/* Continuous */
+/***************/
+
+// I need to bundle all the data that goes to the function to optimze together.
+typedef struct{
+ gsl_matrix *Xc; // continuous covariates Matrix Nobs x Kc
+ gsl_vector *y;
+ double lambdaL1;
+ double lambdaL2;
+}fix_parm_cont_T;
+
+
+double fLogit_cont(gsl_vector *beta
+ ,gsl_matrix *Xc
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+ int n = y->size;
+ int npar = beta->size;
+ double total = 0;
+ double aux = 0;
+ /* omp_set_num_threads(ompthr); */
+ /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta_0*\/ */
+ /* /\*#pragma omp parallel for reduction (+:total)*\/ */
+ for(int i = 1; i < npar; ++i)
+ total += beta->data[i]*beta->data[i];
+ total = (-total*lambdaL2/2);
+ /* /\*#pragma omp parallel for reduction (+:aux)*\/ */
+ for(int i = 1; i < npar; ++i)
+ aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]);
+ total = total-aux*lambdaL1;
+ /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */
+ for(int i = 0; i < n; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < (Xc->size2); ++k)
+ Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++];
+ total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai));
+ }
+ return -total;
+}
+
+
+void logistic_cont_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1)
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used)
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ )
+{
+ for(int i = 0; i < Xc->size1; ++i) {
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < (Xc->size2); ++k)
+ Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++];
+ yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai));
+ }
+}
+
+
+/* The gradient of f, df = (df/dx, df/dy). */
+void
+wgsl_cont_optim_df (const gsl_vector *beta, void *params,
+ gsl_vector *out)
+{
+ fix_parm_cont_T *p = (fix_parm_cont_T *)params;
+ int n = p->y->size;
+ int Kc = p->Xc->size2;
+ int npar = beta->size;
+ // Intitialize gradient out necessary?
+ for(int i = 0; i < npar; ++i)
+ out->data[i]= 0;
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */
+ for(int i = 1; i < npar; ++i)
+ out->data[i]= p->lambdaL2*beta->data[i];
+ for(int i = 1; i < npar; ++i)
+ out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0));
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double Xbetai=beta->data[0];
+ int iParm=1;
+ for(int k = 0; k < Kc; ++k)
+ Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm++];
+
+ pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) );
+
+ out->data[0]+= pn;
+ iParm=1;
+ // Adding the continuous
+ for(int k = 0; k < Kc; ++k) {
+ out->data[iParm++] += gsl_matrix_get(p->Xc,i,k)*pn;
+ }
+ }
+}
+
+
+/* The Hessian of f */
+void
+wgsl_cont_optim_hessian (const gsl_vector *beta, void *params,
+ gsl_matrix *out)
+{
+ fix_parm_cont_T *p = (fix_parm_cont_T *)params;
+ int n = p->y->size;
+ int Kc = p->Xc->size2;
+ int npar = beta->size;
+ gsl_vector *gn = gsl_vector_alloc(npar); /* gn */
+ // Intitialize Hessian out necessary ???
+ gsl_matrix_set_zero(out);
+ /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/
+ for(int i = 1; i < npar; ++i)
+ gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this
+ // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net)
+
+ for(int i = 0; i < n; ++i) {
+ double pn=0;
+ double aux=0;
+ double Xbetai=beta->data[0];
+ int iParm1=1;
+ for(int k = 0; k < Kc; ++k)
+ Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm1++];
+
+ pn= 1/(1 + gsl_sf_exp(-Xbetai));
+ // Add a protection for pn very close to 0 or 1?
+ aux=pn*(1-pn);
+
+ // Calculate sub-gradient vector gn
+ gsl_vector_set_zero(gn);
+ gn->data[0]= 1;
+ iParm1=1;
+ for(int k = 0; k < Kc; ++k) {
+ gn->data[iParm1++] = gsl_matrix_get(p->Xc,i,k);
+ }
+
+ for(int k1=0;k1<npar; ++k1)
+ if(gn->data[k1]!=0)
+ for(int k2=0;k2<npar; ++k2)
+ if(gn->data[k2]!=0)
+ *gsl_matrix_ptr(out,k1,k2) += (aux * gn->data[k1] * gn->data[k2]);
+ }
+ gsl_vector_free(gn);
+}
+
+
+double wgsl_cont_optim_f(gsl_vector *v, void *params)
+{
+ double mLogLik=0;
+ fix_parm_cont_T *p = (fix_parm_cont_T *)params;
+ mLogLik = fLogit_cont(v,p->Xc,p->y,p->lambdaL1,p->lambdaL2);
+ return mLogLik;
+}
+
+
+/* Compute both f and df together. */
+void
+wgsl_cont_optim_fdf (gsl_vector *x, void *params,
+ double *f, gsl_vector *df)
+{
+ *f = wgsl_cont_optim_f(x, params);
+ wgsl_cont_optim_df(x, params, df);
+}
+
+
+int logistic_cont_fit(gsl_vector *beta
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used)
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2)
+{
+
+ double mLogLik=0;
+ fix_parm_cont_T p;
+ int npar = beta->size;
+ int iter=0;
+ double maxchange=0;
+
+ //Intializing fix parameters
+ p.Xc=Xc;
+ p.y=y;
+ p.lambdaL1=lambdaL1;
+ p.lambdaL2=lambdaL2;
+
+ //Initial fit
+ //#ifdef _RPR_DEBUG_
+ mLogLik = wgsl_cont_optim_f(beta,&p);
+ //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik);
+ //#endif //_RPR_DEBUG
+
+ gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/
+ gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */
+
+ gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/
+ gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/
+
+ for(iter=0;iter<100;iter++){
+ wgsl_cont_optim_hessian(beta,&p,myH); //Calculate Hessian
+ wgsl_cont_optim_df(beta,&p,myG); //Calculate Gradient
+ gsl_linalg_QR_decomp(myH,tau); //Calculate next beta
+ gsl_linalg_QR_solve(myH,tau,myG,stBeta);
+ gsl_vector_sub(beta,stBeta);
+
+ //Monitor convergence
+ maxchange=0;
+ for(int i=0;i<npar; i++)
+ if(maxchange<fabs(stBeta->data[i]))
+ maxchange=fabs(stBeta->data[i]);
+
+#ifdef _RPR_DEBUG_
+ mLogLik = wgsl_cont_optim_f(beta,&p);
+ //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange);
+#endif //_RPR_DEBUG
+
+ if(maxchange<1E-4)
+ break;
+ }
+
+#ifdef _RPR_DEBUG_
+ //for (int i = 0; i < npar; i++)
+ // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]);
+#endif //_RPR_DEBUG
+
+ //Final fit
+ mLogLik = wgsl_cont_optim_f(beta,&p);
+ //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange);
+
+ gsl_vector_free (tau);
+ gsl_vector_free (stBeta);
+ gsl_vector_free (myG);
+ gsl_matrix_free (myH);
+
+ return 0;
+}
+
diff --git a/src/logistic.h b/src/logistic.h new file mode 100644 index 0000000..a68ee09 --- /dev/null +++ b/src/logistic.h @@ -0,0 +1,70 @@ +#ifndef LOGISTIC_H_ /* Include guard */
+#define LOGISTIC_H_
+
+/* Mixed interface */
+void logistic_mixed_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ );
+
+int logistic_mixed_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *y //Vector of prob. to predict
+ ,double lambdaL1 // Regularization L1 0.0 if not used
+ ,double lambdaL2); // Regularization L2 0.0 if not used
+
+double fLogit_mixed(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2);
+
+
+/* Categorical only interface */
+void logistic_cat_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ );
+
+int logistic_cat_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix_int *X //Matrix Nobs x K
+ ,gsl_vector_int *nlev // Vector with number categories
+ ,gsl_vector *y //Vector of prob. to predict
+ ,double lambdaL1 // Regularization L1 0.0 if not used
+ ,double lambdaL2); // Regularization L2 0.0 if not used
+
+double fLogit_cat(gsl_vector *beta
+ ,gsl_matrix_int *X
+ ,gsl_vector_int *nlev
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2);
+
+
+/* Continuous only interface */
+void logistic_cont_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *yhat //Vector of prob. predicted by the logistic
+ );
+
+int logistic_cont_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *y //Vector of prob. to predict
+ ,double lambdaL1 // Regularization L1 0.0 if not used
+ ,double lambdaL2); // Regularization L2 0.0 if not used
+
+double fLogit_cont(gsl_vector *beta
+ ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc
+ ,gsl_vector *y
+ ,double lambdaL1
+ ,double lambdaL2);
+
+
+#endif // LOGISTIC_H_
diff --git a/src/varcov.cpp b/src/varcov.cpp new file mode 100644 index 0000000..fdc6f10 --- /dev/null +++ b/src/varcov.cpp @@ -0,0 +1,482 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> +#include <iomanip> +#include <bitset> +#include <vector> +#include <map> +#include <set> +#include <cstring> +#include <cmath> +#include <stdio.h> +#include <stdlib.h> + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "varcov_float.h" +#include "io_float.h" +#include "mathfunc_float.h" +#else +#include "param.h" +#include "varcov.h" +#include "io.h" +#include "mathfunc.h" +#endif + + +using namespace std; + + + + +void VARCOV::CopyFromParam (PARAM &cPar) +{ + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + time_opt=0.0; + + window_cm=cPar.window_cm; + window_bp=cPar.window_bp; + window_ns=cPar.window_ns; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void VARCOV::CopyToParam (PARAM &cPar) +{ + cPar.time_opt=time_opt; + return; +} + + + +//chr rs ps n_idv allele1 allele0 af var window_size r2 (r2_11,n_11,r2_12,n_12...r2_1m,n_1m) +void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub, const vector<vector<double> > &Cov_mat) +{ + string file_cov; + file_cov=path_out+"/"+file_out; + file_cov+=".cor.txt"; + + ofstream outfile; + + if (flag==0) { + outfile.open (file_cov.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;} + + outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis" + <<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0" + <<"\t"<<"af"<<"\t"<<"window_size" + <<"\t"<<"var"<<"\t"<<"cor"<<endl; + } else { + outfile.open (file_cov.c_str(), ofstream::app); + if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;} + + for (size_t i=0; i<Cov_mat.size(); i++) { + outfile<<snpInfo_sub[i].chr<<"\t"<<snpInfo_sub[i].rs_number<<"\t"<<snpInfo_sub[i].base_position<<"\t"<<snpInfo_sub[i].n_miss<<"\t"<<snpInfo_sub[i].n_idv<<"\t"<<snpInfo_sub[i].a_minor<<"\t"<<snpInfo_sub[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo_sub[i].maf<<"\t"<<Cov_mat[i].size()-1<<"\t"; + outfile<<scientific<<setprecision(6)<<Cov_mat[i][0]<<"\t"; + + if (Cov_mat[i].size()==1) { + outfile<<"NA"; + } else { + for (size_t j=1; j<Cov_mat[i].size(); j++) { + if (j==(Cov_mat[i].size()-1)) { + outfile<<Cov_mat[i][j]; + } else { + outfile<<Cov_mat[i][j]<<","; + } + } + } + + outfile<<endl; + } + } + + outfile.close(); + outfile.clear(); + return; +} + + + + +// sort SNPs first based on chromosomes then based on bp (or cm, if bp is not available) +//if chr1=chr2 and bp1=bp2, then return with the same order +bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) +{ + int c_chr=snpInfo1.chr.compare(snpInfo2.chr); + long int c_bp=snpInfo1.base_position-snpInfo2.base_position; + + if(c_chr<0) { + return true; + } else if (c_chr>0) { + return false; + } else { + if (c_bp<0) { + return true; + } else if (c_bp>0) { + return false; + } else { + return true; + } + } +} + + +// do not sort SNPs (because gzip files do not support random access) +// then calculate n_nb, the number of neighbours, for each snp +void VARCOV::CalcNB (vector<SNPINFO> &snpInfo_sort) +{ + // stable_sort(snpInfo_sort.begin(), snpInfo_sort.end(), CompareSNPinfo); + + size_t t2=0, n_nb=0; + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (indicator_snp[t]==0) {continue;} + + if (snpInfo_sort[t].chr=="-9" || (snpInfo_sort[t].cM==-9 && window_cm!=0) || (snpInfo_sort[t].base_position==-9 && window_bp!=0) ) { + snpInfo_sort[t].n_nb=0; continue; + } + + if (t==indicator_snp.size()-1) {snpInfo_sort[t].n_nb=0; continue;} + + t2=t+1; n_nb=0; + + while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr && indicator_snp[t2]==0) {t2++;} + + while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr && (snpInfo_sort[t2].cM-snpInfo_sort[t].cM<window_cm || window_cm==0) && (snpInfo_sort[t2].base_position-snpInfo_sort[t].base_position<window_bp || window_bp==0) && (n_nb<window_ns|| window_ns==0) ) { + t2++; n_nb++; + while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr && indicator_snp[t2]==0) {t2++;} + } + + snpInfo_sort[t].n_nb=n_nb; + } + + return; +} + + + +//vector double is centered to have mean 0 +void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec) +{ + cov_vec.clear(); + + double v1, v2, r; + vector<double> x_vec=X_mat[0]; + + lapack_ddot(x_vec, x_vec, v1); + cov_vec.push_back(v1/(double)x_vec.size() ); + + for (size_t i=1; i<X_mat.size(); i++) { + lapack_ddot(X_mat[i], x_vec, r); + lapack_ddot(X_mat[i], X_mat[i], v2); + r/=sqrt(v1*v2); + + cov_vec.push_back(r); + } + + return; +} + +/* +//somehow this can produce nan for some snps; perhaps due to missing values? +//vector int is not centered, and have 0/1/2 values only +//missing value is -9; to calculate covariance, these values are replaced by +void Calc_Cor(const vector<vector<int> > &X_mat, vector<double> &cov_vec) +{ + cov_vec.clear(); + + int d1, d2, m1, m2, n1, n2, n12; + double m1d, m2d, m12d, v1d, v2d, v; + + vector<int> x_vec=X_mat[0]; + + //calculate m2 + m2=0; n2=0; + for (size_t j=0; j<x_vec.size(); j++) { + d2=x_vec[j]; + if (d2==-9) {continue;} + m2+=d2; n2++; + } + m2d=(double)m2/(double)n2; + + for (size_t i=0; i<X_mat.size(); i++) { + //calculate m1 + m1=0; n1=0; + for (size_t j=0; j<x_vec.size(); j++) { + d1=X_mat[i][j]; + if (d1==-9) {continue;} + m1+=d1; n1++; + } + m1d=(double)m1/(double)n1; + + //calculate v1 + m1=0; m2=0; m12d=0; n12=0; + for (size_t j=0; j<x_vec.size(); j++) { + d1=X_mat[i][j]; + if (d1==-9) { + n12++; + } else if (d1!=0) { + if (d1==1) {m12d+=1;} else if (d1==2) {m12d+=4;} else {m12d+=(double)(d1*d1);} + } + } + + v1d=((double)m12d+m1d*(double)m2+m2d*(double)m1+(double)n12*m1d*m2d)/(double)x_vec.size()-m1d*m2d; + + //calculate covariance + if (i!=0) { + m1=0; m2=0; m12d=0; n12=0; + for (size_t j=0; j<x_vec.size(); j++) { + d1=X_mat[i][j]; d2=x_vec[j]; + if (d1==-9 && d2==-9) { + n12++; + } else if (d1==-9) { + m2+=d2; + } else if (d2==-9) { + m1+=d1; + } else if (d1!=0 && d2!=0) { + if (d1==1) {m12d+=d2;} else if (d1==2) {m12d+=d2+d2;} else {m12d+=(double)(d1*d2);} + } + } + + v=((double)m12d+m1d*(double)m2+m2d*(double)m1+(double)n12*m1d*m2d)/(double)x_vec.size()-m1d*m2d; + v/=sqrt(v1d*v2d); + } else { + v2d=v1d; + v=v1d/(double)x_vec.size(); + } + + cov_vec.push_back(v); + } + return; +} +*/ + + +//read the genotype file again, calculate r2 between pairs of SNPs within a window, output the file every 10K SNPs +//the output results are sorted based on chr and bp +//output format similar to assoc.txt files (remember n_miss is replaced by n_idv) + +//r2 between the current SNP and every following SNPs within the window_size (which can vary if cM was used) +//read bimbam mean genotype file and calculate the covariance matrix for neighboring SNPs +//output values at 10000-SNP-interval +void VARCOV::AnalyzeBimbam () +{ + igzstream infile (file_geno.c_str(), igzstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;} + + //calculate the number of right-hand-side neighbours for each SNP + vector<SNPINFO> snpInfo_sub; + CalcNB(snpInfo); + + size_t ni_test=0; + for (size_t i=0; i<indicator_idv.size(); i++) { + ni_test+=indicator_idv[i]; + } + + gsl_vector *geno=gsl_vector_alloc (ni_test); + double geno_mean; + + vector<double> x_vec, cov_vec; + vector<vector<double> > X_mat, Cov_mat; + + for (size_t i=0; i<ni_test; i++) { + x_vec.push_back(0); + } + + WriteCov (0, snpInfo_sub, Cov_mat); + + size_t t2=0, inc; + int n_nb=0; + + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (t%d_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + // if (t>=2) {break;} + + if (X_mat.size()==0) { + n_nb=snpInfo[t].n_nb+1; + } else { + n_nb=snpInfo[t].n_nb-n_nb+1; + } + + for (int i=0; i<n_nb; i++) { + if (X_mat.size()==0) {t2=t;} + + //read a line of the snp is filtered out + inc=0; + while (t2<indicator_snp.size() && indicator_snp[t2]==0) { + t2++; inc++; + } + + Bimbam_ReadOneSNP (inc, indicator_idv, infile, geno, geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); + + for (size_t j=0; j<geno->size; j++) { + x_vec[j]=gsl_vector_get(geno, j); + } + X_mat.push_back(x_vec); + + t2++; + } + + n_nb=snpInfo[t].n_nb; + + Calc_Cor(X_mat, cov_vec); + Cov_mat.push_back(cov_vec); + snpInfo_sub.push_back(snpInfo[t]); + + X_mat.erase(X_mat.begin()); + + //write out var/cov values + if (Cov_mat.size()==10000) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + } + + if (Cov_mat.size()!=0) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + + gsl_vector_free(geno); + + infile.close(); + infile.clear(); + + return; +} + + + + +void VARCOV::AnalyzePlink () +{ + string file_bed=file_bfile+".bed"; + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} + + //calculate the number of right-hand-side neighbours for each SNP + vector<SNPINFO> snpInfo_sub; + CalcNB(snpInfo); + + size_t ni_test=0; + for (size_t i=0; i<indicator_idv.size(); i++) { + ni_test+=indicator_idv[i]; + } + + gsl_vector *geno=gsl_vector_alloc (ni_test); + double geno_mean; + + vector<double> x_vec, cov_vec; + vector<vector<double> > X_mat, Cov_mat; + + for (size_t i=0; i<ni_test; i++) { + x_vec.push_back(0); + } + + WriteCov (0, snpInfo_sub, Cov_mat); + + size_t t2=0, inc; + int n_nb=0; + + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (t%d_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + // if (t>=2) {break;} + + if (X_mat.size()==0) { + n_nb=snpInfo[t].n_nb+1; + } else { + n_nb=snpInfo[t].n_nb-n_nb+1; + } + + for (int i=0; i<n_nb; i++) { + if (X_mat.size()==0) {t2=t;} + + //read a line of the snp is filtered out + inc=0; + while (t2<indicator_snp.size() && indicator_snp[t2]==0) { + t2++; inc++; + } + + Plink_ReadOneSNP (t2, indicator_idv, infile, geno, geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); + + for (size_t j=0; j<geno->size; j++) { + x_vec[j]=gsl_vector_get(geno, j); + } + X_mat.push_back(x_vec); + + t2++; + } + + n_nb=snpInfo[t].n_nb; + + Calc_Cor(X_mat, cov_vec); + Cov_mat.push_back(cov_vec); + snpInfo_sub.push_back(snpInfo[t]); + + X_mat.erase(X_mat.begin()); + + //write out var/cov values + if (Cov_mat.size()==10000) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + } + + if (Cov_mat.size()!=0) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + + gsl_vector_free(geno); + + infile.close(); + infile.clear(); + + return; +} diff --git a/src/varcov.h b/src/varcov.h new file mode 100644 index 0000000..b380b8c --- /dev/null +++ b/src/varcov.h @@ -0,0 +1,72 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __VARCOV_H__ +#define __VARCOV_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + + +class VARCOV { + +public: + // IO related parameters + string file_out; + string path_out; + string file_geno; + string file_bfile; + int d_pace; + + vector<int> indicator_idv; + vector<int> indicator_snp; + + vector<SNPINFO> snpInfo; + + double time_opt; + + // Class specific parameters + double window_cm; + size_t window_bp; + size_t window_ns; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void CalcNB (vector<SNPINFO> &snpInfo_sort); + void WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub, const vector<vector<double> > &Cov_mat); + void AnalyzeBimbam (); + void AnalyzePlink (); +}; + +#endif + + |