/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011-2017, Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <iostream> #include <fstream> #include <sstream> #include <iomanip> #include <cmath> #include <iostream> #include <stdio.h> #include <stdlib.h> #include <bitset> #include <vector> #include <set> #include <map> #include <string> #include <cstring> #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_multiroots.h" #include "gsl/gsl_min.h" #include "Eigen/Dense" #include "param.h" #include "io.h" #include "lapack.h" #include "eigenlib.h" #include "gzstream.h" #include "mathfunc.h" #include "lmm.h" #include "vc.h" using namespace std; using namespace Eigen; // In this file, X, Y are already transformed (i.e. UtX and UtY). void VC::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; file_cat=cPar.file_cat; file_beta=cPar.file_beta; file_cor=cPar.file_cor; setSnps=cPar.setSnps; file_out=cPar.file_out; path_out=cPar.path_out; time_UtX=0.0; time_opt=0.0; v_traceG=cPar.v_traceG; ni_total=cPar.ni_total; ns_total=cPar.ns_total; ns_test=cPar.ns_test; crt=cPar.crt; window_cm=cPar.window_cm; window_bp=cPar.window_bp; window_ns=cPar.window_ns; n_vc=cPar.n_vc; return; } void VC::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; cPar.time_opt=time_opt; cPar.v_pve=v_pve; cPar.v_se_pve=v_se_pve; cPar.v_sigma2=v_sigma2; cPar.v_se_sigma2=v_se_sigma2; cPar.pve_total=pve_total; cPar.se_pve_total=se_pve_total; cPar.v_traceG=v_traceG; cPar.v_beta=v_beta; cPar.v_se_beta=v_se_beta; cPar.ni_total=ni_total; cPar.ns_total=ns_total; cPar.ns_test=ns_test; cPar.n_vc=n_vc; return; } void VC::WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat) { string file_str; file_str=path_out+"/"+file_out; file_str+=".qvec.txt"; ofstream outfile_q (file_str.c_str(), ofstream::out); if (!outfile_q) { cout<<"error writing file: "<<file_str.c_str()<<endl; return; } for (size_t i=0; i<s_vec->size; i++) { outfile_q<<gsl_vector_get(s_vec, i)<<endl; } for (size_t i=0; i<q_vec->size; i++) { outfile_q<<gsl_vector_get(q_vec, i)<<endl; } for (size_t i=0; i<qvar_vec->size; i++) { outfile_q<<gsl_vector_get(qvar_vec, i)<<endl; } outfile_q.clear(); outfile_q.close(); file_str=path_out+"/"+file_out; file_str+=".smat.txt"; ofstream outfile_s (file_str.c_str(), ofstream::out); if (!outfile_s) { cout<<"error writing file: "<<file_str.c_str()<<endl; return; } for (size_t i=0; i<S_mat->size1; i++) { for (size_t j=0; j<S_mat->size2; j++) { outfile_s<<gsl_matrix_get(S_mat, i, j)<<"\t"; } outfile_s<<endl; } for (size_t i=0; i<Svar_mat->size1; i++) { for (size_t j=0; j<Svar_mat->size2; j++) { outfile_s<<gsl_matrix_get(Svar_mat, i, j)<<"\t"; } outfile_s<<endl; } outfile_s.clear(); outfile_s.close(); return; } void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) { size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2; gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1); gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt); gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1); double sigma2; // Calculate H = \sum_i^{k+1} \sigma_i^2 K_i. gsl_matrix_set_zero (p->P); for (size_t i=0; i<n_vc+1; i++) { if (i==n_vc) { gsl_matrix_set_identity (K_temp); } else { gsl_matrix_const_view K_sub= gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); gsl_matrix_memcpy (K_temp, &K_sub.matrix); } // When unconstrained, update on sigma2 instead of log_sigma2. if (p->noconstrain) { sigma2=gsl_vector_get (log_sigma2, i); } else { sigma2=exp(gsl_vector_get (log_sigma2, i) ); } gsl_matrix_scale(K_temp, sigma2); gsl_matrix_add (p->P, K_temp); } // Calculate H^{-1}. eigenlib_invert(p->P); eigenlib_dgemm ("N", "N", 1.0, p->P, p->W, 0.0, HiW); eigenlib_dgemm ("T", "N", 1.0, p->W, HiW, 0.0, WtHiW); eigenlib_invert(WtHiW); gsl_matrix_memcpy(WtHiWi, WtHiW); eigenlib_dgemm ("N", "T", 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); eigenlib_dgemm ("N", "N", -1.0, HiW, WtHiWiWtHi, 1.0, p->P); // Calculate Py, KPy, PKPy. gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); double d; for (size_t i=0; i<n_vc+1; i++) { gsl_vector_view KPy=gsl_matrix_column (p->KPy_mat, i); gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i); if (i==n_vc) { gsl_vector_memcpy (&KPy.vector, p->Py); } else { gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); // Seems to be important to use gsl dgemv here instead of // eigenlib_dgemv; otherwise. gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); } gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); // When phenotypes are not normalized well, then some values in // the following matrix maybe NaN; change that to 0; this seems to // only happen when eigenlib_dgemv was used above. for (size_t j=0; j<p->KPy_mat->size1; j++) { d=gsl_matrix_get (p->KPy_mat, j, i); if (std::isnan(d)) { gsl_matrix_set (p->KPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl; } d=gsl_matrix_get (p->PKPy_mat, j, i); if (std::isnan(d)) { gsl_matrix_set (p->PKPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl; } } } gsl_matrix_free (K_temp); gsl_matrix_free (HiW); gsl_matrix_free (WtHiW); gsl_matrix_free (WtHiWi); gsl_matrix_free (WtHiWiWtHi); return; } // Below are functions for AI algorithm. int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) { VC_PARAM *p=(VC_PARAM *) params; size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; double tr, d; // Update parameters. UpdateParam (log_sigma2, p); // Calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy. for (size_t i=0; i<n_vc+1; i++) { if (i==n_vc) { tr=0; for (size_t l=0; l<n1; l++) { tr+=gsl_matrix_get (p->P, l, l); } } else { tr=0; for (size_t l=0; l<n1; l++) { gsl_vector_view P_row=gsl_matrix_row (p->P, l); gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); tr+=d; } } gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); if (p->noconstrain) { d=(-0.5*tr+0.5*d); } else { d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); } gsl_vector_set(dev1, i, d); } return GSL_SUCCESS; } int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2) { VC_PARAM *p=(VC_PARAM *) params; size_t n_vc=log_sigma2->size-1; double d, sigma2_i, sigma2_j; // Update parameters. UpdateParam (log_sigma2, p); // Calculate dev2 = 0.5(yPKPKPy). for (size_t i=0; i<n_vc+1; i++) { gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); if (p->noconstrain) { sigma2_i=gsl_vector_get(log_sigma2, i); } else { sigma2_i=exp(gsl_vector_get(log_sigma2, i)); } for (size_t j=i; j<n_vc+1; j++) { gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); if (p->noconstrain) { sigma2_j=gsl_vector_get(log_sigma2, j); d*=-0.5; } else { sigma2_j=exp(gsl_vector_get(log_sigma2, j)); d*=-0.5*sigma2_i*sigma2_j; } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} } } gsl_matrix_memcpy (p->Hessian, dev2); return GSL_SUCCESS; } int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, gsl_matrix *dev2) { VC_PARAM *p=(VC_PARAM *) params; size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; double tr, d, sigma2_i, sigma2_j; // Update parameters. UpdateParam (log_sigma2, p); for (size_t i=0; i<n_vc+1; i++) { if (i==n_vc) { tr=0; for (size_t l=0; l<n1; l++) { tr+=gsl_matrix_get (p->P, l, l); } } else { tr=0; for (size_t l=0; l<n1; l++) { gsl_vector_view P_row=gsl_matrix_row (p->P, l); gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l); gsl_blas_ddot(&P_row.vector, &K_col.vector, &d); tr+=d; } } gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); if (p->noconstrain) { sigma2_i=gsl_vector_get(log_sigma2, i); d=(-0.5*tr+0.5*d); } else { sigma2_i=exp(gsl_vector_get(log_sigma2, i)); d=(-0.5*tr+0.5*d)*sigma2_i; } gsl_vector_set(dev1, i, d); for (size_t j=i; j<n_vc+1; j++) { gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); if (p->noconstrain) { sigma2_j=gsl_vector_get(log_sigma2, j); d*=-0.5; } else { sigma2_j=exp(gsl_vector_get(log_sigma2, j)); d*=-0.5*sigma2_i*sigma2_j; } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} } } gsl_matrix_memcpy (p->Hessian, dev2); return GSL_SUCCESS; } // Read header to determine which column contains which item. bool ReadHeader_vc (const string &line, HEADER &header) { string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID", "rsid","RSID"}; set<string> rs_set(rs_ptr, rs_ptr+10); string chr_ptr[]={"chr","CHR"}; set<string> chr_set(chr_ptr, chr_ptr+2); string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; set<string> pos_set(pos_ptr, pos_ptr+8); string cm_ptr[]={"cm","CM"}; set<string> cm_set(cm_ptr, cm_ptr+2); string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; set<string> a1_set(a1_ptr, a1_ptr+4); string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; set<string> a0_set(a0_ptr, a0_ptr+4); string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; set<string> z_set(z_ptr, z_ptr+6); string beta_ptr[]={"beta","BETA","b","B"}; set<string> beta_set(beta_ptr, beta_ptr+4); string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"}; set<string> sebeta_set(sebeta_ptr, sebeta_ptr+4); string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"}; set<string> chisq_set(chisq_ptr, chisq_ptr+4); string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"}; set<string> p_set(p_ptr, p_ptr+6); string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"}; set<string> n_set(n_ptr, n_ptr+6); string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"}; set<string> nmis_set(nmis_ptr, nmis_ptr+6); string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; set<string> nobs_set(nobs_ptr, nobs_ptr+4); string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq", "ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; set<string> af_set(af_ptr, af_ptr+10); string var_ptr[]={"var","VAR"}; set<string> var_set(var_ptr, var_ptr+2); string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"}; set<string> ws_set(ws_ptr, ws_ptr+4); string cor_ptr[]={"cor","COR","r","R"}; set<string> cor_set(cor_ptr, cor_ptr+4); header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; char *ch_ptr; string type; size_t n_error=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { type=ch_ptr; if (rs_set.count(type)!=0) { if (header.rs_col==0) { header.rs_col=header.coln+1; } else { cout<<"error! more than two rs columns in the file."<<endl; n_error++; } } else if (chr_set.count(type)!=0) { if (header.chr_col==0) { header.chr_col=header.coln+1; } else { cout<<"error! more than two chr columns in the file."<<endl; n_error++; } } else if (pos_set.count(type)!=0) { if (header.pos_col==0) { header.pos_col=header.coln+1; } else { cout<<"error! more than two pos columns in the file."<<endl; n_error++; } } else if (cm_set.count(type)!=0) { if (header.cm_col==0) { header.cm_col=header.coln+1; } else { cout<<"error! more than two cm columns in the file."<<endl; n_error++; } } else if (a1_set.count(type)!=0) { if (header.a1_col==0) { header.a1_col=header.coln+1; } else { cout<<"error! more than two allele1 columns in the file."<<endl; n_error++; } } else if (a0_set.count(type)!=0) { if (header.a0_col==0) { header.a0_col=header.coln+1; } else { cout<<"error! more than two allele0 columns in the file."<<endl; n_error++; } } else if (z_set.count(type)!=0) { if (header.z_col==0) { header.z_col=header.coln+1; } else { cout<<"error! more than two z columns in the file."<<endl; n_error++; } } else if (beta_set.count(type)!=0) { if (header.beta_col==0) { header.beta_col=header.coln+1; } else { cout<<"error! more than two beta columns in the file."<<endl; n_error++; } } else if (sebeta_set.count(type)!=0) { if (header.sebeta_col==0) { header.sebeta_col=header.coln+1; } else { cout<<"error! more than two se_beta columns in the file."<<endl; n_error++; } } else if (chisq_set.count(type)!=0) { if (header.chisq_col==0) { header.chisq_col=header.coln+1; } else { cout<<"error! more than two z columns in the file."<<endl; n_error++; } } else if (p_set.count(type)!=0) { if (header.p_col==0) { header.p_col=header.coln+1; } else { cout<<"error! more than two p columns in the file."<<endl; n_error++; } } else if (n_set.count(type)!=0) { if (header.n_col==0) { header.n_col=header.coln+1; } else { cout<<"error! more than two n_total columns in the file."<<endl; n_error++; } } else if (nmis_set.count(type)!=0) { if (header.nmis_col==0) { header.nmis_col=header.coln+1; } else { cout<<"error! more than two n_mis columns in the file."<<endl; n_error++; } } else if (nobs_set.count(type)!=0) { if (header.nobs_col==0) { header.nobs_col=header.coln+1; } else { cout<<"error! more than two n_obs columns in the file."<<endl; n_error++; } } else if (ws_set.count(type)!=0) { if (header.ws_col==0) { header.ws_col=header.coln+1; } else { cout<<"error! more than two window_size columns in the file."<<endl; n_error++; } } else if (af_set.count(type)!=0) { if (header.af_col==0) { header.af_col=header.coln+1; } else { cout<<"error! more than two af columns in the file."<<endl; n_error++; } } else if (cor_set.count(type)!=0) { if (header.cor_col==0) { header.cor_col=header.coln+1; } else { cout<<"error! more than two cor columns in the file."<<endl; n_error++; } } else {} ch_ptr=strtok (NULL, " , \t"); header.coln++; } if (header.cor_col!=0 && header.cor_col!=header.coln) { cout<<"error! the cor column should be the last column."<<endl; n_error++; } if (header.rs_col==0) { if (header.chr_col!=0 && header.pos_col!=0) { cout<<"missing an rs column. rs id will be replaced by chr:pos"<<endl; } else { cout<<"error! missing an rs column."<<endl; n_error++; } } if (n_error==0) {return true;} else {return false;} } // Read cov file the first time, record mapRS2in, mapRS2var (in case // var is not provided in the z file), store vec_n and vec_rs. void ReadFile_cor (const string &file_cor, const set<string> &setSnps, vector<string> &vec_rs, vector<size_t> &vec_n, vector<double> &vec_cm, vector<double> &vec_bp, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var) { vec_rs.clear(); vec_n.clear(); mapRS2in.clear(); mapRS2var.clear(); igzstream infile (file_cor.c_str(), igzstream::in); if (!infile) { cout<<"error! fail to open cov file: "<<file_cor<<endl; return; } string line; char *ch_ptr; string rs, chr, a1, a0, pos, cm; double af=0, var_x=0, d_pos, d_cm; size_t n_total=0, n_mis=0, n_obs=0, ni_total=0; size_t ns_test=0, ns_total=0; HEADER header; // Header. !safeGetline(infile, line).eof(); ReadHeader_vc (line, header); if (header.n_col==0 ) { if (header.nobs_col==0 && header.nmis_col==0) { cout<<"error! missing sample size in the cor file."<<endl; } else { cout<<"total sample size will be replaced by obs/mis sample size."<<endl; } } while (!safeGetline(infile, line).eof()) { //do not read cor values this time; upto col_n-1. ch_ptr=strtok ((char *)line.c_str(), " , \t"); n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; d_cm=0; d_pos=0; for (size_t i=0; i<header.coln-1; i++) { if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} if (header.pos_col!=0 && header.pos_col==i+1) { pos=ch_ptr; d_pos=atof(ch_ptr); } if (header.cm_col!=0 && header.cm_col==i+1) { cm=ch_ptr; d_cm=atof(ch_ptr); } if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} ch_ptr=strtok (NULL, " , \t"); } if (header.rs_col==0) { rs=chr+":"+pos; } if (header.n_col==0) { n_total=n_mis+n_obs; } // Record rs, n. vec_rs.push_back(rs); vec_n.push_back(n_total); if (d_cm>0) {vec_cm.push_back(d_cm);} else {vec_cm.push_back(d_cm);} if (d_pos>0) {vec_bp.push_back(d_pos);} else {vec_bp.push_back(d_pos);} // Record mapRS2in and mapRS2var. if (setSnps.size()==0 || setSnps.count(rs)!=0) { if (mapRS2in.count(rs)==0) { mapRS2in[rs]=1; if (header.var_col!=0) { mapRS2var[rs]=var_x; } else if (header.af_col!=0) { var_x=2.0*af*(1.0-af); mapRS2var[rs]=var_x; } else {} ns_test++; } else { cout<<"error! more than one snp has the same id "<<rs<< " in cor file?"<<endl; } } // Record max pos. ni_total=max(ni_total, n_total); ns_total++; } infile.close(); infile.clear(); return; } // Read beta file, store mapRS2var if var is provided here, calculate // q and var_y. void ReadFile_beta (const bool flag_priorscale, const string &file_beta, const map<string, size_t> &mapRS2cat, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var, map<string, size_t> &mapRS2nsamp, gsl_vector *q_vec, gsl_vector *qvar_vec, gsl_vector *s_vec, size_t &ni_total, size_t &ns_total) { mapRS2nsamp.clear(); igzstream infile (file_beta.c_str(), igzstream::in); if (!infile) { cout<<"error! fail to open beta file: "<<file_beta<<endl; return; } string line; char *ch_ptr; string type; string rs, chr, a1, a0, pos, cm; double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0; size_t n_total=0, n_mis=0, n_obs=0; size_t ns_test=0; ns_total=0; ni_total=0; vector<double> vec_q, vec_qvar, vec_s; for (size_t i=0; i<q_vec->size; i++) { vec_q.push_back(0.0); vec_qvar.push_back(0.0); vec_s.push_back(0.0); } // Read header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_vc (line, header); if (header.n_col==0 ) { if (header.nobs_col==0 && header.nmis_col==0) { cout<<"error! missing sample size in the beta file."<<endl; } else { cout<<"total sample size will be replaced by obs/mis sample size."<<endl; } } if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) { cout<<"error! missing z scores in the beta file."<<endl; } if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) { cout<<"error! missing allele frequency in the beta file."<<endl; } while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); z=0; beta=0; se_beta=0; chisq=0; pvalue=0; n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; for (size_t i=0; i<header.coln; i++) { if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;} if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;} if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} if (header.sebeta_col!=0 && header.sebeta_col==i+1) { se_beta=atof(ch_ptr); } if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} ch_ptr=strtok (NULL, " , \t"); } if (header.rs_col==0) { rs=chr+":"+pos; } if (header.n_col==0) { n_total=n_mis+n_obs; } // Both z values and beta/se_beta have directions, while // chisq/pvalue do not. if (header.z_col!=0) { zsquare=z*z; } else if (header.beta_col!=0 && header.sebeta_col!=0) { z=beta/se_beta; zsquare=z*z; } else if (header.chisq_col!=0) { zsquare=chisq; } else if (header.p_col!=0) { zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); } else {zsquare=0;} // If the snp is also present in cor file, then do calculations. if ((header.var_col!=0 || header.af_col!=0 || mapRS2var.count(rs)!=0) && mapRS2in.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { if (mapRS2in.at(rs)>1) { cout<<"error! more than one snp has the same id "<<rs<< " in beta file?"<<endl; break; } if (header.var_col==0) { if (header.af_col!=0) { var_x=2.0*af*(1.0-af); } else { var_x=mapRS2var.at(rs); } } if (flag_priorscale) {var_x=1;} mapRS2in[rs]++; mapRS2var[rs]=var_x; mapRS2nsamp[rs]=n_total; if (mapRS2cat.size()!=0) { vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total; vec_s[mapRS2cat.at(rs) ]+=var_x; vec_qvar[mapRS2cat.at(rs) ]+= var_x*var_x/((double)n_total*(double)n_total); } else { vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total; vec_s[0]+=var_x; vec_qvar[0]+=var_x*var_x/((double)n_total*(double)n_total); } ni_total=max(ni_total, n_total); ns_test++; } ns_total++; } for (size_t i=0; i<q_vec->size; i++) { gsl_vector_set(q_vec, i, vec_q[i]); gsl_vector_set(qvar_vec, i, 2.0*vec_qvar[i]); gsl_vector_set(s_vec, i, vec_s[i]); } infile.clear(); infile.close(); return; } // Read covariance file the second time. // Look for rs, n_mis+n_obs, var, window_size, cov. // If window_cm/bp/ns is provided, then use these max values to // calibrate estimates. void ReadFile_cor (const string &file_cor, const vector<string> &vec_rs, const vector<size_t> &vec_n, const vector<double> &vec_cm, const vector<double> &vec_bp, const map<string, size_t> &mapRS2cat, const map<string, size_t> &mapRS2in, const map<string, double> &mapRS2var, const map<string, size_t> &mapRS2nsamp, const size_t crt, const double &window_cm, const double &window_bp, const double &window_ns, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *qvar_vec, size_t &ni_total, size_t &ns_total, size_t &ns_test, size_t &ns_pair) { igzstream infile (file_cor.c_str(), igzstream::in); if (!infile) { cout<<"error! fail to open cov file: "<<file_cor<<endl; return; } string line; char *ch_ptr; string rs1, rs2; double d1, d2, d3, cor, var1, var2; size_t n_nb, nsamp1, nsamp2, n12, bin_size=10, bin; vector<vector<double> > mat_S, mat_Svar, mat_tmp; vector<double> vec_qvar, vec_tmp; vector<vector<vector<double> > > mat3d_Sbin; for (size_t i=0; i<S_mat->size1; i++) { vec_qvar.push_back(0.0); } for (size_t i=0; i<S_mat->size1; i++) { mat_S.push_back(vec_qvar); mat_Svar.push_back(vec_qvar); } for (size_t k=0; k<bin_size; k++) { vec_tmp.push_back(0.0); } for (size_t i=0; i<S_mat->size1; i++) { mat_tmp.push_back(vec_tmp); } for (size_t i=0; i<S_mat->size1; i++) { mat3d_Sbin.push_back(mat_tmp); } string rs, chr, a1, a0, type, pos, cm; size_t n_total=0, n_mis=0, n_obs=0; double d_pos1, d_pos2, d_pos, d_cm1, d_cm2, d_cm; ns_test=0; ns_total=0; ns_pair=0; ni_total=0; // Header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_vc (line, header); while (!safeGetline(infile, line).eof()) { // Do not read cor values this time; upto col_n-1. d_pos1=0; d_cm1=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); for (size_t i=0; i<header.coln-1; i++) { if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} if (header.pos_col!=0 && header.pos_col==i+1) { pos=ch_ptr; d_pos1=atof(ch_ptr); } if (header.cm_col!=0 && header.cm_col==i+1) { cm=ch_ptr; d_cm1=atof(ch_ptr); } if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} ch_ptr=strtok (NULL, " , \t"); } if (header.rs_col==0) { rs=chr+":"+pos; } if (header.n_col==0) { n_total=n_mis+n_obs; } rs1=rs; if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs1)!=0) && mapRS2in.count(rs1)!=0 && mapRS2in.at(rs1)==2) { var1=mapRS2var.at(rs1); nsamp1=mapRS2nsamp.at(rs1); d2=var1*var1; if (mapRS2cat.size()!=0) { mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+= (1-1.0/(double)vec_n[ns_total])*d2; mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+= d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]); if (crt==1) { mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ][0]+= (1-1.0/(double)vec_n[ns_total])*d2; } } else { mat_S[0][0]+=(1-1.0/(double)vec_n[ns_total])*d2; mat_Svar[0][0]+= d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]); if (crt==1) { mat3d_Sbin[0][0][0]+=(1-1.0/(double)vec_n[ns_total])*d2; } } n_nb=0; while(ch_ptr!=NULL) { type=ch_ptr; if (type.compare("NA")!=0 && type.compare("na")!=0 && type.compare("nan")!=0 && type.compare("-nan")!=0) { cor=atof(ch_ptr); rs2=vec_rs[ns_total+n_nb+1]; d_pos2=vec_bp[ns_total+n_nb+1]; d_cm2=vec_cm[ns_total+n_nb+1]; d_pos=abs(d_pos2-d_pos1); d_cm=abs(d_cm2-d_cm1); if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs2)!=0) && mapRS2in.count(rs2)!=0 && mapRS2in.at(rs2)==2) { var2=mapRS2var.at(rs2); nsamp2=mapRS2nsamp.at(rs2); d1=cor*cor-1.0/(double)min(vec_n[ns_total], vec_n[ns_total+n_nb+1]); d2=var1*var2; d3=cor*cor/((double)nsamp1*(double)nsamp2); n12=min(vec_n[ns_total], vec_n[ns_total+n_nb+1]); // Compute bin. if (crt==1) { if (window_cm!=0 && d_cm1!=0 && d_cm2!=0) { bin=min( (int)floor(d_cm/window_cm*bin_size), (int)bin_size); } else if (window_bp!=0 && d_pos1!=0 && d_pos2!=0) { bin=min( (int)floor(d_pos/window_bp*bin_size), (int)bin_size); } else if (window_ns!=0) { bin=min( (int)floor(((double)n_nb+1)/window_ns*bin_size), (int)bin_size); } } if (mapRS2cat.size()!=0) { if (mapRS2cat.at(rs1)==mapRS2cat.at(rs2)) { vec_qvar[mapRS2cat.at(rs1)]+=2*d3*d2; mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=2*d1*d2; mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+= 2*d2*d2/((double)n12*(double)n12); if (crt==1) { mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+= 2*d1*d2; } } else { mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=d1*d2; mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+= d2*d2/((double)n12*(double)n12); if (crt==1) { mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+= d1*d2; } } } else { vec_qvar[0]+=2*d3*d2; mat_S[0][0]+=2*d1*d2; mat_Svar[0][0]+=2*d2*d2/((double)n12*(double)n12); if (crt==1) { mat3d_Sbin[0][0][bin]+=2*d1*d2; } } ns_pair++; } } ch_ptr=strtok (NULL, " , \t"); n_nb++; } ni_total=max(ni_total, n_total); ns_test++; } ns_total++; } // Use S_bin to fit a rational function y=1/(a+bx)^2, where // x=seq(0.5,bin_size-0.5,by=1) and then compute a correlation // factor as a percentage. double a, b, x, y, n, var_y, var_x, mean_y, mean_x, cov_xy, crt_factor; if (crt==1) { for (size_t i=0; i<S_mat->size1; i++) { for (size_t j=i; j<S_mat->size2; j++) { // Correct mat_S. n=0; var_y=0; var_x=0; mean_y=0; mean_x=0; cov_xy=0; for (size_t k=0; k<bin_size; k++) { if (j==i) { y=mat3d_Sbin[i][j][k]; } else { y=mat3d_Sbin[i][j][k]+mat3d_Sbin[j][i][k]; } x=k+0.5; cout<<y<<", "; if (y>0) { y=1/sqrt(y); mean_x+=x; mean_y+=y; var_x+=x*x; var_y+=y*y; cov_xy+=x*y; n++; } } cout<<endl; if (n>=5) { mean_x/=n; mean_y/=n; var_x/=n; var_y/=n; cov_xy/=n; var_x-=mean_x*mean_x; var_y-=mean_y*mean_y; cov_xy-=mean_x*mean_y; b=cov_xy/var_x; a=mean_y-b*mean_x; crt_factor=a/(b*(bin_size+0.5))+1; if (i==j) { mat_S[i][j]*=crt_factor; } else { mat_S[i][j]*=crt_factor; mat_S[j][i]*=crt_factor; } cout<<crt_factor<<endl; // Correct qvar. if (i==j) { vec_qvar[i]*=crt_factor; } } } } } // Save to gsl_vector and gsl_matrix: qvar_vec, S_mat, Svar_mat. for (size_t i=0; i<S_mat->size1; i++) { d1=gsl_vector_get(qvar_vec, i)+2*vec_qvar[i]; gsl_vector_set(qvar_vec, i, d1); for (size_t j=0; j<S_mat->size2; j++) { if (i==j) { gsl_matrix_set(S_mat, i, j, mat_S[i][i]); gsl_matrix_set(Svar_mat, i, j, 2.0*mat_Svar[i][i]*ns_test*ns_test/(2.0*ns_pair) ); } else { gsl_matrix_set(S_mat, i, j, mat_S[i][j]+mat_S[j][i]); gsl_matrix_set(Svar_mat, i, j, 2.0*(mat_Svar[i][j]+mat_Svar[j][i])* ns_test*ns_test/(2.0*ns_pair) ); } } } infile.clear(); infile.close(); return; } // Use the new method to calculate variance components with summary // statistics first, use a function CalcS to compute S matrix (where // the diagonal elements are part of V(q) ), and then use bootstrap to // compute the variance for S, use a set of genotypes, phenotypes, and // individual ids, and snp category label. void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) { size_t n_vc=S_mat->size1; gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_vector *pve=gsl_vector_alloc (n_vc); gsl_vector *pve_plus=gsl_vector_alloc (n_vc+1); gsl_vector *tmp=gsl_vector_alloc (n_vc+1); gsl_vector *sigma2persnp=gsl_vector_alloc (n_vc); gsl_vector *enrich=gsl_vector_alloc (n_vc); gsl_vector *se_pve=gsl_vector_alloc (n_vc); gsl_vector *se_sigma2persnp=gsl_vector_alloc (n_vc); gsl_vector *se_enrich=gsl_vector_alloc (n_vc); double d; // Calculate S^{-1}q. gsl_matrix_memcpy (tmp_mat, S_mat); int sig; gsl_permutation * pmt=gsl_permutation_alloc (n_vc); LUDecomp (tmp_mat, pmt, &sig); LUInvert (tmp_mat, pmt, Si_mat); // Calculate sigma2snp and pve. gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); gsl_vector_memcpy(sigma2persnp, pve); gsl_vector_div(sigma2persnp, s_vec); // Get qvar_mat. gsl_matrix_memcpy (qvar_mat, Vq); gsl_matrix_scale (qvar_mat, 1.0/(df*df)); // Calculate variance for these estimates. for (size_t i=0; i<n_vc; i++) { for (size_t j=i; j<n_vc; j++) { d=gsl_matrix_get(Svar_mat, i, j); d*=gsl_vector_get(pve, i)*gsl_vector_get(pve, j); d+=gsl_matrix_get(qvar_mat, i, j); gsl_matrix_set(Var_mat, i, j, d); if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);} } } gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, Var_mat, 0.0, tmp_mat); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat); for (size_t i=0; i<n_vc; i++) { d=sqrt(gsl_matrix_get(Var_mat, i, i)); gsl_vector_set(se_pve, i, d); d/=gsl_vector_get(s_vec, i); gsl_vector_set(se_sigma2persnp, i, d); } // Compute pve_total, se_pve_total. pve_total=0; se_pve_total=0; for (size_t i=0; i<n_vc; i++) { pve_total+=gsl_vector_get(pve, i); for (size_t j=0; j<n_vc; j++) { se_pve_total+=gsl_matrix_get(Var_mat, i, j); } } se_pve_total=sqrt(se_pve_total); // Compute enrichment and its variance. double s_pve=0, s_snp=0; for (size_t i=0; i<n_vc; i++) { s_pve+=gsl_vector_get(pve, i); s_snp+=gsl_vector_get(s_vec, i); } gsl_vector_memcpy (enrich, sigma2persnp); gsl_vector_scale (enrich, s_snp/s_pve); gsl_matrix_set_identity(tmp_mat); double d1; for (size_t i=0; i<n_vc; i++) { d=gsl_vector_get(pve, i)/s_pve; d1=gsl_vector_get(s_vec, i); for (size_t j=0; j<n_vc; j++) { if (i==j) { gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve); } else { gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve); } } } gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Var_mat, 0.0, tmp_mat1); gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, tmp_mat1, tmp_mat, 0.0, VarEnrich_mat); for (size_t i=0; i<n_vc; i++) { d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i)); gsl_vector_set(se_enrich, i, d); } cout<<"pve = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(pve, i)<<" "; } cout<<endl; cout<<"se(pve) = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(se_pve, i)<<" "; } cout<<endl; cout<<"sigma2 per snp = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(sigma2persnp, i)<<" "; } cout<<endl; cout<<"se(sigma2 per snp) = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(se_sigma2persnp, i)<<" "; } cout<<endl; cout<<"enrichment = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(enrich, i)<<" "; } cout<<endl; cout<<"se(enrichment) = "; for (size_t i=0; i<n_vc; i++) { cout<<gsl_vector_get(se_enrich, i)<<" "; } cout<<endl; // Save data. v_pve.clear(); v_se_pve.clear(); v_sigma2.clear(); v_se_sigma2.clear(); v_enrich.clear(); v_se_enrich.clear(); for (size_t i=0; i<n_vc; i++) { d=gsl_vector_get(pve, i); v_pve.push_back(d); d=gsl_vector_get(se_pve, i); v_se_pve.push_back(d); d=gsl_vector_get(sigma2persnp, i); v_sigma2.push_back(d); d=gsl_vector_get(se_sigma2persnp, i); v_se_sigma2.push_back(d); d=gsl_vector_get(enrich, i); v_enrich.push_back(d); d=gsl_vector_get(se_enrich, i); v_se_enrich.push_back(d); } // Delete matrices. gsl_matrix_free(Si_mat); gsl_matrix_free(Var_mat); gsl_matrix_free(VarEnrich_mat); gsl_matrix_free(tmp_mat); gsl_matrix_free(tmp_mat1); gsl_matrix_free(qvar_mat); gsl_vector_free(pve); gsl_vector_free(pve_plus); gsl_vector_free(tmp); gsl_vector_free(sigma2persnp); gsl_vector_free(enrich); gsl_vector_free(se_pve); gsl_vector_free(se_sigma2persnp); gsl_vector_free(se_enrich); return; } // Ks are not scaled. void VC::CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) { size_t n1=K->size1, n2=K->size2; size_t n_vc=n2/n1; double r=(double)n1/(double)(n1 - W->size2); double var_y, var_y_new; double d, tr, s, v; vector<double> traceG_new; // New matrices/vectors. gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2); gsl_vector *y_scale=gsl_vector_alloc (n1); gsl_matrix *Kry=gsl_matrix_alloc (n1, n_vc); gsl_matrix *yKrKKry=gsl_matrix_alloc (n_vc, n_vc*(n_vc+1) ); gsl_vector *KKry=gsl_vector_alloc (n1); // Old matrices/vectors. gsl_vector *pve=gsl_vector_alloc (n_vc); gsl_vector *se_pve=gsl_vector_alloc (n_vc); gsl_vector *q_vec=gsl_vector_alloc (n_vc); gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); // Center and scale K by W. for (size_t i=0; i<n_vc; i++) { gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); gsl_matrix_const_view K_sub = gsl_matrix_const_submatrix (K, 0, n1*i, n1, n1); gsl_matrix_memcpy (&Kscale_sub.matrix, &K_sub.matrix); CenterMatrix (&Kscale_sub.matrix, W); d=ScaleMatrix (&Kscale_sub.matrix); traceG_new.push_back(d); } // Center y by W, and standardize it to have variance 1 (t(y)%*%y/n=1). gsl_vector_memcpy (y_scale, y); CenterVector (y_scale, W); var_y=VectorVar (y); var_y_new=VectorVar (y_scale); StandardizeVector (y_scale); // Compute Kry, which is used for confidence interval; also compute // q_vec (*n^2). for (size_t i=0; i<n_vc; i++) { gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); gsl_vector_view Kry_col=gsl_matrix_column (Kry, i); gsl_vector_memcpy (&Kry_col.vector, y_scale); gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, -1.0*r, &Kry_col.vector); gsl_blas_ddot (&Kry_col.vector, y_scale, &d); gsl_vector_set(q_vec, i, d); } // Compute yKrKKry, which is used later for confidence interval. for (size_t i=0; i<n_vc; i++) { gsl_vector_const_view Kry_coli=gsl_matrix_const_column (Kry, i); for (size_t j=i; j<n_vc; j++) { gsl_vector_const_view Kry_colj=gsl_matrix_const_column (Kry, j); for (size_t l=0; l<n_vc; l++) { gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*l, n1, n1); gsl_blas_dgemv (CblasNoTrans, 1.0, &Kscale_sub.matrix, &Kry_coli.vector, 0.0, KKry); gsl_blas_ddot (&Kry_colj.vector, KKry, &d); gsl_matrix_set(yKrKKry, i, l*n_vc+j, d); if (i!=j) {gsl_matrix_set(yKrKKry, j, l*n_vc+i, d);} } gsl_blas_ddot (&Kry_coli.vector, &Kry_colj.vector, &d); gsl_matrix_set(yKrKKry, i, n_vc*n_vc+j, d); if (i!=j) {gsl_matrix_set(yKrKKry, j, n_vc*n_vc+i, d);} } } // Compute Sij (*n^2). for (size_t i=0; i<n_vc; i++) { for (size_t j=i; j<n_vc; j++) { tr=0; for (size_t l=0; l<n1; l++) { gsl_vector_const_view Ki_col=gsl_matrix_const_column (K_scale, i*n1+l); gsl_vector_const_view Kj_col=gsl_matrix_const_column (K_scale, j*n1+l); gsl_blas_ddot (&Ki_col.vector, &Kj_col.vector, &d); tr+=d; } tr=tr-r*(double)n1; gsl_matrix_set (S_mat, i, j, tr); if (i!=j) {gsl_matrix_set (S_mat, j, i, tr);} } } // Compute S^{-1}q. int sig; gsl_permutation * pmt=gsl_permutation_alloc (n_vc); LUDecomp (S_mat, pmt, &sig); LUInvert (S_mat, pmt, Si_mat); // Compute pve (on the transformed scale). gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); // Compute q_var (*n^4). gsl_matrix_set_zero (qvar_mat); s=1; for (size_t i=0; i<n_vc; i++) { d=gsl_vector_get(pve, i); gsl_matrix_view yKrKKry_sub= gsl_matrix_submatrix(yKrKKry, 0, i*n_vc, n_vc, n_vc); gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix); gsl_matrix_scale(tmp_mat, d); gsl_matrix_add (qvar_mat, tmp_mat); s-=d; } gsl_matrix_view yKrKKry_sub=gsl_matrix_submatrix(yKrKKry, 0, n_vc*n_vc, n_vc, n_vc); gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix); gsl_matrix_scale(tmp_mat, s); gsl_matrix_add (qvar_mat, tmp_mat); gsl_matrix_scale(qvar_mat, 2.0); // Compute S^{-1}var_qS^{-1}. gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, qvar_mat, 0.0, tmp_mat); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat); // Transform pve back to the original scale and save data. v_pve.clear(); v_se_pve.clear(); v_sigma2.clear(); v_se_sigma2.clear(); s=1.0, v=0, pve_total=0, se_pve_total=0; for (size_t i=0; i<n_vc; i++) { d=gsl_vector_get (pve, i); v_sigma2.push_back(d*var_y_new/traceG_new[i]); v_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)); s-=d; pve_total+=d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y); d=sqrt(gsl_matrix_get (Var_mat, i, i)); v_se_sigma2.push_back(d*var_y_new/traceG_new[i]); v_se_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)); for (size_t j=0; j<n_vc; j++) { v+=gsl_matrix_get(Var_mat, i, j); se_pve_total+=gsl_matrix_get(Var_mat, i, j)* (var_y_new/traceG_new[i])*(v_traceG[i]/var_y)* (var_y_new/traceG_new[j])*(v_traceG[j]/var_y); } } v_sigma2.push_back(s*r*var_y_new); v_se_sigma2.push_back(sqrt(v)*r*var_y_new); se_pve_total=sqrt(se_pve_total); cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { cout<<v_sigma2[i]<<" "; } cout<<endl; cout<<"se(sigma2) = "; for (size_t i=0; i<n_vc+1; i++) { cout<<v_se_sigma2[i]<<" "; } cout<<endl; cout<<"pve = "; for (size_t i=0; i<n_vc; i++) { cout<<v_pve[i]<<" "; } cout<<endl; cout<<"se(pve) = "; for (size_t i=0; i<n_vc; i++) { cout<<v_se_pve[i]<<" "; } cout<<endl; if (n_vc>1) { cout<<"total pve = "<<pve_total<<endl; cout<<"se(total pve) = "<<se_pve_total<<endl; } gsl_permutation_free(pmt); gsl_matrix_free(K_scale); gsl_vector_free(y_scale); gsl_matrix_free(Kry); gsl_matrix_free(yKrKKry); gsl_vector_free(KKry); // Old matrices/vectors. gsl_vector_free(pve); gsl_vector_free(se_pve); gsl_vector_free(q_vec); gsl_matrix_free(qvar_mat); gsl_matrix_free(tmp_mat); gsl_matrix_free(S_mat); gsl_matrix_free(Si_mat); gsl_matrix_free(Var_mat); return; } // REML for log(sigma2) based on the AI algorithm. void VC::CalcVCreml (bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) { size_t n1=K->size1, n2=K->size2; size_t n_vc=n2/n1; gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1); double d, s; // Set up params. gsl_matrix *P=gsl_matrix_alloc (n1, n1); gsl_vector *Py=gsl_vector_alloc (n1); gsl_matrix *KPy_mat=gsl_matrix_alloc (n1, n_vc+1); gsl_matrix *PKPy_mat=gsl_matrix_alloc (n1, n_vc+1); gsl_vector *dev1=gsl_vector_alloc (n_vc+1); gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1); gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1); VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian, noconstrain}; // Initialize sigma2/log_sigma2. CalcVChe (K, W, y); gsl_blas_ddot (y, y, &s); s/=(double)n1; for (size_t i=0; i<n_vc+1; i++) { if (noconstrain) { d=v_sigma2[i]; } else { if (v_sigma2[i]<=0) {d=log(0.1);} else {d=log(v_sigma2[i]);} } gsl_vector_set (log_sigma2, i, d); } cout<<"iteration "<<0<<endl; cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { if (noconstrain) { cout<<gsl_vector_get(log_sigma2, i)<<" "; } else { cout<<exp(gsl_vector_get(log_sigma2, i))<<" "; } } cout<<endl; // Set up fdf. gsl_multiroot_function_fdf FDF; FDF.n=n_vc+1; FDF.params=¶ms; FDF.f=&LogRL_dev1; FDF.df=&LogRL_dev2; FDF.fdf=&LogRL_dev12; // Set up solver. int status; int iter=0, max_iter=100; const gsl_multiroot_fdfsolver_type *T_fdf; gsl_multiroot_fdfsolver *s_fdf; T_fdf=gsl_multiroot_fdfsolver_hybridsj; s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1); gsl_multiroot_fdfsolver_set (s_fdf, &FDF, log_sigma2); do { iter++; status=gsl_multiroot_fdfsolver_iterate (s_fdf); if (status) break; cout<<"iteration "<<iter<<endl; cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { if (noconstrain) { cout<<gsl_vector_get(s_fdf->x, i)<<" "; } else { cout<<exp(gsl_vector_get(s_fdf->x, i))<<" "; } } cout<<endl; status=gsl_multiroot_test_residual (s_fdf->f, 1e-3); } while (status==GSL_CONTINUE && iter<max_iter); // Obtain Hessian and Hessian inverse. int sig=LogRL_dev12 (s_fdf->x, ¶ms, dev1, dev2); gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1); LUDecomp (dev2, pmt, &sig); LUInvert (dev2, pmt, Hessian); gsl_permutation_free(pmt); // Save sigma2 and se_sigma2. v_sigma2.clear(); v_se_sigma2.clear(); for (size_t i=0; i<n_vc+1; i++) { if (noconstrain) { d=gsl_vector_get(s_fdf->x, i); } else { d=exp(gsl_vector_get(s_fdf->x, i)); } v_sigma2.push_back(d); if (noconstrain) { d=-1.0*gsl_matrix_get(Hessian, i, i); } else { d=-1.0*d*d*gsl_matrix_get(Hessian, i, i); } v_se_sigma2.push_back(sqrt(d)); } s=0; for (size_t i=0; i<n_vc; i++) { s+=v_traceG[i]*v_sigma2[i]; } s+=v_sigma2[n_vc]; // Compute pve. v_pve.clear(); pve_total=0; for (size_t i=0; i<n_vc; i++) { d=v_traceG[i]*v_sigma2[i]/s; v_pve.push_back(d); pve_total+=d; } // Compute se_pve; k=n_vc+1: total. double d1, d2; v_se_pve.clear(); se_pve_total=0; for (size_t k=0; k<n_vc+1; k++) { d=0; for (size_t i=0; i<n_vc+1; i++) { if (noconstrain) { d1=gsl_vector_get(s_fdf->x, i); d1=1; } else { d1=exp(gsl_vector_get(s_fdf->x, i)); } if (k<n_vc) { if (i==k) { d1*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s); } else if (i==n_vc) { d1*=-1*v_traceG[k]*v_sigma2[k]/(s*s); } else { d1*=-1*v_traceG[i]*v_traceG[k]*v_sigma2[k]/(s*s); } } else { if (i==k) { d1*=-1*(s-v_sigma2[n_vc])/(s*s); } else { d1*=v_traceG[i]*v_sigma2[n_vc]/(s*s); } } for (size_t j=0; j<n_vc+1; j++) { if (noconstrain) { d2=gsl_vector_get(s_fdf->x, j); d2=1; } else { d2=exp(gsl_vector_get(s_fdf->x, j)); } if (k<n_vc) { if (j==k) { d2*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s); } else if (j==n_vc) { d2*=-1*v_traceG[k]*v_sigma2[k]/(s*s); } else { d2*=-1*v_traceG[j]*v_traceG[k]*v_sigma2[k]/(s*s); } } else { if (j==k) { d2*=-1*(s-v_sigma2[n_vc])/(s*s); } else { d2*=v_traceG[j]*v_sigma2[n_vc]/(s*s); } } d+=-1.0*d1*d2*gsl_matrix_get(Hessian, i, j); } } if (k<n_vc) { v_se_pve.push_back(sqrt(d) ); } else { se_pve_total=sqrt(d); } } gsl_multiroot_fdfsolver_free(s_fdf); gsl_vector_free(log_sigma2); gsl_matrix_free(P); gsl_vector_free(Py); gsl_matrix_free(KPy_mat); gsl_matrix_free(PKPy_mat); gsl_vector_free(dev1); gsl_matrix_free(dev2); gsl_matrix_free(Hessian); return; } // Ks are not scaled. void VC::CalcVCacl (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) { size_t n1=K->size1, n2=K->size2; size_t n_vc=n2/n1; double d, y2_sum, tau_inv, se_tau_inv; // New matrices/vectors. gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2); gsl_vector *y_scale=gsl_vector_alloc (n1); gsl_vector *y2=gsl_vector_alloc (n1); gsl_vector *n1_vec=gsl_vector_alloc (n1); gsl_matrix *Ay=gsl_matrix_alloc (n1, n_vc); gsl_matrix *K2=gsl_matrix_alloc (n1, n_vc*n_vc); gsl_matrix *K_tmp=gsl_matrix_alloc (n1, n1); gsl_matrix *V_mat=gsl_matrix_alloc (n1, n1); // Old matrices/vectors. gsl_vector *pve=gsl_vector_alloc (n_vc); gsl_vector *se_pve=gsl_vector_alloc (n_vc); gsl_vector *q_vec=gsl_vector_alloc (n_vc); gsl_matrix *S1=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *S2=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *J_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); int sig; gsl_permutation * pmt=gsl_permutation_alloc (n_vc); // Center and scale K by W, and standardize K further so that all // diagonal elements are 1 for (size_t i=0; i<n_vc; i++) { gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); gsl_matrix_const_view K_sub = gsl_matrix_const_submatrix (K, 0, n1*i, n1, n1); gsl_matrix_memcpy (&Kscale_sub.matrix, &K_sub.matrix); CenterMatrix (&Kscale_sub.matrix, W); StandardizeMatrix (&Kscale_sub.matrix); } // Center y by W, and standardize it to have variance 1 (t(y)%*%y/n=1) gsl_vector_memcpy (y_scale, y); CenterVector (y_scale, W); // Compute y^2 and sum(y^2), which is also the variance of y*n1. gsl_vector_memcpy (y2, y_scale); gsl_vector_mul (y2, y_scale); y2_sum=0; for (size_t i=0; i<y2->size; i++) { y2_sum+=gsl_vector_get(y2, i); } // Compute the n_vc size q vector. for (size_t i=0; i<n_vc; i++) { gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, 0.0, n1_vec); gsl_blas_ddot (n1_vec, y_scale, &d); gsl_vector_set(q_vec, i, d-y2_sum); } // Compute the n_vc by n_vc S1 and S2 matrix (and eventually // S=S1-\tau^{-1}S2). for (size_t i=0; i<n_vc; i++) { gsl_matrix_const_view Kscale_sub1 = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); for (size_t j=i; j<n_vc; j++) { gsl_matrix_const_view Kscale_sub2 = gsl_matrix_const_submatrix (K_scale, 0, n1*j, n1, n1); gsl_matrix_memcpy (K_tmp, &Kscale_sub1.matrix); gsl_matrix_mul_elements (K_tmp, &Kscale_sub2.matrix); gsl_vector_set_zero(n1_vec); for (size_t t=0; t<K_tmp->size1; t++) { gsl_vector_view Ktmp_col=gsl_matrix_column (K_tmp, t); gsl_vector_add (n1_vec, &Ktmp_col.vector); } gsl_vector_add_constant (n1_vec, -1.0); // Compute S1. gsl_blas_ddot (n1_vec, y2, &d); gsl_matrix_set (S1, i, j, 2*d); if (i!=j) {gsl_matrix_set (S1, j, i, 2*d);} // Compute S2. d=0; for (size_t t=0; t<n1_vec->size; t++) { d+=gsl_vector_get (n1_vec, t); } gsl_matrix_set (S2, i, j, d); if (i!=j) {gsl_matrix_set (S2, j, i, d);} // Save information to compute J. gsl_vector_view K2col1=gsl_matrix_column (K2, n_vc*i+j); gsl_vector_view K2col2=gsl_matrix_column (K2, n_vc*j+i); gsl_vector_memcpy(&K2col1.vector, n1_vec); if (i!=j) {gsl_vector_memcpy(&K2col2.vector, n1_vec);} } } // Iterate to solve tau and h's. size_t it=0; double s=1; while (abs(s)>1e-3 && it<100) { // Update tau_inv. gsl_blas_ddot (q_vec, pve, &d); if (it>0) {s=y2_sum/(double)n1-d/((double)n1*((double)n1-1))-tau_inv;} tau_inv=y2_sum/(double)n1-d/((double)n1*((double)n1-1)); if (it>0) {s/=tau_inv;} // Update S. gsl_matrix_memcpy (S_mat, S2); gsl_matrix_scale (S_mat, -1*tau_inv); gsl_matrix_add (S_mat, S1); // Update h=S^{-1}q. int sig; gsl_permutation * pmt=gsl_permutation_alloc (n_vc); LUDecomp (S_mat, pmt, &sig); LUInvert (S_mat, pmt, Si_mat); gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); it++; } // Compute V matrix and A matrix (K_scale is destroyed, so need to // compute V first). gsl_matrix_set_zero (V_mat); for (size_t i=0; i<n_vc; i++) { gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); // Compute V. gsl_matrix_memcpy (K_tmp, &Kscale_sub.matrix); gsl_matrix_scale (K_tmp, gsl_vector_get(pve, i)); gsl_matrix_add (V_mat, K_tmp); // Compute A; the corresponding Kscale is destroyed. gsl_matrix_const_view K2_sub = gsl_matrix_const_submatrix (K2, 0, n_vc*i, n1, n_vc); gsl_blas_dgemv (CblasNoTrans, 1.0, &K2_sub.matrix, pve, 0.0, n1_vec); for (size_t t=0; t<n1; t++) { gsl_matrix_set (K_scale, t, n1*i+t, gsl_vector_get(n1_vec, t) ); } // Compute Ay. gsl_vector_view Ay_col=gsl_matrix_column (Ay, i); gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, 0.0, &Ay_col.vector); } gsl_matrix_scale (V_mat, tau_inv); // Compute J matrix. for (size_t i=0; i<n_vc; i++) { gsl_vector_view Ay_col1=gsl_matrix_column (Ay, i); gsl_blas_dgemv(CblasNoTrans, 1.0, V_mat, &Ay_col1.vector, 0.0, n1_vec); for (size_t j=i; j<n_vc; j++) { gsl_vector_view Ay_col2=gsl_matrix_column (Ay, j); gsl_blas_ddot (&Ay_col2.vector, n1_vec, &d); gsl_matrix_set (J_mat, i, j, 2.0*d); if (i!=j) {gsl_matrix_set (J_mat, j, i, 2.0*d);} } } // Compute H^{-1}JH^{-1} as V(\hat h), where H=S2*tau_inv; this is // stored in Var_mat. gsl_matrix_memcpy (S_mat, S2); gsl_matrix_scale (S_mat, tau_inv); LUDecomp (S_mat, pmt, &sig); LUInvert (S_mat, pmt, Si_mat); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, J_mat, 0.0, S_mat); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, S_mat, Si_mat, 0.0, Var_mat); // Compute variance for tau_inv. gsl_blas_dgemv(CblasNoTrans, 1.0, V_mat, y_scale, 0.0, n1_vec); gsl_blas_ddot (y_scale, n1_vec, &d); se_tau_inv=sqrt(2*d)/(double)n1; // Transform pve back to the original scale and save data. v_pve.clear(); v_se_pve.clear(); v_sigma2.clear(); v_se_sigma2.clear(); pve_total=0, se_pve_total=0; for (size_t i=0; i<n_vc; i++) { d=gsl_vector_get (pve, i); pve_total+=d; v_pve.push_back(d); v_sigma2.push_back(d*tau_inv/v_traceG[i] ); d=sqrt(gsl_matrix_get (Var_mat, i, i)); v_se_pve.push_back(d); v_se_sigma2.push_back(d*tau_inv/v_traceG[i]); for (size_t j=0; j<n_vc; j++) { se_pve_total+=gsl_matrix_get(Var_mat, i, j); } } v_sigma2.push_back( (1-pve_total)*tau_inv ); v_se_sigma2.push_back(sqrt(se_pve_total)*tau_inv ); se_pve_total=sqrt(se_pve_total); cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { cout<<v_sigma2[i]<<" "; } cout<<endl; cout<<"se(sigma2) = "; for (size_t i=0; i<n_vc+1; i++) { cout<<v_se_sigma2[i]<<" "; } cout<<endl; cout<<"pve = "; for (size_t i=0; i<n_vc; i++) { cout<<v_pve[i]<<" "; } cout<<endl; cout<<"se(pve) = "; for (size_t i=0; i<n_vc; i++) { cout<<v_se_pve[i]<<" "; } cout<<endl; if (n_vc>1) { cout<<"total pve = "<<pve_total<<endl; cout<<"se(total pve) = "<<se_pve_total<<endl; } gsl_permutation_free(pmt); gsl_matrix_free(K_scale); gsl_vector_free(y_scale); gsl_vector_free(y2); gsl_vector_free(n1_vec); gsl_matrix_free(Ay); gsl_matrix_free(K2); gsl_matrix_free(K_tmp); gsl_matrix_free(V_mat); gsl_vector_free(pve); gsl_vector_free(se_pve); gsl_vector_free(q_vec); gsl_matrix_free(S1); gsl_matrix_free(S2); gsl_matrix_free(S_mat); gsl_matrix_free(Si_mat); gsl_matrix_free(J_mat); gsl_matrix_free(Var_mat); return; } // Read bimbam mean genotype file and compute XWz. bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) { igzstream infile (file_geno.c_str(), igzstream::in); if (!infile) { cout<<"error reading genotype file:"<<file_geno<<endl; return false; } string line; char *ch_ptr; size_t n_miss; double d, geno_mean, geno_var; size_t ni_test=XWz->size1; gsl_vector *geno=gsl_vector_alloc (ni_test); gsl_vector *geno_miss=gsl_vector_alloc (ni_test); gsl_vector *wz=gsl_vector_alloc (w->size); gsl_vector_memcpy (wz, z); gsl_vector_mul(wz, w); for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); if (t%display_pace==0 || t==(indicator_snp.size()-1)) { ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); } if (indicator_snp[t]==0) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); size_t j=0; for (size_t i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0) {continue;} ch_ptr=strtok (NULL, " , \t"); if (strcmp(ch_ptr, "NA")==0) { gsl_vector_set(geno_miss, i, 0); n_miss++; } else { d=atof(ch_ptr); gsl_vector_set (geno, j, d); gsl_vector_set (geno_miss, j, 1); geno_mean+=d; geno_var+=d*d; } j++; } geno_mean/=(double)(ni_test-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (geno_miss, i)==0) { gsl_vector_set(geno, i, geno_mean); } } gsl_vector_add_constant (geno, -1.0*geno_mean); gsl_vector_view XWz_col= gsl_matrix_column(XWz, vec_cat[ns_test]); d=gsl_vector_get (wz, ns_test); gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector); ns_test++; } cout<<endl; gsl_vector_free (geno); gsl_vector_free (geno_miss); gsl_vector_free (wz); infile.close(); infile.clear(); return true; } // Read PLINK bed file and compute XWz. bool PlinkXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) { cout<<"error reading bed file:"<<file_bed<<endl; return false; } char ch[1]; bitset<8> b; size_t n_miss, ci_total, ci_test; double d, geno_mean, geno_var; size_t ni_test=XWz->size1; size_t ni_total=indicator_idv.size(); gsl_vector *geno=gsl_vector_alloc (ni_test); gsl_vector *wz=gsl_vector_alloc (w->size); gsl_vector_memcpy (wz, z); gsl_vector_mul(wz, w); int n_bit; // Calculate n_bit and c, the number of bit for each snp. if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } // Print the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } for (size_t t=0; t<indicator_snp.size(); ++t) { if (t%display_pace==0 || t==(indicator_snp.size()-1)) { ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); } if (indicator_snp[t]==0) {continue;} // n_bit, and 3 is the number of magic numbers. infile.seekg(t*n_bit+3); // Read genotypes. geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; // Minor allele homozygous: 2.0; major: 0.0. for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && ci_total==ni_total) { break; } if (indicator_idv[ci_total]==0) { ci_total++; continue; } if (b[2*j]==0) { if (b[2*j+1]==0) { gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } else { gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0; } } else { if (b[2*j+1]==1) { gsl_vector_set(geno, ci_test, 0.0); } else { gsl_vector_set(geno, ci_test, -9.0); n_miss++; } } ci_test++; ci_total++; } } geno_mean/=(double)(ni_test-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; for (size_t i=0; i<ni_test; ++i) { d=gsl_vector_get(geno,i); if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} } gsl_vector_add_constant (geno, -1.0*geno_mean); gsl_vector_view XWz_col= gsl_matrix_column(XWz, vec_cat[ns_test]); d=gsl_vector_get (wz, ns_test); gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector); ns_test++; } cout<<endl; gsl_vector_free (geno); gsl_vector_free (wz); infile.close(); infile.clear(); return true; } // Read multiple genotype files and compute XWz. bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz) { gsl_matrix_set_zero(XWz); igzstream infile (file_mfile.c_str(), igzstream::in); if (!infile) { cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false; } string file_name; size_t l=0, ns_test=0; while (!safeGetline(infile, file_name).eof()) { if (mfile_mode==1) { file_name+=".bed"; PlinkXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz); } else { BimbamXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz); } l++; } infile.close(); infile.clear(); return true; } // Read bimbam mean genotype file and compute X_i^TX_jWz. bool BimbamXtXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) { igzstream infile (file_geno.c_str(), igzstream::in); if (!infile) { cout<<"error reading genotype file:"<<file_geno<<endl; return false; } string line; char *ch_ptr; size_t n_miss; double d, geno_mean, geno_var; size_t ni_test=XWz->size1; gsl_vector *geno=gsl_vector_alloc (ni_test); gsl_vector *geno_miss=gsl_vector_alloc (ni_test); for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); if (t%display_pace==0 || t==(indicator_snp.size()-1)) { ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); } if (indicator_snp[t]==0) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); size_t j=0; for (size_t i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0) {continue;} ch_ptr=strtok (NULL, " , \t"); if (strcmp(ch_ptr, "NA")==0) { gsl_vector_set(geno_miss, i, 0); n_miss++; } else { d=atof(ch_ptr); gsl_vector_set (geno, j, d); gsl_vector_set (geno_miss, j, 1); geno_mean+=d; geno_var+=d*d; } j++; } geno_mean/=(double)(ni_test-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (geno_miss, i)==0) { gsl_vector_set(geno, i, geno_mean); } } gsl_vector_add_constant (geno, -1.0*geno_mean); for (size_t i=0; i<XWz->size2; i++) { gsl_vector_const_view XWz_col= gsl_matrix_const_column(XWz, i); gsl_blas_ddot (geno, &XWz_col.vector, &d); gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); } ns_test++; } cout<<endl; gsl_vector_free (geno); gsl_vector_free (geno_miss); infile.close(); infile.clear(); return true; } // Read PLINK bed file and compute XWz. bool PlinkXtXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) { cout<<"error reading bed file:"<<file_bed<<endl; return false; } char ch[1]; bitset<8> b; size_t n_miss, ci_total, ci_test; double d, geno_mean, geno_var; size_t ni_test=XWz->size1; size_t ni_total=indicator_idv.size(); gsl_vector *geno=gsl_vector_alloc (ni_test); int n_bit; // Calculate n_bit and c, the number of bit for each snp. if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } // Print the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } for (size_t t=0; t<indicator_snp.size(); ++t) { if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} if (indicator_snp[t]==0) {continue;} // n_bit, and 3 is the number of magic numbers. infile.seekg(t*n_bit+3); // Read genotypes. geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; // Minor allele homozygous: 2.0; major: 0.0; for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && ci_total==ni_total) { break; } if (indicator_idv[ci_total]==0) { ci_total++; continue; } if (b[2*j]==0) { if (b[2*j+1]==0) { gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } else { gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0; } } else { if (b[2*j+1]==1) { gsl_vector_set(geno, ci_test, 0.0); } else { gsl_vector_set(geno, ci_test, -9.0); n_miss++; } } ci_test++; ci_total++; } } geno_mean/=(double)(ni_test-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; for (size_t i=0; i<ni_test; ++i) { d=gsl_vector_get(geno,i); if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} } gsl_vector_add_constant (geno, -1.0*geno_mean); for (size_t i=0; i<XWz->size2; i++) { gsl_vector_const_view XWz_col= gsl_matrix_const_column(XWz, i); gsl_blas_ddot (geno, &XWz_col.vector, &d); gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); } ns_test++; } cout<<endl; gsl_vector_free (geno); infile.close(); infile.clear(); return true; } // Read multiple genotype files and compute XWz. bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz) { gsl_matrix_set_zero(XtXWz); igzstream infile (file_mfile.c_str(), igzstream::in); if (!infile) { cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false; } string file_name; size_t l=0, ns_test=0; while (!safeGetline(infile, file_name).eof()) { if (mfile_mode==1) { file_name+=".bed"; PlinkXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz); } else { BimbamXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz); } l++; } infile.close(); infile.clear(); return true; } // Compute confidence intervals from summary statistics. void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector<size_t> &vec_cat, const vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) { size_t n_vc=XWz->size2, ns_test=w->size, ni_test=XWz->size1; // Set up matrices. gsl_vector *w_pve=gsl_vector_alloc (ns_test); gsl_vector *wz=gsl_vector_alloc (ns_test); gsl_vector *zwz=gsl_vector_alloc (n_vc); gsl_vector *zz=gsl_vector_alloc (n_vc); gsl_vector *Xz_pve=gsl_vector_alloc (ni_test); gsl_vector *WXtXWz=gsl_vector_alloc (ns_test); gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); double d, s0, s1, s, s_pve, s_snp; // Compute wz and zwz. gsl_vector_memcpy (wz, z); gsl_vector_mul (wz, w); gsl_vector_set_zero (zwz); gsl_vector_set_zero (zz); for (size_t i=0; i<w->size; i++) { d=gsl_vector_get (wz, i)*gsl_vector_get (z, i); d+=gsl_vector_get (zwz, vec_cat[i]); gsl_vector_set (zwz, vec_cat[i], d); d=gsl_vector_get (z, i)*gsl_vector_get (z, i); d+=gsl_vector_get (zz, vec_cat[i]); gsl_vector_set (zz, vec_cat[i], d); } // Compute wz, ve and Xz_pve. gsl_vector_set_zero (Xz_pve); s_pve=0; s_snp=0; for (size_t i=0; i<n_vc; i++) { s_pve+=v_pve[i]; s_snp+=gsl_vector_get(s_vec, i); gsl_vector_const_view Xz_col=gsl_matrix_const_column (Xz, i); gsl_blas_daxpy (v_pve[i]/gsl_vector_get(s_vec, i), &Xz_col.vector, Xz_pve); } // Set up wpve vector. for (size_t i=0; i<w->size; i++) { d=v_pve[vec_cat[i]]/gsl_vector_get(s_vec, vec_cat[i]); gsl_vector_set (w_pve, i, d); } // Compute Vq (in qvar_mat). s0=1-s_pve; for (size_t i=0; i<n_vc; i++) { s0+=gsl_vector_get (zz, i)*v_pve[i]/gsl_vector_get(s_vec, i); } for (size_t i=0; i<n_vc; i++) { s1=s0; s1-=gsl_vector_get (zwz, i)*(1-s_pve)/gsl_vector_get(s_vec, i); gsl_vector_const_view XWz_col1=gsl_matrix_const_column (XWz, i); gsl_vector_const_view XtXWz_col1=gsl_matrix_const_column (XtXWz, i); gsl_vector_memcpy (WXtXWz, &XtXWz_col1.vector); gsl_vector_mul (WXtXWz, w_pve); gsl_blas_ddot (Xz_pve, &XWz_col1.vector, &d); s1-=d/gsl_vector_get(s_vec, i); for (size_t j=0; j<n_vc; j++) { s=s1; s-=gsl_vector_get (zwz, j)*(1-s_pve)/gsl_vector_get(s_vec, j); gsl_vector_const_view XWz_col2=gsl_matrix_const_column (XWz, j); gsl_vector_const_view XtXWz_col2=gsl_matrix_const_column (XtXWz, j); gsl_blas_ddot (WXtXWz, &XtXWz_col2.vector, &d); s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j)); gsl_blas_ddot (&XWz_col1.vector, &XWz_col2.vector, &d); s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j))*(1-s_pve); gsl_blas_ddot (Xz_pve, &XWz_col2.vector, &d); s-=d/gsl_vector_get(s_vec, j); gsl_matrix_set (qvar_mat, i, j, s); } } d=(double)(ni_test-1); gsl_matrix_scale (qvar_mat, 2.0/(d*d*d)); // Calculate S^{-1}. gsl_matrix_memcpy (tmp_mat, S_mat); int sig; gsl_permutation * pmt=gsl_permutation_alloc (n_vc); LUDecomp (tmp_mat, pmt, &sig); LUInvert (tmp_mat, pmt, Si_mat); // Calculate variance for the estimates. for (size_t i=0; i<n_vc; i++) { for (size_t j=i; j<n_vc; j++) { d=gsl_matrix_get(Svar_mat, i, j); d*=v_pve[i]*v_pve[j]; d+=gsl_matrix_get(qvar_mat, i, j); gsl_matrix_set(Var_mat, i, j, d); if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);} } } gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,Si_mat,Var_mat,0.0,tmp_mat); gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,tmp_mat,Si_mat,0.0,Var_mat); // Compute sigma2 per snp, enrich. v_sigma2.clear(); v_enrich.clear(); for (size_t i=0; i<n_vc; i++) { v_sigma2.push_back(v_pve[i]/gsl_vector_get(s_vec, i) ); v_enrich.push_back(v_pve[i]/gsl_vector_get(s_vec, i)*s_snp/s_pve); } // Compute se_pve, se_sigma2. for (size_t i=0; i<n_vc; i++) { d=sqrt(gsl_matrix_get(Var_mat, i, i)); v_se_pve.push_back(d); v_se_sigma2.push_back(d/gsl_vector_get(s_vec, i)); } // Compute pve_total, se_pve_total. pve_total=0; for (size_t i=0; i<n_vc; i++) { pve_total+=v_pve[i]; } se_pve_total=0; for (size_t i=0; i<n_vc; i++) { for (size_t j=0; j<n_vc; j++) { se_pve_total+=gsl_matrix_get(Var_mat, i, j); } } se_pve_total=sqrt(se_pve_total); // Compute se_enrich. gsl_matrix_set_identity(tmp_mat); double d1; for (size_t i=0; i<n_vc; i++) { d=v_pve[i]/s_pve; d1=gsl_vector_get(s_vec, i); for (size_t j=0; j<n_vc; j++) { if (i==j) { gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve); } else { gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve); } } } gsl_blas_dgemm(CblasNoTrans,CblasNoTrans,1.0,tmp_mat,Var_mat,0.0,tmp_mat1); gsl_blas_dgemm(CblasNoTrans,CblasTrans,1.0,tmp_mat1,tmp_mat,0.0, VarEnrich_mat); for (size_t i=0; i<n_vc; i++) { d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i)); v_se_enrich.push_back(d); } cout<<"pve = "; for (size_t i=0; i<n_vc; i++) { cout<<v_pve[i]<<" "; } cout<<endl; cout<<"se(pve) = "; for (size_t i=0; i<n_vc; i++) { cout<<v_se_pve[i]<<" "; } cout<<endl; cout<<"sigma2 per snp = "; for (size_t i=0; i<n_vc; i++) { cout<<v_sigma2[i]<<" "; } cout<<endl; cout<<"se(sigma2 per snp) = "; for (size_t i=0; i<n_vc; i++) { cout<<v_se_sigma2[i]<<" "; } cout<<endl; cout<<"enrichment = "; for (size_t i=0; i<n_vc; i++) { cout<<v_enrich[i]<<" "; } cout<<endl; cout<<"se(enrichment) = "; for (size_t i=0; i<n_vc; i++) { cout<<v_se_enrich[i]<<" "; } cout<<endl; // Delete matrices. gsl_matrix_free(Si_mat); gsl_matrix_free(Var_mat); gsl_matrix_free(VarEnrich_mat); gsl_matrix_free(tmp_mat); gsl_matrix_free(tmp_mat1); gsl_matrix_free(qvar_mat); gsl_vector_free(w_pve); gsl_vector_free(wz); gsl_vector_free(zwz); gsl_vector_free(WXtXWz); gsl_vector_free(Xz_pve); return; }