From b3b491cd9143d33bfebd4c5b26629573afcf0970 Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Sat, 11 Jul 2015 12:57:37 -0400 Subject: add GXE test --- src/param.cpp | 1122 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 920 insertions(+), 202 deletions(-) (limited to 'src/param.cpp') diff --git a/src/param.cpp b/src/param.cpp index 7a89ff8..c4b234a 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -24,6 +24,15 @@ #include #include +#include "gsl/gsl_randist.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" + +#include "eigenlib.h" +#include "mathfunc.h" #ifdef FORCE_FLOAT #include "param_float.h" @@ -39,12 +48,12 @@ using namespace std; -PARAM::PARAM(void): +PARAM::PARAM(void): mode_silence (false), a_mode (0), k_mode(1), d_pace (100000), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0), -pheno_mean(0), +pheno_mean(0), noconstrain (false), h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0), rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), @@ -55,53 +64,64 @@ n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), +window_cm(0), window_bp(0), window_ns(0), error(false), - n_cvt(1), n_vc(1), +ni_subsample(0), n_cvt(1), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} //read files //obtain ns_total, ng_total, ns_test, ni_test -void PARAM::ReadFiles (void) +void PARAM::ReadFiles (void) { string file_str; - if (!file_mk.empty()) { + + + if (!file_cat.empty()) { + if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;} + } + + if (!file_var.empty()) { + if (ReadFile_var (file_var, mapRS2var)==false) {error=true;} + } + + if (!file_mk.empty()) { if (CountFileLines (file_mk, n_vc)==false) {error=true;} } - + if (!file_snps.empty()) { if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} } else { setSnps.clear(); } - + //for prediction if (!file_epm.empty()) { if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;} - + if (!file_bfile.empty()) { file_str=file_bfile+".bim"; - if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} - + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + file_str=file_bfile+".fam"; - if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} } - - if (!file_geno.empty()) { - if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} - - if (CountFileLines (file_geno, ns_total)==false) {error=true;} + + if (!file_geno.empty()) { + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + if (CountFileLines (file_geno, ns_total)==false) {error=true;} } - + if (!file_ebv.empty() ) { if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;} } - + if (!file_log.empty() ) { if (ReadFile_log (file_log, pheno_mean)==false) {error=true;} } - + //convert indicator_pheno to indicator_idv int k=1; for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } - + if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "<1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<1) {cout<<"error! rho values must be between 0 and 1. current values = "<0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<1) { cout<<"error! pnr value must be between 0 and 1. current value = "<::size_type i=0; i<(indicator_idv).size(); ++i) { if (indicator_idv[i]==0) {continue;} ni_test++; } - + ni_cvt=0; for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; ni_test+=indicator_idv[i]; } - } - + } + if ((indicator_read).size()!=0) { - ni_test=0; + ni_test=0; for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_read[i]; ni_test+=indicator_idv[i]; } } */ - if (ni_test==0) { + if (ni_test==0 && file_cor.empty() && file_mq.empty() && file_q.empty() && file_beta.empty() ) { error=true; cout<<"error! number of analyzed individuals equals 0. "<ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."< > &Xt, gsl_matrix *K, const bool calc_K) { + string file_str; + + if (!file_bfile.empty()) { + file_str=file_bfile+".bed"; + if (ReadFile_bed (file_str, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } else { + if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;} + } + + return; +} + void PARAM::CalcKin (gsl_matrix *matrix_kin) { string file_str; - + gsl_matrix_set_zero (matrix_kin); - - if (!file_bfile.empty() ) { + + if (!file_bfile.empty() ) { file_str=file_bfile+".bed"; if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } + else if (!file_oxford.empty() ) { + file_str=file_oxford+".bgen"; + if (bgenKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} + } else { file_str=file_geno; if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;} } - + + return; +} + + + +//from an existing n by nd G matrix, compute the d by d S matrix +void compKtoS (const gsl_matrix *G, gsl_matrix *S) { + size_t n_vc=S->size1, ni_test=G->size1; + double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d; + + for (size_t i=0; in_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + +//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d+1) Q matrix +//where inside i'th d+1 by d+1 matrix, each element is tr(KiKjKiKl)-r*tr(KjKiKl)-r*tr(KlKiKj)+r^2*tr(KjKl), where r=n/(n-1) +void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { + size_t n_vc=G->size2/G->size1, ni_test=G->size1; + + gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test); + gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc ); + gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 ); + gsl_vector *trKi=gsl_vector_alloc(n_vc); + + double d, tr, r=(double)ni_test/(double)(ni_test-1); + size_t t, t_ij, t_il, t_jl, t_ii; + + //compute KiKj for all pairs of i and j (including the identity matrix) + t=0; + for (size_t i=0; il) { + gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); + tr-=r*d; + } else if (i>j && i<=l) { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } else { + gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); + tr-=r*d; + } + } + + tr+=r*r*gsl_vector_get (trKiKj, t_jl); + } else if (j!=n_vc && l==n_vc) { + t_ij=GetabIndex (i+1, j+1, n_vc-2); + tr=gsl_vector_get (trKiKjKi, i*n_vc+j)-2*r*gsl_vector_get (trKiKj, t_ij)+r*r*gsl_vector_get (trKi, j); + } else if (j==n_vc && l==n_vc) { + t_ii=GetabIndex (i+1, i+1, n_vc-2); + tr=gsl_vector_get (trKiKj, t_ii)-2*r*gsl_vector_get (trKi, i)+r*r*(double)(ni_test-1); + } + + gsl_matrix_set (Q, j, i*(n_vc+1)+l, tr); + if (l!=j) {gsl_matrix_set (Q, l, i*(n_vc+1)+j, tr);} + } + } + } + + gsl_matrix_scale (Q, 1.0/pow((double)ni_test, 2) ); + + gsl_matrix_free(KiKj); + gsl_vector_free(trKiKjKi); + gsl_vector_free(trKiKj); + gsl_vector_free(trKi); + + return; +} + + + +//perform Jacknife sampling for variance of S +void JacknifeGtoS (const gsl_matrix *G, gsl_matrix *S, gsl_matrix *Svar) { + size_t n_vc=Svar->size1, ni_test=G->size1; + vector > > tr_KiKj, s_KiKj; + vector > sum_Ki, s_Ki, si; + vector vec_tmp; + double di, dj, d, m, v; + + //initialize and set all elements to zero + for (size_t i=0; itm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //bootstrap: in each iteration, sample individuals and compute S_pmt + size_t n_pmt=100; + vector idv_order, idv_remove; + for (size_t i=0; i(&idv_remove[0]), n_pmt, static_cast(&idv_order[0]), ni_test, sizeof(size_t)); + + gsl_matrix *S_pmt=gsl_matrix_alloc(n_vc, n_vc*n_pmt); + for (size_t i=0; isize; ++i) { + outfile<size; ++i) { + outfile<size1; ++i) { for (size_t j=0; jsize2; ++j) { outfile<size; ++i) { outfile<::size_type i=0; i set_remove; - + //check if any columns is an intercept for (size_t i=0; isize2; i++) { gsl_vector_view w_col=gsl_matrix_column (W, i); gsl_vector_minmax (&w_col.vector, &v_min, &v_max); if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);} } - + //add an intecept term if needed if (n_cvt==set_remove.size()) { indicator_cvt.clear(); @@ -697,19 +1326,19 @@ void PARAM::CheckCvt () if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;} cvt[i].push_back(1.0); } - + n_cvt++; - } else {} - + } else {} + gsl_matrix_free(W); - + return; } //post-process phentoypes, covariates void PARAM::ProcessCvtPhen () -{ +{ //convert indicator_pheno to indicator_idv int k=1; indicator_idv.clear(); @@ -720,27 +1349,88 @@ void PARAM::ProcessCvtPhen () } indicator_idv.push_back(k); } - + //remove individuals with missing covariates if ((indicator_cvt).size()!=0) { for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_idv[i]*=indicator_cvt[i]; } } - + + //remove individuals with missing gxe variables + if ((indicator_gxe).size()!=0) { + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_gxe[i]; + } + } + + //remove individuals with missing residual weights + if ((indicator_weight).size()!=0) { + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + indicator_idv[i]*=indicator_weight[i]; + } + } + //obtain ni_test - ni_test=0; + ni_test=0; for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { - if (indicator_idv[i]==0) {continue;} + if (indicator_idv[i]==0) {continue;} ni_test++; } - + + + + //if subsample number is set, perform a random sub-sampling to determine the subsampled ids + if (ni_subsample!=0) { + if (ni_testtm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); + } + gsl_r = gsl_rng_alloc(gslType); + gsl_rng_set(gsl_r, randseed); + + //from ni_test, sub-sample ni_subsample + vector a, b; + for (size_t i=0; i(&a[0]), ni_subsample, static_cast(&b[0]), ni_test, sizeof (size_t) ); + + //re-set indicator_idv and ni_test + int j=0; + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { + if (indicator_idv[i]==0) {continue;} + if(find(a.begin(), a.end(), j) == a.end()) { + indicator_idv[i]=0; + } + j++; + } + ni_test=ni_subsample; + } + } + + //check ni_test if (ni_test==0) { error=true; cout<<"error! number of analyzed individuals equals 0. "< cvt_row; cvt_row.push_back(1); - + for (vector::size_type i=0; i<(indicator_idv).size(); ++i) { indicator_cvt.push_back(1); - + cvt.push_back(cvt_row); } } - + return; } -void PARAM::CopyCvt (gsl_matrix *W) +void PARAM::CopyCvt (gsl_matrix *W) { size_t ci_test=0; - + for (vector::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i