/*
Genome-wide Efficient Mixed Model Association (GEMMA)
Copyright (C) 2011-2017 Xiang Zhou
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include "gsl/gsl_randist.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_vector.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_blas.h"
#include "eigenlib.h"
#include "mathfunc.h"
#include "param.h"
#include "io.h"
using namespace std;
PARAM::PARAM(void):
mode_silence (false), a_mode (0), k_mode(1), d_pace (100000),
file_out("result"), path_out("./output/"),
miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999),
l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),
nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0),
pheno_mean(0), noconstrain (false),
h_min(-1), h_max(-1), h_scale(-1),
rho_min(0.0), rho_max(1.0), rho_scale(-1),
logp_min(0.0), logp_max(0.0), logp_scale(-1),
h_ngrid(10), rho_ngrid(10),
s_min(0), s_max(300),
w_step(100000), s_step(1000000),
r_pace(10), w_pace(1000),
n_accept(0),
n_mh(10),
geo_mean(2000.0),
randseed(-1),
window_cm(0), window_bp(0), window_ns(0), n_block(200),
error(false),
ni_subsample(0), n_cvt(1), n_vc(1), n_cat(0),
time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0),
time_UtZ(0.0), time_opt(0.0), time_Omega(0.0)
{}
// Read files: obtain ns_total, ng_total, ns_test, ni_test.
void PARAM::ReadFiles (void) {
string file_str;
// Read cat file.
if (!file_mcat.empty()) {
if (ReadFile_mcat (file_mcat, mapRS2cat, n_vc)==false) {error=true;}
} else if (!file_cat.empty()) {
if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;}
}
// Read snp weight files.
if (!file_wcat.empty()) {
if (ReadFile_wsnp (file_wcat, n_vc, mapRS2wcat)==false) {error=true;}
}
if (!file_wsnp.empty()) {
if (ReadFile_wsnp (file_wsnp, mapRS2wsnp)==false) {error=true;}
}
// Count number of kinship files.
if (!file_mk.empty()) {
if (CountFileLines (file_mk, n_vc)==false) {error=true;}
}
// Read SNP set.
if (!file_snps.empty()) {
if (ReadFile_snps (file_snps, setSnps)==false) {error=true;}
} else {
setSnps.clear();
}
// For prediction.
if (!file_epm.empty()) {
if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {
error=true;
}
if (!file_bfile.empty()) {
file_str=file_bfile+".bim";
if (ReadFile_bim (file_str, snpInfo)==false) {
error=true;
}
file_str=file_bfile+".fam";
if (ReadFile_fam (file_str, indicator_pheno, pheno,
mapID2num, p_column)==false) {
error=true;
}
}
if (!file_geno.empty()) {
if (ReadFile_pheno (file_pheno, indicator_pheno,
pheno, p_column)==false) {
error=true;
}
if (CountFileLines (file_geno, ns_total)==false) {
error=true;
}
}
if (!file_ebv.empty() ) {
if (ReadFile_column (file_ebv, indicator_bv,
vec_bv, 1)==false) {
error=true;
}
}
if (!file_log.empty() ) {
if (ReadFile_log (file_log, pheno_mean)==false) {
error=true;
}
}
// Convert indicator_pheno to indicator_idv.
int k=1;
for (size_t i=0; i::size_type i=0;
i<(indicator_idv).size();
++i) {
indicator_idv[i]*=indicator_read[i];
ni_test+=indicator_idv[i];
}
if (ni_test==0) {
error=true;
cout<<"error! number of analyzed individuals equals 0. "<<
endl;
return;
}
}
// For ridge prediction, read phenotype only.
if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) {
if (ReadFile_pheno (file_pheno, indicator_pheno, pheno,
p_column)==false) {
error=true;
}
// Post-process covariates and phenotypes, obtain
// ni_test, save all useful covariates.
ProcessCvtPhen();
}
return;
}
void PARAM::CheckParam (void)
{
struct stat fileInfo;
string str;
// Check parameters.
if (k_mode!=1 && k_mode!=2) {
cout<<"error! unknown kinship/relatedness input mode: "<<
k_mode<1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<1) {cout<<"error! rho values must be between 0 and 1. current values = "<0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) {
cout<<"error! the current analysis mode "<1 && !file_gene.empty() ) {
cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<1) {
cout<<"error! pnr value must be between 0 and 1. current value = "<::size_type i=0; i<(indicator_idv).size(); ++i) {
if (indicator_idv[i]==0) {continue;}
ni_test++;
}
ni_cvt=0;
for (size_t i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_idv[i]*=indicator_cvt[i];
ni_test+=indicator_idv[i];
}
}
if ((indicator_read).size()!=0) {
ni_test=0;
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_idv[i]*=indicator_read[i];
ni_test+=indicator_idv[i];
}
}
*/
if (ni_test==0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() && file_bf.empty() ) {
error=true;
cout<<"error! number of analyzed individuals equals 0. "<ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."< > &Xt, gsl_matrix *K, const bool calc_K) {
string file_str;
if (!file_bfile.empty()) {
file_str=file_bfile+".bed";
if (ReadFile_bed (file_str, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;}
} else {
if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test)==false) {error=true;}
}
return;
}
void PARAM::CalcKin (gsl_matrix *matrix_kin) {
string file_str;
gsl_matrix_set_zero (matrix_kin);
if (!file_bfile.empty() ) {
file_str=file_bfile+".bed";
if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
}
else if (!file_oxford.empty() ) {
file_str=file_oxford+".bgen";
if (bgenKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
}
else {
file_str=file_geno;
if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
}
return;
}
//from an existing n by nd A and K matrices, compute the d by d S matrix (which is not necessary symmetric)
void compAKtoS (const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt, gsl_matrix *S) {
size_t n_vc=S->size1, ni_test=A->size1;
double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d;
for (size_t i=0; in_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<a) {l=a; h=b;} else {l=b; h=a;}
size_t n=n_cvt+2;
index=(2*n-l+2)*(l-1)/2+h-l;
return index;
}
//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d-1)/2*(d+1) Q matrix
//where inside i'th d+1 by d+1 matrix, each element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm), where r=n/(n-1)
void compKtoV (const gsl_matrix *G, gsl_matrix *V) {
size_t n_vc=G->size2/G->size1, ni_test=G->size1;
gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, (n_vc*(n_vc+1))/2*ni_test);
gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 );
gsl_vector *trKi=gsl_vector_alloc(n_vc);
double d, tr, r=(double)ni_test/(double)(ni_test-1);
size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij;
//compute KiKj for all pairs of i and j (not including the identity matrix)
t=0;
for (size_t i=0; im) {
gsl_blas_ddot (&KiKl_row.vector, &KjKm_row.vector, &d);
tr+=d;
gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d);
tr-=r*d;
gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d);
tr-=r*d;
} else if (i>l && j<=m) {
gsl_blas_ddot (&KiKl_col.vector, &KjKm_col.vector, &d);
tr+=d;
gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d);
tr-=r*d;
gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d);
tr-=r*d;
} else {
gsl_blas_ddot (&KiKl_col.vector, &KjKm_row.vector, &d);
tr+=d;
gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d);
tr-=r*d;
gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d);
tr-=r*d;
}
}
tr+=r*r*gsl_vector_get (trKiKj, t_lm);
} else if (l!=n_vc && m==n_vc) {
t_il=GetabIndex (i+1, l+1, n_vc-2);
t_jl=GetabIndex (j+1, l+1, n_vc-2);
tr=0;
for (size_t k=0; ksize1, ni_test=A->size1, n_cvt=W->size2;
vector > > trAK, sumAK;
vector > sumA, sumK, trA, trK, sA, sK;
vector vec_tmp;
double di, dj, d, m, v;
//gsl_matrix *Stmp=gsl_matrix_alloc (n_vc, ni_test*n_vc);
//gsl_matrix *Stmp_sub=gsl_matrix_alloc (n_vc, n_vc);
//initialize and set all elements to zero
for (size_t i=0; i &mapRS2wA, const map &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns) {
string file_str;
gsl_matrix_set_zero (S);
gsl_matrix_set_zero (Svar);
gsl_vector_set_zero (ns);
//compute the kinship matrix G for multiple categories; these matrices are not centered, for convienence of Jacknife sampling
if (!file_bfile.empty() ) {
file_str=file_bfile+".bed";
if (mapRS2wA.size()==0) {
if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;}
} else {
if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;}
}
} else if (!file_geno.empty()) {
file_str=file_geno;
if (mapRS2wA.size()==0) {
if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;}
} else {
if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;}
}
} else if (!file_mbfile.empty() ){
if (mapRS2wA.size()==0) {
if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;}
} else {
if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;}
}
} else if (!file_mgeno.empty()) {
if (mapRS2wA.size()==0) {
if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;}
} else {
if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;}
}
}
if (mapRS2wA.size()==0) {
gsl_matrix_memcpy (A, K);
}
//center and scale every kinship matrix inside G
for (size_t i=0; isize2, S);
//compute Svar and update S with Jacknife
JackknifeAKtoS (W, A, K, S, Svar);
//based on G, compute a matrix Q that can be used to calculate the variance of q
//compKtoV (G, V);
return;
}
void PARAM::WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix)
{
string file_str;
file_str=path_out+"/"+file_out;
file_str+=".";
file_str+=suffix;
file_str+=".txt";
ofstream outfile (file_str.c_str(), ofstream::out);
if (!outfile) {cout<<"error writing file: "<size; ++i) {
outfile<size; ++i) {
outfile<size1; ++i) {
for (size_t j=0; jsize2; ++j) {
outfile<size; ++i) {
outfile<::size_type i=0; i set_remove;
//check if any columns is an intercept
for (size_t i=0; isize2; i++) {
gsl_vector_view w_col=gsl_matrix_column (W, i);
gsl_vector_minmax (&w_col.vector, &v_min, &v_max);
if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);}
}
//add an intecept term if needed
if (n_cvt==set_remove.size()) {
indicator_cvt.clear();
n_cvt=1;
} else if (flag_ipt==0) {
cout<<"no intecept term is found in the cvt file. a column of 1s is added."<::size_type i=0; i::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_idv[i]*=indicator_cvt[i];
}
}
//remove individuals with missing gxe variables
if ((indicator_gxe).size()!=0) {
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_idv[i]*=indicator_gxe[i];
}
}
//remove individuals with missing residual weights
if ((indicator_weight).size()!=0) {
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_idv[i]*=indicator_weight[i];
}
}
//obtain ni_test
ni_test=0;
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
if (indicator_idv[i]==0) {continue;}
ni_test++;
}
//if subsample number is set, perform a random sub-sampling to determine the subsampled ids
if (ni_subsample!=0) {
if (ni_testtm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
}
gsl_r = gsl_rng_alloc(gslType);
gsl_rng_set(gsl_r, randseed);
//from ni_test, sub-sample ni_subsample
vector a, b;
for (size_t i=0; i(&a[0]), ni_subsample, static_cast(&b[0]), ni_test, sizeof (size_t) );
//re-set indicator_idv and ni_test
int j=0;
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
if (indicator_idv[i]==0) {continue;}
if(find(a.begin(), a.end(), j) == a.end()) {
indicator_idv[i]=0;
}
j++;
}
ni_test=ni_subsample;
}
}
//check ni_test
if (ni_test==0 && a_mode!=15) {
error=true;
cout<<"error! number of analyzed individuals equals 0. "< cvt_row;
cvt_row.push_back(1);
for (vector::size_type i=0; i<(indicator_idv).size(); ++i) {
indicator_cvt.push_back(1);
cvt.push_back(cvt_row);
}
}
return;
}
void PARAM::CopyCvt (gsl_matrix *W)
{
size_t ci_test=0;
for (vector::size_type i=0; i flag_vec;
vector catc;
for (size_t j=0; j::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i::size_type i=0; i &setSnps_beta, map &mapRS2wK)
{
mapRS2wK.clear();
vector wsum, wcount;
for (size_t i=0; i::iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
if (mapRS2cat.size()==0) {
it->second/=wsum[0];
} else {
it->second/=wsum[mapRS2cat[it->first]];
}
}
}
return;
}
//pve_flag=0 then do not change pve; pve_flag==1, then change pve to 0 if pve < 0 and pve to 1 if pve > 1
void PARAM::UpdateWeight (const size_t pve_flag, const map &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map &mapRS2wA)
{
double d;
vector wsum, wcount;
for (size_t i=0; i::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
d=1;
for (size_t i=0; i=1 && pve_flag==1) {
d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i];
} else if (v_pve[i]<=0 && pve_flag==1) {
d+=0;
} else {
d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]*v_pve[i];
}
}
mapRS2wA[it->first]=1/(d*d);
if (mapRS2cat.size()==0) {
wsum[0]+=mapRS2wA[it->first];
wcount[0]++;
} else {
wsum[mapRS2cat[it->first]]+=mapRS2wA[it->first];
wcount[mapRS2cat[it->first]]++;
}
}
for (size_t i=0; i::iterator it=mapRS2wA.begin(); it!=mapRS2wA.end(); ++it) {
if (mapRS2cat.size()==0) {
it->second/=wsum[0];
} else {
it->second/=wsum[mapRS2cat[it->first]];
}
}
return;
}
// this function updates indicator_snp, and save z-scores and other values into vectors
void PARAM::UpdateSNPnZ (const map &mapRS2wA, const map &mapRS2A1, const map &mapRS2z, gsl_vector *w, gsl_vector *z, vector &vec_cat)
{
gsl_vector_set_zero (w);
gsl_vector_set_zero (z);
vec_cat.clear();
string rs, a1;
size_t c=0;
if (msnpInfo.size()==0) {
for (size_t i=0; i &mapRS2wA)
{
string rs;
if (msnpInfo.size()==0) {
for (size_t i=0; i