/*
Genome-wide Efficient Mixed Model Association (GEMMA)
Copyright (C) 2011 Xiang Zhou
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "gsl/gsl_vector.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_blas.h"
#include "gsl/gsl_cdf.h"
#include "gsl/gsl_roots.h"
#include "gsl/gsl_min.h"
#include "gsl/gsl_integration.h"
#include "eigenlib.h"
#include "gzstream.h"
#include "lapack.h"
#ifdef FORCE_FLOAT
#include "lm_float.h"
#else
#include "lm.h"
#endif
using namespace std;
void LM::CopyFromParam (PARAM &cPar)
{
a_mode=cPar.a_mode;
d_pace=cPar.d_pace;
file_bfile=cPar.file_bfile;
file_geno=cPar.file_geno;
file_out=cPar.file_out;
path_out=cPar.path_out;
file_gene=cPar.file_gene;
// WJA added
file_oxford=cPar.file_oxford;
time_opt=0.0;
ni_total=cPar.ni_total;
ns_total=cPar.ns_total;
ni_test=cPar.ni_test;
ns_test=cPar.ns_test;
n_cvt=cPar.n_cvt;
ng_total=cPar.ng_total;
ng_test=0;
indicator_idv=cPar.indicator_idv;
indicator_snp=cPar.indicator_snp;
snpInfo=cPar.snpInfo;
return;
}
void LM::CopyToParam (PARAM &cPar)
{
cPar.time_opt=time_opt;
cPar.ng_test=ng_test;
return;
}
void LM::WriteFiles ()
{
string file_str;
file_str=path_out+"/"+file_out;
file_str+=".assoc.txt";
ofstream outfile (file_str.c_str(), ofstream::out);
if (!outfile) {cout<<"error writing file: "<::size_type t=0; tsize;
double d;
gsl_vector *WtWiWtx=gsl_vector_alloc (c_size);
gsl_blas_ddot (x, x, &xPwx);
gsl_blas_ddot (x, y, &xPwy);
gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
gsl_blas_ddot (WtWiWtx, Wtx, &d);
xPwx-=d;
gsl_blas_ddot (WtWiWtx, Wty, &d);
xPwy-=d;
gsl_vector_free (WtWiWtx);
return;
}
void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy)
{
size_t c_size=Wty->size;
double d;
gsl_vector *WtWiWty=gsl_vector_alloc (c_size);
gsl_blas_ddot (y, y, &yPwy);
gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);
gsl_blas_ddot (WtWiWty, Wty, &d);
yPwy-=d;
gsl_vector_free (WtWiWty);
return;
}
//calculate p values and beta/se in a linear model
void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score)
{
double yPxy=yPwy-xPwy*xPwy/xPwx;
double se_wald, se_score;
beta=xPwy/xPwx;
se_wald=sqrt(yPxy/(df*xPwx) );
se_score=sqrt(yPwy/((double)n_size*xPwx) );
p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df);
p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df);
p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1);
if (test_mode==3) {se=se_score;} else {se=se_wald;}
return;
}
void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x)
{
ifstream infile (file_gene.c_str(), ifstream::in);
if (!infile) {cout<<"error reading gene expression file:"<size1-(double)W->size2-1.0;
gsl_vector *y=gsl_vector_alloc (W->size1);
gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
gsl_vector *Wty=gsl_vector_alloc (W->size2);
gsl_vector *Wtx=gsl_vector_alloc (W->size2);
gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
int sig;
LUDecomp (WtW, pmt, &sig);
LUInvert (WtW, pmt, WtWi);
gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
CalcvPv(WtWi, Wtx, x, xPwx);
//header
getline(infile, line);
for (size_t t=0; tsize1, beta, se, p_wald, p_lrt, p_score);
time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
//store summary data
SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
sumStat.push_back(SNPs);
}
cout<
void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y)
{
string file_bgen=file_oxford+".bgen";
ifstream infile (file_bgen.c_str(), ios::binary);
if (!infile) {cout<<"error reading bgen file:"<size1-(double)W->size2-1.0;
gsl_vector *x=gsl_vector_alloc (W->size1);
gsl_vector *x_miss=gsl_vector_alloc (W->size1);
gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
gsl_vector *Wty=gsl_vector_alloc (W->size2);
gsl_vector *Wtx=gsl_vector_alloc (W->size2);
gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
int sig;
LUDecomp (WtW, pmt, &sig);
LUInvert (WtW, pmt, WtWi);
gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
CalcvPv(WtWi, Wty, y, yPwy);
// read in header
uint32_t bgen_snp_block_offset;
uint32_t bgen_header_length;
uint32_t bgen_nsamples;
uint32_t bgen_nsnps;
uint32_t bgen_flags;
infile.read(reinterpret_cast(&bgen_snp_block_offset),4);
infile.read(reinterpret_cast(&bgen_header_length),4);
bgen_snp_block_offset-=4;
infile.read(reinterpret_cast(&bgen_nsnps),4);
bgen_snp_block_offset-=4;
infile.read(reinterpret_cast(&bgen_nsamples),4);
bgen_snp_block_offset-=4;
infile.ignore(4+bgen_header_length-20);
bgen_snp_block_offset-=4+bgen_header_length-20;
infile.read(reinterpret_cast(&bgen_flags),4);
bgen_snp_block_offset-=4;
bool CompressedSNPBlocks=bgen_flags&0x1;
// bool LongIds=bgen_flags&0x4;
infile.ignore(bgen_snp_block_offset);
double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss;
uint32_t bgen_N;
uint16_t bgen_LS;
uint16_t bgen_LR;
uint16_t bgen_LC;
uint32_t bgen_SNP_pos;
uint32_t bgen_LA;
std::string bgen_A_allele;
uint32_t bgen_LB;
std::string bgen_B_allele;
uint32_t bgen_P;
size_t unzipped_data_size;
string id;
string rs;
string chr;
std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<1) {break;}
if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);}
// read SNP header
id.clear();
rs.clear();
chr.clear();
bgen_A_allele.clear();
bgen_B_allele.clear();
infile.read(reinterpret_cast(&bgen_N),4);
infile.read(reinterpret_cast(&bgen_LS),2);
id.resize(bgen_LS);
infile.read(&id[0], bgen_LS);
infile.read(reinterpret_cast(&bgen_LR),2);
rs.resize(bgen_LR);
infile.read(&rs[0], bgen_LR);
infile.read(reinterpret_cast(&bgen_LC),2);
chr.resize(bgen_LC);
infile.read(&chr[0], bgen_LC);
infile.read(reinterpret_cast(&bgen_SNP_pos),4);
infile.read(reinterpret_cast(&bgen_LA),4);
bgen_A_allele.resize(bgen_LA);
infile.read(&bgen_A_allele[0], bgen_LA);
infile.read(reinterpret_cast(&bgen_LB),4);
bgen_B_allele.resize(bgen_LB);
infile.read(&bgen_B_allele[0], bgen_LB);
uint16_t unzipped_data[3*bgen_N];
if (indicator_snp[t]==0) {
if(CompressedSNPBlocks)
infile.read(reinterpret_cast(&bgen_P),4);
else
bgen_P=6*bgen_N;
infile.ignore(static_cast(bgen_P));
continue;
}
if(CompressedSNPBlocks)
{
infile.read(reinterpret_cast(&bgen_P),4);
uint8_t zipped_data[bgen_P];
unzipped_data_size=6*bgen_N;
infile.read(reinterpret_cast(zipped_data),bgen_P);
int result=uncompress(reinterpret_cast(unzipped_data), reinterpret_cast(&unzipped_data_size), reinterpret_cast(zipped_data), static_cast (bgen_P));
assert(result == Z_OK);
}
else
{
bgen_P=6*bgen_N;
infile.read(reinterpret_cast(unzipped_data),bgen_P);
}
x_mean=0.0; c_phen=0; n_miss=0;
gsl_vector_set_zero(x_miss);
for (size_t i=0; i(unzipped_data[i*3])/32768.0;
bgen_geno_prob_AB=static_cast(unzipped_data[i*3+1])/32768.0;
bgen_geno_prob_BB=static_cast(unzipped_data[i*3+2])/32768.0;
// WJA
bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB;
if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
else {
bgen_geno_prob_AA/=bgen_geno_prob_non_miss;
bgen_geno_prob_AB/=bgen_geno_prob_non_miss;
bgen_geno_prob_BB/=bgen_geno_prob_non_miss;
geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB;
gsl_vector_set(x, c_phen, geno);
gsl_vector_set(x_miss, c_phen, 1.0);
x_mean+=geno;
}
c_phen++;
}
x_mean/=static_cast(ni_test-n_miss);
for (size_t i=0; i1) {
//gsl_vector_set(x, i, 2-geno);
//}
}
//calculate statistics
time_start=clock();
gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
//store summary data
SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
sumStat.push_back(SNPs);
}
cout<size1-(double)W->size2-1.0;
gsl_vector *x=gsl_vector_alloc (W->size1);
gsl_vector *x_miss=gsl_vector_alloc (W->size1);
gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
gsl_vector *Wty=gsl_vector_alloc (W->size2);
gsl_vector *Wtx=gsl_vector_alloc (W->size2);
gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
int sig;
LUDecomp (WtW, pmt, &sig);
LUInvert (WtW, pmt, WtWi);
gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
CalcvPv(WtWi, Wty, y, yPwy);
//start reading genotypes and analyze
for (size_t t=0; t1) {break;}
getline(infile, line);
if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs ", t, ns_total-1);}
if (indicator_snp[t]==0) {continue;}
ch_ptr=strtok ((char *)line.c_str(), " , \t");
ch_ptr=strtok (NULL, " , \t");
ch_ptr=strtok (NULL, " , \t");
x_mean=0.0; c_phen=0; n_miss=0;
gsl_vector_set_zero(x_miss);
for (size_t i=0; i1) {
//gsl_vector_set(x, i, 2-geno);
//}
}
//calculate statistics
time_start=clock();
gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);
CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
//store summary data
SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
sumStat.push_back(SNPs);
}
cout< b;
double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
int n_bit, n_miss, ci_total, ci_test;
double geno, x_mean;
//calculate some basic quantities
double yPwy, xPwy, xPwx;
double df=(double)W->size1-(double)W->size2-1.0;
gsl_vector *x=gsl_vector_alloc (W->size1);
gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
gsl_vector *Wty=gsl_vector_alloc (W->size2);
gsl_vector *Wtx=gsl_vector_alloc (W->size2);
gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
int sig;
LUDecomp (WtW, pmt, &sig);
LUInvert (WtW, pmt, WtWi);
gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
CalcvPv(WtWi, Wty, y, yPwy);
//calculate n_bit and c, the number of bit for each snp
if (ni_total%4==0) {n_bit=ni_total/4;}
else {n_bit=ni_total/4+1; }
//print the first three majic numbers
for (int i=0; i<3; ++i) {
infile.read(ch,1);
b=ch[0];
}
for (vector::size_type t=0; t1) {
//gsl_vector_set(x, i, 2-geno);
//}
}
//calculate statistics
time_start=clock();
gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
//store summary data
SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
sumStat.push_back(SNPs);
time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
}
cout< > &pos_loglr)
{
double yty, xty, xtx, log_lr;
gsl_blas_ddot(y, y, &yty);
for (size_t i=0; isize2; ++i) {
gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
gsl_blas_ddot(&X_col.vector, y, &xty);
log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx));
pos_loglr.push_back(make_pair(i,log_lr) );
}
return;
}