From 17deca2d54827a00df3ea4d98df700fc2b8ed777 Mon Sep 17 00:00:00 2001
From: xiangzhou
Date: Sat, 20 Sep 2014 10:17:34 -0400
Subject: initial upload, version 0.95alpha

---
 bslmm.cpp    | 1927 ++++++++++++++++++++++++++++++
 bslmm.h      |  145 +++
 gemma.cpp    | 1856 +++++++++++++++++++++++++++++
 gemma.h      |   52 +
 gzstream.cpp |  165 +++
 gzstream.h   |  121 ++
 io.cpp       | 1396 ++++++++++++++++++++++
 io.h         |   79 ++
 lapack.cpp   |  609 ++++++++++
 lapack.h     |   53 +
 lm.cpp       |  571 +++++++++
 lm.h         |   74 ++
 lmm.cpp      | 1770 +++++++++++++++++++++++++++
 lmm.h        |  110 ++
 main.cpp     |   86 ++
 mathfunc.cpp |  310 +++++
 mathfunc.h   |   41 +
 mvlmm.cpp    | 3748 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mvlmm.h      |   93 ++
 param.cpp    |  849 +++++++++++++
 param.h      |  231 ++++
 prdt.cpp     |  543 +++++++++
 prdt.h       |   80 ++
 vc.cpp       |  443 +++++++
 vc.h         |   80 ++
 25 files changed, 15432 insertions(+)
 create mode 100644 bslmm.cpp
 create mode 100644 bslmm.h
 create mode 100644 gemma.cpp
 create mode 100644 gemma.h
 create mode 100644 gzstream.cpp
 create mode 100644 gzstream.h
 create mode 100644 io.cpp
 create mode 100644 io.h
 create mode 100644 lapack.cpp
 create mode 100644 lapack.h
 create mode 100644 lm.cpp
 create mode 100644 lm.h
 create mode 100644 lmm.cpp
 create mode 100644 lmm.h
 create mode 100644 main.cpp
 create mode 100644 mathfunc.cpp
 create mode 100644 mathfunc.h
 create mode 100644 mvlmm.cpp
 create mode 100644 mvlmm.h
 create mode 100644 param.cpp
 create mode 100644 param.h
 create mode 100644 prdt.cpp
 create mode 100644 prdt.h
 create mode 100644 vc.cpp
 create mode 100644 vc.h
diff --git a/bslmm.cpp b/bslmm.cpp
new file mode 100644
index 0000000..ff9618d
--- /dev/null
+++ b/bslmm.cpp
@@ -0,0 +1,1927 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <ctime>
+#include <cstring>
+#include <algorithm>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_eigen.h"
+#include "gsl/gsl_randist.h"
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+
+
+
+
+#include "lapack.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "bslmm_float.h"
+#include "lmm_float.h"  //for class FUNC_PARAM and MatrixCalcLR
+#include "lm_float.h"
+#include "mathfunc_float.h"  //for function CenterVector
+#else
+#include "param.h"
+#include "bslmm.h"
+#include "lmm.h"
+#include "lm.h"
+#include "mathfunc.h"
+#endif
+
+using namespace std;
+
+
+
+
+void BSLMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	
+	l_min=cPar.h_min;	
+	l_max=cPar.h_max;  
+	n_region=cPar.n_region;	
+	pve_null=cPar.pve_null;
+	pheno_mean=cPar.pheno_mean;
+	
+	time_UtZ=0.0;
+	time_Omega=0.0;
+	n_accept=0;
+	
+	h_min=cPar.h_min;	
+	h_max=cPar.h_max;  
+	h_scale=cPar.h_scale;
+	rho_min=cPar.rho_min;	
+	rho_max=cPar.rho_max;  
+	rho_scale=cPar.rho_scale;
+	logp_min=cPar.logp_min;	
+	logp_max=cPar.logp_max;  
+	logp_scale=cPar.logp_scale;
+	
+	s_min=cPar.s_min;
+	s_max=cPar.s_max;
+	w_step=cPar.w_step;
+	s_step=cPar.s_step;
+	r_pace=cPar.r_pace;
+	w_pace=cPar.w_pace;
+	n_mh=cPar.n_mh;
+	geo_mean=cPar.geo_mean;
+	randseed=cPar.randseed;
+	trace_G=cPar.trace_G;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	indicator_idv=cPar.indicator_idv;
+	indicator_snp=cPar.indicator_snp;
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void BSLMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtZ=time_UtZ;
+	cPar.time_Omega=time_Omega;
+	cPar.time_Proposal=time_Proposal;
+	cPar.cHyp_initial=cHyp_initial;
+	cPar.n_accept=n_accept;
+	cPar.pheno_mean=pheno_mean;
+	cPar.randseed=randseed;
+	
+	return;
+}
+
+
+
+void BSLMM::WriteBV (const gsl_vector *bv) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".bv.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t t=0;
+	for (size_t i=0; i<ni_total; ++i) {
+		if (indicator_idv[i]==0) {
+			outfile<<"NA"<<endl;
+		}		
+		else {
+			outfile<<scientific<<setprecision(6)<<gsl_vector_get(bv, t)<<endl;
+			t++;
+		}
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+
+
+void BSLMM::WriteParam (vector<pair<double, double> > &beta_g, const gsl_vector *alpha, const size_t w) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".param.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"
+			<<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t"
+			<<"beta"<<"\t"<<"gamma"<<endl;
+	
+	size_t t=0;
+	for (size_t i=0; i<ns_total; ++i) {
+		if (indicator_snp[i]==0) {continue;}		
+		
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"
+		<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t";	
+				
+		outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t";
+		if (beta_g[t].second!=0) {
+			outfile<<beta_g[t].first/beta_g[t].second<<"\t"<<beta_g[t].second/(double)w<<endl;
+		}
+		else {
+			outfile<<0.0<<"\t"<<0.0<<endl;
+		}
+		t++;
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+void BSLMM::WriteParam (const gsl_vector *alpha) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".param.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"
+			<<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t"
+			<<"beta"<<"\t"<<"gamma"<<endl;
+	
+	size_t t=0;
+	for (size_t i=0; i<ns_total; ++i) {
+		if (indicator_snp[i]==0) {continue;}		
+
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"
+				<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t";				
+		outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t";
+		outfile<<0.0<<"\t"<<0.0<<endl;
+		t++;
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+void BSLMM::WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col) 
+{
+	string file_gamma, file_hyp;
+	file_gamma="./output/"+file_out;
+	file_gamma+=".gamma.txt";
+	file_hyp="./output/"+file_out;
+	file_hyp+=".hyp.txt";
+
+	ofstream outfile_gamma, outfile_hyp;
+		
+	if (flag==0) {
+		outfile_gamma.open (file_gamma.c_str(), ofstream::out);
+		outfile_hyp.open (file_hyp.c_str(), ofstream::out);
+		if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;}
+		if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;}
+		
+		outfile_hyp<<"h \t pve \t rho \t pge \t pi \t n_gamma"<<endl;
+		
+		for (size_t i=0; i<s_max; ++i) {
+			outfile_gamma<<"s"<<i<<"\t";
+		}
+		outfile_gamma<<endl;
+	}
+	else {
+		outfile_gamma.open (file_gamma.c_str(), ofstream::app);
+		outfile_hyp.open (file_hyp.c_str(), ofstream::app);
+		if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;}
+		if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;}
+		
+		size_t w;
+		if (w_col==0) {w=w_pace;}
+		else {w=w_col;}
+		
+		for (size_t i=0; i<w; ++i) {
+			outfile_hyp<<scientific;
+			for (size_t j=0; j<4; ++j) {
+				outfile_hyp<<setprecision(6)<<gsl_matrix_get (Result_hyp, i, j)<<"\t";
+			}
+			outfile_hyp<<setprecision(6)<<exp(gsl_matrix_get (Result_hyp, i, 4))<<"\t";
+			outfile_hyp<<(int)gsl_matrix_get (Result_hyp, i, 5)<<"\t";
+			outfile_hyp<<endl;
+		}
+		
+		for (size_t i=0; i<w; ++i) {
+			for (size_t j=0; j<s_max; ++j) {
+				outfile_gamma<<(int)gsl_matrix_get (Result_gamma, i, j)<<"\t";
+			}
+			outfile_gamma<<endl;
+		}
+		
+	}
+	
+	outfile_hyp.close();
+	outfile_hyp.clear();
+	outfile_gamma.close();
+	outfile_gamma.clear();	
+	return;
+}
+
+
+
+void BSLMM::CalcPgamma (double *p_gamma)
+{
+	double p, s=0.0;
+	for (size_t i=0; i<ns_test; ++i) {
+		p=0.7*gsl_ran_geometric_pdf (i+1, 1.0/geo_mean)+0.3/(double)ns_test;
+		p_gamma[i]=p;
+		s+=p;
+	}
+	for (size_t i=0; i<ns_test; ++i) {
+		p=p_gamma[i];
+		p_gamma[i]=p/s;
+	}
+	return;
+}
+
+
+
+void BSLMM::SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector<size_t> &rank)
+{
+	size_t pos;
+	for (size_t i=0; i<rank.size(); ++i) {
+		pos=mapRank2pos[rank[i]];
+		gsl_vector_view Xgamma_col=gsl_matrix_column (Xgamma, i);
+		gsl_vector_const_view X_col=gsl_matrix_const_column (X, pos);
+		gsl_vector_memcpy (&Xgamma_col.vector, &X_col.vector);
+	}
+	
+	return;
+}
+
+
+
+double BSLMM::CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2) 
+{
+	double pve, var_y;	
+	
+	gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2);
+	gsl_vector *Xty=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *OiXty=gsl_vector_alloc (UtXgamma->size2);
+
+	gsl_matrix_set_identity (Omega);
+	gsl_matrix_scale (Omega, 1.0/sigma_a2); 
+
+#ifdef WITH_LAPACK
+	lapack_dgemm ((char *)"T", (char *)"N", 1.0, UtXgamma, UtXgamma, 1.0, Omega);
+#else
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, UtXgamma, UtXgamma, 1.0, Omega);	
+#endif
+	gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma, Uty, 0.0, Xty);
+
+	CholeskySolve(Omega, Xty, OiXty);
+	
+	gsl_blas_ddot (Xty, OiXty, &pve);
+	gsl_blas_ddot (Uty, Uty, &var_y);
+	
+	pve/=var_y;
+	
+	gsl_matrix_free (Omega);
+	gsl_vector_free (Xty);
+	gsl_vector_free (OiXty);
+
+	return pve;
+}
+
+
+void BSLMM::InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector<size_t> &rank, class HYPBSLMM &cHyp, vector<pair<size_t, double> > &pos_loglr)
+{
+	double q_genome=gsl_cdf_chisq_Qinv(0.05/(double)ns_test, 1);
+	
+	cHyp.n_gamma=0;
+	for (size_t i=0; i<pos_loglr.size(); ++i) {
+		if (2.0*pos_loglr[i].second>q_genome) {cHyp.n_gamma++;}
+	}
+	if (cHyp.n_gamma<10) {cHyp.n_gamma=10;}
+	
+	if (cHyp.n_gamma>s_max) {cHyp.n_gamma=s_max;}
+	if (cHyp.n_gamma<s_min) {cHyp.n_gamma=s_min;}	
+	
+	rank.clear();
+	for (size_t i=0; i<cHyp.n_gamma; ++i) {
+		rank.push_back(i);
+	}
+	
+	cHyp.logp=log((double)cHyp.n_gamma/(double)ns_test);
+	cHyp.h=pve_null; 
+	
+	if (cHyp.logp==0) {cHyp.logp=-0.000001;}
+	if (cHyp.h==0) {cHyp.h=0.1;}
+
+	gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp.n_gamma);
+	SetXgamma (UtXgamma, UtX, rank);
+	double sigma_a2;
+	if (trace_G!=0) {
+	  sigma_a2=cHyp.h*1.0/(trace_G*(1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	} else {
+	  sigma_a2=cHyp.h*1.0/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	}
+	if (sigma_a2==0) {sigma_a2=0.025;}	
+	cHyp.rho=CalcPveLM (UtXgamma, Uty, sigma_a2)/cHyp.h;
+	gsl_matrix_free (UtXgamma);
+	
+	if (cHyp.rho>1.0) {cHyp.rho=1.0;}
+	
+	if (cHyp.h<h_min) {cHyp.h=h_min;}
+	if (cHyp.h>h_max) {cHyp.h=h_max;}
+	if (cHyp.rho<rho_min) {cHyp.rho=rho_min;}
+	if (cHyp.rho>rho_max) {cHyp.rho=rho_max;}
+	if (cHyp.logp<logp_min) {cHyp.logp=logp_min;}
+	if (cHyp.logp>logp_max) {cHyp.logp=logp_max;}
+	
+	
+//	if (fix_sigma>=0) {
+//		fix_sigma=cHyp.h;
+//		rho_max=1-cHyp.h;
+//		cHyp.rho=rho_max/2.0;
+//	}
+	
+	//Initial for grid sampling:
+//	cHyp.h=0.225;
+//	cHyp.rho=1.0;
+//	cHyp.logp=-4.835429;
+	
+	cout<<"initial value of h = "<<cHyp.h<<endl;
+	cout<<"initial value of rho = "<<cHyp.rho<<endl;
+	cout<<"initial value of pi = "<<exp(cHyp.logp)<<endl;
+	cout<<"initial value of |gamma| = "<<cHyp.n_gamma<<endl;
+	
+	return;
+}
+
+
+
+double BSLMM::CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp)
+{
+	double sigma_b2=cHyp.h*(1.0-cHyp.rho)/(trace_G*(1-cHyp.h));
+	
+	gsl_vector *Utu_rand=gsl_vector_alloc (Uty->size);	
+	gsl_vector *weight_Hi=gsl_vector_alloc (Uty->size);
+	
+	double logpost=0.0;
+	double d, ds, uy, Hi_yy=0, logdet_H=0.0;
+	for (size_t i=0; i<ni_test; ++i) {
+		d=gsl_vector_get (K_eval, i)*sigma_b2;
+		ds=d/(d+1.0);
+		d=1.0/(d+1.0);		
+		gsl_vector_set (weight_Hi, i, d);
+		
+		logdet_H-=log(d);
+		uy=gsl_vector_get (Uty, i);
+		Hi_yy+=d*uy*uy;
+		
+		gsl_vector_set (Utu_rand, i, gsl_ran_gaussian(gsl_r, 1)*sqrt(ds));
+	}
+	
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau = gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/Hi_yy); }
+	
+	//sample alpha
+	gsl_vector_memcpy (alpha_prime, Uty);
+	gsl_vector_mul (alpha_prime, weight_Hi);
+	gsl_vector_scale (alpha_prime, sigma_b2);
+	
+	//sample u
+	gsl_vector_memcpy (Utu, alpha_prime);
+	gsl_vector_mul (Utu, K_eval);
+	if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));}
+	gsl_vector_add (Utu, Utu_rand);	
+	
+	//for quantitative traits, calculate pve and ppe
+	if (a_mode==11) {
+		gsl_blas_ddot (Utu, Utu, &d);
+		cHyp.pve=d/(double)ni_test;	
+		cHyp.pve/=cHyp.pve+1.0/tau;
+		cHyp.pge=0.0;	
+	}
+
+	//calculate likelihood
+	logpost=-0.5*logdet_H;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(Hi_yy);}
+	else {logpost-=0.5*Hi_yy;}
+	
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp));
+	
+	gsl_vector_free (Utu_rand);
+	gsl_vector_free (weight_Hi);
+	
+	return logpost;
+}
+
+
+double BSLMM::CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp)
+{
+	clock_t time_start;	
+	
+	double sigma_a2=cHyp.h*cHyp.rho/(trace_G*(1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	double sigma_b2=cHyp.h*(1.0-cHyp.rho)/(trace_G*(1-cHyp.h));
+	
+	double logpost=0.0;
+	double d, ds, uy, P_yy=0, logdet_O=0.0, logdet_H=0.0;
+	
+	gsl_matrix *UtXgamma_eval=gsl_matrix_alloc (UtXgamma->size1, UtXgamma->size2);	
+	gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2);
+	gsl_vector *XtHiy=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *beta_hat=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *Utu_rand=gsl_vector_alloc (UtXgamma->size1);	
+	gsl_vector *weight_Hi=gsl_vector_alloc (UtXgamma->size1);
+	
+	gsl_matrix_memcpy (UtXgamma_eval, UtXgamma);
+	
+	logdet_H=0.0; P_yy=0.0;
+	for (size_t i=0; i<ni_test; ++i) {
+		gsl_vector_view UtXgamma_row=gsl_matrix_row (UtXgamma_eval, i);
+		d=gsl_vector_get (K_eval, i)*sigma_b2;
+		ds=d/(d+1.0);
+		d=1.0/(d+1.0);
+		gsl_vector_set (weight_Hi, i, d);
+		
+		logdet_H-=log(d);
+		uy=gsl_vector_get (Uty, i);
+		P_yy+=d*uy*uy;
+		gsl_vector_scale (&UtXgamma_row.vector, d);
+		
+		gsl_vector_set (Utu_rand, i, gsl_ran_gaussian(gsl_r, 1)*sqrt(ds));
+	}
+	
+	//calculate Omega
+	gsl_matrix_set_identity (Omega);
+	
+	time_start=clock();
+#ifdef WITH_LAPACK
+	lapack_dgemm ((char *)"T", (char *)"N", sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega);
+#else
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega);
+#endif	
+	time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	
+	
+	//calculate beta_hat
+	gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma_eval, Uty, 0.0, XtHiy);	
+
+	logdet_O=CholeskySolve(Omega, XtHiy, beta_hat);
+	
+	gsl_vector_scale (beta_hat, sigma_a2);
+
+	gsl_blas_ddot (XtHiy, beta_hat, &d);
+	P_yy-=d;
+	
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/P_yy); }
+
+	//sample beta
+	for (size_t i=0; i<beta->size; i++)
+	{
+		d=gsl_ran_gaussian(gsl_r, 1); 
+		gsl_vector_set(beta, i, d); 
+	}
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, beta); 
+	
+	
+	//it compuates inv(L^T(Omega)) %*% beta;  
+	gsl_vector_scale(beta, sqrt(sigma_a2/tau));
+	gsl_vector_add(beta, beta_hat); 
+	gsl_blas_dgemv (CblasNoTrans, 1.0, UtXgamma, beta, 0.0, UtXb);
+	
+	//sample alpha
+	gsl_vector_memcpy (alpha_prime, Uty);
+	gsl_vector_sub (alpha_prime, UtXb);
+	gsl_vector_mul (alpha_prime, weight_Hi);
+	gsl_vector_scale (alpha_prime, sigma_b2);
+	
+	//sample u
+	gsl_vector_memcpy (Utu, alpha_prime);
+	gsl_vector_mul (Utu, K_eval);
+	
+	if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));}
+	gsl_vector_add (Utu, Utu_rand);	
+	
+	
+	//for quantitative traits, calculate pve and pge
+	if (a_mode==11) {
+		gsl_blas_ddot (UtXb, UtXb, &d);
+		cHyp.pge=d/(double)ni_test;
+	
+		gsl_blas_ddot (Utu, Utu, &d);
+		cHyp.pve=cHyp.pge+d/(double)ni_test;
+		
+		if (cHyp.pve==0) {cHyp.pge=0.0;}
+		else {cHyp.pge/=cHyp.pve;}
+		cHyp.pve/=cHyp.pve+1.0/tau;	
+	}	
+	
+
+	gsl_matrix_free (UtXgamma_eval);
+	gsl_matrix_free (Omega);
+	gsl_vector_free (XtHiy);
+	gsl_vector_free (beta_hat);
+	gsl_vector_free (Utu_rand);	
+	gsl_vector_free (weight_Hi);
+	
+	logpost=-0.5*logdet_H-0.5*logdet_O;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);}
+	else {logpost-=0.5*P_yy;}
+//	else {logpost+=-0.5*P_yy*tau+0.5*(double)ni_test*log(tau);}
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp));
+	
+	return logpost;
+}
+
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	
+	gsl_blas_ddot (Utu, Utu, &d);
+	cHyp.pve=d/(double)ni_test;	
+		
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, z_hat);
+		
+	cHyp.pve/=cHyp.pve+1.0;
+	cHyp.pge=0.0;	
+	
+	return;
+}
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	gsl_vector *UtXbU=gsl_vector_alloc (Utu->size);
+	
+	gsl_blas_ddot (UtXb, UtXb, &d);
+	cHyp.pge=d/(double)ni_test;
+	
+	gsl_blas_ddot (Utu, Utu, &d);
+	cHyp.pve=cHyp.pge+d/(double)ni_test;
+	
+	gsl_vector_memcpy (UtXbU, Utu);
+	gsl_vector_add (UtXbU, UtXb);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, UtXbU, 0.0, z_hat);	
+	
+	if (cHyp.pve==0) {cHyp.pge=0.0;}
+	else {cHyp.pge/=cHyp.pve;}
+	
+	cHyp.pve/=cHyp.pve+1.0;
+	
+	gsl_vector_free(UtXbU);
+	return;
+}
+
+
+
+
+void BSLMM::SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z)
+{	
+	double d1, d2, z_rand=0.0;
+	for (size_t i=0; i<z->size; ++i) {
+		d1=gsl_vector_get (y, i);
+		d2=gsl_vector_get (z_hat, i);
+		//y is centerred for case control studies
+		if (d1<=0.0) {
+			//control, right truncated
+			do {				
+				z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0);
+			} while (z_rand>0.0);
+		}
+		else {
+			do {
+				z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0);
+			} while (z_rand<0.0);
+		}
+		
+		gsl_vector_set (z, i, z_rand);
+	}
+
+	return;
+}
+
+
+
+
+
+double BSLMM::ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	
+	double h=cHyp_old.h, rho=cHyp_old.rho;
+	
+	double d_h=(h_max-h_min)*h_scale, d_rho=(rho_max-rho_min)*rho_scale;
+	
+	for (size_t i=0; i<repeat; ++i) {
+		h=h+(gsl_rng_uniform(gsl_r)-0.5)*d_h;
+		if (h<h_min) {h=2*h_min-h;}
+		if (h>h_max) {h=2*h_max-h;}
+		
+		rho=rho+(gsl_rng_uniform(gsl_r)-0.5)*d_rho;
+		if (rho<rho_min) {rho=2*rho_min-rho;}
+		if (rho>rho_max) {rho=2*rho_max-rho;}
+	}
+	/*
+	//Grid Sampling
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		h=h+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.1;
+		if (h<h_min) {h=h_max;}
+		if (h>h_max) {h=h_min;}
+	}
+	
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		rho=rho+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.1;
+		if (rho<rho_min) {rho=rho_max;}
+		if (rho>rho_max) {rho=rho_min;}
+	}
+	*/
+	cHyp_new.h=h;
+	cHyp_new.rho=rho;
+	return 0.0;
+}
+
+
+double BSLMM::ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	double logp_old=cHyp_old.logp, logp_new=cHyp_old.logp;
+	double log_ratio=0.0;
+	
+	double d_logp=min(0.1, (logp_max-logp_min)*logp_scale);
+	
+	for (size_t i=0; i<repeat; ++i) {
+		logp_new=logp_old+(gsl_rng_uniform(gsl_r)-0.5)*d_logp;
+		if (logp_new<logp_min) {logp_new=2*logp_min-logp_new;}
+		if (logp_new>logp_max) {logp_new=2*logp_max-logp_new;}		
+		
+		log_ratio+=logp_new-logp_old;
+		logp_old=logp_new;
+	}
+	/*
+	//Grid Sampling
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		logp_new=logp_old+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.5*log(10.0);
+		if (logp_new<logp_min) {logp_new=logp_max;}
+		if (logp_new>logp_max) {logp_new=logp_min;}	
+		
+		log_ratio+=logp_new-logp_old;
+		logp_old=logp_new;
+	}
+	*/
+	cHyp_new.logp=logp_new;
+	
+	return log_ratio;
+}
+
+bool comp_vec (size_t a, size_t b)
+{
+	return (a < b); 
+}
+
+
+double BSLMM::ProposeGamma (const vector<size_t> &rank_old, vector<size_t> &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	map<size_t, int> mapRank2in;
+	size_t r;
+	double unif, logp=0.0;
+	int flag_gamma;
+	size_t r_add, r_remove, col_id;
+	
+	rank_new.clear();
+	if (cHyp_old.n_gamma!=rank_old.size()) {cout<<"size wrong"<<endl;}
+	
+	if (cHyp_old.n_gamma!=0) {
+		for (size_t i=0; i<rank_old.size(); ++i) {
+			r=rank_old[i];
+			rank_new.push_back(r);
+			mapRank2in[r]=1;
+		}
+	}
+	cHyp_new.n_gamma=cHyp_old.n_gamma;	
+	
+	for (size_t i=0; i<repeat; ++i) {
+		unif=gsl_rng_uniform(gsl_r); 
+	
+		if (unif < 0.40 && cHyp_new.n_gamma<s_max) {flag_gamma=1;}
+		else if (unif>=0.40 && unif < 0.80 && cHyp_new.n_gamma>s_min) {flag_gamma=2;}
+		else if (unif>=0.80 && cHyp_new.n_gamma>0 && cHyp_new.n_gamma<ns_test) {flag_gamma=3;}
+		else {flag_gamma=4;}
+	
+		if(flag_gamma==1)  {//add a snp; 
+			do {
+				r_add=gsl_ran_discrete (gsl_r, gsl_t);
+			} while (mapRank2in.count(r_add)!=0); 
+		
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+
+			mapRank2in[r_add]=1;
+			rank_new.push_back(r_add);
+			cHyp_new.n_gamma++;
+			logp+=-log(p_gamma[r_add]/prob_total)-log((double)cHyp_new.n_gamma);
+		}
+		else if (flag_gamma==2) {//delete a snp;
+			col_id=gsl_rng_uniform_int(gsl_r, cHyp_new.n_gamma);		
+			r_remove=rank_new[col_id];
+		
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+			prob_total+=p_gamma[r_remove];
+		
+			mapRank2in.erase(r_remove);
+			rank_new.erase(rank_new.begin()+col_id);
+			logp+=log(p_gamma[r_remove]/prob_total)+log((double)cHyp_new.n_gamma);
+			cHyp_new.n_gamma--;
+		}
+		else if (flag_gamma==3) {//switch a snp;
+			col_id=gsl_rng_uniform_int(gsl_r, cHyp_new.n_gamma);		
+			r_remove=rank_new[col_id];
+		//careful with the proposal
+			do {
+				r_add=gsl_ran_discrete (gsl_r, gsl_t);
+			} while (mapRank2in.count(r_add)!=0); 
+			
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+			
+			logp+=log(p_gamma[r_remove]/(prob_total+p_gamma[r_remove]-p_gamma[r_add]) );
+			logp-=log(p_gamma[r_add]/prob_total);
+			
+			mapRank2in.erase(r_remove);
+			mapRank2in[r_add]=1;
+			rank_new.erase(rank_new.begin()+col_id);
+			rank_new.push_back(r_add);
+		}
+		else {logp+=0;}//do not change
+	}
+	
+	stable_sort (rank_new.begin(), rank_new.end(), comp_vec);
+
+	mapRank2in.clear();
+	return logp;
+}
+
+
+
+
+
+
+bool comp_lr (pair<size_t, double> a, pair<size_t, double> b)
+{
+	return (a.second > b.second); 
+}
+
+
+
+
+
+
+
+//if a_mode==13 then Uty==y
+void BSLMM::MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y) {
+	clock_t time_start;	
+
+	class HYPBSLMM cHyp_old, cHyp_new;
+	
+	gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6);
+	gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max);	
+	
+	gsl_vector *alpha_prime=gsl_vector_alloc (ni_test);		
+	gsl_vector *alpha_new=gsl_vector_alloc (ni_test);
+	gsl_vector *alpha_old=gsl_vector_alloc (ni_test);	
+	gsl_vector *Utu=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu_new=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu_old=gsl_vector_alloc (ni_test);
+	
+	gsl_vector *UtXb_new=gsl_vector_alloc (ni_test);
+	gsl_vector *UtXb_old=gsl_vector_alloc (ni_test);
+	
+	gsl_vector *z_hat=gsl_vector_alloc (ni_test);
+	gsl_vector *z=gsl_vector_alloc (ni_test);
+	gsl_vector *Utz=gsl_vector_alloc (ni_test);	
+
+	gsl_vector_memcpy (Utz, Uty);			
+	
+	double logPost_new, logPost_old;
+	double logMHratio;
+	double mean_z=0.0;	
+	
+	gsl_matrix_set_zero (Result_gamma);
+	gsl_vector_set_zero (Utu);
+	gsl_vector_set_zero (alpha_prime);
+	if (a_mode==13) {
+		pheno_mean=0.0;
+	}
+	
+	vector<pair<double, double> > beta_g;
+	for (size_t i=0; i<ns_test; i++) {
+		beta_g.push_back(make_pair(0.0, 0.0));
+	}
+	
+	vector<size_t> rank_new, rank_old;
+	vector<double> beta_new, beta_old;	
+
+	vector<pair<size_t, double> > pos_loglr;
+
+	time_start=clock();
+	MatrixCalcLR (U, UtX, Utz, K_eval, l_min, l_max, n_region, pos_loglr);
+	time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+	stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr);
+	for (size_t i=0; i<ns_test; ++i) {
+		mapRank2pos[i]=pos_loglr[i].first;
+	}
+	
+	//calculate proposal distribution for gamma (unnormalized), and set up gsl_r and gsl_t			
+	gsl_rng_env_setup();                
+	const gsl_rng_type * gslType;                                               
+	gslType = gsl_rng_default; 
+	if (randseed<0)
+	{
+		time_t rawtime;
+		time (&rawtime);
+		tm * ptm = gmtime (&rawtime);
+		
+		randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
+	}
+	gsl_r = gsl_rng_alloc(gslType); 
+	gsl_rng_set(gsl_r, randseed);
+	
+	double *p_gamma = new double[ns_test]; 
+	CalcPgamma (p_gamma);
+	
+	gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma);
+	
+	//initial parameters
+	InitialMCMC (UtX, Utz, rank_old, cHyp_old, pos_loglr);
+//	if (fix_sigma>=0) {
+//		rho_max=1-fix_sigma;
+//		cHyp_old.h=fix_sigma/(1-cHyp_old.rho);
+//	}
+	
+	cHyp_initial=cHyp_old;
+	
+	if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+		logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old);
+
+		beta_old.clear();
+		for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+		  beta_old.push_back(0);
+		}	
+	}
+	else {
+		gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_old.n_gamma);
+		gsl_vector *beta=gsl_vector_alloc (cHyp_old.n_gamma);
+		SetXgamma (UtXgamma, UtX, rank_old);		
+		logPost_old=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_old, Utu_old, alpha_old, beta, cHyp_old);
+	
+		beta_old.clear();
+		for (size_t i=0; i<beta->size; ++i) {
+			beta_old.push_back(gsl_vector_get(beta, i));
+		}	
+		gsl_matrix_free (UtXgamma);
+		gsl_vector_free (beta);
+	}	
+	
+	//calculate centered z_hat, and pve
+	if (a_mode==13) {
+		time_start=clock();
+		if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+			CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old);
+		}
+		else {
+			CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old);
+		}
+		time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	}
+	
+	//start MCMC
+	int accept;
+	size_t total_step=w_step+s_step;
+	size_t w=0, w_col, pos;
+	size_t repeat=0;
+	
+	for (size_t t=0; t<total_step; ++t) {
+		if (t%d_pace==0 || t==total_step-1) {ProgressBar ("Running MCMC ", t, total_step-1, (double)n_accept/(double)(t*n_mh+1));}
+//		if (t>10) {break;}		
+
+		if (a_mode==13) {			
+			SampleZ (y, z_hat, z);		
+			mean_z=CenterVector (z);	
+			
+			time_start=clock();
+			gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz);
+			time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+			//First proposal
+			if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {				
+				logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old);
+				beta_old.clear();
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+				  beta_old.push_back(0);
+				}	
+			}
+			else {
+				gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_old.n_gamma);
+				gsl_vector *beta=gsl_vector_alloc (cHyp_old.n_gamma);
+				SetXgamma (UtXgamma, UtX, rank_old);
+				logPost_old=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_old, Utu_old, alpha_old, beta, cHyp_old);
+				
+				beta_old.clear();
+				for (size_t i=0; i<beta->size; ++i) {
+					beta_old.push_back(gsl_vector_get(beta, i));
+				}
+				gsl_matrix_free (UtXgamma);
+				gsl_vector_free (beta);
+			}
+		}
+		
+		//MH steps
+		for (size_t i=0; i<n_mh; ++i) {
+			if (gsl_rng_uniform(gsl_r)<0.33) {repeat = 1+gsl_rng_uniform_int(gsl_r, 20);}
+			else {repeat=1;}
+			
+			logMHratio=0.0;
+			logMHratio+=ProposeHnRho(cHyp_old, cHyp_new, repeat);		
+			logMHratio+=ProposeGamma (rank_old, rank_new, p_gamma, cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposePi(cHyp_old, cHyp_new, repeat);
+			
+//			if (fix_sigma>=0) {
+//				cHyp_new.h=fix_sigma/(1-cHyp_new.rho);
+//			}
+			
+			if (cHyp_new.n_gamma==0 || cHyp_new.rho==0) {
+				logPost_new=CalcPosterior(Utz, K_eval, Utu_new, alpha_new, cHyp_new);
+				beta_new.clear();
+				for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				  beta_new.push_back(0);
+				}	
+			}
+			else {
+				gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_new.n_gamma);
+				gsl_vector *beta=gsl_vector_alloc (cHyp_new.n_gamma);
+				SetXgamma (UtXgamma, UtX, rank_new);
+				logPost_new=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_new, Utu_new, alpha_new, beta, cHyp_new);
+				beta_new.clear();
+				for (size_t i=0; i<beta->size; ++i) {
+					beta_new.push_back(gsl_vector_get(beta, i));
+				}
+				gsl_matrix_free (UtXgamma);
+				gsl_vector_free (beta);
+			}	
+			
+			logMHratio+=logPost_new-logPost_old;		
+		
+			if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))<logMHratio) {accept=1; n_accept++;}
+			else {accept=0;}
+
+			if (accept==1) {			
+				logPost_old=logPost_new;
+				rank_old.clear(); beta_old.clear();
+				if (rank_new.size()!=0) {
+					for (size_t i=0; i<rank_new.size(); ++i) {
+						rank_old.push_back(rank_new[i]);
+						beta_old.push_back(beta_new[i]);
+					}
+				}
+				cHyp_old=cHyp_new;
+				gsl_vector_memcpy (alpha_old, alpha_new);
+				gsl_vector_memcpy (UtXb_old, UtXb_new);
+				gsl_vector_memcpy (Utu_old, Utu_new);
+			}
+			else {cHyp_new=cHyp_old;}
+		}				
+		
+		//calculate z_hat, and pve
+		if (a_mode==13) {
+			time_start=clock();
+			if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+				CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old);
+			}
+			else {
+				CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old);
+			}
+			
+			//sample mu and update z hat
+			gsl_vector_sub (z, z_hat);
+			mean_z+=CenterVector(z);
+			mean_z+=gsl_ran_gaussian(gsl_r, sqrt(1.0/(double) ni_test) );			
+			
+			gsl_vector_add_constant (z_hat, mean_z);
+			
+			time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		}
+		
+		//Save data
+		if (t<w_step) {continue;}
+		else {		
+			if (t%r_pace==0) {
+				w_col=w%w_pace;
+				if (w_col==0) {
+					if (w==0) {WriteResult (0, Result_hyp, Result_gamma, w_col);}					
+					else {
+						WriteResult (1, Result_hyp, Result_gamma, w_col);
+						gsl_matrix_set_zero (Result_hyp);
+						gsl_matrix_set_zero (Result_gamma);
+					}
+				}
+				
+				gsl_matrix_set (Result_hyp, w_col, 0, cHyp_old.h);
+				gsl_matrix_set (Result_hyp, w_col, 1, cHyp_old.pve);
+				gsl_matrix_set (Result_hyp, w_col, 2, cHyp_old.rho);
+				gsl_matrix_set (Result_hyp, w_col, 3, cHyp_old.pge);
+				gsl_matrix_set (Result_hyp, w_col, 4, cHyp_old.logp);
+				gsl_matrix_set (Result_hyp, w_col, 5, cHyp_old.n_gamma);
+				
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+					pos=mapRank2pos[rank_old[i]]+1;
+
+					gsl_matrix_set (Result_gamma, w_col, i, pos);
+					
+					beta_g[pos-1].first+=beta_old[i];
+					beta_g[pos-1].second+=1.0;	
+				}
+				
+				gsl_vector_add (alpha_prime, alpha_old);
+				gsl_vector_add (Utu, Utu_old);
+				
+				if (a_mode==13) {
+					pheno_mean+=mean_z;
+				}
+				
+				w++;
+				
+			}
+			
+		}
+	}
+	cout<<endl;
+	
+	w_col=w%w_pace;
+	WriteResult (1, Result_hyp, Result_gamma, w_col);	
+	
+	gsl_matrix_free(Result_hyp);
+	gsl_matrix_free(Result_gamma);	
+	
+	gsl_vector_free(z_hat);
+	gsl_vector_free(z);
+	gsl_vector_free(Utz);	
+	gsl_vector_free(UtXb_new);	
+	gsl_vector_free(UtXb_old);
+	gsl_vector_free(alpha_new);	
+	gsl_vector_free(alpha_old);
+	gsl_vector_free(Utu_new);	
+	gsl_vector_free(Utu_old);	
+	
+	gsl_vector_scale (alpha_prime, 1.0/(double)w);	
+	gsl_vector_scale (Utu, 1.0/(double)w);	
+	if (a_mode==13) {
+		pheno_mean/=(double)w;
+	}
+	
+	gsl_vector *alpha=gsl_vector_alloc (ns_test);
+	gsl_blas_dgemv (CblasTrans, 1.0/(double)ns_test, UtX, alpha_prime, 0.0, alpha);	
+	WriteParam (beta_g, alpha, w);
+	gsl_vector_free(alpha);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, alpha_prime);
+	WriteBV(alpha_prime);	
+	
+	gsl_vector_free(alpha_prime);				
+	gsl_vector_free(Utu);	
+		
+	delete [] p_gamma;
+	beta_g.clear();
+	
+	return;
+}
+
+
+
+void BSLMM::RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda)
+{
+	gsl_vector *beta=gsl_vector_alloc (UtX->size2);
+	gsl_vector *H_eval=gsl_vector_alloc (Uty->size);
+	gsl_vector *bv=gsl_vector_alloc (Uty->size);
+
+	gsl_vector_memcpy (H_eval, eval);
+	gsl_vector_scale (H_eval, lambda);
+	gsl_vector_add_constant (H_eval, 1.0);
+	
+	gsl_vector_memcpy (bv, Uty);
+	gsl_vector_div (bv, H_eval);	
+
+	gsl_blas_dgemv (CblasTrans, lambda/(double)UtX->size2, UtX, bv, 0.0, beta);
+	gsl_vector_add_constant (H_eval, -1.0);
+	gsl_vector_mul (H_eval, bv);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, H_eval, 0.0, bv);
+
+	WriteParam (beta);
+	WriteBV(bv);
+	
+	gsl_vector_free (H_eval);
+	gsl_vector_free (beta);
+	gsl_vector_free (bv);
+	
+	return;
+}
+ 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//below fits MCMC for rho=1
+void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty)
+{
+  time_t time_start=clock();	
+  gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size);
+  gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size);
+  gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size);
+
+#ifdef WITH_LAPACK
+  lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix);
+#else
+  gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix);
+#endif
+  gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector);
+
+  time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+  return;
+}
+
+
+void BSLMM::SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector<size_t> &rank_old, const vector<size_t> &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new)
+{
+  double d;
+
+  //rank_old and rank_new are sorted already inside PorposeGamma
+  //calculate vectors rank_remove and rank_add
+  //  size_t v_size=max(rank_old.size(), rank_new.size());
+  //make sure that v_size is larger than repeat
+  size_t v_size=20;
+  vector<size_t> rank_remove(v_size), rank_add(v_size), rank_union(s_max+v_size);
+  vector<size_t>::iterator it;
+
+  it=set_difference (rank_old.begin(), rank_old.end(), rank_new.begin(), rank_new.end(), rank_remove.begin());
+  rank_remove.resize(it-rank_remove.begin());
+
+  it=set_difference (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_add.begin());
+  rank_add.resize(it-rank_add.begin());
+
+  it=set_union (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_union.begin());
+  rank_union.resize(it-rank_union.begin());
+
+  //map rank_remove and rank_add
+  map<size_t, int> mapRank2in_remove, mapRank2in_add;
+  for (size_t i=0; i<rank_remove.size(); i++) {
+    mapRank2in_remove[rank_remove[i]]=1;
+  }
+  for (size_t i=0; i<rank_add.size(); i++) {
+    mapRank2in_add[rank_add[i]]=1;
+  }
+
+  //obtain the subset of matrix/vector
+  gsl_matrix_const_view Xold_sub=gsl_matrix_const_submatrix(X_old, 0, 0, X_old->size1, rank_old.size());
+  gsl_matrix_const_view XtXold_sub=gsl_matrix_const_submatrix(XtX_old, 0, 0, rank_old.size(), rank_old.size());
+  gsl_vector_const_view Xtyold_sub=gsl_vector_const_subvector(Xty_old, 0, rank_old.size());
+
+  gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(X_new, 0, 0, X_new->size1, rank_new.size());
+  gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size());
+  gsl_vector_view Xtynew_sub=gsl_vector_subvector(Xty_new, 0, rank_new.size());
+
+  //get X_new and calculate XtX_new
+  /*
+  if (rank_remove.size()==0 && rank_add.size()==0) {
+    gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix);
+    gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix);
+    gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector);
+  } else {
+    gsl_matrix *X_temp=gsl_matrix_alloc(X_old->size1, rank_old.size()-rank_remove.size() );
+    gsl_matrix *XtX_temp=gsl_matrix_alloc(X_temp->size2, X_temp->size2);
+    gsl_vector *Xty_temp=gsl_vector_alloc(X_temp->size2);
+    
+    if (rank_remove.size()==0) {
+      gsl_matrix_memcpy (X_temp, &Xold_sub.matrix);
+      gsl_matrix_memcpy (XtX_temp, &XtXold_sub.matrix);
+      gsl_vector_memcpy (Xty_temp, &Xtyold_sub.vector);
+    } else {
+      size_t i_temp=0, j_temp;
+      for (size_t i=0; i<rank_old.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i])!=0) {continue;}
+	gsl_vector_const_view Xold_col=gsl_matrix_const_column(X_old, i);	
+	gsl_vector_view Xtemp_col=gsl_matrix_column(X_temp, i_temp);
+	gsl_vector_memcpy (&Xtemp_col.vector, &Xold_col.vector);
+
+	d=gsl_vector_get (Xty_old, i);
+	gsl_vector_set (Xty_temp, i_temp, d);
+	
+	j_temp=i_temp;
+	for (size_t j=i; j<rank_old.size(); j++) {
+	  if (mapRank2in_remove.count(rank_old[j])!=0) {continue;}
+	  d=gsl_matrix_get (XtX_old, i, j);
+	  gsl_matrix_set (XtX_temp, i_temp, j_temp, d);
+	  if (i_temp!=j_temp) {gsl_matrix_set (XtX_temp, j_temp, i_temp, d);}
+	  j_temp++;
+	}
+	i_temp++;
+      }
+    }
+
+    if (rank_add.size()==0) {
+      gsl_matrix_memcpy (&Xnew_sub.matrix, X_temp);
+      gsl_matrix_memcpy (&XtXnew_sub.matrix, XtX_temp);
+      gsl_vector_memcpy (&Xtynew_sub.vector, Xty_temp);
+    } else {
+      gsl_matrix *X_add=gsl_matrix_alloc(X_old->size1, rank_add.size() );
+      gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2);
+      gsl_matrix *XtX_at=gsl_matrix_alloc(X_add->size2, X_temp->size2);
+      gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2);
+
+      //get X_add
+      SetXgamma (X_add, X, rank_add);
+
+      //get t(X_add)X_add and t(X_add)X_temp	
+      clock_t time_start=clock();
+      
+      //somehow the lapack_dgemm does not work here
+      //#ifdef WITH_LAPACK
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa);
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at);
+      
+      //#else
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa);
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_temp, 0.0, XtX_at);
+      //#endif
+      gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add);
+
+      time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+      //save to X_new, XtX_new and Xty_new
+      size_t i_temp=0, j_temp, i_flag=0, j_flag=0;
+      for (size_t i=0; i<rank_new.size(); i++) {
+	if (mapRank2in_add.count(rank_new[i])!=0) {i_flag=1;} else {i_flag=0;}
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i); 
+	if (i_flag==1) {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_add, i-i_temp);
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	} else {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_temp, i_temp);	  
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	}	
+
+	if (i_flag==1) {
+          d=gsl_vector_get (Xty_add, i-i_temp);
+        } else {
+          d=gsl_vector_get (Xty_temp, i_temp);
+        }
+	gsl_vector_set (Xty_new, i, d);
+
+	j_temp=i_temp;
+	for (size_t j=i; j<rank_new.size(); j++) {
+          if (mapRank2in_add.count(rank_new[j])!=0) {j_flag=1;} else {j_flag=0;}
+
+	  if (i_flag==1 && j_flag==1) {
+            d=gsl_matrix_get(XtX_aa, i-i_temp, j-j_temp);
+	  } else if (i_flag==1) {
+	    d=gsl_matrix_get(XtX_at, i-i_temp, j_temp);
+	  } else if (j_flag==1) {
+	    d=gsl_matrix_get(XtX_at, j-j_temp, i_temp);
+	  } else {
+	    d=gsl_matrix_get(XtX_temp, i_temp, j_temp);
+	  }
+
+	  gsl_matrix_set (XtX_new, i, j, d);
+	  if (i!=j) {gsl_matrix_set (XtX_new, j, i, d);}
+
+	  if (j_flag==0) {j_temp++;}
+        }
+	if (i_flag==0) {i_temp++;}
+      }
+
+      gsl_matrix_free(X_add);
+      gsl_matrix_free(XtX_aa);
+      gsl_matrix_free(XtX_at);
+      gsl_vector_free(Xty_add);
+    }
+
+    gsl_matrix_free(X_temp);
+    gsl_matrix_free(XtX_temp);
+    gsl_vector_free(Xty_temp);
+  }
+  */
+
+
+  if (rank_remove.size()==0 && rank_add.size()==0) {
+    gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix);
+    gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix);
+    gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector);
+  } else {
+    size_t i_old, j_old, i_new, j_new, i_add, j_add, i_flag, j_flag;
+    if (rank_add.size()==0) {
+      i_old=0; i_new=0;
+      for (size_t i=0; i<rank_union.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i_old])!=0) {i_old++; continue;}
+
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i_new); 
+	gsl_vector_const_view Xcopy_col=gsl_matrix_const_column(X_old, i_old);
+	gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+
+	d=gsl_vector_get (Xty_old, i_old);
+	gsl_vector_set (Xty_new, i_new, d);
+
+	j_old=i_old; j_new=i_new;
+	for (size_t j=i; j<rank_union.size(); j++) {
+          if (mapRank2in_remove.count(rank_old[j_old])!=0) {j_old++; continue;}
+
+	  d=gsl_matrix_get(XtX_old, i_old, j_old);
+
+	  gsl_matrix_set (XtX_new, i_new, j_new, d);
+	  if (i_new!=j_new) {gsl_matrix_set (XtX_new, j_new, i_new, d);}
+
+	  j_old++; j_new++;
+        }
+	i_old++; i_new++;
+      }	
+    } else {
+      gsl_matrix *X_add=gsl_matrix_alloc(X_old->size1, rank_add.size() );
+      gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2);
+      gsl_matrix *XtX_ao=gsl_matrix_alloc(X_add->size2, X_old->size2);
+      gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2);
+
+      //get X_add
+      SetXgamma (X_add, X, rank_add);
+
+      //get t(X_add)X_add and t(X_add)X_temp	
+      clock_t time_start=clock();
+      
+      //somehow the lapack_dgemm does not work here
+      //#ifdef WITH_LAPACK
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa);
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_old, 0.0, XtX_ao);
+      
+      //#else
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa);
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_old, 0.0, XtX_ao);
+      //#endif
+      gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add);
+
+      time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+      //save to X_new, XtX_new and Xty_new
+      i_old=0; i_new=0; i_add=0;
+      for (size_t i=0; i<rank_union.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i_old])!=0) {i_old++; continue;}
+	if (mapRank2in_add.count(rank_new[i_new])!=0) {i_flag=1;} else {i_flag=0;}
+
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i_new); 
+	if (i_flag==1) {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_add, i_add);
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	} else {
+	  gsl_vector_const_view Xcopy_col=gsl_matrix_const_column(X_old, i_old);	  
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	}	
+
+	if (i_flag==1) {
+          d=gsl_vector_get (Xty_add, i_add);
+        } else {
+          d=gsl_vector_get (Xty_old, i_old);
+        }
+	gsl_vector_set (Xty_new, i_new, d);
+
+	j_old=i_old; j_new=i_new; j_add=i_add;
+	for (size_t j=i; j<rank_union.size(); j++) {
+	  if (mapRank2in_remove.count(rank_old[j_old])!=0) {j_old++; continue;}
+	  if (mapRank2in_add.count(rank_new[j_new])!=0) {j_flag=1;} else {j_flag=0;}
+
+	  if (i_flag==1 && j_flag==1) {
+            d=gsl_matrix_get(XtX_aa, i_add, j_add);
+	  } else if (i_flag==1) {
+	    d=gsl_matrix_get(XtX_ao, i_add, j_old);
+	  } else if (j_flag==1) {
+	    d=gsl_matrix_get(XtX_ao, j_add, i_old);
+	  } else {
+	    d=gsl_matrix_get(XtX_old, i_old, j_old);
+	  }
+
+	  gsl_matrix_set (XtX_new, i_new, j_new, d);
+	  if (i_new!=j_new) {gsl_matrix_set (XtX_new, j_new, i_new, d);}
+
+	  j_new++; if (j_flag==1) {j_add++;} else {j_old++;}
+        }
+	i_new++; if (i_flag==1) {i_add++;} else {i_old++;}
+      }
+
+      gsl_matrix_free(X_add);
+      gsl_matrix_free(XtX_aa);
+      gsl_matrix_free(XtX_ao);
+      gsl_vector_free(Xty_add);
+    }
+
+  }
+
+  rank_remove.clear();
+  rank_add.clear();
+  rank_union.clear();
+  mapRank2in_remove.clear();
+  mapRank2in_add.clear();
+	
+  return;
+}
+
+
+double BSLMM::CalcPosterior (const double yty, class HYPBSLMM &cHyp)
+{	
+	double logpost=0.0;
+	
+	//for quantitative traits, calculate pve and pge
+	//pve and pge for case/control data are calculted in CalcCC_PVEnZ
+	if (a_mode==11) {
+		cHyp.pve=0.0;
+		cHyp.pge=1.0;	
+	}
+
+	//calculate likelihood
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(yty);}
+	else {logpost-=0.5*yty;}
+	
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp));
+		
+	return logpost;
+}
+
+
+double BSLMM::CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp)
+{	
+	double sigma_a2=cHyp.h/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	double logpost=0.0;
+	double d, P_yy=yty, logdet_O=0.0;
+
+	gsl_matrix_const_view Xgamma_sub=gsl_matrix_const_submatrix (Xgamma, 0, 0, Xgamma->size1, s_size);
+	gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size);
+	gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size);
+	
+	gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size);
+	gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size);
+	gsl_vector *beta_hat=gsl_vector_alloc (s_size);	
+	gsl_vector *Xty_temp=gsl_vector_alloc (s_size);
+
+	gsl_vector_memcpy (Xty_temp, &Xty_sub.vector);
+
+	//calculate Omega
+	gsl_matrix_memcpy (Omega, &XtX_sub.matrix);
+	gsl_matrix_scale (Omega, sigma_a2);
+	gsl_matrix_set_identity (M_temp);
+	gsl_matrix_add (Omega, M_temp);
+	
+	//calculate beta_hat
+	logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat);	
+	gsl_vector_scale (beta_hat, sigma_a2);
+
+	gsl_blas_ddot (Xty_temp, beta_hat, &d);
+	P_yy-=d;
+
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/P_yy); }
+
+	//sample beta
+	for (size_t i=0; i<s_size; i++)
+	{
+		d=gsl_ran_gaussian(gsl_r, 1); 
+		gsl_vector_set(beta, i, d); 
+	}
+	gsl_vector_view beta_sub=gsl_vector_subvector(beta, 0, s_size);
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, &beta_sub.vector); 
+		
+	//it compuates inv(L^T(Omega)) %*% beta;  
+	gsl_vector_scale(&beta_sub.vector, sqrt(sigma_a2/tau));
+	gsl_vector_add(&beta_sub.vector, beta_hat); 
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &Xgamma_sub.matrix, &beta_sub.vector, 0.0, Xb);		
+	
+	//for quantitative traits, calculate pve and pge
+	if (a_mode==11) {
+		gsl_blas_ddot (Xb, Xb, &d);
+		cHyp.pve=d/(double)ni_test;
+		cHyp.pve/=cHyp.pve+1.0/tau;
+		cHyp.pge=1.0;	
+	}	
+	
+	logpost=-0.5*logdet_O;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);}
+	else {logpost-=0.5*P_yy;}
+
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp));
+
+	gsl_matrix_free (Omega);
+	gsl_matrix_free (M_temp);
+	gsl_vector_free (beta_hat);
+	gsl_vector_free (Xty_temp);
+
+	return logpost;
+}
+
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+  gsl_vector_set_zero(z_hat);
+  cHyp.pve=0.0;
+  cHyp.pge=1.0;		
+  return;
+}
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	
+	gsl_blas_ddot (Xb, Xb, &d);
+	cHyp.pve=d/(double)ni_test;
+	cHyp.pve/=cHyp.pve+1.0;
+	cHyp.pge=1.0;
+	
+	gsl_vector_memcpy (z_hat, Xb);
+
+	return;
+}
+
+
+
+//if a_mode==13, then run probit model
+void BSLMM::MCMC (const gsl_matrix *X, const gsl_vector *y) {
+	clock_t time_start;	
+	double time_set=0, time_post=0;
+
+	class HYPBSLMM cHyp_old, cHyp_new;
+	
+	gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6);
+	gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max);	
+	
+	gsl_vector *Xb_new=gsl_vector_alloc (ni_test);
+	gsl_vector *Xb_old=gsl_vector_alloc (ni_test);	
+	gsl_vector *z_hat=gsl_vector_alloc (ni_test);
+	gsl_vector *z=gsl_vector_alloc (ni_test);
+
+	gsl_matrix *Xgamma_old=gsl_matrix_alloc (ni_test, s_max);
+	gsl_matrix *XtX_old=gsl_matrix_alloc (s_max, s_max);
+	gsl_vector *Xtz_old=gsl_vector_alloc (s_max);
+	gsl_vector *beta_old=gsl_vector_alloc (s_max);
+
+	gsl_matrix *Xgamma_new=gsl_matrix_alloc (ni_test, s_max);
+	gsl_matrix *XtX_new=gsl_matrix_alloc (s_max, s_max);
+	gsl_vector *Xtz_new=gsl_vector_alloc (s_max);
+	gsl_vector *beta_new=gsl_vector_alloc (s_max);
+
+	double ztz=0.0;
+	gsl_vector_memcpy (z, y);
+	//for quantitative traits, y is centered already in gemma.cpp, but just in case
+	double mean_z=CenterVector (z);				
+	gsl_blas_ddot(z, z, &ztz);
+
+	double logPost_new, logPost_old;
+	double logMHratio;
+	
+	gsl_matrix_set_zero (Result_gamma);
+	if (a_mode==13) {
+		pheno_mean=0.0;
+	}
+	
+	vector<pair<double, double> > beta_g;
+	for (size_t i=0; i<ns_test; i++) {
+		beta_g.push_back(make_pair(0.0, 0.0));
+	}
+	
+	vector<size_t> rank_new, rank_old;
+	vector<pair<size_t, double> > pos_loglr;
+	
+	time_start=clock();
+	MatrixCalcLmLR (X, z, pos_loglr);
+	time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+	stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr);
+	for (size_t i=0; i<ns_test; ++i) {
+		mapRank2pos[i]=pos_loglr[i].first;
+	}
+	
+	//calculate proposal distribution for gamma (unnormalized), and set up gsl_r and gsl_t		
+	gsl_rng_env_setup();                
+	const gsl_rng_type * gslType;                                               
+	gslType = gsl_rng_default; 
+	if (randseed<0)
+	{
+		time_t rawtime;
+		time (&rawtime);
+		tm * ptm = gmtime (&rawtime);
+		
+		randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
+	}
+	gsl_r = gsl_rng_alloc(gslType); 
+	gsl_rng_set(gsl_r, randseed);
+	
+	double *p_gamma = new double[ns_test]; 
+	CalcPgamma (p_gamma);
+	
+	gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma);
+	
+	//initial parameters
+	InitialMCMC (X, z, rank_old, cHyp_old, pos_loglr);
+	
+	cHyp_initial=cHyp_old;
+
+	if (cHyp_old.n_gamma==0) {	  
+	    logPost_old=CalcPosterior (ztz, cHyp_old);
+	}
+	else {	  
+	  SetXgamma (Xgamma_old, X, rank_old);	  
+	  CalcXtX (Xgamma_old, z, rank_old.size(), XtX_old, Xtz_old);
+	  logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old);
+	}	
+
+	//calculate centered z_hat, and pve
+	if (a_mode==13) {
+		if (cHyp_old.n_gamma==0) {
+			CalcCC_PVEnZ (z_hat, cHyp_old);
+		}
+		else {
+			CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old);
+		}
+	}
+	
+	//start MCMC
+	int accept;
+	size_t total_step=w_step+s_step;
+	size_t w=0, w_col, pos;
+	size_t repeat=0;
+	
+	for (size_t t=0; t<total_step; ++t) {
+		if (t%d_pace==0 || t==total_step-1) {ProgressBar ("Running MCMC ", t, total_step-1, (double)n_accept/(double)(t*n_mh+1));}
+//		if (t>10) {break;}		
+		if (a_mode==13) {			
+			SampleZ (y, z_hat, z);		
+			mean_z=CenterVector (z);
+			gsl_blas_ddot(z,z,&ztz);
+					
+			//First proposal		
+			if (cHyp_old.n_gamma==0) {	  
+			  logPost_old=CalcPosterior (ztz, cHyp_old);
+			} else {	  
+			  gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_old.size());
+			  gsl_vector_view Xtz_sub=gsl_vector_subvector(Xtz_old, 0, rank_old.size());
+			  gsl_blas_dgemv (CblasTrans, 1.0, &Xold_sub.matrix, z, 0.0, &Xtz_sub.vector);
+			  logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old);
+			}	
+		}
+
+		//MH steps
+		for (size_t i=0; i<n_mh; ++i) {
+			if (gsl_rng_uniform(gsl_r)<0.33) {repeat = 1+gsl_rng_uniform_int(gsl_r, 20);}
+			else {repeat=1;}
+
+			logMHratio=0.0;
+			logMHratio+=ProposeHnRho(cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposeGamma (rank_old, rank_new, p_gamma, cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposePi(cHyp_old, cHyp_new, repeat);
+			
+			if (cHyp_new.n_gamma==0) {
+				logPost_new=CalcPosterior (ztz, cHyp_new);
+			} else {
+			  //this if makes sure that rank_old.size()==rank_remove.size() does not happen
+			  if (cHyp_new.n_gamma<=20 || cHyp_old.n_gamma<=20) {
+			    time_start=clock();
+			    SetXgamma (Xgamma_new, X, rank_new);	  
+			    CalcXtX (Xgamma_new, z, rank_new.size(), XtX_new, Xtz_new);	
+			    time_set+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			  } else {
+			    time_start=clock();
+			    SetXgamma (X, Xgamma_old, XtX_old, Xtz_old, z, rank_old, rank_new, Xgamma_new, XtX_new, Xtz_new);
+			    time_set+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			  }
+			  time_start=clock();
+			  logPost_new=CalcPosterior (Xgamma_new, XtX_new, Xtz_new, ztz, rank_new.size(), Xb_new, beta_new, cHyp_new);
+			  time_post+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			}	
+			logMHratio+=logPost_new-logPost_old;	
+		
+			if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))<logMHratio) {accept=1; n_accept++;}
+			else {accept=0;}
+			
+			//cout<<rank_new.size()<<"\t"<<rank_old.size()<<"\t"<<logPost_new<<"\t"<<logPost_old<<endl;
+			if (accept==1) {			
+				logPost_old=logPost_new;
+				cHyp_old=cHyp_new;
+				gsl_vector_memcpy (Xb_old, Xb_new);
+
+				rank_old.clear();
+				if (rank_new.size()!=0) {
+					for (size_t i=0; i<rank_new.size(); ++i) {
+						rank_old.push_back(rank_new[i]);
+					}
+								
+					gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_new.size());
+					gsl_matrix_view XtXold_sub=gsl_matrix_submatrix(XtX_old, 0, 0, rank_new.size(), rank_new.size());
+					gsl_vector_view Xtzold_sub=gsl_vector_subvector(Xtz_old, 0, rank_new.size());
+					gsl_vector_view betaold_sub=gsl_vector_subvector(beta_old, 0, rank_new.size());
+
+					gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(Xgamma_new, 0, 0, ni_test, rank_new.size());
+					gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size());
+					gsl_vector_view Xtznew_sub=gsl_vector_subvector(Xtz_new, 0, rank_new.size());
+					gsl_vector_view betanew_sub=gsl_vector_subvector(beta_new, 0, rank_new.size());
+
+					gsl_matrix_memcpy(&Xold_sub.matrix, &Xnew_sub.matrix);
+					gsl_matrix_memcpy(&XtXold_sub.matrix, &XtXnew_sub.matrix);
+					gsl_vector_memcpy(&Xtzold_sub.vector, &Xtznew_sub.vector);
+					gsl_vector_memcpy(&betaold_sub.vector, &betanew_sub.vector);
+				}
+			} else {
+			  cHyp_new=cHyp_old;
+			}
+			
+		}				
+
+		//calculate z_hat, and pve
+		if (a_mode==13) {
+			if (cHyp_old.n_gamma==0) {
+				CalcCC_PVEnZ (z_hat, cHyp_old);
+			}
+			else {
+				CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old);
+			}
+			
+			//sample mu and update z hat
+			gsl_vector_sub (z, z_hat);
+			mean_z+=CenterVector(z);
+			mean_z+=gsl_ran_gaussian(gsl_r, sqrt(1.0/(double) ni_test) );			
+			
+			gsl_vector_add_constant (z_hat, mean_z);
+		}
+		
+		//Save data
+		if (t<w_step) {continue;}
+		else {		
+			if (t%r_pace==0) {
+				w_col=w%w_pace;
+				if (w_col==0) {
+					if (w==0) {WriteResult (0, Result_hyp, Result_gamma, w_col);}					
+					else {
+						WriteResult (1, Result_hyp, Result_gamma, w_col);
+						gsl_matrix_set_zero (Result_hyp);
+						gsl_matrix_set_zero (Result_gamma);
+					}
+				}
+
+				gsl_matrix_set (Result_hyp, w_col, 0, cHyp_old.h);
+				gsl_matrix_set (Result_hyp, w_col, 1, cHyp_old.pve);
+				gsl_matrix_set (Result_hyp, w_col, 2, cHyp_old.rho);
+				gsl_matrix_set (Result_hyp, w_col, 3, cHyp_old.pge);
+				gsl_matrix_set (Result_hyp, w_col, 4, cHyp_old.logp);
+				gsl_matrix_set (Result_hyp, w_col, 5, cHyp_old.n_gamma);
+				
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+					pos=mapRank2pos[rank_old[i]]+1;
+
+					gsl_matrix_set (Result_gamma, w_col, i, pos);
+					
+					beta_g[pos-1].first+=gsl_vector_get(beta_old, i);
+					beta_g[pos-1].second+=1.0;	
+				}
+				
+				if (a_mode==13) {
+					pheno_mean+=mean_z;
+				}
+				
+				w++;
+				
+			}
+			
+		}
+	}
+	cout<<endl;
+
+	cout<<"time on selecting Xgamma: "<<time_set<<endl;
+	cout<<"time on calculating posterior: "<<time_post<<endl;
+
+	w_col=w%w_pace;
+	WriteResult (1, Result_hyp, Result_gamma, w_col);	
+	
+	gsl_vector *alpha=gsl_vector_alloc (ns_test);
+	gsl_vector_set_zero (alpha);
+	WriteParam (beta_g, alpha, w);
+	gsl_vector_free(alpha);
+
+	gsl_matrix_free(Result_hyp);
+	gsl_matrix_free(Result_gamma);	
+	
+	gsl_vector_free(z_hat);
+	gsl_vector_free(z);
+	gsl_vector_free(Xb_new);	
+	gsl_vector_free(Xb_old);
+
+	gsl_matrix_free(Xgamma_old);
+	gsl_matrix_free(XtX_old);
+	gsl_vector_free(Xtz_old);
+	gsl_vector_free(beta_old);
+
+	gsl_matrix_free(Xgamma_new);
+	gsl_matrix_free(XtX_new);
+	gsl_vector_free(Xtz_new);
+	gsl_vector_free(beta_new);
+	
+	delete [] p_gamma;
+	beta_g.clear();
+	
+	return;
+}
diff --git a/bslmm.h b/bslmm.h
new file mode 100644
index 0000000..f407794
--- /dev/null
+++ b/bslmm.h
@@ -0,0 +1,145 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef __BSLMM_H__                
+#define __BSLMM_H__
+
+#include <vector>
+#include <map>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+
+class BSLMM {
+
+public:	
+	// IO related parameters
+	int a_mode;	
+	size_t d_pace;
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double pve_null;
+	double pheno_mean;
+	
+	// BSLMM MCMC related parameters
+	double h_min, h_max, h_scale;			//priors for h
+	double rho_min, rho_max, rho_scale;		//priors for rho
+	double logp_min, logp_max, logp_scale;		//priors for log(pi)
+	size_t s_min, s_max;			//minimum and maximum number of gammas
+	size_t w_step;					//number of warm up/burn in iterations
+	size_t s_step;					//number of sampling iterations
+	size_t r_pace;					//record pace
+	size_t w_pace;					//write pace
+	size_t n_accept;				//number of acceptance
+	size_t n_mh;					//number of MH steps within each iteration
+	double geo_mean;				//mean of the geometric distribution
+	long int randseed;
+	double trace_G;	
+	
+	HYPBSLMM cHyp_initial;
+
+	// Summary statistics
+	size_t ni_total, ns_total;	//number of total individuals and snps
+	size_t ni_test, ns_test;	//number of individuals and snps used for analysis
+	size_t n_cvt;				//number of covariates
+	double time_UtZ;
+	double time_Omega;		//time spent on optimization iterations
+	double time_Proposal;        //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis)
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	gsl_rng *gsl_r; 
+	gsl_ran_discrete_t *gsl_t;	
+	map<size_t, size_t> mapRank2pos;	
+	
+	// Main Functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	
+	void RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda);
+	
+	void MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y);
+	void WriteLog ();
+	void WriteLR ();
+	void WriteBV (const gsl_vector *bv);
+	void WriteParam (vector<pair<double, double> > &beta_g, const gsl_vector *alpha, const size_t w);
+	void WriteParam (const gsl_vector *alpha);
+	void WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col);
+	
+	//Subfunctions inside MCMC
+	void CalcPgamma (double *p_gammar);
+	
+	double CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2);
+	void InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector<size_t> &rank_old, class HYPBSLMM &cHyp, vector<pair<size_t, double> > &pos_loglr);
+	double CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp);
+	double CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	double CalcREMLE (const gsl_matrix *Utw, const gsl_vector *Uty, const gsl_vector *K_eval);
+	double CalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, vector<pair<size_t, double> > &loglr_sort);		//calculate the maximum marginal likelihood ratio for each analyzed SNPs with gemma, use it to rank SNPs
+	void SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z);
+	double ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	double ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	double ProposeGamma (const vector<size_t> &rank_old, vector<size_t> &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	void SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector<size_t> &rank);
+
+	void CalcXtX (const gsl_matrix *X_new, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX_new, gsl_vector *Xty_new);
+	void SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector<size_t> &rank_old, const vector<size_t> &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new);
+	double CalcPosterior (const double yty, class HYPBSLMM &cHyp);
+	double CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void MCMC (const gsl_matrix *X, const gsl_vector *y);
+	
+	//utility functions
+//	double vec_sum (gsl_vector *v);
+//	void vec_center (gsl_vector *v);
+//	double calc_var (gsl_vector *v);
+//	void calc_sigma (MCMC &cMcmc);
+//	bool comp_lr (pair<size_t, double> a, pair<size_t, double> b);
+};
+
+
+
+#endif
+
+
diff --git a/gemma.cpp b/gemma.cpp
new file mode 100644
index 0000000..093cd05
--- /dev/null
+++ b/gemma.cpp
@@ -0,0 +1,1856 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <sys/stat.h>
+#include <ctime>
+#include <cmath>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_eigen.h"
+#include "gsl/gsl_cdf.h"
+
+#include "lapack.h"  //for functions EigenDecomp
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"   //for function ReadFile_kin
+#include "gemma_float.h"
+#include "vc_float.h"
+#include "lm_float.h"  //for LM class
+#include "bslmm_float.h"  //for BSLMM class
+#include "lmm_float.h"  //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe
+#include "mvlmm_float.h"  //for MVLMM class
+#include "prdt_float.h"	//for PRDT class
+#include "mathfunc_float.h"	//for a few functions
+#else
+#include "io.h"
+#include "gemma.h"
+#include "vc.h"
+#include "lm.h"
+#include "bslmm.h"
+#include "lmm.h"
+#include "mvlmm.h"
+#include "prdt.h"
+#include "mathfunc.h"
+#endif
+
+
+using namespace std;
+
+
+
+GEMMA::GEMMA(void):	
+version("0.95"), date("08/08/2014"), year("2011")
+{}
+
+void GEMMA::PrintHeader (void)
+{
+	cout<<endl;
+	cout<<"*********************************************************"<<endl;
+	cout<<"  Genome-wide Efficient Mixed Model Association (GEMMA) "<<endl;
+	cout<<"  Version "<<version<<", "<<date<<"                              "<<endl;
+	cout<<"  Visit                                                 "<<endl;
+	cout<<"     http://stephenslab.uchicago.edu/software.html      "<<endl;
+	cout<<"     http://home.uchicago.edu/~xz7/software.html        "<<endl;
+	cout<<"  For Possible Updates                                  "<<endl;
+	cout<<"  (C) "<<year<<" Xiang Zhou                                   "<<endl;
+	cout<<"  GNU General Public License                            "<<endl;
+	cout<<"  For Help, Type ./gemma -h                             "<<endl;
+	cout<<"*********************************************************"<<endl;
+	cout<<endl;
+	
+	return;
+}
+
+
+void GEMMA::PrintLicense (void)
+{
+	cout<<endl;
+	cout<<"The Software Is Distributed Under GNU General Public License, But May Also Require The Following Notifications."<<endl;
+	cout<<endl;
+	
+	cout<<"Including Lapack Routines In The Software May Require The Following Notification:"<<endl;
+	cout<<"Copyright (c) 1992-2010 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved."<<endl;
+	cout<<"Copyright (c) 2000-2010 The University of California Berkeley. All rights reserved."<<endl;
+	cout<<"Copyright (c) 2006-2010 The University of Colorado Denver.  All rights reserved."<<endl;	
+	cout<<endl;
+	
+	cout<<"$COPYRIGHT$"<<endl;
+	cout<<"Additional copyrights may follow"<<endl;
+	cout<<"$HEADER$"<<endl;
+	cout<<"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:"<<endl;
+	cout<<"- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer."<<endl;
+	cout<<"- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution."<<endl;
+	cout<<"- Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission."<<endl;
+	cout<<"The copyright holders provide no reassurances that the source code provided does not infringe any patent, copyright, or any other "
+		<<"intellectual property rights of third parties.  The copyright holders disclaim any liability to any recipient for claims brought against "
+		<<"recipient by any third party for infringement of that parties intellectual property rights. "<<endl;
+	cout<<"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT "
+		<<"LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT "
+		<<"OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT "
+		<<"LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY "
+		<<"THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE "
+		<<"OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."<<endl;
+	cout<<endl;
+	
+	
+	
+	return;
+}
+
+
+
+void GEMMA::PrintHelp(size_t option)
+{
+	if (option==0) {
+		cout<<endl; 
+		cout<<" GEMMA version "<<version<<", released on "<<date<<endl;
+		cout<<" implemented by Xiang Zhou"<<endl; 
+		cout<<endl;
+		cout<<" type ./gemma -h [num] for detailed helps"<<endl;
+		cout<<" options: " << endl;
+		cout<<" 1: quick guide"<<endl;
+		cout<<" 2: file I/O related"<<endl;
+		cout<<" 3: SNP QC"<<endl;
+		cout<<" 4: calculate relatedness matrix"<<endl;
+		cout<<" 5: perform eigen decomposition"<<endl;
+		cout<<" 6: perform variance component estiamtion"<<endl;
+		cout<<" 7: fit a linear model"<<endl;
+		cout<<" 8: fit a linear mixed model"<<endl;
+		cout<<" 9: fit a multivariate linear mixed model"<<endl;
+		cout<<" 10: fit a Bayesian sparse linear mixed model"<<endl;
+		cout<<" 11: obtain predicted values"<<endl;
+		cout<<" 12: note"<<endl;
+		cout<<endl;
+	}	
+	
+	if (option==1) {
+		cout<<" QUICK GUIDE" << endl;
+		cout<<" to generate a relatedness matrix: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -gk [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -gk [num] -o [prefix]"<<endl;
+		cout<<" to perform eigen decomposition of the relatedness matrix: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -eigen -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]"<<endl;
+		cout<<" to estimate variance components: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -k [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -mk [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -mk [filename] -vc -o [prefix]"<<endl;
+		cout<<" to fit a linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;	
+		cout<<" to fit a multivariate linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;	
+		cout<<" to fit a Bayesian sparse linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -bslmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -bslmm [num] -o [prefix]"<<endl;
+		cout<<" to obtain predicted values: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==2) {
+		cout<<" FILE I/O RELATED OPTIONS" << endl;
+		cout<<" -bfile    [prefix]       "<<" specify input PLINK binary ped file prefix."<<endl;	
+		cout<<"          requires: *.fam, *.bim and *.bed files"<<endl;	
+		cout<<"          missing value: -9"<<endl;
+		cout<<" -g        [filename]     "<<" specify input BIMBAM mean genotype file name"<<endl;
+		cout<<"          format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
+		cout<<"                  rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
+		cout<<"                  ..."<<endl;	
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -p        [filename]     "<<" specify input BIMBAM phenotype file name"<<endl;
+		cout<<"          format: phenotype for individual 1"<<endl;	
+		cout<<"                  phenotype for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -a        [filename]     "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl;	
+		cout<<"          format: rs#1, base_position, chr_number"<<endl;	
+		cout<<"                  rs#2, base_position, chr_number"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<" -k        [filename]     "<<" specify input kinship/relatedness matrix file name"<<endl;	
+		cout<<" -mk       [filename]     "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl;	
+		cout<<" -u        [filename]     "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl;	
+		cout<<" -d        [filename]     "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl;	
+		cout<<" -c        [filename]     "<<" specify input covariates file name (optional)"<<endl;	
+		cout<<"          format: covariate 1 for individual 1, ... , covariate c for individual 1"<<endl;	
+		cout<<"                  covariate 1 for individual 2, ... , covariate c for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<"          note: the intercept (a column of 1s) may need to be included"<<endl;
+		cout<<" -epm      [filename]     "<<" specify input estimated parameter file name"<<endl;
+		cout<<" -en [n1] [n2] [n3] [n4]  "<<" specify values for the input estimated parameter file (with a header)"<<endl;
+		cout<<"          options: n1: rs column number"<<endl;
+		cout<<"                   n2: estimated alpha column number (0 to ignore)"<<endl;
+		cout<<"                   n3: estimated beta column number (0 to ignore)"<<endl;
+		cout<<"                   n4: estimated gamma column number (0 to ignore)"<<endl;
+		cout<<"          default: 2 4 5 6 if -ebv is not specified; 2 0 5 6 if -ebv is specified"<<endl;
+		cout<<" -ebv      [filename]     "<<" specify input estimated random effect (breeding value) file name"<<endl;
+		cout<<"          format: value for individual 1"<<endl;	
+		cout<<"                  value for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -emu      [filename]     "<<" specify input log file name containing estimated mean"<<endl;
+		cout<<" -mu       [num]          "<<" specify input estimated mean value"<<endl;
+		cout<<" -gene     [filename]     "<<" specify input gene expression file name"<<endl;
+		cout<<"          format: header"<<endl;	
+		cout<<"                  gene1, count for individual 1, count for individual 2, ..."<<endl;	
+		cout<<"                  gene2, count for individual 1, count for individual 2, ..."<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: not allowed"<<endl;	
+		cout<<" -r        [filename]     "<<" specify input total read count file name"<<endl;
+		cout<<"          format: total read count for individual 1"<<endl;	
+		cout<<"                  total read count for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -snps     [filename]     "<<" specify input snps file name to only analyze a certain set of snps"<<endl;
+		cout<<"          format: rs#1"<<endl;	
+		cout<<"                  rs#2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -silence                 "<<" silent terminal display"<<endl;
+		cout<<" -km       [num]          "<<" specify input kinship/relatedness file type (default 1)."<<endl;
+		cout<<"          options: 1: \"n by n matrix\" format"<<endl;
+		cout<<"                   2: \"id  id  value\" format"<<endl;
+		cout<<" -n        [num]          "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl;	
+		cout<<" -pace     [num]          "<<" specify terminal display update pace (default 100000 SNPs or 100000 iterations)."<<endl;
+		cout<<" -o        [prefix]       "<<" specify output file prefix (default \"result\")"<<endl;  
+		cout<<"          output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl;	
+		cout<<"          output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==3) {
+		cout<<" SNP QC OPTIONS" << endl;
+		cout<<" -miss     [num]          "<<" specify missingness threshold (default 0.05)" << endl; 
+		cout<<" -maf      [num]          "<<" specify minor allele frequency threshold (default 0.01)" << endl; 
+		cout<<" -hwe      [num]          "<<" specify HWE test p value threshold (default 0; no test)" << endl; 
+		cout<<" -r2       [num]          "<<" specify r-squared threshold (default 0.9999)" << endl; 
+		cout<<" -notsnp                  "<<" minor allele frequency cutoff is not used" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==4) {
+		cout<<" RELATEDNESS MATRIX CALCULATION OPTIONS" << endl;
+		cout<<" -gk       [num]          "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl; 
+		cout<<"          options: 1: centered XX^T/p"<<endl;
+		cout<<"                   2: standardized XX^T/p"<<endl;
+		cout<<"          note: non-polymorphic SNPs are excluded "<<endl;
+		cout<<endl;
+	}
+	
+	if (option==5) {
+		cout<<" EIGEN-DECOMPOSITION OPTIONS" << endl;
+		cout<<" -eigen                   "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl; 
+		cout<<endl;
+	}
+
+	if (option==6) {
+		cout<<" VARIANCE COMPONENT ESTIMATION OPTIONS" << endl;
+		cout<<" -vc                      "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==7) {
+		cout<<" LINEAR MODEL OPTIONS" << endl;		
+		cout<<" -lm       [num]         "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: Wald test"<<endl;
+		cout<<"                   2: Likelihood ratio test"<<endl;
+		cout<<"                   3: Score test"<<endl;
+		cout<<"                   4: 1-3"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==8) {
+		cout<<" LINEAR MIXED MODEL OPTIONS" << endl;		
+		cout<<" -lmm      [num]         "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: Wald test"<<endl;		
+		cout<<"                   2: Likelihood ratio test"<<endl;
+		cout<<"                   3: Score test"<<endl;
+		cout<<"                   4: 1-3"<<endl;
+		cout<<"                   5: Parameter estimation in the null model only"<<endl;
+		cout<<" -lmin     [num]          "<<" specify minimal value for lambda (default 1e-5)" << endl; 
+		cout<<" -lmax     [num]          "<<" specify maximum value for lambda (default 1e+5)" << endl; 
+		cout<<" -region   [num]          "<<" specify the number of regions used to evaluate lambda (default 10)" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==9) {
+		cout<<" MULTIVARIATE LINEAR MIXED MODEL OPTIONS" << endl;
+		cout<<" -pnr				     "<<" specify the pvalue threshold to use the Newton-Raphson's method (default 0.001)"<<endl;
+		cout<<" -emi				     "<<" specify the maximum number of iterations for the PX-EM method in the null (default 10000)"<<endl;
+		cout<<" -nri				     "<<" specify the maximum number of iterations for the Newton-Raphson's method in the null (default 100)"<<endl;
+		cout<<" -emp				     "<<" specify the precision for the PX-EM method in the null (default 0.0001)"<<endl;
+		cout<<" -nrp				     "<<" specify the precision for the Newton-Raphson's method in the null (default 0.0001)"<<endl;
+		cout<<" -crt				     "<<" specify to output corrected pvalues for these pvalues that are below the -pnr threshold"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==10) {
+		cout<<" MULTI-LOCUS ANALYSIS OPTIONS" << endl;
+		cout<<" -bslmm	  [num]			 "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: BSLMM"<<endl;	
+		cout<<"                   2: standard ridge regression/GBLUP (no mcmc)"<<endl;	
+		cout<<"                   3: probit BSLMM (requires 0/1 phenotypes)"<<endl;			
+		
+		cout<<"   MCMC OPTIONS" << endl;
+		cout<<"   Prior" << endl;	
+		cout<<" -hmin     [num]          "<<" specify minimum value for h (default 0)" << endl; 
+		cout<<" -hmax     [num]          "<<" specify maximum value for h (default 1)" << endl; 
+		cout<<" -rmin     [num]          "<<" specify minimum value for rho (default 0)" << endl; 
+		cout<<" -rmax     [num]          "<<" specify maximum value for rho (default 1)" << endl; 
+		cout<<" -pmin     [num]          "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl; 
+		cout<<" -pmax     [num]          "<<" specify maximum value for log10(pi) (default log10(1) )" << endl; 	
+		cout<<" -smin     [num]          "<<" specify minimum value for |gamma| (default 0)" << endl; 
+		cout<<" -smax     [num]          "<<" specify maximum value for |gamma| (default 300)" << endl; 
+		
+		cout<<"   Proposal" << endl;
+		cout<<" -gmean    [num]          "<<" specify the mean for the geometric distribution (default: 2000)" << endl; 
+		cout<<" -hscale   [num]          "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
+		cout<<" -rscale   [num]          "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
+		cout<<" -pscale   [num]          "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl; 
+		
+		cout<<"   Others" << endl;
+		cout<<" -w        [num]          "<<" specify burn-in steps (default 100,000)" << endl; 
+		cout<<" -s        [num]          "<<" specify sampling steps (default 1,000,000)" << endl; 
+		cout<<" -rpace    [num]          "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl; 	
+		cout<<" -wpace    [num]          "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl; 	
+		cout<<" -seed     [num]          "<<" specify random seed (a random seed is generated by default)" << endl; 	
+		cout<<" -mh       [num]          "<<" specify number of MH steps in each iteration (default 10)" << endl; 
+		cout<<"          requires: 0/1 phenotypes and -bslmm 3 option"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==11) {
+		cout<<" PREDICTION OPTIONS" << endl;
+		cout<<" -predict  [num]			 "<<" specify prediction options (default 1)."<<endl;
+		cout<<"          options: 1: predict for individuals with missing phenotypes"<<endl;	
+		cout<<"                   2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==12) {
+		cout<<" NOTE"<<endl;
+		cout<<" 1. Only individuals with non-missing phenotoypes and covariates will be analyzed."<<endl;
+		cout<<" 2. Missing genotoypes will be repalced with the mean genotype of that SNP."<<endl;
+		cout<<" 3. For lmm analysis, memory should be large enough to hold the relatedness matrix and to perform eigen decomposition."<<endl;
+		cout<<" 4. For multivariate lmm analysis, use a large -pnr for each snp will increase computation time dramatically."<<endl;
+		cout<<" 5. For bslmm analysis, in addition to 3, memory should be large enough to hold the whole genotype matrix."<<endl;
+		cout<<endl;
+	}
+	
+	return;
+}
+
+
+
+void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
+{
+	string str;
+	
+	for(int i = 1; i < argc; i++) {		
+		if (strcmp(argv[i], "-bfile")==0 || strcmp(argv[i], "--bfile")==0 || strcmp(argv[i], "-b")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_bfile=str;
+		}
+		else if (strcmp(argv[i], "-silence")==0) {
+			cPar.mode_silence=true;
+		}
+		else if (strcmp(argv[i], "-g")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_geno=str;
+		}
+		else if (strcmp(argv[i], "-p")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_pheno=str;
+		}
+		else if (strcmp(argv[i], "-a")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_anno=str;
+		}
+		else if (strcmp(argv[i], "-k")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_kin=str;
+		}
+		else if (strcmp(argv[i], "-mk")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mk=str;
+		}
+		else if (strcmp(argv[i], "-u")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_ku=str;
+		}
+		else if (strcmp(argv[i], "-d")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_kd=str;
+		}
+		else if (strcmp(argv[i], "-c")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_cvt=str;
+		}
+		else if (strcmp(argv[i], "-epm")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_epm=str;
+		}
+		else if (strcmp(argv[i], "-en")==0) {			
+			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
+				++i;
+				str.clear();
+				str.assign(argv[i]);
+				cPar.est_column.push_back(atoi(str.c_str()));
+			}
+		}
+		else if (strcmp(argv[i], "-ebv")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_ebv=str;
+		}
+		else if (strcmp(argv[i], "-emu")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_log=str;
+		}
+		else if (strcmp(argv[i], "-mu")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.pheno_mean=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gene")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_gene=str;
+		}
+		else if (strcmp(argv[i], "-r")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_read=str;
+		}
+		else if (strcmp(argv[i], "-snps")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_snps=str;
+		}
+		else if (strcmp(argv[i], "-km")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.k_mode=atoi(str.c_str());
+		}		
+		else if (strcmp(argv[i], "-n")==0) {
+			(cPar.p_column).clear();
+			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
+				++i;
+				str.clear();
+				str.assign(argv[i]);
+				(cPar.p_column).push_back(atoi(str.c_str()));
+			}
+		}
+		else if (strcmp(argv[i], "-pace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.d_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-o")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_out=str;
+		}		
+		else if (strcmp(argv[i], "-miss")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.miss_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-maf")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			if (cPar.maf_level!=-1) {cPar.maf_level=atof(str.c_str());}
+		}
+		else if (strcmp(argv[i], "-hwe")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.hwe_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-r2")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.r2_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-notsnp")==0) {
+			cPar.maf_level=-1;
+		}
+		else if (strcmp(argv[i], "-gk")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=21; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=20+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-eigen")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=31; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=30+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-vc")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=61; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=60+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-lm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=51; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=50+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-fa")==0 || strcmp(argv[i], "-lmm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=1; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-lmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.l_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-lmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.l_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-region")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.n_region=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pnr")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.p_nr=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-emi")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.em_iter=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-nri")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.nr_iter=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-emp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.em_prec=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-nrp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.nr_prec=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-crt")==0) {
+			cPar.crt=1;
+		}
+		else if (strcmp(argv[i], "-bslmm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=11; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=10+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pmin")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_min=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-pmax")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_max=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-smin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_min=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-smax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_max=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gmean")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.geo_mean=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_scale=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_scale=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_scale=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-w")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.w_step=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-s")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_step=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rpace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.r_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-wpace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.w_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-seed")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.randseed=atol(str.c_str());
+		}
+		else if (strcmp(argv[i], "-mh")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.n_mh=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-predict")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=41; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=40+atoi(str.c_str());
+		}
+		else {cout<<"error! unrecognized option: "<<argv[i]<<endl; cPar.error=true; continue;}
+	}
+	
+	//change prediction mode to 43, if the epm file is not provided
+	if (cPar.a_mode==41 && cPar.file_epm.empty()) {cPar.a_mode=43;}
+	
+	return;
+}
+
+
+
+void GEMMA::BatchRun (PARAM &cPar) 
+{
+	clock_t time_begin, time_start;
+	time_begin=clock();
+
+	//Read Files
+	cout<<"Reading Files ... "<<endl;
+	cPar.ReadFiles();
+	if (cPar.error==true) {cout<<"error! fail to read files. "<<endl; return;}
+	cPar.CheckData();
+	if (cPar.error==true) {cout<<"error! fail to check data. "<<endl; return;}
+	//Prediction for bslmm	
+	if (cPar.a_mode==41 || cPar.a_mode==42) {
+		gsl_vector *y_prdt;
+		
+		y_prdt=gsl_vector_alloc (cPar.ni_total-cPar.ni_test);
+
+		//set to zero
+		gsl_vector_set_zero (y_prdt);
+		
+		PRDT cPRDT;
+		cPRDT.CopyFromParam(cPar);
+		
+		//add breeding value if needed
+		if (!cPar.file_kin.empty() && !cPar.file_ebv.empty()) {
+			cout<<"Adding Breeding Values ... "<<endl;
+			
+			gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
+			gsl_vector *u_hat=gsl_vector_alloc (cPar.ni_test);
+			
+			//read kinship matrix and set u_hat
+			vector<int> indicator_all;
+			size_t c_bv=0;
+			for (size_t i=0; i<cPar.indicator_idv.size(); i++) {
+				indicator_all.push_back(1);
+				if (cPar.indicator_bv[i]==1) {gsl_vector_set(u_hat, c_bv, cPar.vec_bv[i]); c_bv++;}
+			}
+			
+			ReadFile_kin (cPar.file_kin, indicator_all, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+			
+			//read u			
+			cPRDT.AddBV(G, u_hat, y_prdt);					
+			
+			gsl_matrix_free(G);
+			gsl_vector_free(u_hat);
+		}
+
+		//add beta
+		if (!cPar.file_bfile.empty()) {
+			cPRDT.AnalyzePlink (y_prdt);
+		}
+		else {
+			cPRDT.AnalyzeBimbam (y_prdt);
+		}
+		
+		//add mu
+		gsl_vector_add_constant(y_prdt, cPar.pheno_mean);
+		
+		//convert y to probability if needed
+		if (cPar.a_mode==42) {
+			double d;
+			for (size_t i=0; i<y_prdt->size; i++) {
+				d=gsl_vector_get(y_prdt, i);
+				d=gsl_cdf_gaussian_P(d, 1.0);
+				gsl_vector_set(y_prdt, i, d);
+			}
+		}
+			
+			
+		cPRDT.CopyToParam(cPar);
+		
+		cPRDT.WriteFiles(y_prdt);
+		
+		gsl_vector_free(y_prdt);
+	}
+	
+	
+	//Prediction with kinship matrix only; for one or more phenotypes
+	if (cPar.a_mode==43) {
+		//first, use individuals with full phenotypes to obtain estimates of Vg and Ve		
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);		
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
+		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
+		gsl_vector *eval=gsl_vector_alloc (Y->size1);
+		
+		gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph);
+		gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt);
+		//set covariates matrix W and phenotype matrix Y
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+		cPar.CopyCvtPhen (W_full, Y_full, 1);
+				
+		gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph);		
+		gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1);		
+		gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2);
+				
+		//read relatedness matrix G, and matrix G_full
+		ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+		ReadFile_kin (cPar.file_kin, cPar.indicator_cvt, cPar.mapID2num, cPar.k_mode, cPar.error, G_full);
+		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+				
+		//center matrix G
+		CenterMatrix (G);
+		CenterMatrix (G_full);
+		
+		//eigen-decomposition and calculate trace_G
+		cout<<"Start Eigen-Decomposition..."<<endl;
+		time_start=clock();	
+		cPar.trace_G=EigenDecomp (G, U, eval, 0);
+		cPar.trace_G=0.0;
+		for (size_t i=0; i<eval->size; i++) {
+			if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+			cPar.trace_G+=gsl_vector_get (eval, i);
+		}
+		cPar.trace_G/=(double)eval->size;
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		
+		//calculate UtW and Uty
+		CalcUtX (U, W, UtW);
+		CalcUtX (U, Y, UtY);
+
+		//calculate variance component and beta estimates
+		//and then obtain predicted values
+		if (cPar.n_ph==1) {
+			gsl_vector *beta=gsl_vector_alloc (W->size2);
+			gsl_vector *se_beta=gsl_vector_alloc (W->size2);
+			
+			double lambda, logl, vg, ve;
+			gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+			//obtain estimates
+			CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl);
+			CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta);
+
+			cout<<"REMLE estimate for vg in the null model = "<<vg<<endl;
+			cout<<"REMLE estimate for ve in the null model = "<<ve<<endl;
+			cPar.vg_remle_null=vg; cPar.ve_remle_null=ve;
+			
+			//obtain Y_hat from fixed effects
+			gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0);			
+			gsl_blas_dgemv (CblasNoTrans, 1.0, W_full, beta, 0.0, &Yhat_col.vector);
+			
+			//obtain H
+			gsl_matrix_set_identity (H_full);
+			gsl_matrix_scale (H_full, ve);
+			gsl_matrix_scale (G_full, vg);
+			gsl_matrix_add (H_full, G_full);
+			
+			//free matrices			
+			gsl_vector_free(beta);
+			gsl_vector_free(se_beta);
+		} else {			
+			gsl_matrix *Vg=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
+			gsl_matrix *Ve=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
+			gsl_matrix *B=gsl_matrix_alloc (cPar.n_ph, W->size2);
+			gsl_matrix *se_B=gsl_matrix_alloc (cPar.n_ph, W->size2);
+			
+			//obtain estimates
+			CalcMvLmmVgVeBeta (eval, UtW, UtY, cPar.em_iter, cPar.nr_iter, cPar.em_prec, cPar.nr_prec, cPar.l_min, cPar.l_max, cPar.n_region, Vg, Ve, B, se_B);
+			
+			cout<<"REMLE estimate for Vg in the null model: "<<endl;
+			for (size_t i=0; i<Vg->size1; i++) {
+				for (size_t j=0; j<=i; j++) {
+					cout<<gsl_matrix_get(Vg, i, j)<<"\t";
+				}
+				cout<<endl;
+			}
+			cout<<"REMLE estimate for Ve in the null model: "<<endl;
+			for (size_t i=0; i<Ve->size1; i++) {
+				for (size_t j=0; j<=i; j++) {
+					cout<<gsl_matrix_get(Ve, i, j)<<"\t";
+				}
+				cout<<endl;
+			}
+			cPar.Vg_remle_null.clear();
+			cPar.Ve_remle_null.clear();
+			for (size_t i=0; i<Vg->size1; i++) {
+				for (size_t j=i; j<Vg->size2; j++) {
+					cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) );
+					cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) );
+				}
+			}
+			
+			//obtain Y_hat from fixed effects
+			gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat);
+			
+			//obtain H
+			KroneckerSym(G_full, Vg, H_full);
+			for (size_t i=0; i<G_full->size1; i++) {
+				gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2);
+				gsl_matrix_add (&H_sub.matrix, Ve);
+			}
+			
+			//free matrices					
+			gsl_matrix_free (Vg);
+			gsl_matrix_free (Ve);
+			gsl_matrix_free (B);
+			gsl_matrix_free (se_B);
+		}
+					
+		PRDT cPRDT;
+		
+		cPRDT.CopyFromParam(cPar);
+		
+		cout<<"Predicting Missing Phentypes ... "<<endl;
+		time_start=clock();	
+		cPRDT.MvnormPrdt(Y_hat, H_full, Y_full);
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+
+		cPRDT.WriteFiles(Y_full);
+		
+		gsl_matrix_free(Y);
+		gsl_matrix_free(W);		
+		gsl_matrix_free(G);
+		gsl_matrix_free(U); 
+		gsl_matrix_free(UtW);
+		gsl_matrix_free(UtY);
+		gsl_vector_free(eval);
+		
+		gsl_matrix_free(Y_full);
+		gsl_matrix_free(Y_hat);
+		gsl_matrix_free(W_full);
+		gsl_matrix_free(G_full);		
+		gsl_matrix_free(H_full);
+	}
+	
+	
+	//Generate Kinship matrix
+	if (cPar.a_mode==21 || cPar.a_mode==22) {  
+		cout<<"Calculating Relatedness Matrix ... "<<endl;
+		
+		gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
+		
+		time_start=clock();
+		cPar.CalcKin (G);
+		cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		if (cPar.error==true) {cout<<"error! fail to calculate relatedness matrix. "<<endl; return;}
+		
+		if (cPar.a_mode==21) {
+			cPar.WriteMatrix (G, "cXX");
+		} else {
+			cPar.WriteMatrix (G, "sXX");
+		}
+		
+		gsl_matrix_free (G);
+	}
+	
+	
+	//LM
+	if (cPar.a_mode==51 || cPar.a_mode==52 || cPar.a_mode==53 || cPar.a_mode==54) {  //Fit LM
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);	
+		
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+		
+		//Fit LM or mvLM
+		if (cPar.n_ph==1) {			
+			LM cLm;
+			cLm.CopyFromParam(cPar);
+			
+			gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+			
+			if (!cPar.file_gene.empty()) {		
+				cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype
+			} else if (!cPar.file_bfile.empty()) {
+				cLm.AnalyzePlink (W, &Y_col.vector);
+			} else {
+				cLm.AnalyzeBimbam (W, &Y_col.vector);
+			}
+			
+			cLm.WriteFiles();
+			cLm.CopyToParam(cPar);
+		}
+		/*
+		else {			 
+			MVLM cMvlm;
+			cMvlm.CopyFromParam(cPar);			
+			
+			if (!cPar.file_bfile.empty()) {
+				cMvlm.AnalyzePlink (W, Y);
+			} else {
+				cMvlm.AnalyzeBimbam (W, Y);
+			}
+			
+			cMvlm.WriteFiles();
+			cMvlm.CopyToParam(cPar);
+		}
+		*/
+		//release all matrices and vectors
+		gsl_matrix_free (Y);
+		gsl_matrix_free (W);
+	} 
+
+
+	//VC estimation with one or multiple kinship matrices
+	//REML approach only
+	//if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp
+	//for one phenotype only; 
+	if (cPar.a_mode==61) {
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc );
+
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+
+		//read kinship matrices
+		if (!(cPar.file_mk).empty()) {
+		  ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+		  if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+	
+		  //center matrix G, and obtain v_traceG
+		  double d=0;
+		  (cPar.v_traceG).clear();
+		  for (size_t i=0; i<cPar.n_vc; i++) {
+		    gsl_matrix_view G_sub=gsl_matrix_submatrix (G, 0, i*G->size1, G->size1, G->size1);
+		    CenterMatrix (&G_sub.matrix);
+		    d=0;
+		    for (size_t j=0; j<G->size1; j++) {
+		      d+=gsl_matrix_get (&G_sub.matrix, j, j);
+		    }
+		    d/=(double)G->size1;
+		    (cPar.v_traceG).push_back(d);
+		  }
+		} else if (!(cPar.file_kin).empty()) {
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+						
+			//center matrix G
+			CenterMatrix (G);
+
+			(cPar.v_traceG).clear();
+			double d=0;
+			for (size_t j=0; j<G->size1; j++) {
+			  d+=gsl_matrix_get (G, j, j);
+			}
+			d/=(double)G->size1;
+			(cPar.v_traceG).push_back(d);
+		}
+			/*
+			//eigen-decomposition and calculate trace_G
+			cout<<"Start Eigen-Decomposition..."<<endl;
+			time_start=clock();	
+	
+			if (cPar.a_mode==31) {
+				cPar.trace_G=EigenDecomp (G, U, eval, 1);
+			} else {
+				cPar.trace_G=EigenDecomp (G, U, eval, 0);
+			}
+
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+				cPar.trace_G+=gsl_vector_get (eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		} else {
+			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
+			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
+			
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
+			
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
+			  	cPar.trace_G+=gsl_vector_get(eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+		}
+		*/
+		//fit multiple variance components
+		if (cPar.n_ph==1) {
+		  //		  if (cPar.n_vc==1) {
+		    /*
+		    //calculate UtW and Uty	
+		    CalcUtX (U, W, UtW);
+		    CalcUtX (U, Y, UtY);
+
+		    gsl_vector_view beta=gsl_matrix_row (B, 0);
+		    gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
+		    gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+		    CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+		    CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
+
+		    cPar.beta_mle_null.clear();
+		    cPar.se_beta_mle_null.clear();
+		    for (size_t i=0; i<B->size2; i++) {
+		      cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
+		      cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+		    }
+
+		    CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+		    CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
+		    cPar.beta_remle_null.clear();
+		    cPar.se_beta_remle_null.clear();
+		    for (size_t i=0; i<B->size2; i++) {
+		      cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
+		      cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+		    }
+				
+		    CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+		    cPar.PrintSummary();
+				
+		    //calculate and output residuals
+		    if (cPar.a_mode==5) {
+		      gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
+					
+		      //obtain Utu and Ute
+		      gsl_vector_memcpy (y_hat, &UtY_col.vector);
+		      gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
+		      
+		      double d, u, e;
+		      for (size_t i=0; i<eval->size; i++) {
+			d=gsl_vector_get (eval, i);
+			u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+			e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+			gsl_vector_set (Utu_hat, i, u);
+			gsl_vector_set (Ute_hat, i, e);
+		      }
+					
+		      //obtain u and e
+		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
+		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
+		      
+		      //output residuals					
+		      cPar.WriteVector(u_hat, "residU");
+		      cPar.WriteVector(e_hat, "residE");
+		      
+		      gsl_vector_free(u_hat);
+		      gsl_vector_free(e_hat);
+		      gsl_vector_free(y_hat);
+		    }	
+*/	
+		  //		  } else {
+		    gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+		    VC cVc;
+		    cVc.CopyFromParam(cPar); 
+		    cVc.CalcVCreml (G, W, &Y_col.vector);			
+		    cVc.CopyToParam(cPar);
+
+		    //obtain pve from sigma2
+		    //obtain se_pve from se_sigma2
+		    
+		    //}
+		} 
+
+		
+	}
+	
+	
+	//LMM or mvLMM or Eigen-Decomposition
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) {  //Fit LMM or mvLMM or eigen
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
+		gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2);	//B is a d by c matrix
+		gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2);
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
+		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
+		gsl_vector *eval=gsl_vector_alloc (Y->size1);
+				
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+				
+		//read relatedness matrix G	
+		if (!(cPar.file_kin).empty()) {
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+						
+			//center matrix G
+			CenterMatrix (G);
+			
+			//eigen-decomposition and calculate trace_G
+			cout<<"Start Eigen-Decomposition..."<<endl;
+			time_start=clock();	
+	
+			if (cPar.a_mode==31) {
+				cPar.trace_G=EigenDecomp (G, U, eval, 1);
+			} else {
+				cPar.trace_G=EigenDecomp (G, U, eval, 0);
+			}
+
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+				cPar.trace_G+=gsl_vector_get (eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		} else {
+			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
+			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
+			
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
+			
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
+			  	cPar.trace_G+=gsl_vector_get(eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+		}
+		
+		if (cPar.a_mode==31) {
+			cPar.WriteMatrix(U, "eigenU");
+			cPar.WriteVector(eval, "eigenD");
+		} else {
+			//calculate UtW and Uty	
+			CalcUtX (U, W, UtW);
+			CalcUtX (U, Y, UtY);			
+
+			//calculate REMLE/MLE estimate and pve for univariate model
+			if (cPar.n_ph==1) {
+				gsl_vector_view beta=gsl_matrix_row (B, 0);
+				gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
+				gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+				CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+				CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
+
+				cPar.beta_mle_null.clear();
+				cPar.se_beta_mle_null.clear();
+				for (size_t i=0; i<B->size2; i++) {
+					cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
+					cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+				}
+
+				CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+				CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
+				cPar.beta_remle_null.clear();
+				cPar.se_beta_remle_null.clear();
+				for (size_t i=0; i<B->size2; i++) {
+					cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
+					cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+				}
+				
+				CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+				cPar.PrintSummary();
+				
+				//calculate and output residuals
+				if (cPar.a_mode==5) {
+					gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
+					
+					//obtain Utu and Ute
+					gsl_vector_memcpy (y_hat, &UtY_col.vector);
+					gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
+					
+					double d, u, e;
+					for (size_t i=0; i<eval->size; i++) {
+						d=gsl_vector_get (eval, i);
+						u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+						e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+						gsl_vector_set (Utu_hat, i, u);
+						gsl_vector_set (Ute_hat, i, e);
+					}
+					
+					//obtain u and e
+					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
+					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
+					
+					//output residuals					
+					cPar.WriteVector(u_hat, "residU");
+					cPar.WriteVector(e_hat, "residE");
+					
+					gsl_vector_free(u_hat);
+					gsl_vector_free(e_hat);
+					gsl_vector_free(y_hat);
+				}							
+			} 
+			
+			//Fit LMM or mvLMM
+			if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) {
+				if (cPar.n_ph==1) {			
+					LMM cLmm;
+					cLmm.CopyFromParam(cPar);
+					
+					gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+					gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+					
+					if (!cPar.file_gene.empty()) {		
+						cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype
+					} else if (!cPar.file_bfile.empty()) {
+						cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					} else {
+						cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					}	
+					
+					cLmm.WriteFiles();
+					cLmm.CopyToParam(cPar);
+				} else {			 
+					MVLMM cMvlmm;
+					cMvlmm.CopyFromParam(cPar);			
+					
+					if (!cPar.file_bfile.empty()) {
+						cMvlmm.AnalyzePlink (U, eval, UtW, UtY);
+					} else {
+						cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY);
+					}
+					
+					cMvlmm.WriteFiles();
+					cMvlmm.CopyToParam(cPar);
+				}
+			}
+		}
+		
+				
+		//release all matrices and vectors
+		gsl_matrix_free (Y);
+		gsl_matrix_free (W);
+		gsl_matrix_free(B);
+		gsl_matrix_free(se_B);
+		gsl_matrix_free (G);	
+		gsl_matrix_free (U);
+		gsl_matrix_free (UtW);
+		gsl_matrix_free (UtY);
+		gsl_vector_free (eval);
+	} 
+	
+	
+	//BSLMM
+	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
+		gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);	
+		gsl_matrix *G=gsl_matrix_alloc (y->size, y->size);
+		gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test);	
+		
+		//set covariates matrix W and phenotype vector y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, y, 0);
+		
+		//center y, even for case/control data
+		cPar.pheno_mean=CenterVector(y);
+
+		//run bslmm if rho==1
+		if (cPar.rho_min==1 && cPar.rho_max==1) {
+		  //read genotypes X (not UtX)
+		  cPar.ReadGenotypes (UtX, G, false);
+
+		  //perform BSLMM analysis
+		  BSLMM cBslmm;
+		  cBslmm.CopyFromParam(cPar);
+		  time_start=clock();	
+		  cBslmm.MCMC(UtX, y);
+		  cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		  cBslmm.CopyToParam(cPar);
+		  //else, if rho!=1
+		} else {
+		gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); 
+		gsl_vector *eval=gsl_vector_alloc (y->size);
+		gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2);
+		gsl_vector *Uty=gsl_vector_alloc (y->size);
+
+		
+		//read relatedness matrix G		
+		if (!(cPar.file_kin).empty()) {		
+			cPar.ReadGenotypes (UtX, G, false);
+			
+			//read relatedness matrix G
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+			
+			//center matrix G
+			CenterMatrix (G);
+		} else {
+			cPar.ReadGenotypes (UtX, G, true);
+		}
+		
+		//eigen-decomposition and calculate trace_G
+		cout<<"Start Eigen-Decomposition..."<<endl;
+		time_start=clock();
+		cPar.trace_G=EigenDecomp (G, U, eval, 0);
+		cPar.trace_G=0.0;
+		for (size_t i=0; i<eval->size; i++) {
+			if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+			cPar.trace_G+=gsl_vector_get (eval, i);
+		}
+		cPar.trace_G/=(double)eval->size;
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);			
+		
+		//calculate UtW and Uty		
+		CalcUtX (U, W, UtW);
+		CalcUtX (U, y, Uty);
+		
+		//calculate REMLE/MLE estimate and pve
+		CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+		CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+		CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+		
+		cPar.PrintSummary();
+				
+		//Creat and calcualte UtX, use a large memory
+		cout<<"Calculating UtX..."<<endl;
+		time_start=clock();							
+		CalcUtX (U, UtX);
+		cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//perform BSLMM analysis
+		BSLMM cBslmm;
+		cBslmm.CopyFromParam(cPar);
+		time_start=clock();	
+		if (cPar.a_mode==12) {  //ridge regression				
+			cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null);
+		} else {	//Run MCMC
+			cBslmm.MCMC(U, UtX, Uty, eval, y);
+		}
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		cBslmm.CopyToParam(cPar);
+		
+		//release all matrices and vectors
+		gsl_matrix_free (G);	
+		gsl_matrix_free (U);
+		gsl_matrix_free (UtW);
+		gsl_vector_free (eval);
+		gsl_vector_free (Uty);
+
+		}
+		gsl_matrix_free (W);
+		gsl_vector_free (y);
+		gsl_matrix_free (UtX);
+	} 
+	
+	
+		
+	cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0);
+	
+	return;
+}
+
+
+
+
+void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) 
+{
+	string file_str;
+	file_str="./output/"+cPar.file_out;
+	file_str+=".log.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing log file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"##"<<endl;
+	outfile<<"## GEMMA Version = "<<version<<endl;
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Command Line Input = ";
+	for(int i = 1; i < argc; i++) {	
+		outfile<<argv[i]<<" ";
+	}
+	outfile<<endl;
+
+	outfile<<"##"<<endl;
+	time_t  rawtime; 
+	time(&rawtime);
+	tm *ptm = localtime (&rawtime);
+
+	outfile<<"## Date = "<<asctime(ptm)<<endl;
+	  //ptm->tm_year<<":"<<ptm->tm_month<<":"<<ptm->tm_day":"<<ptm->tm_hour<<":"<<ptm->tm_min<<endl;
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Summary Statistics:"<<endl;
+	outfile<<"## number of total individuals = "<<cPar.ni_total<<endl;	
+	if (cPar.a_mode==43) {
+		outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl;
+		outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl;
+	} else {
+		outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl;
+	}
+	outfile<<"## number of covariates = "<<cPar.n_cvt<<endl;
+	outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl;
+	if (cPar.a_mode==43) {
+		outfile<<"## number of observed data = "<<cPar.np_obs<<endl;
+		outfile<<"## number of missing data = "<<cPar.np_miss<<endl;
+	}
+	if (cPar.a_mode==61) {
+		outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+	}
+		
+	if (!(cPar.file_gene).empty()) {
+		outfile<<"## number of total genes = "<<cPar.ng_total<<endl;
+		outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl;		
+	} else if (cPar.file_epm.empty()) {	
+		outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl;	
+		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	} else {
+		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	}
+	
+	if (cPar.a_mode==13) {
+		outfile<<"## number of cases = "<<cPar.ni_case<<endl;
+		outfile<<"## number of controls = "<<cPar.ni_control<<endl;
+	}
+
+
+	if (cPar.a_mode==61) {
+	  //	        outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
+		if (cPar.n_ph==1) {
+		  outfile<<"## pve estimate in the null model = ";
+		  for (size_t i=0; i<cPar.v_pve.size(); i++) {
+		    outfile<<"  "<<cPar.v_pve[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## se(pve) in the null model = ";
+		  for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
+		    outfile<<"  "<<cPar.v_se_pve[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## sigma2 estimate in the null model = ";
+		  for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
+		    outfile<<"  "<<cPar.v_sigma2[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## se(sigma2) in the null model = ";
+		  for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
+		    outfile<<"  "<<cPar.v_se_sigma2[i];
+		  }
+		  outfile<<endl;
+		  /*
+			outfile<<"## beta estimate in the null model = ";
+			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.beta_remle_null[i];
+			}
+			outfile<<endl;
+			outfile<<"## se(beta) = ";
+			for (size_t i=0; i<cPar.se_beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.se_beta_remle_null[i];
+			}
+			outfile<<endl;
+		  */
+		}
+	}
+	
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
+		outfile<<"## MLE log-likelihood in the null model = "<<cPar.logl_mle_H0<<endl;
+		if (cPar.n_ph==1) {
+			//outfile<<"## lambda REMLE estimate in the null (linear mixed) model = "<<cPar.l_remle_null<<endl;
+			//outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl;	
+			outfile<<"## pve estimate in the null model = "<<cPar.pve_null<<endl;
+			outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl;	
+			outfile<<"## vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
+			outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+			outfile<<"## beta estimate in the null model = ";
+			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.beta_remle_null[i];
+			}
+			outfile<<endl;
+			outfile<<"## se(beta) = ";
+			for (size_t i=0; i<cPar.se_beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.se_beta_remle_null[i];
+			}
+			outfile<<endl;
+			
+		} else {
+			size_t c;
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Vg): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVg_remle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Ve): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVe_remle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			
+			outfile<<"## MLE estimate for Vg in the null model: "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_ph; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_mle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Vg): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVg_mle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## MLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_ph; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_mle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Ve): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVe_mle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## estimate for B (d by c) in the null model (columns correspond to the covariates provided in the file): "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_cvt; j++) {
+					c=i*cPar.n_cvt+j;
+					outfile<<cPar.beta_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(B): "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_cvt; j++) {
+					c=i*cPar.n_cvt+j;
+					outfile<<cPar.se_beta_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+		}
+	}
+	
+	/*
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		if (cPar.n_ph==1) {
+			outfile<<"## REMLE vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
+			outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+		} else {
+			size_t c;
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+		}
+	}
+	 */
+	
+	
+	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl;
+	}
+	
+	if (cPar.a_mode==11 || cPar.a_mode==13) {	
+		outfile<<"##"<<endl;
+		outfile<<"## MCMC related:"<<endl;	
+		outfile<<"## initial value of h = "<<cPar.cHyp_initial.h<<endl;
+		outfile<<"## initial value of rho = "<<cPar.cHyp_initial.rho<<endl;
+		outfile<<"## initial value of pi = "<<exp(cPar.cHyp_initial.logp)<<endl;
+		outfile<<"## initial value of |gamma| = "<<cPar.cHyp_initial.n_gamma<<endl;
+		outfile<<"## random seed = "<<cPar.randseed<<endl;
+		outfile<<"## acceptance ratio = "<<(double)cPar.n_accept/(double)((cPar.w_step+cPar.s_step)*cPar.n_mh)<<endl;
+	}
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Computation Time:"<<endl;
+	outfile<<"## total computation time = "<<cPar.time_total<<" min "<<endl;
+	outfile<<"## computation time break down: "<<endl;
+	if (cPar.a_mode==21 || cPar.a_mode==22 || cPar.a_mode==11 || cPar.a_mode==13) {
+		outfile<<"##      time on calculating relatedness matrix = "<<cPar.time_G<<" min "<<endl;
+	}
+	if (cPar.a_mode==31) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+	}
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+		outfile<<"##      time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl;		
+	}
+	if ((cPar.a_mode>=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) {
+		outfile<<"##      time on optimization = "<<cPar.time_opt<<" min "<<endl;
+	}
+	if (cPar.a_mode==11 || cPar.a_mode==13) {
+		outfile<<"##      time on proposal = "<<cPar.time_Proposal<<" min "<<endl;
+		outfile<<"##      time on mcmc = "<<cPar.time_opt<<" min "<<endl;
+		outfile<<"##      time on Omega = "<<cPar.time_Omega<<" min "<<endl;
+	}
+	if (cPar.a_mode==41 || cPar.a_mode==42) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+	}
+	if (cPar.a_mode==43) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+		outfile<<"##      time on predicting phenotypes = "<<cPar.time_opt<<" min "<<endl;
+	}
+	outfile<<"##"<<endl;
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
diff --git a/gemma.h b/gemma.h
new file mode 100644
index 0000000..acb1309
--- /dev/null
+++ b/gemma.h
@@ -0,0 +1,52 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __GEMMA_H__                
+#define __GEMMA_H__
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+class GEMMA {
+
+public:			
+	//parameters
+	string version;
+	string date;
+	string year;
+	
+	//constructor
+	GEMMA(void);
+	
+	//functions
+	void PrintHeader (void);
+	void PrintHelp (size_t option);
+	void PrintLicense (void);
+	void Assign (int argc, char **argv, PARAM &cPar);
+	void BatchRun (PARAM &cPar);
+	void WriteLog (int argc, char **argv, PARAM &cPar);
+};
+
+
+#endif
+
diff --git a/gzstream.cpp b/gzstream.cpp
new file mode 100644
index 0000000..bbb4ba8
--- /dev/null
+++ b/gzstream.cpp
@@ -0,0 +1,165 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.C
+// Revision      : $Revision: 1.7 $
+// Revision_date : $Date: 2003/01/08 14:41:27 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#include "gzstream.h"
+#include <iostream>
+#include <string.h>  // for memcpy
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See header file for user classes.
+// ----------------------------------------------------------------------------
+
+// --------------------------------------
+// class gzstreambuf:
+// --------------------------------------
+
+gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
+    if ( is_open())
+        return (gzstreambuf*)0;
+    mode = open_mode;
+    // no append nor read/write mode
+    if ((mode & std::ios::ate) || (mode & std::ios::app)
+        || ((mode & std::ios::in) && (mode & std::ios::out)))
+        return (gzstreambuf*)0;
+    char  fmode[10];
+    char* fmodeptr = fmode;
+    if ( mode & std::ios::in)
+        *fmodeptr++ = 'r';
+    else if ( mode & std::ios::out)
+        *fmodeptr++ = 'w';
+    *fmodeptr++ = 'b';
+    *fmodeptr = '\0';
+    file = gzopen( name, fmode);
+    if (file == 0)
+        return (gzstreambuf*)0;
+    opened = 1;
+    return this;
+}
+
+gzstreambuf * gzstreambuf::close() {
+    if ( is_open()) {
+        sync();
+        opened = 0;
+        if ( gzclose( file) == Z_OK)
+            return this;
+    }
+    return (gzstreambuf*)0;
+}
+
+int gzstreambuf::underflow() { // used for input buffer only
+    if ( gptr() && ( gptr() < egptr()))
+        return * reinterpret_cast<unsigned char *>( gptr());
+
+    if ( ! (mode & std::ios::in) || ! opened)
+        return EOF;
+    // Josuttis' implementation of inbuf
+    int n_putback = gptr() - eback();
+    if ( n_putback > 4)
+        n_putback = 4;
+    memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
+
+    int num = gzread( file, buffer+4, bufferSize-4);
+    if (num <= 0) // ERROR or EOF
+        return EOF;
+
+    // reset buffer pointers
+    setg( buffer + (4 - n_putback),   // beginning of putback area
+          buffer + 4,                 // read position
+          buffer + 4 + num);          // end of buffer
+
+    // return next character
+    return * reinterpret_cast<unsigned char *>( gptr());    
+}
+
+int gzstreambuf::flush_buffer() {
+    // Separate the writing of the buffer from overflow() and
+    // sync() operation.
+    int w = pptr() - pbase();
+    if ( gzwrite( file, pbase(), w) != w)
+        return EOF;
+    pbump( -w);
+    return w;
+}
+
+int gzstreambuf::overflow( int c) { // used for output buffer only
+    if ( ! ( mode & std::ios::out) || ! opened)
+        return EOF;
+    if (c != EOF) {
+        *pptr() = c;
+        pbump(1);
+    }
+    if ( flush_buffer() == EOF)
+        return EOF;
+    return c;
+}
+
+int gzstreambuf::sync() {
+    // Changed to use flush_buffer() instead of overflow( EOF)
+    // which caused improper behavior with std::endl and flush(),
+    // bug reported by Vincent Ricard.
+    if ( pptr() && pptr() > pbase()) {
+        if ( flush_buffer() == EOF)
+            return -1;
+    }
+    return 0;
+}
+
+// --------------------------------------
+// class gzstreambase:
+// --------------------------------------
+
+gzstreambase::gzstreambase( const char* name, int mode) {
+    init( &buf);
+    open( name, mode);
+}
+
+gzstreambase::~gzstreambase() {
+    buf.close();
+}
+
+void gzstreambase::open( const char* name, int open_mode) {
+    if ( ! buf.open( name, open_mode))
+        clear( rdstate() | std::ios::badbit);
+}
+
+void gzstreambase::close() {
+    if ( buf.is_open())
+        if ( ! buf.close())
+            clear( rdstate() | std::ios::badbit);
+}
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+// ============================================================================
+// EOF //
diff --git a/gzstream.h b/gzstream.h
new file mode 100644
index 0000000..861653f
--- /dev/null
+++ b/gzstream.h
@@ -0,0 +1,121 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.h
+// Revision      : $Revision: 1.5 $
+// Revision_date : $Date: 2002/04/26 23:30:15 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#ifndef GZSTREAM_H
+#define GZSTREAM_H 1
+
+// standard C++ with new header file names and std:: namespace
+#include <iostream>
+#include <fstream>
+#include <zlib.h>
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+private:
+    static const int bufferSize = 47+256;    // size of data buff
+    // totals 512 bytes under g++ for igzstream at the end.
+
+    gzFile           file;               // file handle for compressed file
+    char             buffer[bufferSize]; // data buffer
+    char             opened;             // open/close state of stream
+    int              mode;               // I/O mode
+
+    int flush_buffer();
+public:
+    gzstreambuf() : opened(0) {
+        setp( buffer, buffer + (bufferSize-1));
+        setg( buffer + 4,     // beginning of putback area
+              buffer + 4,     // read position
+              buffer + 4);    // end position      
+        // ASSERT: both input & output capabilities will not be used together
+    }
+    int is_open() { return opened; }
+    gzstreambuf* open( const char* name, int open_mode);
+    gzstreambuf* close();
+    ~gzstreambuf() { close(); }
+    
+    virtual int     overflow( int c = EOF);
+    virtual int     underflow();
+    virtual int     sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+    gzstreambuf buf;
+public:
+    gzstreambase() { init(&buf); }
+    gzstreambase( const char* name, int open_mode);
+    ~gzstreambase();
+    void open( const char* name, int open_mode);
+    void close();
+    gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz* 
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+    igzstream() : std::istream( &buf) {} 
+    igzstream( const char* name, int open_mode = std::ios::in)
+        : gzstreambase( name, open_mode), std::istream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::in) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+    ogzstream() : std::ostream( &buf) {}
+    ogzstream( const char* name, int mode = std::ios::out)
+        : gzstreambase( name, mode), std::ostream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::out) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/io.cpp b/io.cpp
new file mode 100644
index 0000000..c22f668
--- /dev/null
+++ b/io.cpp
@@ -0,0 +1,1396 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <map>
+#include <set>
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h> 
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_cdf.h"
+
+#include "lapack.h"
+#include "gzstream.h"
+#include "mathfunc.h"
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"
+#else
+#include "io.h"
+#endif
+
+
+using namespace std;
+
+
+
+//Print process bar
+void ProgressBar (string str, double p, double total)
+{
+	double progress = (100.0 * p / total); 
+	int barsize = (int) (progress / 2.0); 
+	char bar[51];
+	
+	cout<<str;
+	for (int i = 0; i <50; i++) {
+		if (i<barsize) {bar[i] = '=';}
+		else {bar[i]=' ';}
+		cout<<bar[i];
+	}
+	cout<<setprecision(2)<<fixed<<progress<<"%\r"<<flush;
+	
+	return;
+}
+
+
+//Print process bar (with acceptance ratio)
+void ProgressBar (string str, double p, double total, double ratio)
+{
+	double progress = (100.0 * p / total); 
+	int barsize = (int) (progress / 2.0); 
+	char bar[51];
+	
+	cout<<str;
+	for (int i = 0; i <50; i++) {
+		if (i<barsize) {bar[i] = '=';}
+		else {bar[i]=' ';}
+		cout<<bar[i];
+	}
+	cout<<setprecision(2)<<fixed<<progress<<"%    "<<ratio<<"\r"<<flush;
+	
+	
+	return;
+}
+
+// in case files are ended with "\r" or "\r\n"
+std::istream& safeGetline(std::istream& is, std::string& t)
+{
+    t.clear();
+
+    // The characters in the stream are read one-by-one using a std::streambuf.
+    // That is faster than reading them one-by-one using the std::istream.
+    // Code that uses streambuf this way must be guarded by a sentry object.
+    // The sentry object performs various tasks,
+    // such as thread synchronization and updating the stream state.
+
+    std::istream::sentry se(is, true);
+    std::streambuf* sb = is.rdbuf();
+
+    for(;;) {
+        int c = sb->sbumpc();
+        switch (c) {
+        case '\n':
+            return is;
+        case '\r':
+            if(sb->sgetc() == '\n')
+                sb->sbumpc();
+            return is;
+        case EOF:
+            // Also handle the case when the last line has no line ending
+            if(t.empty())
+                is.setstate(std::ios::eofbit);
+            return is;
+        default:
+            t += (char)c;
+        }
+    }
+}
+
+//Read snp file
+bool ReadFile_snps (const string &file_snps, set<string> &setSnps)
+{
+	setSnps.clear();
+
+	ifstream infile (file_snps.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		setSnps.insert(ch_ptr); 
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+//Read log file
+bool ReadFile_log (const string &file_log, double &pheno_mean)
+{
+	ifstream infile (file_log.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open log file: "<<file_log<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	size_t flag=0;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		if (ch_ptr!=NULL && strcmp(ch_ptr, "estimated")==0) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (ch_ptr!=NULL && strcmp(ch_ptr, "mean")==0) {
+				ch_ptr=strtok (NULL, " , \t");
+				if (ch_ptr!=NULL && strcmp(ch_ptr, "=")==0) {
+					ch_ptr=strtok (NULL, " , \t");
+					pheno_mean=atof(ch_ptr);
+					flag=1;
+				}
+			}
+		}
+		
+		if (flag==1) {break;}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+//Read bimbam annotation file
+bool ReadFile_anno (const string &file_anno, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM)
+{
+	mapRS2chr.clear();
+	mapRS2bp.clear();
+	
+	ifstream infile (file_anno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening annotation file: "<<file_anno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	long int b_pos;
+	string chr;
+	double cM;
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		if (strcmp(ch_ptr, "NA")==0) {b_pos=-9;} else {b_pos=atol(ch_ptr);}
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {chr="-9";} else {chr=ch_ptr;}
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {cM=-9;} else {cM=atof(ch_ptr);}
+		
+		mapRS2chr[rs]=chr;
+		mapRS2bp[rs]=b_pos;
+		mapRS2cM[rs]=cM;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+//read one column of phenotype
+bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column)
+{
+	indicator_idv.clear();
+	pheno.clear();
+	
+	igzstream infile (file_pheno.c_str(), igzstream::in);
+//	ifstream infile (file_pheno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string id;
+	double p;
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		for (int i=0; i<(p_column-1); ++i) {
+			ch_ptr=strtok (NULL, " , \t");	
+		}		
+		if (strcmp(ch_ptr, "NA")==0) {indicator_idv.push_back(0); pheno.push_back(-9);}		//pheno is different from pimass2
+		else {p=atof(ch_ptr); indicator_idv.push_back(1); pheno.push_back(p);}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+//Read bimbam phenotype file, p_column=1, 2 ...
+bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column)
+{
+	indicator_pheno.clear();
+	pheno.clear();
+	
+	igzstream infile (file_pheno.c_str(), igzstream::in);
+//	ifstream infile (file_pheno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+  
+	string id;
+	double p;
+	
+	vector<double> pheno_row;
+	vector<int> ind_pheno_row;
+	
+	size_t p_max=*max_element(p_column.begin(), p_column.end() );
+	map<size_t, size_t> mapP2c;
+	for (size_t i=0; i<p_column.size(); i++) {
+		mapP2c[p_column[i]]=i;
+		pheno_row.push_back(-9);
+		ind_pheno_row.push_back(0);
+	}	
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		
+		size_t i=0;
+		while (i<p_max ) {			
+			if (mapP2c.count(i+1)!=0) {
+				if (strcmp(ch_ptr, "NA")==0) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;}
+				else {p=atof(ch_ptr); ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;}
+			}
+			i++;
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		indicator_pheno.push_back(ind_pheno_row);	
+		pheno.push_back(pheno_row);			
+	}
+ 
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt)
+{
+	indicator_cvt.clear();
+	
+	ifstream infile (file_cvt.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open covariates file: "<<file_cvt<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	double d;	
+	
+	int flag_na=0;	
+	
+	while (!safeGetline(infile, line).eof()) {
+		vector<double> v_d; flag_na=0;
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		while (ch_ptr!=NULL) {
+			if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;}
+			else {d=atof(ch_ptr);}
+			
+			v_d.push_back(d);
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} 
+		cvt.push_back(v_d);
+	}
+	
+	if (indicator_cvt.empty()) {n_cvt=0;}
+	else {
+		flag_na=0;
+		for (vector<int>::size_type i=0; i<indicator_cvt.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			if (flag_na==0) {flag_na=1; n_cvt=cvt[i].size();}
+			if (flag_na!=0 && n_cvt!=cvt[i].size()) {cout<<"error! number of covariates in row "<<i<<" do not match other rows."<<endl; return false;}
+		}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+//Read .bim file
+bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo)
+{
+	snpInfo.clear();
+	
+	ifstream infile (file_bim.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening .bim file: "<<file_bim<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	long int b_pos;
+	string chr;
+	double cM;
+	string major;
+	string minor;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");
+		chr=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		cM=atof(ch_ptr);
+		ch_ptr=strtok (NULL, " \t");
+		b_pos=atol(ch_ptr);
+		ch_ptr=strtok (NULL, " \t");
+		minor=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		major=ch_ptr;
+		
+		SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, -9, -9, -9};
+		snpInfo.push_back(sInfo);
+	}
+	
+	infile.close();
+	infile.clear();	
+	return true;
+}
+
+
+//Read .fam file
+bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column)
+{
+	indicator_pheno.clear();
+	pheno.clear();
+	mapID2num.clear();	
+	
+	igzstream infile (file_fam.c_str(), igzstream::in);
+	//ifstream infile (file_fam.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening .fam file: "<<file_fam<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+
+	string id;
+	int c=0;
+	double p;
+
+	vector<double> pheno_row;
+	vector<int> ind_pheno_row;
+	
+	size_t p_max=*max_element(p_column.begin(), p_column.end() );
+	map<size_t, size_t> mapP2c;
+	for (size_t i=0; i<p_column.size(); i++) {
+		mapP2c[p_column[i]]=i;
+		pheno_row.push_back(-9);
+		ind_pheno_row.push_back(0);
+	}	
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");
+		ch_ptr=strtok (NULL, " \t");
+		id=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		
+		size_t i=0;
+		while (i<p_max ) {
+			if (mapP2c.count(i+1)!=0 ) {
+				if (strcmp(ch_ptr, "NA")==0) {
+					ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;
+				} else {
+					p=atof(ch_ptr);
+					
+					if (p==-9) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;}
+					else {ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;}
+				}
+			}
+			i++;
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		indicator_pheno.push_back(ind_pheno_row);
+		pheno.push_back(pheno_row);				
+		
+		mapID2num[id]=c; c++;
+	}
+ 
+	infile.close();
+	infile.clear();	
+	return true;
+}
+
+
+
+
+
+
+//Read bimbam mean genotype file, the first time, to obtain #SNPs for analysis (ns_test) and total #SNP (ns_total)
+bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test)
+{
+	indicator_snp.clear();
+	snpInfo.clear();
+	
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+
+	gsl_vector *genotype=gsl_vector_alloc (W->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (W->size1);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+	
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;	
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+	
+	double v_x, v_w;
+	int c_idv=0;
+	
+	string line;
+	char *ch_ptr;
+		
+	string rs;
+	long int b_pos;
+	string chr;
+	string major;
+	string minor;
+	double cM;
+  
+	double maf, geno, geno_old;
+	size_t n_miss;
+	size_t n_0, n_1, n_2;
+	int flag_poly;
+	
+	int ni_total=indicator_idv.size();
+	int ni_test=0;
+	for (int i=0; i<ni_total; ++i) {
+		ni_test+=indicator_idv[i];
+	}
+	ns_test=0;
+	
+	while (!safeGetline(infile, line).eof()) {		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		minor=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		major=ch_ptr;
+		
+		if (setSnps.size()!=0 && setSnps.count(rs)==0) {
+			SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9};
+			snpInfo.push_back(sInfo);
+			indicator_snp.push_back(0);
+			continue;
+		}
+				
+		if (mapRS2bp.count(rs)==0) {chr="-9"; b_pos=-9;cM=-9;}
+		else {b_pos=mapRS2bp[rs]; chr=mapRS2chr[rs]; cM=mapRS2cM[rs];}		
+				
+		maf=0; n_miss=0; flag_poly=0; geno_old=-9;
+		n_0=0; n_1=0; n_2=0;
+		c_idv=0; gsl_vector_set_zero (genotype_miss);
+		for (int i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}		
+
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;}
+			
+			geno=atof(ch_ptr);
+			if (geno>=0 && geno<=0.5) {n_0++;}
+			if (geno>0.5 && geno<1.5) {n_1++;}
+			if (geno>=1.5 && geno<=2.0) {n_2++;}
+			
+			gsl_vector_set (genotype, c_idv, geno); 
+			
+//			if (geno<0) {n_miss++; continue;}
+			
+			if (flag_poly==0) {geno_old=geno; flag_poly=2;}
+			if (flag_poly==2 && geno!=geno_old) {flag_poly=1;}
+			
+			maf+=geno;
+			
+			c_idv++;
+		}
+		maf/=2.0*(double)(ni_test-n_miss);	
+		
+		SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf};
+		snpInfo.push_back(sInfo);
+		
+		if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;}
+		
+		if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;}
+		
+		if (flag_poly!=1) {indicator_snp.push_back(0); continue;}
+		
+		if (hwe_level!=0) {
+			if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;}
+		}
+		
+		//filter SNP if it is correlated with W
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);}		
+		}
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_ddot (genotype, genotype, &v_x);
+		gsl_blas_ddot (Wtx, WtWiWtx, &v_w);
+		
+		if (v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;}
+		
+		indicator_snp.push_back(1); 
+		ns_test++;
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (Wtx);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+      
+//Read bed file, the first time
+bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test)
+{
+	indicator_snp.clear();
+	size_t ns_total=snpInfo.size();
+	
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+
+	gsl_vector *genotype=gsl_vector_alloc (W->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (W->size1);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+	
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;	
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+	
+	double v_x, v_w, geno;
+	size_t c_idv=0;
+	
+	char ch[1];
+	bitset<8> b;
+  	
+	size_t ni_total=indicator_idv.size();
+	size_t ni_test=0;
+	for (size_t i=0; i<ni_total; ++i) {
+		ni_test+=indicator_idv[i];
+	}
+	ns_test=0;
+	
+	//calculate n_bit and c, the number of bit for each snp
+	size_t n_bit;
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1;}
+
+	//ignore the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	double maf;
+	size_t n_miss;
+	size_t n_0, n_1, n_2, c;	
+	
+	//start reading snps and doing association test
+	for (size_t t=0; t<ns_total; ++t) {
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) {
+			snpInfo[t].n_miss=-9;
+			snpInfo[t].missingness=-9;
+			snpInfo[t].maf=-9;
+			indicator_snp.push_back(0);
+			continue;
+		}
+
+		//read genotypes
+		c=0; maf=0.0; n_miss=0; n_0=0; n_1=0; n_2=0;
+		c_idv=0; gsl_vector_set_zero (genotype_miss);
+		for (size_t i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && c==ni_total) {break;}
+				if (indicator_idv[c]==0) {c++; continue;}
+				c++;
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); maf+=2.0; n_2++;}
+					else {gsl_vector_set(genotype, c_idv, 1.0); maf+=1.0; n_1++;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;}                                  
+					else {gsl_vector_set(genotype_miss, c_idv, 1); n_miss++; }
+				}
+				c_idv++;
+			}
+		}
+		maf/=2.0*(double)(ni_test-n_miss);
+		
+		snpInfo[t].n_miss=n_miss;
+		snpInfo[t].missingness=(double)n_miss/(double)ni_test;
+		snpInfo[t].maf=maf;
+		
+		if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;}
+		
+		if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;}
+		
+		if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;}
+		
+		if (hwe_level!=1) {
+			if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;}
+		}
+			
+		
+		//filter SNP if it is correlated with W
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);}		
+		}
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_ddot (genotype, genotype, &v_x);
+		gsl_blas_ddot (Wtx, WtWiWtx, &v_w);
+		
+		if (v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;}
+		
+		indicator_snp.push_back(1); 
+		ns_test++;
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (Wtx);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
+		  
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) 
+{
+	igzstream infile (file_kin.c_str(), igzstream::in);
+//	ifstream infile (file_kin.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open kinship file: "<<file_kin<<endl; error=true; return;}
+	
+	size_t ni_total=indicator_idv.size();
+	
+	gsl_matrix_set_zero (G);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	if (k_mode==1) {
+		size_t i_test=0, i_total=0, j_test=0, j_total=0;
+		while (getline(infile, line)) {
+			if (i_total==ni_total) {cout<<"error! number of rows in the kinship file is larger than the number of phentypes."<<endl; error=true;}			
+			
+			if (indicator_idv[i_total]==0) {i_total++; continue;}
+			
+			j_total=0; j_test=0;
+			ch_ptr=strtok ((char *)line.c_str(), " , \t");
+			while (ch_ptr!=NULL) {
+				if (j_total==ni_total) {cout<<"error! number of columns in the kinship file is larger than the number of phentypes for row = "<<i_total<<endl; error=true;}
+				
+				d=atof(ch_ptr);
+				if (indicator_idv[j_total]==1) {gsl_matrix_set (G, i_test, j_test, d); j_test++;}				
+				j_total++;
+				
+				ch_ptr=strtok (NULL, " , \t");
+			}
+			if (j_total!=ni_total) {cout<<"error! number of columns in the kinship file do not match the number of phentypes for row = "<<i_total<<endl; error=true;}
+			i_total++; i_test++;			
+		}
+		if (i_total!=ni_total) {cout<<"error! number of rows in the kinship file do not match the number of phentypes."<<endl; error=true;}
+	}	
+	else {  
+		map<size_t, size_t> mapID2ID;
+		size_t c=0;
+		for (size_t i=0; i<indicator_idv.size(); i++) {
+			if (indicator_idv[i]==1) {mapID2ID[i]=c; c++;}
+		}
+		
+		string id1, id2;
+		double Cov_d;
+		size_t n_id1, n_id2;
+		
+		while (getline(infile, line)) {
+			ch_ptr=strtok ((char *)line.c_str(), " , \t");
+			id1=ch_ptr;
+			ch_ptr=strtok (NULL, " , \t");
+			id2=ch_ptr;
+			ch_ptr=strtok (NULL, " , \t");
+			d=atof(ch_ptr);
+			if (mapID2num.count(id1)==0 || mapID2num.count(id2)==0) {continue;}
+			if (indicator_idv[mapID2num[id1]]==0 || indicator_idv[mapID2num[id2]]==0) {continue;}
+			
+			n_id1=mapID2ID[mapID2num[id1]];
+			n_id2=mapID2ID[mapID2num[id2]];
+			
+			Cov_d=gsl_matrix_get(G, n_id1, n_id2);
+			if (Cov_d!=0 && Cov_d!=d) {cout<<"error! redundant and unequal terms in the kinship file, for id1 = "<<id1<<" and id2 = "<<id2<<endl;}
+			else {
+				gsl_matrix_set(G, n_id1, n_id2, d);
+				gsl_matrix_set(G, n_id2, n_id1, d);
+			}
+		}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) 
+{
+	igzstream infile (file_mk.c_str(), igzstream::in);
+	if (!infile) {cout<<"error! fail to open file: "<<file_mk<<endl; error=true; return;}
+
+	string file_kin, line;
+
+	size_t i=0;
+	while (getline(infile, line)) {
+	  file_kin=line.c_str();
+	  gsl_matrix_view G_sub=gsl_matrix_submatrix(G, 0, i*G->size1, G->size1, G->size1);
+	  ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix);
+	  i++;
+	}
+
+	infile.close();
+	infile.clear();	
+	return;
+}
+
+
+void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) 
+{
+	igzstream infile (file_ku.c_str(), igzstream::in);
+//	ifstream infile (file_ku.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open the U file: "<<file_ku<<endl; error=true; return;}
+	
+	size_t n_row=U->size1, n_col=U->size2, i_row=0, i_col=0;
+	
+	gsl_matrix_set_zero (U);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	while (getline(infile, line)) {
+		if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<<endl; error=true;}			
+				
+		i_col=0;
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		while (ch_ptr!=NULL) {
+			if (i_col==n_col) {cout<<"error! number of columns in the U file is larger than expected, for row = "<<i_row<<endl; error=true;}
+			
+			d=atof(ch_ptr);
+			gsl_matrix_set (U, i_row, i_col, d);			
+			i_col++;
+			
+			ch_ptr=strtok (NULL, " , \t");
+		}
+		
+		i_row++;
+	}
+		
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) 
+{
+	igzstream infile (file_kd.c_str(), igzstream::in);
+//	ifstream infile (file_kd.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open the D file: "<<file_kd<<endl; error=true; return;}
+	
+	size_t n_row=eval->size, i_row=0;
+	
+	gsl_vector_set_zero (eval);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	while (getline(infile, line)) {
+		if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<<endl; error=true;}			
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		d=atof(ch_ptr);
+		
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr!=NULL) {cout<<"error! number of columns in the D file is larger than expected, for row = "<<i_row<<endl; error=true;}
+		
+		gsl_vector_set (eval, i_row, d);
+		
+		i_row++;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+//read bimbam mean genotype file and calculate kinship matrix
+bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	size_t n_miss;
+	double d, geno_mean, geno_var;
+	
+	size_t ni_total=matrix_kin->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_total);
+	gsl_vector *geno_miss=gsl_vector_alloc (ni_total);
+
+	size_t ns_test=0;
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		geno_mean=0.0; n_miss=0; geno_var=0.0;
+		gsl_vector_set_all(geno_miss, 0);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;}
+			else {
+				d=atof(ch_ptr);
+				gsl_vector_set (geno, i, d);
+				gsl_vector_set (geno_miss, i, 1);
+				geno_mean+=d;
+				geno_var+=d*d;
+			}
+		}
+		
+		geno_mean/=(double)(ni_total-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_total;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+		
+		for (size_t i=0; i<ni_total; ++i) {
+			if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);}
+		}		
+		
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+		
+		if (geno_var!=0) {
+			if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);}
+			else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);}
+			else {cout<<"Unknown kinship mode."<<endl;}
+		}
+		
+		ns_test++;
+    }	
+	cout<<endl;
+	
+	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
+	
+	for (size_t i=0; i<ni_total; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (matrix_kin, j, i);
+			gsl_matrix_set (matrix_kin, i, j, d);
+		}
+	}
+	
+	gsl_vector_free (geno);
+	gsl_vector_free (geno_miss);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+
+
+bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) 
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+		
+	char ch[1];
+	bitset<8> b;
+	
+	size_t n_miss, ci_total;
+	double d, geno_mean, geno_var;
+	
+	size_t ni_total=matrix_kin->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_total);
+
+	size_t ns_test=0;
+	int n_bit;
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}	
+	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		geno_mean=0.0;	n_miss=0; ci_total=0; geno_var=0.0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==ni_total) {break;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(geno, ci_total, 2.0); geno_mean+=2.0; geno_var+=4.0; }
+					else {gsl_vector_set(geno, ci_total, 1.0); geno_mean+=1.0; geno_var+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); }      
+					else {gsl_vector_set(geno, ci_total, -9.0); n_miss++; }
+				}
+
+				ci_total++;
+			}
+		}
+				
+		geno_mean/=(double)(ni_total-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_total;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+		
+		for (size_t i=0; i<ni_total; ++i) {
+			d=gsl_vector_get(geno,i);
+			if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);}
+		}		
+		
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+		
+		if (geno_var!=0) {
+			if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);}
+			else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);}
+			else {cout<<"Unknown kinship mode."<<endl;}
+		}
+		
+		ns_test++;
+    }	
+	cout<<endl;
+	
+	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
+	
+	for (size_t i=0; i<ni_total; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (matrix_kin, j, i);
+			gsl_matrix_set (matrix_kin, i, j, d);
+		}
+	}
+	
+	gsl_vector_free (geno);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K
+bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	if (calc_K==true) {gsl_matrix_set_zero (K);}
+	
+	gsl_vector *genotype=gsl_vector_alloc (UtX->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1);
+	double geno, geno_mean;
+	size_t n_miss;
+	
+	int ni_total=(int)indicator_idv.size();
+	int ns_total=(int)indicator_snp.size();
+	int ni_test=UtX->size1;
+	int ns_test=UtX->size2;
+	
+	int c_idv=0, c_snp=0;
+	
+	for (int i=0; i<ns_total; ++i) {
+		!safeGetline(infile, line).eof();
+		if (indicator_snp[i]==0) {continue;}	
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		c_idv=0; geno_mean=0; n_miss=0;
+		gsl_vector_set_zero (genotype_miss);
+		for (int j=0; j<ni_total; ++j) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[j]==0) {continue;}			
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;}
+			else {			
+				geno=atof(ch_ptr);
+				gsl_vector_set (genotype, c_idv, geno); 
+				geno_mean+=geno;
+			}
+			c_idv++;
+		}
+		
+		geno_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=0;}
+			else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;}
+			
+			gsl_vector_set (genotype, i, geno);
+			gsl_matrix_set (UtX, i, c_snp, geno);
+		}
+		
+		if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);}
+		
+		c_snp++;
+	}	
+	
+	if (calc_K==true) {
+		gsl_matrix_scale (K, 1.0/(double)ns_test);
+		
+		for (size_t i=0; i<genotype->size; ++i) {
+			for (size_t j=0; j<i; ++j) {
+				geno=gsl_matrix_get (K, j, i);
+				gsl_matrix_set (K, i, j, geno);
+			}
+		}
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	
+	infile.clear();
+	infile.close();
+	
+	return true;
+}
+
+
+
+
+
+
+
+//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K
+bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K)
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+	
+	char ch[1];
+	bitset<8> b;
+	
+	int ni_total=(int)indicator_idv.size();
+	int ns_total=(int)indicator_snp.size();
+	int ni_test=UtX->size1;
+	int ns_test=UtX->size2;
+	int n_bit;
+	
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1;}
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	if (calc_K==true) {gsl_matrix_set_zero (K);}
+	
+	gsl_vector *genotype=gsl_vector_alloc (UtX->size1);	
+	
+	double geno, geno_mean;
+	size_t n_miss;	
+	int c_idv=0, c_snp=0, c=0;
+	
+	//start reading snps and doing association test
+	for (int t=0; t<ns_total; ++t) {
+		if (indicator_snp[t]==0) {continue;}	
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		c_idv=0; geno_mean=0.0; n_miss=0; c=0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && c==ni_total) {break;}				
+				if (indicator_idv[c]==0) {c++; continue;}
+				c++;
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;}
+					else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;}                               
+					else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;}
+				}
+				c_idv++;
+			}
+		}
+		
+		geno_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<genotype->size; ++i) {		
+			geno=gsl_vector_get (genotype, i);
+			if (geno==-9) {geno=0;}
+			else {geno-=geno_mean;}
+			
+			gsl_vector_set (genotype, i, geno);
+			gsl_matrix_set (UtX, i, c_snp, geno);
+		}
+		
+		if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);}
+		
+		c_snp++;
+	}	
+	
+	if (calc_K==true) {
+		gsl_matrix_scale (K, 1.0/(double)ns_test);
+		
+		for (size_t i=0; i<genotype->size; ++i) {
+			for (size_t j=0; j<i; ++j) {
+				geno=gsl_matrix_get (K, j, i);
+				gsl_matrix_set (K, i, j, geno);
+			}
+		}
+	}
+	
+	gsl_vector_free (genotype);		  
+	infile.clear();
+	infile.close();
+	
+	return true;
+}
+
+
+
+
+
+bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est)
+{
+	mapRS2est.clear();
+	
+	ifstream infile (file_est.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening estimated parameter file: "<<file_est<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	double alpha, beta, gamma, d;
+	
+	//header
+	getline(infile, line);
+	
+	size_t n=*max_element(est_column.begin(), est_column.end());
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");		
+		
+		alpha=0.0; beta=0.0; gamma=1.0;
+		for (size_t i=0; i<n+1; ++i) {
+			if (i==est_column[0]-1) {rs=ch_ptr;}
+			if (i==est_column[1]-1) {alpha=atof(ch_ptr);}
+			if (i==est_column[2]-1) {beta=atof(ch_ptr);}
+			if (i==est_column[3]-1) {gamma=atof(ch_ptr);}
+			if (i<n) {ch_ptr=strtok (NULL, " \t");}
+		}
+		
+		d=alpha+beta*gamma;
+		
+		if (mapRS2est.count(rs)==0) {
+			mapRS2est[rs]=d;
+		}
+		else {
+			cout<<"the same SNP occurs more than once in estimated parameter file: "<<rs<<endl; return false;
+		}
+	}
+	
+	infile.clear();
+	infile.close();
+	return true;
+}
+
+
+
+bool CountFileLines (const string &file_input, size_t &n_lines)
+{
+	igzstream infile (file_input.c_str(), igzstream::in);
+	//ifstream infile (file_input.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open file: "<<file_input<<endl; return false;}
+
+	n_lines=count(istreambuf_iterator<char>(infile), istreambuf_iterator<char>(), '\n');
+	infile.seekg (0, ios::beg);
+	
+	return true;
+}
+
+
+
+//Read gene expression file
+bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total)
+{
+	vec_read.clear();
+	ng_total=0;
+	
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open gene expression file: "<<file_gene<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	string rs;
+	
+	size_t n_idv=0, t=0;
+	
+	//header
+	getline(infile, line);
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		ch_ptr=strtok (NULL, " , \t");	
+		
+		t=0;
+		while (ch_ptr!=NULL) {
+			if (ng_total==0) {
+				vec_read.push_back(0);
+				t++;
+				n_idv++;
+			} else {
+				vec_read[t]+=atof(ch_ptr);		
+				t++;
+			}
+			
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		if (t!=n_idv) {cout<<"error! number of columns doesn't match in row: "<<ng_total<<endl; return false;}
+		
+		SNPINFO sInfo={"-9", rs, -9, -9, "-9", "-9", -9, -9, -9};
+		snpInfo.push_back(sInfo);
+		
+		ng_total++;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
diff --git a/io.h b/io.h
new file mode 100644
index 0000000..13e3e47
--- /dev/null
+++ b/io.h
@@ -0,0 +1,79 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __IO_H__                
+#define __IO_H__
+
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+void ProgressBar (string str, double p, double total);
+void ProgressBar (string str, double p, double total, double ratio);
+std::istream& safeGetline(std::istream& is, std::string& t);
+
+bool ReadFile_snps (const string &file_snps, set<string> &setSnps);
+bool ReadFile_log (const string &file_log, double &pheno_mean);
+
+bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo);
+bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column);
+
+bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt);
+bool ReadFile_anno (const string &file_bim, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM);
+bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column);
+bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column);
+
+bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test);
+bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test);
+
+void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G);
+void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G);
+void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U);
+void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); 
+
+bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
+bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
+
+bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);
+bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);
+
+bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est);
+
+bool CountFileLines (const string &file_input, size_t &n_lines);
+
+bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total);
+
+#endif
+
+
+
+
+
+
+
diff --git a/lapack.cpp b/lapack.cpp
new file mode 100644
index 0000000..83d5290
--- /dev/null
+++ b/lapack.cpp
@@ -0,0 +1,609 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <cmath>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+
+using namespace std;
+
+extern "C" void sgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, float *ALPHA, float *A, int *LDA, float *B, int *LDB, float *BETA, float *C, int *LDC);
+extern "C" void spotrf_(char *UPLO, int *N, float *A, int *LDA, int *INFO);
+extern "C" void spotrs_(char *UPLO, int *N, int *NRHS, float *A, int *LDA, float *B, int *LDB, int *INFO);
+extern "C" void ssyev_(char* JOBZ, char* UPLO, int *N, float *A, int *LDA, float *W, float *WORK, int *LWORK, int *INFO);
+extern "C" void ssyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, float *A, int *LDA, float *VL, float *VU, int *IL, int *IU, float *ABSTOL, int *M, float *W, float *Z, int *LDZ, int *ISUPPZ, float *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);
+
+extern "C" void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int *LDB, double *BETA, double *C, int *LDC);
+extern "C" void dpotrf_(char *UPLO, int *N, double *A, int *LDA, int *INFO);
+extern "C" void dpotrs_(char *UPLO, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, int *INFO);
+extern "C" void dsyev_(char* JOBZ, char* UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO);
+extern "C" void dsyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, double *A, int *LDA, double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z, int *LDZ, int *ISUPPZ, double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);
+
+
+//cholesky decomposition, A is distroyed
+void lapack_float_cholesky_decomp (gsl_matrix_float *A)
+{
+	int N=A->size1, LDA=A->size1, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<<endl; return;}
+	
+	spotrf_(&UPLO, &N, A->data, &LDA, &INFO);
+	if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky decomposition, A is distroyed
+void lapack_cholesky_decomp (gsl_matrix *A)
+{
+	int N=A->size1, LDA=A->size1, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<<endl; return;}
+	
+	dpotrf_(&UPLO, &N, A->data, &LDA, &INFO);
+	if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky solve, A is decomposed, 
+void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x)
+{
+	int N=A->size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<<endl; return;}
+	
+	gsl_vector_float_memcpy (x, b);
+	spotrs_(&UPLO, &N, &NRHS, A->data, &LDA, x->data, &LDB, &INFO);
+	if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky solve, A is decomposed, 
+void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x)
+{
+	int N=A->size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<<endl; return;}
+	
+	gsl_vector_memcpy (x, b);
+	dpotrs_(&UPLO, &N, &NRHS, A->data, &LDA, x->data, &LDB, &INFO);
+	if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<<endl; return;}	
+	
+	return;
+}
+
+
+void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C)
+{
+	int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2;
+	
+	if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;}
+	else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;}
+	else {cout<<"need 'N' or 'T' in lapack_sgemm"<<endl; return;}
+	
+	if (*TransB=='N' || *TransB=='n') {N=B->size2; K2=B->size1;}
+	else if (*TransB=='T' || *TransB=='t')  {N=B->size1; K2=B->size2;}
+	else {cout<<"need 'N' or 'T' in lapack_sgemm"<<endl;  return;}
+	
+	if (K1!=K2) {cout<<"A and B not compatible in lapack_sgemm"<<endl; return;}
+	if (C->size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_sgemm"<<endl; return;}
+	
+	gsl_matrix_float *A_t=gsl_matrix_float_alloc (A->size2, A->size1);
+	gsl_matrix_float_transpose_memcpy (A_t, A);
+	gsl_matrix_float *B_t=gsl_matrix_float_alloc (B->size2, B->size1);
+	gsl_matrix_float_transpose_memcpy (B_t, B);
+	gsl_matrix_float *C_t=gsl_matrix_float_alloc (C->size2, C->size1);
+	gsl_matrix_float_transpose_memcpy (C_t, C);
+	
+	sgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC);
+	gsl_matrix_float_transpose_memcpy (C, C_t);
+	
+	gsl_matrix_float_free (A_t);
+	gsl_matrix_float_free (B_t);
+	gsl_matrix_float_free (C_t);
+	return;
+}
+
+
+
+void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C)
+{
+	int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2;
+	
+	if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;}
+	else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;}
+	else {cout<<"need 'N' or 'T' in lapack_dgemm"<<endl; return;}
+	
+	if (*TransB=='N' || *TransB=='n') {N=B->size2; K2=B->size1;}
+	else if (*TransB=='T' || *TransB=='t')  {N=B->size1; K2=B->size2;}
+	else {cout<<"need 'N' or 'T' in lapack_dgemm"<<endl;  return;}
+	
+	if (K1!=K2) {cout<<"A and B not compatible in lapack_dgemm"<<endl; return;}
+	if (C->size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_dgemm"<<endl; return;}
+	
+	gsl_matrix *A_t=gsl_matrix_alloc (A->size2, A->size1);
+	gsl_matrix_transpose_memcpy (A_t, A);
+	gsl_matrix *B_t=gsl_matrix_alloc (B->size2, B->size1);
+	gsl_matrix_transpose_memcpy (B_t, B);
+	gsl_matrix *C_t=gsl_matrix_alloc (C->size2, C->size1);
+	gsl_matrix_transpose_memcpy (C_t, C);
+
+	dgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC);
+
+	gsl_matrix_transpose_memcpy (C, C_t);
+	
+	gsl_matrix_free (A_t);
+	gsl_matrix_free (B_t);
+	gsl_matrix_free (C_t);
+	return;
+}
+
+
+
+//eigen value decomposition, matrix A is destroyed, float seems to have problem with large matrices (in mac)
+void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix)
+{
+	if (flag_largematrix==1) {
+		int N=A->size1, LDA=A->size1, INFO, LWORK=-1;
+		char JOBZ='V', UPLO='L';
+				
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		//	float temp[1];
+		//	ssyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, temp, &LWORK, &INFO);
+		//	if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		//	LWORK=(int)temp[0];
+		
+		LWORK=3*N;
+		float *WORK=new float [LWORK];	
+		ssyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, WORK, &LWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_float_view A_sub=gsl_matrix_float_submatrix(A, 0, 0, N, N);
+		gsl_matrix_float_memcpy (evec, &A_sub.matrix);
+		gsl_matrix_float_transpose (evec);
+		
+		delete [] WORK;
+	} else {	
+		int N=A->size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1;
+		char JOBZ='V', UPLO='L', RANGE='A';
+		float ABSTOL=1.0E-7;
+		
+		//VL, VU, IL, IU are not referenced; M equals N if RANGE='A'
+		float VL=0.0, VU=0.0;
+		int IL=0, IU=0, M;
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_float_eigen_symmv."<<endl; return;}
+		
+		int *ISUPPZ=new int [2*N];
+				
+		float WORK_temp[1];
+		int IWORK_temp[1];
+		ssyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_float_eigen_symmv."<<endl; return;}	
+		LWORK=(int)WORK_temp[0]; LIWORK=(int)IWORK_temp[0];	
+		 
+		//LWORK=26*N;
+		//LIWORK=10*N;
+		float *WORK=new float [LWORK];
+		int *IWORK=new int [LIWORK];
+		
+		ssyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_float_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_float_transpose (evec);
+		
+		delete [] ISUPPZ;
+		delete [] WORK;
+		delete [] IWORK;
+	}
+	
+	
+	return;
+}
+
+
+
+//eigen value decomposition, matrix A is destroyed
+void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix)
+{
+	if (flag_largematrix==1) {
+		int N=A->size1, LDA=A->size1, INFO, LWORK=-1;
+		char JOBZ='V', UPLO='L';		
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		//	double temp[1];
+		//	dsyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, temp, &LWORK, &INFO);
+		//	if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}		
+		//	LWORK=(int)temp[0];
+		
+		LWORK=3*N;
+		double *WORK=new double [LWORK];	
+		dsyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, WORK, &LWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_view A_sub=gsl_matrix_submatrix(A, 0, 0, N, N);
+		gsl_matrix_memcpy (evec, &A_sub.matrix);
+		gsl_matrix_transpose (evec);
+		
+		delete [] WORK;
+	} else {	
+		int N=A->size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1;
+		char JOBZ='V', UPLO='L', RANGE='A';
+		double ABSTOL=1.0E-7;
+		
+		//VL, VU, IL, IU are not referenced; M equals N if RANGE='A'
+		double VL=0.0, VU=0.0;
+		int IL=0, IU=0, M;
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		int *ISUPPZ=new int [2*N];
+		
+		double WORK_temp[1];
+		int IWORK_temp[1];
+
+		dsyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}	
+		LWORK=(int)WORK_temp[0]; LIWORK=(int)IWORK_temp[0];	
+
+		//LWORK=26*N;
+		//LIWORK=10*N;
+		double *WORK=new double [LWORK];
+		int *IWORK=new int [LIWORK];
+		
+		dsyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+
+		gsl_matrix_transpose (evec);
+		
+		delete [] ISUPPZ;
+		delete [] WORK;
+		delete [] IWORK;
+	}
+	
+	return;
+}
+
+//DO NOT set eigen values to be positive
+double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix)
+{
+#ifdef WITH_LAPACK
+	lapack_eigen_symmv (G, eval, U, flag_largematrix);
+#else
+	gsl_eigen_symmv_workspace *w=gsl_eigen_symmv_alloc (G->size1);
+	gsl_eigen_symmv (G, eval, U, w);
+	gsl_eigen_symmv_free (w);	
+#endif	
+	/*
+	for (size_t i=0; i<eval->size; ++i) {
+		if (gsl_vector_get (eval, i)<1e-10) {
+//			cout<<gsl_vector_get (eval, i)<<endl;
+			gsl_vector_set (eval, i, 0);			
+		}
+	}
+	*/
+	//calculate track_G=mean(diag(G))	
+	double d=0.0;
+	for (size_t i=0; i<eval->size; ++i) {
+		d+=gsl_vector_get(eval, i);
+	}
+	d/=(double)eval->size;
+	
+	return d;
+}
+
+
+//DO NOT set eigen values to be positive
+double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix)
+{
+#ifdef WITH_LAPACK
+	lapack_float_eigen_symmv (G, eval, U, flag_largematrix);
+#else
+	//gsl doesn't provide float precision eigen decomposition; plus, float precision eigen decomposition in lapack may not work on OS 10.4
+	//first change to double precision
+	gsl_matrix *G_double=gsl_matrix_alloc (G->size1, G->size2);
+	gsl_matrix *U_double=gsl_matrix_alloc (U->size1, U->size2);
+	gsl_vector *eval_double=gsl_vector_alloc (eval->size);
+	for (size_t i=0; i<G->size1; i++) {
+		for (size_t j=0; j<G->size2; j++) {
+			gsl_matrix_set(G_double, i, j, gsl_matrix_float_get(G, i, j));
+		}
+	}	
+	gsl_eigen_symmv_workspace *w_space=gsl_eigen_symmv_alloc (G->size1);
+	gsl_eigen_symmv (G_double, eval_double, U_double, w_space);
+	gsl_eigen_symmv_free (w_space);	
+	
+	//change back to float precision
+	for (size_t i=0; i<G->size1; i++) {
+		for (size_t j=0; j<G->size2; j++) {
+			gsl_matrix_float_set(K, i, j, gsl_matrix_get(G_double, i, j));
+		}
+	}
+	for (size_t i=0; i<U->size1; i++) {
+		for (size_t j=0; j<U->size2; j++) {
+			gsl_matrix_float_set(U, i, j, gsl_matrix_get(U_double, i, j));
+		}
+	}
+	for (size_t i=0; i<eval->size; i++) {
+		gsl_vector_float_set(eval, i, gsl_vector_get(eval_double, i));
+	}	
+	
+	//delete double precision matrices
+	gsl_matrix_free (G_double);
+	gsl_matrix_free (U_double);
+	gsl_vector_free (eval_double);
+#endif
+	/*
+	for (size_t i=0; i<eval->size; ++i) {
+		if (gsl_vector_float_get (eval, i)<1e-10) {
+			gsl_vector_float_set (eval, i, 0);
+		}
+	}
+	*/
+	//calculate track_G=mean(diag(G))	
+	double d=0.0;
+	for (size_t i=0; i<eval->size; ++i) {
+		d+=gsl_vector_float_get(eval, i);
+	}
+	d/=(double)eval->size;
+	
+	return d;
+}
+
+
+double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty)
+{
+	double logdet_O=0.0;
+	
+#ifdef WITH_LAPACK
+	lapack_cholesky_decomp(Omega);
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_get (Omega, i, i));
+	}	
+	logdet_O*=2.0;	
+	lapack_cholesky_solve(Omega, Xty, OiXty);	
+#else	
+	int status = gsl_linalg_cholesky_decomp(Omega);
+	if(status == GSL_EDOM) {
+		cout << "## non-positive definite matrix" << endl; 
+		//		exit(0); 
+	}
+	
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_get (Omega, i, i));
+	}
+	logdet_O*=2.0;	
+	
+	gsl_vector_memcpy (OiXty, Xty);
+	gsl_blas_dtrsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); 
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); 	
+	//	gsl_linalg_cholesky_solve(XtX, Xty, iXty);
+#endif
+	
+	return logdet_O;
+}
+
+
+double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty)
+{
+	double logdet_O=0.0;
+	
+#ifdef WITH_LAPACK
+	lapack_float_cholesky_decomp(Omega);
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_float_get (Omega, i, i));
+	}	
+	logdet_O*=2.0;	
+	lapack_float_cholesky_solve(Omega, Xty, OiXty);	
+#else
+	gsl_matrix *Omega_double=gsl_matrix_alloc (Omega->size1, Omega->size2);
+	double d;
+	for (size_t i=0; i<Omega->size1; ++i) {
+		for (size_t j=0; j<Omega->size2; ++j) {
+			d=(double)gsl_matrix_float_get (Omega, i, j);
+			gsl_matrix_set (Omega_double, i, j, d);
+		}
+	}
+	
+	int status = gsl_linalg_cholesky_decomp(Omega_double);
+	if(status == GSL_EDOM) {
+		cout << "## non-positive definite matrix" << endl; 
+		//		exit(0); 
+	}	
+	
+	for (size_t i=0; i<Omega->size1; ++i) {
+		for (size_t j=0; j<Omega->size2; ++j) {
+			d=gsl_matrix_get (Omega_double, i, j);
+			if (j==i) {logdet_O+=log(d);}
+			gsl_matrix_float_set (Omega, i, j, (float)d);
+		}
+	}
+	logdet_O*=2.0;	
+	
+	gsl_vector_float_memcpy (OiXty, Xty);
+	gsl_blas_strsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); 
+	gsl_blas_strsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); 	
+	//	gsl_linalg_cholesky_solve(XtX, Xty, iXty);
+	
+	gsl_matrix_free (Omega_double);
+#endif
+	
+	return logdet_O;
+}	
+
+
+//LU decomposition
+void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum)
+{
+	gsl_linalg_LU_decomp (LU, p, signum);
+	return;
+}
+
+void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_decomp (LU_double, p, signum);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j));
+		}
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	return;
+}
+
+
+//LU invert
+void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse)
+{
+	gsl_linalg_LU_invert (LU, p, inverse);
+	return;
+}
+
+void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	gsl_matrix *inverse_double=gsl_matrix_alloc (inverse->size1, inverse->size2);
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_invert (LU_double, p, inverse_double);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<inverse->size1; i++) {
+		for (size_t j=0; j<inverse->size2; j++) {
+			gsl_matrix_float_set (inverse, i, j, gsl_matrix_get(inverse_double, i, j));
+		}
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	gsl_matrix_free (inverse_double);
+	return;
+}
+
+//LU lndet
+double LULndet (gsl_matrix *LU)
+{
+	double d;
+	d=gsl_linalg_LU_lndet (LU);
+	return d;
+}
+
+double LULndet (gsl_matrix_float *LU)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	double d;
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	d=gsl_linalg_LU_lndet (LU_double);
+	
+	//copy float matrix to double
+	/*
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j));
+		}
+	}
+	*/
+	//free matrix
+	gsl_matrix_free (LU_double);
+	return d;
+}
+
+
+//LU solve
+void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x)
+{
+	gsl_linalg_LU_solve (LU, p, b, x);
+	return;
+}
+
+void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	gsl_vector *b_double=gsl_vector_alloc (b->size);
+	gsl_vector *x_double=gsl_vector_alloc (x->size);	
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	for (size_t i=0; i<b->size; i++) {
+		gsl_vector_set (b_double, i, gsl_vector_float_get(b, i));
+	}
+	
+	for (size_t i=0; i<x->size; i++) {
+		gsl_vector_set (x_double, i, gsl_vector_float_get(x, i));
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_solve (LU_double, p, b_double, x_double);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<x->size; i++) {
+		gsl_vector_float_set (x, i, gsl_vector_get(x_double, i));
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	gsl_vector_free (b_double);
+	gsl_vector_free (x_double);
+	return;
+}
+
+
diff --git a/lapack.h b/lapack.h
new file mode 100644
index 0000000..cb7b156
--- /dev/null
+++ b/lapack.h
@@ -0,0 +1,53 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __LAPACK_H__                
+#define __LAPACK_H__
+
+
+
+using namespace std;
+
+
+void lapack_float_cholesky_decomp (gsl_matrix_float *A);
+void lapack_cholesky_decomp (gsl_matrix *A);
+void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x);
+void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x);
+void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C);
+void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C);
+void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix);
+void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix);
+
+double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix);
+double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix);
+
+double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty);
+double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty);
+
+void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum);
+void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum);
+void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse);
+void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse);
+double LULndet (gsl_matrix *LU);
+double LULndet (gsl_matrix_float *LU);
+void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x);
+void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x);
+#endif
+
+
+
diff --git a/lm.cpp b/lm.cpp
new file mode 100644
index 0000000..c983253
--- /dev/null
+++ b/lm.cpp
@@ -0,0 +1,571 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "gzstream.h"
+#include "lapack.h"
+
+#ifdef FORCE_FLOAT
+#include "lm_float.h"
+#else
+#include "lm.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+void LM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	file_gene=cPar.file_gene;
+	
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	ng_total=cPar.ng_total;
+	ng_test=0;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;	
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void LM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_opt=time_opt;	
+	
+	cPar.ng_test=ng_test;
+	
+	return;
+}
+
+
+
+void LM::WriteFiles () 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".assoc.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	if (!file_gene.empty()) {
+		outfile<<"geneID"<<"\t";
+		
+		if (a_mode==51) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==52) {
+			outfile<<"p_lrt"<<endl;
+		} else if (a_mode==53) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==54) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+				
+		for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) {	
+			outfile<<snpInfo[t].rs_number<<"\t";
+			
+			if (a_mode==51) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==52) {
+				outfile<<scientific<<setprecision(6)<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==53) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==54) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+		}	
+	}  else {
+		outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+		
+		if (a_mode==51) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==52) {
+			outfile<<"p_lrt"<<endl;
+		} else if (a_mode==53) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==54) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+		
+		size_t t=0;
+		for (size_t i=0; i<snpInfo.size(); ++i) {
+			if (indicator_snp[i]==0) {continue;}
+			
+			outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+			
+			if (a_mode==51) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==52) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==53) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==54) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+			t++;
+		}
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+
+void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x,  double &xPwy, double &xPwx)
+{
+	size_t c_size=Wty->size;
+	double d;
+	
+	gsl_vector *WtWiWtx=gsl_vector_alloc (c_size);
+	
+	gsl_blas_ddot (x, x, &xPwx);
+	gsl_blas_ddot (x, y, &xPwy);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);	
+	
+	gsl_blas_ddot (WtWiWtx, Wtx, &d);	
+	xPwx-=d;
+	
+	gsl_blas_ddot (WtWiWtx, Wty, &d);	
+	xPwy-=d;
+	
+	gsl_vector_free (WtWiWtx);
+	
+	return;
+}
+
+
+void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy)
+{
+	size_t c_size=Wty->size;
+	double d;
+	
+	gsl_vector *WtWiWty=gsl_vector_alloc (c_size);
+	
+	gsl_blas_ddot (y, y, &yPwy);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);	
+	
+	gsl_blas_ddot (WtWiWty, Wty, &d);	
+	yPwy-=d;
+	
+	gsl_vector_free (WtWiWty);
+	
+	return;
+}
+
+
+
+//calculate p values and beta/se in a linear model
+void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score)
+{
+	double yPxy=yPwy-xPwy*xPwy/xPwx;
+	double se_wald, se_score;
+	
+	beta=xPwy/xPwx;
+	se_wald=sqrt(yPxy/(df*xPwx) );
+	se_score=sqrt(yPwy/((double)n_size*xPwx) );
+	
+	p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df);
+	p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df);
+	p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1);
+	
+	if (test_mode==3) {se=se_score;} else {se=se_wald;}
+	
+	return;
+}
+
+
+
+
+void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) 
+{
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int c_phen;
+	string rs; //gene id
+	double d;
+	
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *y=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
+	CalcvPv(WtWi, Wtx, x, xPwx);
+		
+	//header
+	getline(infile, line);
+	
+	for (size_t t=0; t<ng_total; t++) {
+		getline(infile, line);
+		if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);}
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		c_phen=0; 
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			d=atof(ch_ptr); 			
+			gsl_vector_set(y, c_phen, d);
+			
+			c_phen++;
+		}
+				
+		//calculate statistics		
+		time_start=clock();	
+	
+		gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
+		CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy);
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);	
+	
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}
+	cout<<endl;
+	
+	gsl_vector_free(y);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *x=gsl_vector_alloc (W->size1);
+	gsl_vector *x_miss=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);		
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
+	CalcvPv(WtWi, Wty, y, yPwy);
+	
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		//if (t>1) {break;}
+		getline(infile, line);
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}	
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}		
+		
+		//calculate statistics		
+		time_start=clock();		
+
+		gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);		
+		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}	
+	cout<<endl;
+
+	gsl_vector_free(x);
+	gsl_vector_free(x_miss);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	char ch[1];
+	bitset<8> b;	
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+		
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *x=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
+	CalcvPv(WtWi, Wty, y, yPwy);
+		
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+				
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		//calculate statistics		
+		time_start=clock();	
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
+		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);		
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);    
+
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}	
+	cout<<endl;
+	
+	gsl_vector_free(x);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);	
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+//make sure that both y and X are centered already
+void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) 
+{
+	double yty, xty, xtx, log_lr;
+	gsl_blas_ddot(y, y, &yty);
+
+	for (size_t i=0; i<X->size2; ++i) {
+	  gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
+	  gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
+	  gsl_blas_ddot(&X_col.vector, y, &xty);
+
+	  log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx));
+	  pos_loglr.push_back(make_pair(i,log_lr) );
+	}
+	
+	return;
+}
diff --git a/lm.h b/lm.h
new file mode 100644
index 0000000..84a0322
--- /dev/null
+++ b/lm.h
@@ -0,0 +1,74 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LM_H__                
+#define __LM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+class LM {
+	
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 50+1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	
+	string file_gene;
+	
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t n_cvt;
+	double time_opt;		//time spent
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<SUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x);
+	void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y);
+	void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y);
+	void WriteFiles ();
+};
+void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr);
+#endif
diff --git a/lmm.cpp b/lmm.cpp
new file mode 100644
index 0000000..fed94ee
--- /dev/null
+++ b/lmm.cpp
@@ -0,0 +1,1770 @@
+/*
+    Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#else
+#include "lmm.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+void LMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	file_gene=cPar.file_gene;
+	
+	l_min=cPar.l_min;
+	l_max=cPar.l_max;
+	n_region=cPar.n_region;	
+	l_mle_null=cPar.l_mle_null;
+	logl_mle_H0=cPar.logl_mle_H0;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	ng_total=cPar.ng_total;
+	ng_test=0;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;	
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void LMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+	
+	cPar.ng_test=ng_test;
+	
+	return;
+}
+
+
+
+void LMM::WriteFiles () 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".assoc.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	if (!file_gene.empty()) {
+		outfile<<"geneID"<<"\t";
+		
+		if (a_mode==1) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==2) {
+			outfile<<"l_mle"<<"\t"<<"p_lrt"<<endl;
+		} else if (a_mode==3) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==4) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+				
+		for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) {	
+			outfile<<snpInfo[t].rs_number<<"\t";
+			
+			if (a_mode==1) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==2) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==3) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==4) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+		}	
+	}  else {
+		outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+		
+		if (a_mode==1) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==2) {
+			outfile<<"l_mle"<<"\t"<<"p_lrt"<<endl;
+		} else if (a_mode==3) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==4) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+		
+		size_t t=0;
+		for (size_t i=0; i<snpInfo.size(); ++i) {
+			if (indicator_snp[i]==0) {continue;}
+			
+			outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+			
+			if (a_mode==1) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==2) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==3) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==4) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+			t++;
+		}
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+
+
+
+
+
+
+
+//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1
+size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
+	if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;}
+	size_t index;
+	size_t l, h;
+	if (b>a) {l=a; h=b;} else {l=b; h=a;}
+	
+	size_t n=n_cvt+2;
+	index=(2*n-l+2)*(l-1)/2+h-l;	
+	
+	return index;
+}
+
+
+void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, const gsl_matrix *Uab, const gsl_vector *ab, gsl_matrix *Pab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p_ab;
+	double ps_ab, ps_aw, ps_bw, ps_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {			
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab);
+					if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;}
+					gsl_matrix_set (Pab, 0, index_ab, p_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps_ab=gsl_matrix_get (Pab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					
+					p_ab=ps_ab-ps_aw*ps_bw/ps_ww;
+					gsl_matrix_set (Pab, p, index_ab, p_ab);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, gsl_matrix *PPab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p2_ab;
+	double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {					
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab);
+					if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);}
+					gsl_matrix_set (PPab, 0, index_ab, p2_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps2_ab=gsl_matrix_get (PPab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					ps2_aw=gsl_matrix_get (PPab, p-1, index_aw);
+					ps2_bw=gsl_matrix_get (PPab, p-1, index_bw);
+					ps2_ww=gsl_matrix_get (PPab, p-1, index_ww);
+					
+					p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww);
+					p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww;
+					gsl_matrix_set (PPab, p, index_ab, p2_ab);
+					
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, const gsl_matrix *PPab, gsl_matrix *PPPab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p3_ab;
+	double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {					
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab);
+					if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);}
+					gsl_matrix_set (PPPab, 0, index_ab, p3_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					ps2_aw=gsl_matrix_get (PPab, p-1, index_aw);
+					ps2_bw=gsl_matrix_get (PPab, p-1, index_bw);
+					ps2_ww=gsl_matrix_get (PPab, p-1, index_ww);
+					ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw);
+					ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw);
+					ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww);
+					
+					p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww);
+					p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww;
+					p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww);
+					
+					gsl_matrix_set (PPPab, p, index_ab, p3_ab);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+
+double LogL_f (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double f=0.0, logdet_h=0.0, d;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+				
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	for (size_t i=0; i<(p->eval)->size; ++i) {
+		d=gsl_vector_get (v_temp, i);
+		logdet_h+=log(fabs(d));
+	}	
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	
+	double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return f;
+}
+
+ 
+ 
+
+
+
+double LogL_dev1 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double dev1=0.0, trace_Hi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+		
+	if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	
+	double trace_HiK=((double)ni_test-trace_Hi)/l;	
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double yPKPy=(P_yy-PP_yy)/l;	
+	dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy;
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev1;
+}
+	
+	
+
+
+double LogL_dev2 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {		
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy);		
+		
+	double yPKPy=(P_yy-PP_yy)/l;
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+		
+	dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+		
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev2;
+}
+	
+	
+	
+	
+	
+void LogL_dev12 (double l, void *params, double *dev1, double *dev2)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	double trace_HiK=((double)ni_test-trace_Hi)/l;
+	double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy);		
+		
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+		
+	*dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy;
+	*dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return;
+}
+
+
+
+double LogRL_f (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	for (size_t i=0; i<(p->eval)->size; ++i) {
+		d=gsl_vector_get (v_temp, i);
+		logdet_h+=log(fabs(d));
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	gsl_vector_set_all (v_temp, 1.0);
+	CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab);	
+	
+	//calculate |WHiW|-|WW|
+	logdet_hiw=0.0;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		d=gsl_matrix_get (Pab, i, index_ww);
+		logdet_hiw+=log(d);
+		d=gsl_matrix_get (Iab, i, index_ww);
+		logdet_hiw-=log(d);
+	}
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	
+	double c=0.5*df*(log(df)-log(2*M_PI)-1.0);		
+	f=c-0.5*logdet_h-0.5*logdet_hiw-0.5*df*log(P_yy);
+		
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (Iab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return f;
+}
+
+
+
+double LogRL_dev1 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double dev1=0.0, trace_Hi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi;
+	double ps_ww, ps2_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+	}
+	double trace_PK=(df-trace_P)/l;
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);		
+	double yPKPy=(P_yy-PP_yy)/l;	
+	
+	dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy;	
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev1;
+}
+
+
+
+
+double LogRL_dev2 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi, trace_PP=trace_HiHi;
+	double ps_ww, ps2_ww, ps3_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		ps3_ww=gsl_matrix_get (PPPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+		trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww;
+	}
+	double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l);
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww);				
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+	
+	dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev2;
+}
+	
+
+
+
+void LogRL_dev12 (double l, void *params, double *dev1, double *dev2)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+		
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi, trace_PP=trace_HiHi;
+	double ps_ww, ps2_ww, ps3_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		ps3_ww=gsl_matrix_get (PPPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+		trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww;
+	}
+	double trace_PK=(df-trace_P)/l;
+	double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l);
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww);				
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+	
+	*dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy;
+	*dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return ;
+}
+	
+
+
+
+
+
+
+
+void LMM::CalcRLWald (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_wald)
+{
+	size_t n_cvt=params.n_cvt;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	int df=(int)ni_test-(int)n_cvt-1;
+			
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(params.eval->size);
+	
+	gsl_vector_memcpy (v_temp, params.eval);
+	gsl_vector_scale (v_temp, l);
+	if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt);
+	size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);
+	double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx);
+	double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy);	
+	double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy);	
+	
+	beta=P_xy/P_xx;
+	double tau=(double)df/Px_yy;
+	se=sqrt(1.0/(tau*P_xx));	
+	p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df);	
+//	p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1);	
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return ;
+}
+
+
+void LMM::CalcRLScore (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_score)
+{
+	size_t n_cvt=params.n_cvt;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	int df=(int)ni_test-(int)n_cvt-1;
+			
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(params.eval->size);
+	
+	gsl_vector_memcpy (v_temp, params.eval);
+	gsl_vector_scale (v_temp, l);
+	if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt);
+	size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);
+	double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx);
+	double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy);	
+	double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy);	
+	
+	beta=P_xy/P_xx;
+	double tau=(double)df/Px_yy;
+	se=sqrt(1.0/(tau*P_xx));	
+	
+	p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df);
+//	p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1);	
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return ;
+}
+
+
+
+
+
+
+
+
+void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) 
+{
+	size_t index_ab;
+	size_t n_cvt=UtW->size2;
+	
+	gsl_vector *u_a=gsl_vector_alloc (Uty->size);
+	
+	for (size_t a=1; a<=n_cvt+2; ++a) {
+		if (a==n_cvt+1) {continue;}
+		
+		if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);}
+		else {
+			gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1);
+			gsl_vector_memcpy (u_a, &UtW_col.vector);
+		}
+		
+		for (size_t b=a; b>=1; --b) {		
+			if (b==n_cvt+1) {continue;}
+			
+			index_ab=GetabIndex (a, b, n_cvt);
+			gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab);
+			
+			if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);}
+			else {
+				gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1);
+				gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector);
+			}			
+			
+			gsl_vector_mul(&Uab_col.vector, u_a);
+		}
+	}
+	
+	gsl_vector_free (u_a);
+	return;
+}
+
+
+void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) 
+{	
+	size_t index_ab;
+	size_t n_cvt=UtW->size2;
+	
+	for (size_t b=1; b<=n_cvt+2; ++b) {			
+		index_ab=GetabIndex (n_cvt+1, b, n_cvt);
+		gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab);
+		
+		if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);}
+		else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);}
+		else {
+			gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1);
+			gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector);
+		}
+		
+		gsl_vector_mul(&Uab_col.vector, Utx);
+	}
+	
+	return;
+}
+
+
+
+void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) 
+{
+	size_t index_ab;
+	size_t n_cvt=W->size2;
+	
+	double d;
+	gsl_vector *v_a=gsl_vector_alloc (y->size);
+	gsl_vector *v_b=gsl_vector_alloc (y->size);
+	
+	for (size_t a=1; a<=n_cvt+2; ++a) {
+		if (a==n_cvt+1) {continue;}
+		
+		if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);}
+		else {
+			gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1);
+			gsl_vector_memcpy (v_a, &W_col.vector);
+		}
+		
+		for (size_t b=a; b>=1; --b) {		
+			if (b==n_cvt+1) {continue;}
+			
+			index_ab=GetabIndex (a, b, n_cvt);
+			
+			if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);}
+			else {
+				gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1);
+				gsl_vector_memcpy (v_b, &W_col.vector);
+			}			
+			
+			gsl_blas_ddot (v_a, v_b, &d);
+			gsl_vector_set(ab, index_ab, d);
+		}
+	}
+	
+	gsl_vector_free (v_a);
+	gsl_vector_free (v_b);
+	return;
+}
+
+
+void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) 
+{	
+	size_t index_ab;
+	size_t n_cvt=W->size2;
+	
+	double d;
+	gsl_vector *v_b=gsl_vector_alloc (y->size);
+	
+	for (size_t b=1; b<=n_cvt+2; ++b) {			
+		index_ab=GetabIndex (n_cvt+1, b, n_cvt);
+		
+		if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);}
+		else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);}
+		else {
+			gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1);
+			gsl_vector_memcpy (v_b, &W_col.vector);
+		}
+		
+		gsl_blas_ddot (x, v_b, &d);
+		gsl_vector_set(ab, index_ab, d);
+	}
+	
+	gsl_vector_free (v_b);
+	
+	return;
+}
+
+
+
+
+
+void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) 
+{
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0, logl_H0=0.0, l_H0;
+	int c_phen;
+	string rs; //gene id
+	double d;
+	
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_vector *y=gsl_vector_alloc (U->size1);
+	gsl_vector *Uty=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+		
+	//header
+	getline(infile, line);
+	
+	for (size_t t=0; t<ng_total; t++) {
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);}
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		c_phen=0; 
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			d=atof(ch_ptr); 			
+			gsl_vector_set(y, c_phen, d);
+			
+			c_phen++;
+		}
+		
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, y, 0.0, Uty);		
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	
+		//calculate null
+		time_start=clock();
+		
+		gsl_matrix_set_zero (Uab);
+		
+		CalcUab (UtW, Uty, Uab);
+		FUNC_PARAM param0={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		if (a_mode==2 || a_mode==3 || a_mode==4) {
+			CalcLambda('L', param0, l_min, l_max, n_region, l_H0, logl_H0);
+		}
+		
+		//calculate alternative
+		CalcUab(UtW, Uty, Utx, Uab);
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_H0, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1);	
+		}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }
+	cout<<endl;
+	
+	gsl_vector_free (y);
+	gsl_vector_free (Uty);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+
+	gsl_vector *x=gsl_vector_alloc (U->size1);
+	gsl_vector *x_miss=gsl_vector_alloc (U->size1);
+	gsl_vector *Utx=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}	
+	
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+//		if (t>1) {break;}
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+		
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}	
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		
+		//calculate statistics
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);		
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		CalcUab(UtW, Uty, Utx, Uab);
+//		if (e_mode!=0) {
+//			Calcab (W, y, x, ab);
+//		}
+		
+		time_start=clock();
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);	
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);	
+		}			
+		
+		if (x_mean>1) {beta*=-1;}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (x_miss);
+	gsl_vector_free (Utx);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	char ch[1];
+	bitset<8> b;	
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+		
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+
+	gsl_vector *x=gsl_vector_alloc (U->size1);
+	gsl_vector *Utx=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}
+		
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+				
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		//calculate statistics
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		CalcUab(UtW, Uty, Utx, Uab);
+//		if (e_mode!=0) {
+//			Calcab (W, y, x, ab);
+//		}
+		
+		time_start=clock();
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1, for beta
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);	
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);	
+		}		
+		
+		if (x_mean>1) {beta*=-1;}		
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (Utx);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+
+void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr) 
+{
+	double logl_H0, logl_H1, log_lr, lambda0, lambda1;
+	
+	gsl_vector *w=gsl_vector_alloc (Uty->size);
+	gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1);	
+	gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6);
+	gsl_vector *ab=gsl_vector_alloc (6);	
+	
+	gsl_vector_set_zero(ab);
+	gsl_vector_set_all (w, 1.0);
+	gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0);	
+	gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector);		
+	
+	CalcUab (Utw, Uty, Uab) ;	
+	FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0};	
+	
+	CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0);
+	
+	for (size_t i=0; i<UtX->size2; ++i) {
+		gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i);
+		CalcUab(Utw, Uty, &UtX_col.vector, Uab);
+		FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0};
+		
+		CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1);
+		log_lr=logl_H1-logl_H0;				
+		
+		pos_loglr.push_back(make_pair(i,log_lr) );
+	}
+	
+	gsl_vector_free (w);
+	gsl_matrix_free (Utw);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	return;
+}
+
+
+
+
+void CalcLambda (const char func_name, FUNC_PARAM &params, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+	
+	vector<pair<double, double> > lambda_lh;
+	
+	//evaluate first order derivates in different intervals
+	double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region;
+	double dev1_l, dev1_h, logf_l, logf_h;
+	
+	for (size_t i=0; i<n_region; ++i) {
+		lambda_l=l_min*exp(lambda_interval*i);
+		lambda_h=l_min*exp(lambda_interval*(i+1.0));
+		
+		if (func_name=='R' || func_name=='r') {
+			dev1_l=LogRL_dev1 (lambda_l, &params);
+			dev1_h=LogRL_dev1 (lambda_h, &params);
+		}
+		else {
+			dev1_l=LogL_dev1 (lambda_l, &params);
+			dev1_h=LogL_dev1 (lambda_h, &params);
+		}
+		
+		if (dev1_l*dev1_h<=0) {
+			lambda_lh.push_back(make_pair(lambda_l, lambda_h));
+		}
+	}
+	
+	//if derivates do not change signs in any interval
+	if (lambda_lh.empty()) {
+		if (func_name=='R' || func_name=='r') {
+			logf_l=LogRL_f (l_min, &params);
+			logf_h=LogRL_f (l_max, &params);
+		}
+		else {
+			logf_l=LogL_f (l_min, &params);
+			logf_h=LogL_f (l_max, &params);
+		}
+		
+		if (logf_l>=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;}
+	}
+	else {
+		//if derivates change signs
+		int status;
+		int iter=0, max_iter=100;
+		double l, l_temp;	
+		
+		gsl_function F;
+		gsl_function_fdf FDF;
+		
+		F.params=&params;
+		FDF.params=&params;
+		
+		if (func_name=='R' || func_name=='r') {
+			F.function=&LogRL_dev1;
+			FDF.f=&LogRL_dev1;
+			FDF.df=&LogRL_dev2;
+			FDF.fdf=&LogRL_dev12;
+		}
+		else {
+			F.function=&LogL_dev1;
+			FDF.f=&LogL_dev1;
+			FDF.df=&LogL_dev2;
+			FDF.fdf=&LogL_dev12;
+		}
+		
+		const gsl_root_fsolver_type *T_f;
+		gsl_root_fsolver *s_f;
+		T_f=gsl_root_fsolver_brent;
+		s_f=gsl_root_fsolver_alloc (T_f);
+		
+		const gsl_root_fdfsolver_type *T_fdf;
+		gsl_root_fdfsolver *s_fdf;
+		T_fdf=gsl_root_fdfsolver_newton;
+		s_fdf=gsl_root_fdfsolver_alloc(T_fdf);	
+		
+		for (vector<double>::size_type i=0; i<lambda_lh.size(); ++i) {
+			lambda_l=lambda_lh[i].first; lambda_h=lambda_lh[i].second;
+			
+			gsl_root_fsolver_set (s_f, &F, lambda_l, lambda_h);
+			
+			do {
+				iter++;
+				status=gsl_root_fsolver_iterate (s_f);
+				l=gsl_root_fsolver_root (s_f);
+				lambda_l=gsl_root_fsolver_x_lower (s_f);
+				lambda_h=gsl_root_fsolver_x_upper (s_f);
+				status=gsl_root_test_interval (lambda_l, lambda_h, 0, 1e-1);		
+			}
+			while (status==GSL_CONTINUE && iter<max_iter); 				
+			
+			iter=0;
+			
+			gsl_root_fdfsolver_set (s_fdf, &FDF, l);	
+			
+			do {
+				iter++;
+				status=gsl_root_fdfsolver_iterate (s_fdf);
+				l_temp=l;
+				l=gsl_root_fdfsolver_root (s_fdf);
+				status=gsl_root_test_delta (l, l_temp, 0, 1e-5);		
+			}
+			while (status==GSL_CONTINUE && iter<max_iter && l>l_min && l<l_max); 
+			
+			l=l_temp;
+			if (l<l_min) {l=l_min;}
+			if (l>l_max) {l=l_max;}
+			if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, &params);} else {logf_l=LogL_f (l, &params);}			
+			
+			if (i==0) {logf=logf_l; lambda=l;}
+			else if (logf<logf_l) {logf=logf_l; lambda=l;}
+			else {}
+		}
+		gsl_root_fsolver_free (s_f);	
+		gsl_root_fdfsolver_free (s_fdf);		
+		
+		if (func_name=='R' || func_name=='r') {
+			logf_l=LogRL_f (l_min, &params);
+			logf_h=LogRL_f (l_max, &params);
+		}
+		else {
+			logf_l=LogL_f (l_min, &params);
+			logf_h=LogL_f (l_max, &params);
+		}
+		
+		if (logf_l>logf) {lambda=l_min; logf=logf_l;} 
+		if (logf_h>logf) {lambda=l_max; logf=logf_h;}
+	}
+	
+	return;
+}
+
+
+
+
+
+//calculate lambda in the null model
+void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}
+		
+	FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0};
+	
+	CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0);
+	
+	gsl_matrix_free(Uab);	
+	gsl_vector_free(ab);	
+	
+	return;
+}
+	
+	
+//obtain REMLE estimate for PVE using lambda_remle
+void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se)
+{
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+	//	if (e_mode!=0) {
+	//		gsl_vector_set_zero (ab);
+	//		Calcab (W, y, ab);
+	//	}
+	
+	FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0};
+	
+	double se=sqrt(-1.0/LogRL_dev2 (lambda, &param0));
+	
+	pve=trace_G*lambda/(trace_G*lambda+1.0);
+	pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se;
+	
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);	
+	return;
+}
+
+//obtain REML estimate for Vg and Ve using lambda_remle
+//obtain beta and se(beta) for coefficients
+//ab is not used when e_mode==0
+void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta)
+{
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(eval->size);
+	gsl_matrix *HiW=gsl_matrix_alloc(eval->size, UtW->size2);
+	gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2);
+	gsl_vector *WHiy=gsl_vector_alloc(UtW->size2);
+	gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2);
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);	
+	
+	gsl_vector_memcpy (v_temp, eval);
+	gsl_vector_scale (v_temp, lambda);
+	gsl_vector_set_all (Hi_eval, 1.0);
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	//calculate beta
+	gsl_matrix_memcpy (HiW, UtW);
+	for (size_t i=0; i<UtW->size2; i++) {
+		gsl_vector_view HiW_col=gsl_matrix_column(HiW, i);
+		gsl_vector_mul(&HiW_col.vector, Hi_eval);
+	}
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW);
+	gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy);
+	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2);
+	LUDecomp (WHiW, pmt, &sig);
+	LUSolve (WHiW, pmt, WHiy, beta);
+	LUInvert (WHiW, pmt, Vbeta);
+		
+	//calculate vg and ve
+	CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);	
+	
+	ve=P_yy/(double)(ni_test-n_cvt);
+	vg=ve*lambda;
+	
+	//with ve, calculate se(beta)
+	gsl_matrix_scale(Vbeta, ve);
+	
+	//obtain se_beta
+	for (size_t i=0; i<Vbeta->size1; i++) {
+		gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) );
+	}
+	
+	gsl_matrix_free(Uab);
+	gsl_matrix_free(Pab);
+	gsl_vector_free(ab);
+	gsl_vector_free(Hi_eval);
+	gsl_vector_free(v_temp);
+	gsl_matrix_free(HiW);
+	gsl_matrix_free(WHiW);
+	gsl_vector_free(WHiy);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_permutation_free(pmt);
+	return;
+}
+
diff --git a/lmm.h b/lmm.h
new file mode 100644
index 0000000..d65b785
--- /dev/null
+++ b/lmm.h
@@ -0,0 +1,110 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __LMM_H__                
+#define __LMM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+class FUNC_PARAM
+{
+
+public:
+	bool calc_null;
+	size_t ni_test;
+	size_t n_cvt;
+	const gsl_vector *eval;
+	const gsl_matrix *Uab;
+	const gsl_vector *ab;
+	size_t e_mode;
+};
+
+
+
+
+class LMM {
+
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	
+	string file_gene;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double l_mle_null;
+	double logl_mle_H0;	
+	
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t n_cvt;
+	double time_UtX;		//time spent on optimization iterations
+	double time_opt;		//time spent on optimization iterations
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<SUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x);
+	void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y);
+	void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y);
+	void WriteFiles ();
+	
+	void CalcRLWald (const double &lambda, const FUNC_PARAM &params, double &beta, double &se, double &p_wald);
+	void CalcRLScore (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_score);	
+};
+
+void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr);
+void CalcLambda (const char func_name, FUNC_PARAM &params, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf);
+void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0);
+void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se);
+void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta);
+
+#endif
+
+
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..9ab98ea
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,86 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "param.h"
+
+#ifdef FORCE_FLOAT
+#include "gemma_float.h"
+#else
+#include "gemma.h"
+#endif
+
+using namespace std;
+
+
+
+int main(int argc, char * argv[])
+{ 	
+	GEMMA cGemma;	
+	PARAM cPar;
+
+	if (argc <= 1) {
+		cGemma.PrintHeader(); 
+		return EXIT_SUCCESS;
+	}
+	if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'h') {
+		cGemma.PrintHelp(0);
+		return EXIT_SUCCESS;
+	}
+	if (argc==3 && argv[1][0] == '-' && argv[1][1] == 'h') {
+		string str;
+		str.assign(argv[2]);
+		cGemma.PrintHelp(atoi(str.c_str()));
+		return EXIT_SUCCESS;
+	}
+	if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'l') {
+		cGemma.PrintLicense();
+		return EXIT_SUCCESS;
+	}
+	
+	ifstream check_dir("output/");
+	if (!check_dir) {
+		mkdir("output", S_IRWXU|S_IRGRP|S_IROTH);
+	}	
+	
+	cGemma.Assign(argc, argv, cPar); 
+		
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	     
+	if (cPar.mode_silence) {stringstream ss; cout.rdbuf (ss.rdbuf());}
+	
+	cPar.CheckParam();
+	
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	
+	cGemma.BatchRun(cPar);
+	
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	
+	cGemma.WriteLog(argc, argv, cPar);
+	
+    return EXIT_SUCCESS;                                                          
+}
+
+
+ 
diff --git a/mathfunc.cpp b/mathfunc.cpp
new file mode 100644
index 0000000..09e58dc
--- /dev/null
+++ b/mathfunc.cpp
@@ -0,0 +1,310 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <map>
+#include <set>
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h> 
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_cdf.h"
+
+#ifdef FORCE_FLOAT
+#include "mathfunc_float.h"
+#else
+#include "mathfunc.h"
+#endif
+
+
+using namespace std;
+
+
+
+//calculate variance of a vector
+double VectorVar (const gsl_vector *v)
+{
+	double d, m=0.0, m2=0.0;
+	for (size_t i=0; i<v->size; ++i) {
+		d=gsl_vector_get (v, i);
+		m+=d;
+		m2+=d*d;
+	}
+	m/=(double)v->size;
+	m2/=(double)v->size;
+	return m2-m*m;
+}
+
+
+
+//center the matrix G	
+void CenterMatrix (gsl_matrix *G)
+{		
+	double d;
+	gsl_vector *w=gsl_vector_alloc (G->size1);
+	gsl_vector *Gw=gsl_vector_alloc (G->size1);
+	gsl_vector_set_all (w, 1.0);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw);			
+	gsl_blas_dsyr2 (CblasUpper, -1.0/(double)G->size1, Gw, w, G);
+	gsl_blas_ddot (w, Gw, &d);		
+	gsl_blas_dsyr (CblasUpper, d/((double)G->size1*(double)G->size1), w, G);
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (G, j, i);
+			gsl_matrix_set (G, i, j, d);
+		}
+	}
+	
+	gsl_vector_free(w);
+	gsl_vector_free(Gw);
+	
+	return;
+}
+
+
+//center the matrix G	
+void CenterMatrix (gsl_matrix *G, gsl_vector *w)
+{		
+	double d, wtw;
+	gsl_vector *Gw=gsl_vector_alloc (G->size1);
+	
+	gsl_blas_ddot (w, w, &wtw);	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw);			
+	gsl_blas_dsyr2 (CblasUpper, -1.0/wtw, Gw, w, G);
+	gsl_blas_ddot (w, Gw, &d);		
+	gsl_blas_dsyr (CblasUpper, d/(wtw*wtw), w, G);
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (G, j, i);
+			gsl_matrix_set (G, i, j, d);
+		}
+	}
+	
+	gsl_vector_free(Gw);
+	
+	return;
+}
+
+
+//scale the matrix G such that the mean diagonal = 1
+void ScaleMatrix (gsl_matrix *G)
+{		
+	double d=0.0;
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		d+=gsl_matrix_get(G, i, i);
+	}
+	d/=(double)G->size1;
+	
+	gsl_matrix_scale (G, 1.0/d);
+	
+	return;
+}
+
+
+//center the vector y
+double CenterVector (gsl_vector *y)
+{		
+	double d=0.0;
+	
+	for (size_t i=0; i<y->size; ++i) {
+		d+=gsl_vector_get (y, i);
+	}
+	d/=(double)y->size;
+	
+	gsl_vector_add_constant (y, -1.0*d);
+	
+	return d;
+}
+
+
+//calculate UtX
+void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) 
+{
+	gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1);
+	for (size_t i=0; i<UtX->size2; ++i) {
+		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
+		gsl_blas_dgemv (CblasTrans, 1.0, U, &UtX_col.vector, 0.0, Utx_vec);
+		gsl_vector_memcpy (&UtX_col.vector, Utx_vec);
+	}	
+	gsl_vector_free (Utx_vec);
+	return;
+}
+
+
+void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) 
+{
+	for (size_t i=0; i<X->size2; ++i) {
+		gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
+		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
+		gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector);
+	}
+	return;
+}
+
+void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx) 
+{
+	gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
+	return;
+}
+
+
+//Kronecker product
+void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) 
+{
+	for (size_t i=0; i<K->size1; i++) {
+		for (size_t j=0; j<K->size2; j++) {
+			gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2);
+			gsl_matrix_memcpy (&H_sub.matrix, V);
+			gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j));
+		}
+	}
+	return;
+}
+
+//symmetric K matrix
+void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) 
+{
+	for (size_t i=0; i<K->size1; i++) {
+		for (size_t j=i; j<K->size2; j++) {
+			gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2);
+			gsl_matrix_memcpy (&H_sub.matrix, V);
+			gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j));
+			
+			if (i!=j) {
+				gsl_matrix_view H_sub_sym=gsl_matrix_submatrix (H, j*V->size1, i*V->size2, V->size1, V->size2);
+				gsl_matrix_memcpy (&H_sub_sym.matrix, &H_sub.matrix);
+			}
+		}
+	}
+	return;
+}
+
+
+// this function calculates HWE p value with methods described in Wigginton et al., 2005 AJHG; 
+// it is based on the code in plink 1.07
+double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab)
+{
+	if ( (n_hom1+n_hom2+n_ab)==0 ) {return 1;}
+	
+	//aa is the rare allele
+	int n_aa=n_hom1 < n_hom2 ? n_hom1 : n_hom2;
+	int n_bb=n_hom1 < n_hom2 ? n_hom2 : n_hom1;
+	
+	int rare_copies = 2 * n_aa + n_ab;
+	int genotypes   = n_ab + n_bb + n_aa;
+	
+	double * het_probs = (double *) malloc( (rare_copies + 1) * sizeof(double));
+	if (het_probs == NULL) 
+		cout<<"Internal error: SNP-HWE: Unable to allocate array"<<endl;
+		
+		int i;
+	for (i = 0; i <= rare_copies; i++)
+		het_probs[i] = 0.0;
+		
+	/* start at midpoint */
+		int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
+		
+	/* check to ensure that midpoint and rare alleles have same parity */
+		if ((rare_copies & 1) ^ (mid & 1))
+			mid++;
+	
+	int curr_hets = mid;
+	int curr_homr = (rare_copies - mid) / 2;
+	int curr_homc = genotypes - curr_hets - curr_homr;
+	
+	het_probs[mid] = 1.0;
+	double sum = het_probs[mid];
+	for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
+    {
+		het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0)
+		/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
+		sum += het_probs[curr_hets - 2];
+		
+		/* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
+		curr_homr++;
+		curr_homc++;
+    }
+	
+	curr_hets = mid;
+	curr_homr = (rare_copies - mid) / 2;
+	curr_homc = genotypes - curr_hets - curr_homr;
+	for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
+    {
+		het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc
+		/((curr_hets + 2.0) * (curr_hets + 1.0));
+		sum += het_probs[curr_hets + 2];
+		
+		/* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
+		curr_homr--;
+		curr_homc--;
+    }
+	
+	for (i = 0; i <= rare_copies; i++)
+		het_probs[i] /= sum;
+		
+	/* alternate p-value calculation for p_hi/p_lo
+	 double p_hi = het_probs[n_ab];
+	 for (i = n_ab + 1; i <= rare_copies; i++)
+     p_hi += het_probs[i];
+	 
+	 double p_lo = het_probs[n_ab];
+	 for (i = n_ab - 1; i >= 0; i--)
+	 p_lo += het_probs[i];
+	 
+	 double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo;
+	 */
+		
+		double p_hwe = 0.0;
+	/*  p-value calculation for p_hwe  */
+		for (i = 0; i <= rare_copies; i++)
+		{
+			if (het_probs[i] > het_probs[n_ab])
+				continue;
+			p_hwe += het_probs[i];
+		}
+	
+	p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
+	
+	free(het_probs);
+	
+	return p_hwe;
+}
+
+
+
+
+
+
+	
+
diff --git a/mathfunc.h b/mathfunc.h
new file mode 100644
index 0000000..d0e1696
--- /dev/null
+++ b/mathfunc.h
@@ -0,0 +1,41 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MATHFUNC_H__                
+#define __MATHFUNC_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+using namespace std;
+
+double VectorVar (const gsl_vector *v);
+void CenterMatrix (gsl_matrix *G);
+void CenterMatrix (gsl_matrix *G, gsl_vector *w);
+void ScaleMatrix (gsl_matrix *G);
+double CenterVector (gsl_vector *y);
+void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX);
+void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX);
+void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx);
+double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab);
+void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H);
+void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H);
+
+
+#endif
diff --git a/mvlmm.cpp b/mvlmm.cpp
new file mode 100644
index 0000000..56540d8
--- /dev/null
+++ b/mvlmm.cpp
@@ -0,0 +1,3748 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#include "mvlmm_float.h"
+#else
+#include "lmm.h"
+#include "mvlmm.h"
+#endif
+
+
+
+using namespace std;
+
+
+//in this file, X, Y are already transformed (i.e. UtX and UtY)
+
+
+void MVLMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	
+	l_min=cPar.l_min;
+	l_max=cPar.l_max;
+	n_region=cPar.n_region;
+	p_nr=cPar.p_nr;
+	em_iter=cPar.em_iter;
+	nr_iter=cPar.nr_iter;
+	em_prec=cPar.em_prec;
+	nr_prec=cPar.nr_prec;
+	crt=cPar.crt;
+	
+	Vg_remle_null=cPar.Vg_remle_null;
+	Ve_remle_null=cPar.Ve_remle_null;
+	Vg_mle_null=cPar.Vg_mle_null;
+	Ve_mle_null=cPar.Ve_mle_null;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+		
+	n_ph=cPar.n_ph;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void MVLMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+		
+	cPar.Vg_remle_null=Vg_remle_null;
+	cPar.Ve_remle_null=Ve_remle_null;
+	cPar.Vg_mle_null=Vg_mle_null;
+	cPar.Ve_mle_null=Ve_mle_null;
+	
+	cPar.VVg_remle_null=VVg_remle_null;
+	cPar.VVe_remle_null=VVe_remle_null;
+	cPar.VVg_mle_null=VVg_mle_null;
+	cPar.VVe_mle_null=VVe_mle_null;
+	
+	cPar.beta_remle_null=beta_remle_null;
+	cPar.se_beta_remle_null=se_beta_remle_null;
+	cPar.beta_mle_null=beta_mle_null;
+	cPar.se_beta_mle_null=se_beta_mle_null;
+	
+	cPar.logl_remle_H0=logl_remle_H0;
+	cPar.logl_mle_H0=logl_mle_H0;	
+	return;
+}
+
+
+void MVLMM::WriteFiles () 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".assoc.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+	
+	for (size_t i=0; i<n_ph; i++) {
+		outfile<<"beta_"<<i+1<<"\t";
+	}
+	for (size_t i=0; i<n_ph; i++) {
+		for (size_t j=i; j<n_ph; j++) {
+			outfile<<"Vbeta_"<<i+1<<"_"<<j+1<<"\t";
+		}
+	}
+	
+	if (a_mode==1) {
+		outfile<<"p_wald"<<endl;
+	} else if (a_mode==2) {
+		outfile<<"p_lrt"<<endl;
+	} else if (a_mode==3) {
+		outfile<<"p_score"<<endl;
+	} else if (a_mode==4) {
+		outfile<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+	} else {}
+	
+	
+	size_t t=0, c=0;
+	for (size_t i=0; i<snpInfo.size(); ++i) {
+		if (indicator_snp[i]==0) {continue;}
+		
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+		
+		outfile<<scientific<<setprecision(6);
+		
+		for (size_t i=0; i<n_ph; i++) {
+			outfile<<sumStat[t].v_beta[i]<<"\t";
+		}
+		
+		c=0;
+		for (size_t i=0; i<n_ph; i++) {
+			for (size_t j=i; j<n_ph; j++) {
+				outfile<<sumStat[t].v_Vbeta[c]<<"\t";
+				c++;
+			}
+		}
+		
+		if (a_mode==1) {
+			outfile<<sumStat[t].p_wald <<endl;
+		} else if (a_mode==2) {
+			outfile<<sumStat[t].p_lrt<<endl;
+		} else if (a_mode==3) {
+			outfile<<sumStat[t].p_score<<endl;
+		} else if (a_mode==4) {
+			outfile<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+		} else {}
+		
+		t++;
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+//below are functions for EM algorithm
+
+
+
+
+	
+
+double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix *UltVeh, gsl_matrix *UltVehi)
+{
+	size_t d_size=V_g->size1;
+	double d, logdet_Ve=0.0;	
+	
+	//eigen decomposition of V_e
+	gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size);	
+	gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size);	
+	
+	gsl_matrix_memcpy(V_e_temp, V_e);
+	EigenDecomp(V_e_temp, U_l, D_l, 0);
+		
+	//calculate V_e_h and V_e_hi
+	gsl_matrix_set_zero(V_e_h);
+	gsl_matrix_set_zero(V_e_hi);
+	for (size_t i=0; i<d_size; i++) {
+		d=gsl_vector_get (D_l, i);
+		if (d<=0) {continue;}
+		logdet_Ve+=log(d);
+		
+		gsl_vector_view U_col=gsl_matrix_column(U_l, i);
+		d=sqrt(d);
+		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_h);
+		d=1.0/d;
+		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_hi);
+	}
+	
+	//copy the upper part to lower part
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<i; j++) {
+			gsl_matrix_set (V_e_h, i, j, gsl_matrix_get(V_e_h, j, i));
+			gsl_matrix_set (V_e_hi, i, j, gsl_matrix_get(V_e_hi, j, i));
+		}
+	}
+	
+	//calculate Lambda=V_ehi V_g V_ehi
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_g, V_e_hi, 0.0, VgVehi);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_e_hi, VgVehi, 0.0, Lambda);
+	
+	//eigen decomposition of Lambda
+	EigenDecomp(Lambda, U_l, D_l, 0);
+	
+	for (size_t i=0; i<d_size; i++) {
+		d=gsl_vector_get (D_l, i);
+		if (d<0) {gsl_vector_set (D_l, i, 0);}
+	}
+	
+	//calculate UltVeh and UltVehi
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_h, 0.0, UltVeh);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_hi, 0.0, UltVehi);
+	/*
+	cout<<"Vg: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (V_g, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	cout<<"Ve: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (V_e, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	
+	cout<<"Dl: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		cout<<gsl_vector_get (D_l, i)<<endl;
+	}
+	cout<<"UltVeh: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (UltVeh, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	*/
+	
+	//free memory
+	gsl_matrix_free (Lambda);
+	gsl_matrix_free (V_e_temp);
+	gsl_matrix_free (V_e_h);
+	gsl_matrix_free (V_e_hi);
+	gsl_matrix_free (VgVehi);
+	gsl_matrix_free (U_l);
+	
+	return logdet_Ve;
+}
+	
+//Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+I)^{-1} )^{-1}
+double CalcQi (const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, gsl_matrix *Qi)
+{
+	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
+	size_t c_size=dc_size/d_size;
+	
+	double delta, dl, d1, d2, d, logdet_Q;
+		
+	gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix_set_zero (Q);
+	
+	for (size_t i=0; i<c_size; i++) {		
+		for (size_t j=0; j<c_size; j++) {			
+			for (size_t l=0; l<d_size; l++) {
+				dl=gsl_vector_get(D_l, l);
+				
+				if (j<i) {
+					d=gsl_matrix_get (Q, j*d_size+l, i*d_size+l);		
+				} else {
+					d=0.0;
+					for (size_t k=0; k<n_size; k++) {
+						d1=gsl_matrix_get(X, i, k);
+						d2=gsl_matrix_get(X, j, k);
+						delta=gsl_vector_get(eval, k);
+						d+=d1*d2/(dl*delta+1.0);				
+					}
+				}
+				
+				gsl_matrix_set (Q, i*d_size+l, j*d_size+l, d);
+			}
+		}
+	}
+	
+	//calculate LU decomposition of Q, and invert Q and calculate |Q|	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (dc_size);
+	LUDecomp (Q, pmt, &sig);	
+	LUInvert (Q, pmt, Qi);
+	
+	logdet_Q=LULndet (Q);
+	
+	gsl_matrix_free (Q);
+	gsl_permutation_free (pmt);
+	
+	return logdet_Q;
+}
+
+//xHiy=\sum_{k=1}^n x_k\otimes ((delta_k*Dl+I)^{-1}Ul^TVe^{-1/2}y
+void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *UltVehiY, gsl_vector *xHiy)
+{
+	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size;
+	
+	gsl_vector_set_zero (xHiy);
+	
+	double x, delta, dl, y, d;
+	for (size_t i=0; i<d_size; i++) {	
+		dl=gsl_vector_get(D_l, i);
+		for (size_t j=0; j<c_size; j++) {		
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				x=gsl_matrix_get(X, j, k);
+				y=gsl_matrix_get(UltVehiY, i, k);
+				delta=gsl_vector_get(eval, k);
+				d+=x*y/(delta*dl+1.0);
+			}
+			gsl_vector_set(xHiy, j*d_size+i, d);
+		}
+	}
+	/*
+	cout<<"xHiy: "<<endl;
+	for (size_t i=0; i<(d_size*c_size); i++) {
+		cout<<gsl_vector_get(xHiy, i)<<endl;
+	}
+	 */
+	return;
+}
+
+
+//OmegaU=D_l/(delta Dl+I)^{-1}
+//OmegaE=delta D_l/(delta Dl+I)^{-1}
+void CalcOmega (const gsl_vector *eval, const gsl_vector *D_l, gsl_matrix *OmegaU, gsl_matrix *OmegaE)
+{
+	size_t n_size=eval->size, d_size=D_l->size;
+	double delta, dl, d_u, d_e;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get(eval, k);
+		for (size_t i=0; i<d_size; i++) {
+			dl=gsl_vector_get(D_l, i);
+			
+			d_u=dl/(delta*dl+1.0);
+			d_e=delta*d_u;
+			
+			gsl_matrix_set(OmegaU, i, k, d_u);
+			gsl_matrix_set(OmegaE, i, k, d_e);
+		}
+	}
+	
+	return;
+}
+
+
+void UpdateU (const gsl_matrix *OmegaE, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU)
+{
+	gsl_matrix_memcpy (UltVehiU, UltVehiY);
+	gsl_matrix_sub (UltVehiU, UltVehiBX);
+	
+	gsl_matrix_mul_elements (UltVehiU, OmegaE);	
+	return;
+}
+
+
+void UpdateE (const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiE)
+{
+	gsl_matrix_memcpy (UltVehiE, UltVehiY);
+	gsl_matrix_sub (UltVehiE, UltVehiBX);
+	gsl_matrix_sub (UltVehiE, UltVehiU);
+	
+	return;
+}
+
+
+
+void UpdateL_B (const gsl_matrix *X, const gsl_matrix *XXti, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiB)
+{
+	size_t c_size=X->size1, d_size=UltVehiY->size1;
+	
+	gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_memcpy (UltVehiBX, UltVehiY);
+	gsl_matrix_sub (UltVehiBX, UltVehiU);
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB);
+	
+	gsl_matrix_free(YUX);	
+	
+	return;
+}
+
+void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB)
+{
+	size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1;
+	
+	gsl_vector *b=gsl_vector_alloc (dc_size);
+	
+	//calculate b=Qiv
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b);
+	
+	//copy b to UltVehiB
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view UltVehiB_col=gsl_matrix_column (UltVehiB, i);
+		gsl_vector_const_view b_subcol=gsl_vector_const_subvector (b, i*d_size, d_size);
+		gsl_vector_memcpy (&UltVehiB_col.vector, &b_subcol.vector);
+	}	
+	
+	gsl_vector_free(b);
+	
+	return;
+}
+
+
+
+void UpdateV (const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E, const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee, gsl_matrix *V_g, gsl_matrix *V_e)
+{
+	size_t n_size=eval->size, d_size=U->size1;
+	
+	gsl_matrix_set_zero (V_g);
+	gsl_matrix_set_zero (V_e);
+	
+	double delta;
+	
+	//calculate the first part: UD^{-1}U^T and EE^T	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		if (delta==0) {continue;}
+		
+		gsl_vector_const_view U_col=gsl_matrix_const_column (U, k);		
+		gsl_blas_dsyr (CblasUpper, 1.0/delta, &U_col.vector, V_g);
+	}
+	
+	gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e);	
+	
+	//copy the upper part to lower part
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<i; j++) {
+			gsl_matrix_set (V_g, i, j, gsl_matrix_get(V_g, j, i));
+			gsl_matrix_set (V_e, i, j, gsl_matrix_get(V_e, j, i));
+		}
+	}
+	
+	//add Sigma
+	gsl_matrix_add (V_g, Sigma_uu);
+	gsl_matrix_add (V_e, Sigma_ee);
+	
+	//scale by 1/n
+	gsl_matrix_scale (V_g, 1.0/(double)n_size);
+	gsl_matrix_scale (V_e, 1.0/(double)n_size);
+	
+	return;
+}
+
+
+void CalcSigma (const char func_name, const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *OmegaU, const gsl_matrix *OmegaE, const gsl_matrix *UltVeh, const gsl_matrix *Qi, gsl_matrix *Sigma_uu, gsl_matrix *Sigma_ee)
+{	
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1;
+	
+	gsl_matrix_set_zero(Sigma_uu);
+	gsl_matrix_set_zero(Sigma_ee);
+	
+	double delta, dl, x, d;	
+	
+	//calculate the first diagonal term
+	gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu);
+	gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee);
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_vector_const_view OmegaU_col=gsl_matrix_const_column (OmegaU, k);
+		gsl_vector_const_view OmegaE_col=gsl_matrix_const_column (OmegaE, k);
+		
+		gsl_vector_add (&Suu_diag.vector, &OmegaU_col.vector);
+		gsl_vector_add (&See_diag.vector, &OmegaE_col.vector);
+	}	
+	
+	//calculate the second term for reml
+	if (func_name=='R' || func_name=='r') {		
+		gsl_matrix *M_u=gsl_matrix_alloc(dc_size, d_size);
+		gsl_matrix *M_e=gsl_matrix_alloc(dc_size, d_size);
+		gsl_matrix *QiM=gsl_matrix_alloc(dc_size, d_size);		
+		
+		gsl_matrix_set_zero(M_u);
+		gsl_matrix_set_zero(M_e);
+		
+		for (size_t k=0; k<n_size; k++) {
+			delta=gsl_vector_get(eval, k);
+			//if (delta==0) {continue;}
+			
+			for (size_t i=0; i<d_size; i++) {
+				dl=gsl_vector_get(D_l, i);
+				for (size_t j=0; j<c_size; j++) {				
+					x=gsl_matrix_get(X, j, k);
+					d=x/(delta*dl+1.0);
+					gsl_matrix_set(M_e, j*d_size+i, i, d);
+					gsl_matrix_set(M_u, j*d_size+i, i, d*dl);					
+				}
+			}			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_u, 0.0, QiM);
+			gsl_blas_dgemm(CblasTrans, CblasNoTrans, delta, M_u, QiM, 1.0, Sigma_uu);
+		
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_e, 0.0, QiM);
+			gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, M_e, QiM, 1.0, Sigma_ee);
+		}		
+		
+		gsl_matrix_free(M_u);
+		gsl_matrix_free(M_e);
+		gsl_matrix_free(QiM);	
+	}
+	
+	//multiply both sides by VehUl
+	gsl_matrix *M=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_uu, UltVeh, 0.0, M);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_uu);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_ee, UltVeh, 0.0, M);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_ee);
+	
+	gsl_matrix_free(M);
+	return;
+}
+
+
+//'R' for restricted likelihood and 'L' for likelihood
+//'R' update B and 'L' don't
+//only calculate -0.5*\sum_{k=1}^n|H_k|-0.5yPxy
+double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *Qi) 
+{
+	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
+	double logl=0.0, delta, dl, y, d;
+	
+	//calculate yHiy+log|H_k|
+	for (size_t k=0; k<n_size; k++) {		
+		delta=gsl_vector_get(eval, k);
+		for (size_t i=0; i<d_size; i++) {
+			y=gsl_matrix_get(UltVehiY, i, k);
+			dl=gsl_vector_get(D_l, i);
+			d=delta*dl+1.0;
+			
+			logl+=y*y/d+log(d);
+		}
+	}
+	
+	//calculate the rest of yPxy
+	gsl_vector *Qiv=gsl_vector_alloc(dc_size);
+	
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, Qiv);
+	gsl_blas_ddot(xHiy, Qiv, &d);
+	
+	logl-=d;
+		
+	gsl_vector_free(Qiv);
+	
+	return -0.5*logl;
+}
+
+
+
+
+
+//Y is a dxn matrix, X is a cxn matrix, B is a dxc matrix, V_g is a dxd matrix, V_e is a dxd matrix, eval is a size n vector
+//'R' for restricted likelihood and 'L' for likelihood
+double MphEM (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *U_hat, gsl_matrix *E_hat, gsl_matrix *OmegaU, gsl_matrix *OmegaE, gsl_matrix *UltVehiY, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU, gsl_matrix *UltVehiE, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
+	size_t dc_size=d_size*c_size;	
+		
+	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);
+	gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size);
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehiB=gsl_matrix_alloc (d_size, c_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
+	gsl_permutation * pmt=gsl_permutation_alloc (c_size);	
+		
+	double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve;
+	int sig;
+	
+	//calculate |XXt| and (XXt)^{-1}
+	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
+	for (size_t i=0; i<c_size; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
+		}
+	}
+	
+	LUDecomp (XXt, pmt, &sig);
+	LUInvert (XXt, pmt, XXti);
+	
+	//calculate the constant for logl	
+	if (func_name=='R' || func_name=='r') {		
+		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
+	} else {
+		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
+	}	
+	
+	//start EM
+	for (size_t t=0; t<max_iter; t++) {
+		logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);
+
+		logdet_Q=CalcQi (eval, D_l, X, Qi);
+
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+		CalcXHiY(eval, D_l, X, UltVehiY, xHiy);
+
+		//calculate log likelihood/restricted likelihood value, and terminate if change is small	
+		logl_new=logl_const+MphCalcLogL (eval, xHiy, D_l, UltVehiY, Qi)-0.5*(double)n_size*logdet_Ve;
+		if (func_name=='R' || func_name=='r') {	
+			logl_new+=-0.5*(logdet_Q-(double)c_size*logdet_Ve);
+		}		
+		if (t!=0 && abs(logl_new-logl_old)<max_prec) {break;}
+		logl_old=logl_new;
+		
+		/*
+		cout<<"iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;
+		
+		cout<<"Vg: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Ve: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		*/
+		
+		CalcOmega (eval, D_l, OmegaU, OmegaE);
+
+		//Update UltVehiB, UltVehiU
+		if (func_name=='R' || func_name=='r') {	
+			UpdateRL_B(xHiy, Qi, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		} else if (t==0) {
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, B, 0.0, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		}
+		
+		UpdateU(OmegaE, UltVehiY, UltVehiBX, UltVehiU);
+		
+		if (func_name=='L' || func_name=='l') {	
+			//UltVehiBX is destroyed here
+			UpdateL_B(X, XXti, UltVehiY, UltVehiU, UltVehiBX, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		}
+
+		UpdateE(UltVehiY, UltVehiBX, UltVehiU, UltVehiE);
+		
+		//calculate U_hat, E_hat and B
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiU, 0.0, U_hat);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiE, 0.0, E_hat);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiB, 0.0, B);
+
+		//calculate Sigma_uu and Sigma_ee
+		CalcSigma (func_name, eval, D_l, X, OmegaU, OmegaE, UltVeh, Qi, Sigma_uu, Sigma_ee);
+		
+		//update V_g and V_e
+		UpdateV (eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e);		
+	}
+		
+	gsl_matrix_free(XXt);
+	gsl_matrix_free(XXti);
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(UltVehiB);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(Sigma_uu);
+	gsl_matrix_free(Sigma_ee);
+	gsl_vector_free(xHiy);
+	gsl_permutation_free(pmt);
+	
+	return logl_new;
+}
+
+
+
+
+
+
+
+//calculate p-value, beta (d by 1 vector) and V(beta)
+double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta, gsl_matrix *Vbeta) 
+{
+	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
+	size_t dc_size=d_size*c_size;
+	double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value;
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size);	
+	gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size);
+	
+	gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size);	
+	gsl_vector *xPy=gsl_vector_alloc (d_size);
+	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
+	gsl_vector *WHiy=gsl_vector_alloc (dc_size);
+	
+	gsl_matrix_set_zero (xPx);
+	gsl_matrix_set_zero (WHix);
+	gsl_vector_set_zero (xPy);
+	gsl_vector_set_zero (WHiy);
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, W, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+		
+	//calculate WHix, WHiy, xHiy, xHix
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+		
+		d1=0.0; d2=0.0;
+		for (size_t k=0; k<n_size; k++) {
+			delta=gsl_vector_get(eval, k);
+			dx=gsl_vector_get(x_vec, k);
+			dy=gsl_matrix_get(UltVehiY, i, k);
+			
+			d1+=dx*dy/(delta*dl+1.0);
+			d2+=dx*dx/(delta*dl+1.0);
+		}
+		gsl_vector_set (xPy, i, d1);
+		gsl_matrix_set (xPx, i, i, d2);
+		
+		for (size_t j=0; j<c_size; j++) {	
+			d1=0.0; d2=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dx=gsl_vector_get(x_vec, k);
+				dw=gsl_matrix_get(W, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d1+=dx*dw/(delta*dl+1.0);
+				d2+=dy*dw/(delta*dl+1.0);
+			}
+			gsl_matrix_set(WHix, j*d_size+i, i, d1);
+			gsl_vector_set(WHiy, j*d_size+i, d2);
+		}
+	}
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, WHix, 0.0, QiWHix);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, -1.0, WHix, QiWHix, 1.0, xPx);
+	gsl_blas_dgemv(CblasTrans, -1.0, QiWHix, WHiy, 1.0, xPy);
+		
+	//calculate V(beta) and beta
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (d_size);
+	LUDecomp (xPx, pmt, &sig);
+	LUSolve (xPx, pmt, xPy, D_l);
+	LUInvert (xPx, pmt, Vbeta);
+
+	//need to multiply UltVehi on both sides or one side
+	gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, D_l, 0.0, beta);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Vbeta, UltVeh, 0.0, xPx);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, xPx, 0.0, Vbeta);	
+
+	//calculate test statistic and p value	
+	gsl_blas_ddot(D_l, xPy, &d);
+	
+	p_value=gsl_cdf_chisq_Q (d, (double)d_size);
+	//d*=(double)(n_size-c_size-d_size)/((double)d_size*(double)(n_size-c_size-1));
+	//p_value=gsl_cdf_fdist_Q (d, (double)d_size, (double)(n_size-c_size-d_size));	
+	
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(WHix);	
+	gsl_matrix_free(QiWHix);
+	
+	gsl_matrix_free(xPx);	
+	gsl_vector_free(xPy);
+	gsl_vector_free(WHiy);
+	
+	gsl_permutation_free(pmt);
+	
+	return p_value;
+}
+
+
+
+//calculate B and its standard error (which is a matrix of the same dimension as B)
+void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B, gsl_matrix *se_B) 
+{
+	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
+	size_t dc_size=d_size*c_size;
+	double delta, dl, d, dy, dw, logdet_Ve, logdet_Q;
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *Qi_temp=gsl_matrix_alloc (dc_size, dc_size);
+	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
+	gsl_vector *WHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *QiWHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *beta=gsl_vector_alloc (dc_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_vector_set_zero (WHiy);
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, W, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+	
+	//calculate WHiy
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+				
+		for (size_t j=0; j<c_size; j++) {	
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dw=gsl_matrix_get(W, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d+=dy*dw/(delta*dl+1.0);
+			}
+			gsl_vector_set(WHiy, j*d_size+i, d);
+		}
+	}
+	
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, WHiy, 0.0, QiWHiy);
+	
+	//need to multiply I_c\otimes UltVehi on both sides or one side
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view QiWHiy_sub=gsl_vector_subvector(QiWHiy, i*d_size, d_size);
+		gsl_vector_view beta_sub=gsl_vector_subvector(beta, i*d_size, d_size);		
+		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &QiWHiy_sub.vector, 0.0, &beta_sub.vector);
+	
+		for (size_t j=0; j<c_size; j++) {
+			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
+			gsl_matrix_view Qitemp_sub=gsl_matrix_submatrix (Qi_temp, i*d_size, j*d_size, d_size, d_size);
+			gsl_matrix_view Vbeta_sub=gsl_matrix_submatrix (Vbeta, i*d_size, j*d_size, d_size, d_size);
+			
+			if (j<i) {
+				gsl_matrix_view Vbeta_sym=gsl_matrix_submatrix (Vbeta, j*d_size, i*d_size, d_size, d_size);
+				gsl_matrix_transpose_memcpy (&Vbeta_sub.matrix, &Vbeta_sym.matrix);
+			} else {
+				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, &Qitemp_sub.matrix);
+				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, &Qitemp_sub.matrix, 0.0, &Vbeta_sub.matrix);
+			}			
+		}
+	}
+	
+	//copy beta to B, and Vbeta to se_B
+	for (size_t j=0; j<B->size2; j++) {
+		for (size_t i=0; i<B->size1; i++) {
+			gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i));
+			gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i)));
+		}
+	}	
+	
+	//free matrices
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(Qi_temp);
+	gsl_vector_free(WHiy);
+	gsl_vector_free(QiWHiy);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+		
+	return;
+}
+
+
+
+//below are functions for Newton-Raphson's algorithm
+
+
+
+
+
+//calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k|
+//and calculate Qi and return logdet_Q
+//and calculate yPy
+void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all, gsl_matrix *Qi, double &logdet_H, double &logdet_Q)
+{
+	gsl_matrix_set_zero (Hi_all);
+	gsl_matrix_set_zero (Qi);
+	logdet_H=0.0; logdet_Q=0.0;
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1;
+	double logdet_Ve=0.0, delta, dl, d;	
+	
+	gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	
+	//calculate D_l, UltVeh and UltVehi
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);
+	
+	//calculate each Hi and log|H_k|
+	logdet_H=(double)n_size*logdet_Ve;
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_matrix_memcpy (mat_dd, UltVehi);
+		for (size_t i=0; i<d_size; i++) {
+			dl=gsl_vector_get(D_l, i);
+			d=delta*dl+1.0;
+			
+			gsl_vector_view mat_row=gsl_matrix_row (mat_dd, i);
+			gsl_vector_scale (&mat_row.vector, 1.0/d);
+			
+			logdet_H+=log(d);
+		}
+		
+		gsl_matrix_view Hi_k=gsl_matrix_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVehi, mat_dd, 0.0, &Hi_k.matrix);
+	}	
+	
+	//calculate Qi, and multiply I\otimes UtVeh on both side
+	//and calculate logdet_Q, don't forget to substract c_size*logdet_Ve
+	logdet_Q=CalcQi (eval, D_l, X, Qi)-(double)c_size*logdet_Ve;
+		
+	for (size_t i=0; i<c_size; i++) {
+		for (size_t j=0; j<c_size; j++) {
+			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
+			if (j<i) {
+				gsl_matrix_view Qi_sym=gsl_matrix_submatrix (Qi, j*d_size, i*d_size, d_size, d_size);
+				gsl_matrix_transpose_memcpy (&Qi_sub.matrix, &Qi_sym.matrix);
+			} else {
+				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, mat_dd);
+				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, mat_dd, 0.0, &Qi_sub.matrix);
+			}
+		}
+	}
+
+	//free memory
+	gsl_matrix_free(mat_dd);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_vector_free(D_l);
+	
+	return;
+}
+
+
+
+
+//calculate all Hiy
+void Calc_Hiy_all (const gsl_matrix *Y, const gsl_matrix *Hi_all, gsl_matrix *Hiy_all)
+{
+	gsl_matrix_set_zero (Hiy_all);
+	
+	size_t n_size=Y->size2, d_size=Y->size1;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		gsl_vector_view Hiy_k=gsl_matrix_column(Hiy_all, k);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &Hi_k.matrix, &y_k.vector, 0.0, &Hiy_k.vector);
+	}
+	
+	return;
+}
+
+
+//calculate all xHi
+void Calc_xHi_all (const gsl_matrix *X, const gsl_matrix *Hi_all, gsl_matrix *xHi_all)
+{
+	gsl_matrix_set_zero (xHi_all);
+	
+	size_t n_size=X->size2, c_size=X->size1, d_size=Hi_all->size1;
+
+	double d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+
+		for (size_t i=0; i<c_size; i++) {
+			d=gsl_matrix_get (X, i, k);
+			gsl_matrix_view xHi_sub=gsl_matrix_submatrix(xHi_all, i*d_size, k*d_size, d_size, d_size);
+			gsl_matrix_memcpy(&xHi_sub.matrix, &Hi_k.matrix);
+			gsl_matrix_scale(&xHi_sub.matrix, d);
+		}
+	}
+	
+	return;
+}
+
+
+//calculate scalar yHiy
+double Calc_yHiy (const gsl_matrix *Y, const gsl_matrix *Hiy_all)
+{
+	double yHiy=0.0, d;
+	size_t n_size=Y->size2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		gsl_vector_const_view Hiy_k=gsl_matrix_const_column(Hiy_all, k);
+		
+		gsl_blas_ddot (&Hiy_k.vector, &y_k.vector, &d);
+		yHiy+=d;
+	}
+	
+	return yHiy;
+}
+
+
+//calculate the vector xHiy
+void Calc_xHiy (const gsl_matrix *Y, const gsl_matrix *xHi, gsl_vector *xHiy)
+{
+	gsl_vector_set_zero (xHiy);
+	
+	size_t n_size=Y->size2, d_size=Y->size1, dc_size=xHi->size1;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view xHi_k=gsl_matrix_const_submatrix(xHi, 0, k*d_size, dc_size, d_size);
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHi_k.matrix, &y_k.vector, 1.0, xHiy);
+	}
+	
+	return;
+}
+
+
+
+
+//0<=i,j<d_size
+size_t GetIndex (const size_t i, const size_t j, const size_t d_size)
+{
+	if (i>=d_size || j>=d_size) {cout<<"error in GetIndex."<<endl; return 0;}
+	
+	size_t s, l;
+	if (j<i) {s=j; l=i;} else {s=i; l=j;}
+	
+	return (2*d_size-s+1)*s/2+l-s;
+}
+
+
+
+void Calc_yHiDHiy (const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i, const size_t j, double &yHiDHiy_g, double &yHiDHiy_e)
+{
+	yHiDHiy_g=0.0;
+	yHiDHiy_e=0.0;
+	
+	size_t n_size=eval->size;
+	
+	double delta, d1, d2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		d1=gsl_matrix_get (Hiy, i, k);
+		d2=gsl_matrix_get (Hiy, j, k);
+		
+		if (i==j) {
+			yHiDHiy_g+=delta*d1*d2;
+			yHiDHiy_e+=d1*d2;
+		} else {
+			yHiDHiy_g+=delta*d1*d2*2.0;
+			yHiDHiy_e+=d1*d2*2.0;
+		}
+	}	
+	
+	return;
+}
+
+
+
+void Calc_xHiDHiy (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i, const size_t j, gsl_vector *xHiDHiy_g, gsl_vector *xHiDHiy_e)
+{
+	gsl_vector_set_zero(xHiDHiy_g);
+	gsl_vector_set_zero(xHiDHiy_e);
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
+		d=gsl_matrix_get (Hiy, j, k);
+		
+		gsl_blas_daxpy (d*delta, &xHi_col_i.vector, xHiDHiy_g);
+		gsl_blas_daxpy (d, &xHi_col_i.vector, xHiDHiy_e);		
+		
+		if (i!=j) {
+			gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);
+			d=gsl_matrix_get (Hiy, i, k);
+			
+			gsl_blas_daxpy (d*delta, &xHi_col_j.vector, xHiDHiy_g);
+			gsl_blas_daxpy (d, &xHi_col_j.vector, xHiDHiy_e);	
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHix (const gsl_vector *eval, const gsl_matrix *xHi, const size_t i, const size_t j, gsl_matrix *xHiDHix_g, gsl_matrix *xHiDHix_e)
+{
+	gsl_matrix_set_zero(xHiDHix_g);
+	gsl_matrix_set_zero(xHiDHix_e);
+	
+	size_t n_size=eval->size, dc_size=xHi->size1;
+	size_t d_size=xHi->size2/n_size;
+	
+	double delta;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
+		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);
+		
+		gsl_matrix_set_zero (mat_dcdc);
+		gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);
+		
+		gsl_matrix_transpose_memcpy (mat_dcdc_t, mat_dcdc);
+		
+		gsl_matrix_add (xHiDHix_e, mat_dcdc);
+		
+		gsl_matrix_scale (mat_dcdc, delta);
+		gsl_matrix_add (xHiDHix_g, mat_dcdc);
+		
+		if (i!=j) {
+			gsl_matrix_add (xHiDHix_e, mat_dcdc_t);		
+			
+			gsl_matrix_scale (mat_dcdc_t, delta);
+			gsl_matrix_add (xHiDHix_g, mat_dcdc_t);
+		}
+	}
+	
+	gsl_matrix_free(mat_dcdc);
+	gsl_matrix_free(mat_dcdc_t);
+	
+	return;
+}
+
+
+
+void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yHiDHiDHiy_gg, double &yHiDHiDHiy_ee, double &yHiDHiDHiy_ge)
+{
+	yHiDHiDHiy_gg=0.0;
+	yHiDHiDHiy_ee=0.0;
+	yHiDHiDHiy_ge=0.0;
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		d_Hiy_i1=gsl_matrix_get (Hiy, i1, k);
+		d_Hiy_j1=gsl_matrix_get (Hiy, j1, k);
+		d_Hiy_i2=gsl_matrix_get (Hiy, i2, k);
+		d_Hiy_j2=gsl_matrix_get (Hiy, j2, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {			
+			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			
+			if (i2!=j2) {				
+				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+			}
+		} else {
+			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+						
+			if (i2!=j2) {
+				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_vector *xHiDHiDHiy_gg, gsl_vector *xHiDHiDHiy_ee, gsl_vector *xHiDHiDHiy_ge)
+{
+	gsl_vector_set_zero(xHiDHiDHiy_gg);
+	gsl_vector_set_zero(xHiDHiDHiy_ee);
+	gsl_vector_set_zero(xHiDHiDHiy_ge);
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
+		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j1);
+		
+		d_Hiy_i=gsl_matrix_get (Hiy, i2, k);
+		d_Hiy_j=gsl_matrix_get (Hiy, j2, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {
+			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			
+			if (i2!=j2) {
+				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			}
+		} else {			
+			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			
+			gsl_blas_daxpy (delta*delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ge);
+			
+			if (i2!=j2) {
+				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);
+				
+				gsl_blas_daxpy (delta*delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ge);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHiDHix (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_matrix *xHiDHiDHix_gg, gsl_matrix *xHiDHiDHix_ee, gsl_matrix *xHiDHiDHix_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHix_gg);
+	gsl_matrix_set_zero(xHiDHiDHix_ee);
+	gsl_matrix_set_zero(xHiDHiDHix_ge);
+	
+	size_t n_size=eval->size, d_size=Hi->size1, dc_size=xHi->size1;
+	
+	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i1=gsl_matrix_const_column (xHi, k*d_size+i1);
+		gsl_vector_const_view xHi_col_j1=gsl_matrix_const_column (xHi, k*d_size+j1);
+		gsl_vector_const_view xHi_col_i2=gsl_matrix_const_column (xHi, k*d_size+i2);
+		gsl_vector_const_view xHi_col_j2=gsl_matrix_const_column (xHi, k*d_size+j2);	
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+
+			if (i2!=j2) {
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			}
+		} else {
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_i1i2, &xHi_col_j1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			
+			if (i2!=j2) {
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+				
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_i1j2, &xHi_col_j1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			}
+		}
+	}
+	
+	gsl_matrix_free(mat_dcdc);
+	
+	return;
+}
+
+
+
+void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i, const size_t j, double &tHiD_g, double &tHiD_e) 
+{
+	tHiD_g=0.0;
+	tHiD_e=0.0;
+	
+	size_t n_size=eval->size, d_size=Hi->size1;
+	double delta, d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		d=gsl_matrix_get (Hi, j, k*d_size+i);
+		
+		if (i==j) {
+			tHiD_g+=delta*d;
+			tHiD_e+=d;
+		} else {
+			tHiD_g+=delta*d*2.0;
+			tHiD_e+=d*2.0;
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee, double &tHiDHiD_ge) 
+{
+	tHiDHiD_gg=0.0;
+	tHiDHiD_ee=0.0;
+	tHiDHiD_ge=0.0;
+	
+	size_t n_size=eval->size, d_size=Hi->size1;
+	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);
+		
+		if (i1==j1) {
+			tHiDHiD_gg+=delta*delta*d_Hi_i1j2*d_Hi_j1i2;
+			tHiDHiD_ee+=d_Hi_i1j2*d_Hi_j1i2;
+			tHiDHiD_ge+=delta*d_Hi_i1j2*d_Hi_j1i2;
+			
+			if (i2!=j2) {
+				tHiDHiD_gg+=delta*delta*d_Hi_i1i2*d_Hi_j1j2;
+				tHiDHiD_ee+=d_Hi_i1i2*d_Hi_j1j2;
+				tHiDHiD_ge+=delta*d_Hi_i1i2*d_Hi_j1j2;
+			}
+		} else {
+			tHiDHiD_gg+=delta*delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			tHiDHiD_ee+=(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			tHiDHiD_ge+=delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			
+			if (i2!=j2) {
+				tHiDHiD_gg+=delta*delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+				tHiDHiD_ee+=(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+				tHiDHiD_ge+=delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+//trace(PD)=trace((Hi-HixQixHi)D)=trace(HiD)-trace(HixQixHiD)
+void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const size_t i, const size_t j, double &tPD_g, double &tPD_e) 
+{
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t v=GetIndex(i, j, d_size);
+	
+	double d;
+	
+	//calculate the first part: trace(HiD)
+	Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e);
+	
+	//calculate the second part: -trace(HixQixHiD)
+	for (size_t k=0; k<dc_size; k++) {
+		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, k);
+		gsl_vector_const_view xHiDHix_g_col=gsl_matrix_const_column (xHiDHix_all_g, v*dc_size+k);
+		gsl_vector_const_view xHiDHix_e_col=gsl_matrix_const_column (xHiDHix_all_e, v*dc_size+k);
+		
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_g_col.vector, &d);
+		tPD_g-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_e_col.vector, &d);
+		tPD_e-=d;
+	}
+		
+	return;
+}
+
+
+
+//trace(PDPD)=trace((Hi-HixQixHi)D(Hi-HixQixHi)D)
+//=trace(HiDHiD)-trace(HixQixHiDHiD)-trace(HiDHixQixHiD)+trace(HixQixHiDHixQixHiD)
+void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tPDPD_gg, double &tPDPD_ee, double &tPDPD_ge) 
+{
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t v_size=d_size*(d_size+1)/2;
+	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);
+	
+	double d;
+	
+	//calculate the first part: trace(HiDHiD)
+	Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
+
+	//calculate the second and third parts: -trace(HixQixHiDHiD)-trace(HiDHixQixHiD)
+	for (size_t i=0; i<dc_size; i++) {
+		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, i);
+		gsl_vector_const_view xHiDHiDHix_gg_col=gsl_matrix_const_column (xHiDHiDHix_all_gg, (v1*v_size+v2)*dc_size+i);
+		gsl_vector_const_view xHiDHiDHix_ee_col=gsl_matrix_const_column (xHiDHiDHix_all_ee, (v1*v_size+v2)*dc_size+i);
+		gsl_vector_const_view xHiDHiDHix_ge_col=gsl_matrix_const_column (xHiDHiDHix_all_ge, (v1*v_size+v2)*dc_size+i);
+
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_col.vector, &d);
+		tPDPD_gg-=d*2.0;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_col.vector, &d);
+		tPDPD_ee-=d*2.0;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_col.vector, &d);
+		tPDPD_ge-=d*2.0;
+		/*
+		gsl_vector_const_view xHiDHiDHix_gg_row=gsl_matrix_const_row (xHiDHiDHix_gg, i);
+		gsl_vector_const_view xHiDHiDHix_ee_row=gsl_matrix_const_row (xHiDHiDHix_ee, i);
+		gsl_vector_const_view xHiDHiDHix_ge_row=gsl_matrix_const_row (xHiDHiDHix_ge, i);
+		
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_row.vector, &d);
+		tPDPD_gg-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_row.vector, &d);
+		tPDPD_ee-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_row.vector, &d);
+		tPDPD_ge-=d;
+		 */
+	}
+
+	//calculate the fourth part: trace(HixQixHiDHixQixHiD)
+	for (size_t i=0; i<dc_size; i++) {		
+		//gsl_vector_const_view QixHiDHix_g_row1=gsl_matrix_const_subrow (QixHiDHix_all_g, i, v1*dc_size, dc_size);
+		//gsl_vector_const_view QixHiDHix_e_row1=gsl_matrix_const_subrow (QixHiDHix_all_e, i, v1*dc_size, dc_size);
+
+		gsl_vector_const_view QixHiDHix_g_fullrow1=gsl_matrix_const_row (QixHiDHix_all_g, i);
+		gsl_vector_const_view QixHiDHix_e_fullrow1=gsl_matrix_const_row (QixHiDHix_all_e, i);
+		gsl_vector_const_view QixHiDHix_g_row1=gsl_vector_const_subvector (&QixHiDHix_g_fullrow1.vector, v1*dc_size, dc_size);
+		gsl_vector_const_view QixHiDHix_e_row1=gsl_vector_const_subvector (&QixHiDHix_e_fullrow1.vector, v1*dc_size, dc_size);
+
+		gsl_vector_const_view QixHiDHix_g_col2=gsl_matrix_const_column (QixHiDHix_all_g, v2*dc_size+i);
+		gsl_vector_const_view QixHiDHix_e_col2=gsl_matrix_const_column (QixHiDHix_all_e, v2*dc_size+i);
+
+		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_g_col2.vector, &d);
+		tPDPD_gg+=d;
+		gsl_blas_ddot(&QixHiDHix_e_row1.vector, &QixHiDHix_e_col2.vector, &d);
+		tPDPD_ee+=d;
+		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_e_col2.vector, &d);
+		tPDPD_ge+=d;
+	}		
+
+	return;
+}
+
+
+
+//calculate (xHiDHiy) for every pair of i j
+void Calc_xHiDHiy_all (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiy_all_g, gsl_matrix *xHiDHiy_all_e)
+{
+	gsl_matrix_set_zero(xHiDHiy_all_g);
+	gsl_matrix_set_zero(xHiDHiy_all_e);
+	
+	size_t d_size=Hiy->size1;
+	size_t v;
+	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			gsl_vector_view xHiDHiy_g=gsl_matrix_column (xHiDHiy_all_g, v);
+			gsl_vector_view xHiDHiy_e=gsl_matrix_column (xHiDHiy_all_e, v);
+			
+			Calc_xHiDHiy (eval, xHi, Hiy, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector);
+		}
+	}
+	return;
+}
+
+
+//calculate (xHiDHix) for every pair of i j
+void Calc_xHiDHix_all (const gsl_vector *eval, const gsl_matrix *xHi, gsl_matrix *xHiDHix_all_g, gsl_matrix *xHiDHix_all_e)
+{
+	gsl_matrix_set_zero(xHiDHix_all_g);
+	gsl_matrix_set_zero(xHiDHix_all_e);
+	
+	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
+	size_t v;
+	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			gsl_matrix_view xHiDHix_g=gsl_matrix_submatrix (xHiDHix_all_g, 0, v*dc_size, dc_size, dc_size);
+			gsl_matrix_view xHiDHix_e=gsl_matrix_submatrix (xHiDHix_all_e, 0, v*dc_size, dc_size, dc_size);
+			
+			Calc_xHiDHix (eval, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix);
+		}
+	}
+	return;
+}
+
+
+
+//calculate (xHiDHiy) for every pair of i j
+void Calc_xHiDHiDHiy_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiDHiy_all_gg, gsl_matrix *xHiDHiDHiy_all_ee, gsl_matrix *xHiDHiDHiy_all_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHiy_all_gg);
+	gsl_matrix_set_zero(xHiDHiDHiy_all_ee);
+	gsl_matrix_set_zero(xHiDHiDHiy_all_ge);
+	
+	size_t d_size=Hiy->size1;
+	size_t v1, v2;
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex(i2, j2, d_size);
+					
+					gsl_vector_view xHiDHiDHiy_gg=gsl_matrix_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
+					gsl_vector_view xHiDHiDHiy_ee=gsl_matrix_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
+					gsl_vector_view xHiDHiDHiy_ge=gsl_matrix_column (xHiDHiDHiy_all_ge, v1*v_size+v2);
+					
+					Calc_xHiDHiDHiy (eval, Hi, xHi, Hiy, i1, j1, i2, j2, &xHiDHiDHiy_gg.vector, &xHiDHiDHiy_ee.vector, &xHiDHiDHiy_ge.vector);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+//calculate (xHiDHix) for every pair of i j
+void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, gsl_matrix *xHiDHiDHix_all_gg, gsl_matrix *xHiDHiDHix_all_ee, gsl_matrix *xHiDHiDHix_all_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHix_all_gg);
+	gsl_matrix_set_zero(xHiDHiDHix_all_ee);
+	gsl_matrix_set_zero(xHiDHiDHix_all_ge);
+	
+	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
+	size_t v1, v2;	
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex(i2, j2, d_size);
+					
+					if (v2<v1) {continue;}
+					
+					gsl_matrix_view xHiDHiDHix_gg1=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ee1=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ge1=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					
+					Calc_xHiDHiDHix (eval, Hi, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix, &xHiDHiDHix_ee1.matrix, &xHiDHiDHix_ge1.matrix);
+					
+					if (v2!=v1) {
+						gsl_matrix_view xHiDHiDHix_gg2=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ee2=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ge2=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+					
+						gsl_matrix_memcpy (&xHiDHiDHix_gg2.matrix, &xHiDHiDHix_gg1.matrix);
+						gsl_matrix_memcpy (&xHiDHiDHix_ee2.matrix, &xHiDHiDHix_ee1.matrix);
+						gsl_matrix_memcpy (&xHiDHiDHix_ge2.matrix, &xHiDHiDHix_ge1.matrix);
+					}
+				}
+			}
+		}
+	}
+	
+	
+	/*
+	size_t n_size=eval->size;
+	double delta, d_Hi_ij;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		for (size_t i1=0; i1<d_size; i1++) {
+			for (size_t j2=0; j2<d_size; j2++) {				
+				gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
+				gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j2);
+		
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);	
+				
+				for (size_t j1=0; j1<d_size; j1++) {
+					for (size_t i2=0; i2<d_size; i2++) {
+						d_Hi_ij=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+						
+						v1=GetIndex(i1, j1, d_size);
+						v2=GetIndex(i2, j2, d_size);						
+						
+						gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+												
+						gsl_matrix_memcpy (mat_dcdc_temp, mat_dcdc);
+						
+						gsl_matrix_scale (mat_dcdc_temp, d_Hi_ij);
+						gsl_matrix_add(&xHiDHiDHix_ee.matrix, mat_dcdc_temp);
+						gsl_matrix_scale(mat_dcdc_temp, delta);
+						gsl_matrix_add(&xHiDHiDHix_ge.matrix, mat_dcdc_temp);
+						gsl_matrix_scale(mat_dcdc_temp, delta);
+						gsl_matrix_add(&xHiDHiDHix_gg.matrix, mat_dcdc_temp);
+					}
+				}
+			}
+		}
+	}
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					v2=GetIndex(i2, j2, d_size);
+					
+					if (i1!=j1 && i2!=j2) {continue;}
+					
+					gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					
+					if ( (i1==j1 && i2!=j2) || (i1!=j1 && i2==j2) ) {
+						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.5);
+						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.5);
+						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.5);
+					} else {
+						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.25);
+						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.25);
+						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.25);
+					}
+				}
+			}
+		}
+	}
+	
+	gsl_matrix_free (mat_dcdc);
+	gsl_matrix_free (mat_dcdc_temp);	
+	*/
+	
+	return;
+}
+
+
+
+//calculate (xHiDHix)Qi(xHiy) for every pair of i, j
+void Calc_xHiDHixQixHiy_all (const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const gsl_vector *QixHiy, gsl_matrix *xHiDHixQixHiy_all_g, gsl_matrix *xHiDHixQixHiy_all_e)
+{
+	size_t dc_size=xHiDHix_all_g->size1;
+	size_t v_size=xHiDHix_all_g->size2/dc_size;
+	
+	for (size_t i=0; i<v_size; i++) {		
+		gsl_matrix_const_view xHiDHix_g=gsl_matrix_const_submatrix (xHiDHix_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view xHiDHix_e=gsl_matrix_const_submatrix (xHiDHix_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_vector_view xHiDHixQixHiy_g=gsl_matrix_column (xHiDHixQixHiy_all_g, i);
+		gsl_vector_view xHiDHixQixHiy_e=gsl_matrix_column (xHiDHixQixHiy_all_e, i);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_g.matrix, QixHiy, 0.0, &xHiDHixQixHiy_g.vector);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_e.matrix, QixHiy, 0.0, &xHiDHixQixHiy_e.vector);
+	}
+	
+	return;
+}
+
+//calculate Qi(xHiDHiy) and Qi(xHiDHix)Qi(xHiy) for each pair of i j (i<=j)
+void Calc_QiVec_all (const gsl_matrix *Qi, const gsl_matrix *vec_all_g, const gsl_matrix *vec_all_e, gsl_matrix *Qivec_all_g, gsl_matrix *Qivec_all_e)
+{
+	for (size_t i=0; i<vec_all_g->size2; i++) {
+		gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i);
+		gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i);
+		
+		gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i);
+		gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector);
+	}
+	
+	return;
+}
+
+
+//calculate Qi(xHiDHix) for each pair of i j (i<=j)
+void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gsl_matrix *mat_all_e, gsl_matrix *Qimat_all_g, gsl_matrix *Qimat_all_e)
+{
+	size_t dc_size=Qi->size1;
+	size_t v_size=mat_all_g->size2/mat_all_g->size1;
+	
+	for (size_t i=0; i<v_size; i++) {
+		gsl_matrix_const_view mat_g=gsl_matrix_const_submatrix (mat_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view mat_e=gsl_matrix_const_submatrix (mat_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_matrix_view Qimat_g=gsl_matrix_submatrix (Qimat_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_view Qimat_e=gsl_matrix_submatrix (Qimat_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_g.matrix, 0.0, &Qimat_g.matrix);
+		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_e.matrix, 0.0, &Qimat_e.matrix);
+	}
+	
+	return;
+}
+
+
+
+//calculate yPDPy
+//yPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)y
+//=ytHiDHiy
+//-(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
+//+(yHix)Qi(xHiDHix)Qi(xtHiy)
+void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const size_t i, const size_t j, double &yPDPy_g, double &yPDPy_e)
+{	
+	size_t d_size=Hiy->size1;
+	size_t v=GetIndex(i, j, d_size);
+		
+	double d;		
+	
+	//first part: ytHiDHiy
+	Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e);
+	
+	//second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
+	gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v);
+	gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d);
+	yPDPy_g-=d*2.0;
+	gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d);
+	yPDPy_e-=d*2.0;	
+	
+	//fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v);
+	gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d);
+	yPDPy_g+=d;
+	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d);
+	yPDPy_e+=d;
+
+	return;
+}
+
+//calculate yPDPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)D(Hi-HixQixHi)y
+//yPDPDPy=yHiDHiDHiy
+//-(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)
+//-(yHiDHix)Qi(xHiDHiy)
+//+(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
+//+(yHix)Qi(xHiDHiDHix)Qi(xHiy)
+//-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
+void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge)
+{	
+	size_t d_size=Hi->size1, dc_size=xHi->size1;
+	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);
+	size_t v_size=d_size*(d_size+1)/2;	
+	
+	double d;
+		
+	gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size);
+	
+	//first part: yHiDHiDHiy
+	Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);	
+	
+	//second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)	
+	gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
+	gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
+	gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2);
+	
+	gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1);
+	gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1);
+	gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); 
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); 
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); 
+	yPDPDPy_ge-=d;
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); 
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); 
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); 
+	yPDPDPy_ge-=d;
+	
+	//fourth part: -(yHiDHix)Qi(xHiDHiy)
+	gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1);
+	gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1);
+	gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2);
+	gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2);
+	
+	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ge-=d;
+	
+	//fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1);
+	gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1);
+	
+	gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1);
+	gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1);
+	gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2);
+	gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2);
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
+	yPDPDPy_gg+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d);
+	yPDPDPy_gg+=d;
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ee+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d);
+	yPDPDPy_ee+=d;
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ge+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d);
+	yPDPDPy_ge+=d;
+
+	//seventh part: +(yHix)Qi(xHiDHiDHix)Qi(xHiy)
+	gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_gg+=d;
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ee.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_ee+=d;
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_ge+=d;
+		
+	//eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1);
+	gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1);
+	
+	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d);
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d);
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d);
+	yPDPDPy_ge-=d;
+	
+	//free memory	
+	gsl_vector_free(xHiDHiDHixQixHiy);	
+	
+	return;
+}
+
+
+//calculate Edgeworth correctation factors for small samples
+//notation and method follows Thomas J. Rothenberg, Econometirca 1984; 52 (4)
+//M=xHiDHix
+void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c)
+{
+	crt_a=0.0; crt_b=0.0; crt_c=0.0;
+	
+	size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2;
+	size_t c_size=dc_size/d_size;
+	double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0;
+	double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0;
+	
+	gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size);	
+	
+	gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+		
+	//invert Qi_sub to Qi_si
+	gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (d_size);
+	
+	gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix);
+	LUDecomp (Qi_sub, pmt, &sig);
+	LUInvert (Qi_sub, pmt, Qi_si);
+	
+	gsl_permutation_free(pmt);
+	gsl_matrix_free(Qi_sub);
+			
+	//calculate correctation factors
+	for (size_t v1=0; v1<v_size; v1++) {
+		//calculate Qi(xHiDHix)Qi, and subpart of it
+		gsl_matrix_const_view QiM_g1=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v1*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view QiM_e1=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v1*dc_size, dc_size, dc_size);
+				
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, Qi, 0.0, QiMQi_g1);
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, Qi, 0.0, QiMQi_e1);
+		
+		gsl_matrix_view QiMQi_g1_s=gsl_matrix_submatrix (QiMQi_g1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+		gsl_matrix_view QiMQi_e1_s=gsl_matrix_submatrix (QiMQi_e1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+		
+		/*
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<setprecision(6)<<gsl_matrix_get(&QiMQi_g1_s.matrix, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+*/
+		//calculate trCg1 and trCe1
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g1_s.matrix, Qi_si, 0.0, QiMQisQisi_g1);
+		trCg1=0.0;
+		for (size_t k=0; k<d_size; k++) {
+			trCg1-=gsl_matrix_get (QiMQisQisi_g1, k, k);
+		}
+		
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e1_s.matrix, Qi_si, 0.0, QiMQisQisi_e1);
+		trCe1=0.0;
+		for (size_t k=0; k<d_size; k++) {
+			trCe1-=gsl_matrix_get (QiMQisQisi_e1, k, k);
+		}
+		/*
+		cout<<v1<<endl;
+		cout<<"trCg1 = "<<trCg1<<", trCe1 = "<<trCe1<<endl;	
+		*/
+		for (size_t v2=0; v2<v_size; v2++) {
+			if (v2<v1) {continue;}
+			
+			//calculate Qi(xHiDHix)Qi, and subpart of it
+			gsl_matrix_const_view QiM_g2=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v2*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view QiM_e2=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v2*dc_size, dc_size, dc_size);
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g2.matrix, Qi, 0.0, QiMQi_g2);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e2.matrix, Qi, 0.0, QiMQi_e2);
+			
+			gsl_matrix_view QiMQi_g2_s=gsl_matrix_submatrix (QiMQi_g2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQi_e2_s=gsl_matrix_submatrix (QiMQi_e2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			
+			//calculate trCg2 and trCe2
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g2_s.matrix, Qi_si, 0.0, QiMQisQisi_g2);
+			trCg2=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCg2-=gsl_matrix_get (QiMQisQisi_g2, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e2_s.matrix, Qi_si, 0.0, QiMQisQisi_e2);
+			trCe2=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCe2-=gsl_matrix_get (QiMQisQisi_e2, k, k);
+			}
+			
+			//calculate trCC_gg, trCC_ge, trCC_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_g2, 0.0, M_dd);
+			trCC_gg=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_gg+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_e2, 0.0, M_dd);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_g2, 1.0, M_dd);
+			trCC_ge=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_ge+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_e2, 0.0, M_dd);
+			trCC_ee=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_ee+=gsl_matrix_get (M_dd, k, k);
+			}
+						
+			//calculate Qi(xHiDHix)Qi(xHiDHix)Qi, and subpart of it			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_g2, 0.0, QiMQiMQi_gg);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_g2, 1.0, QiMQiMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ee);
+			
+			gsl_matrix_view QiMQiMQi_gg_s=gsl_matrix_submatrix (QiMQiMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQiMQi_ge_s=gsl_matrix_submatrix (QiMQiMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQiMQi_ee_s=gsl_matrix_submatrix (QiMQiMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+						
+			//and part of trB_gg, trB_ge, trB_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
+			trB_gg=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_gg-=d;
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
+			trB_ge=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_ge-=d;
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
+			trB_ee=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_ee-=d;
+			}
+			
+			//calculate Qi(xHiDHiDHix)Qi, and subpart of it	
+			gsl_matrix_const_view MM_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view MM_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view MM_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_gg.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_gg);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ge.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ee.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ee);
+			
+			gsl_matrix_view QiMMQi_gg_s=gsl_matrix_submatrix (QiMMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMMQi_ge_s=gsl_matrix_submatrix (QiMMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMMQi_ee_s=gsl_matrix_submatrix (QiMMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+												
+			//calculate the other part of trB_gg, trB_ge, trB_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_gg+=gsl_matrix_get (M_dd, k, k);
+			}
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_ge+=2.0*gsl_matrix_get (M_dd, k, k);
+			}
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_ee+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			
+			//calculate trD_gg, trD_ge, trD_ee
+			trD_gg=2.0*trB_gg;
+			trD_ge=2.0*trB_ge;
+			trD_ee=2.0*trB_ee;
+			
+			//calculate B, C and D
+			h_gg=-1.0*gsl_matrix_get (Hessian_inv, v1, v2);
+			h_ge=-1.0*gsl_matrix_get (Hessian_inv, v1, v2+v_size);
+			h_ee=-1.0*gsl_matrix_get (Hessian_inv, v1+v_size, v2+v_size);
+			
+			B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
+			C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
+			D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);
+			
+			if (v1!=v2) {
+				B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
+				C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
+				D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);
+			}
+			
+			/*
+			cout<<v1<<"\t"<<v2<<endl;
+			cout<<h_gg<<"\t"<<h_ge<<"\t"<<h_ee<<endl;
+			cout<<trB_gg<<"\t"<<trB_ge<<"\t"<<trB_ee<<endl;
+			cout<<trCg1<<"\t"<<trCe1<<"\t"<<trCg2<<"\t"<<trCe2<<endl;
+			cout<<trCC_gg<<"\t"<<trCC_ge<<"\t"<<trCC_ee<<endl;
+			cout<<trD_gg<<"\t"<<trD_ge<<"\t"<<trD_ee<<endl;
+			*/
+		}
+	}
+	
+	//calculate a, b, c from B C D
+	crt_a=2.0*D-C;
+	crt_b=2.0*B;
+	crt_c=C;
+	/*
+	cout<<B<<"\t"<<C<<"\t"<<D<<endl;
+	cout<<setprecision(6)<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
+	*/
+	//free matrix memory
+	gsl_matrix_free(QiMQi_g1);
+	gsl_matrix_free(QiMQi_e1);
+	gsl_matrix_free(QiMQi_g2);
+	gsl_matrix_free(QiMQi_e2);
+	
+	gsl_matrix_free(QiMQisQisi_g1);
+	gsl_matrix_free(QiMQisQisi_e1);
+	gsl_matrix_free(QiMQisQisi_g2);
+	gsl_matrix_free(QiMQisQisi_e2);
+	
+	gsl_matrix_free(QiMQiMQi_gg);
+	gsl_matrix_free(QiMQiMQi_ge);
+	gsl_matrix_free(QiMQiMQi_ee);
+	
+	gsl_matrix_free(QiMMQi_gg);
+	gsl_matrix_free(QiMMQi_ge);
+	gsl_matrix_free(QiMMQi_ee);
+	
+	gsl_matrix_free(Qi_si);
+	
+	gsl_matrix_free(M_dd);
+	gsl_matrix_free(M_dcdc);
+	
+	return;
+}
+
+
+
+
+
+//calculate first-order and second-order derivatives
+void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, gsl_vector *gradient, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
+{	
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t c_size=dc_size/d_size;
+	size_t v_size=d_size*(d_size+1)/2;
+	size_t v1, v2;
+	double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge;
+
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+		
+	gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
+	gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);		
+	gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	
+	gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
+	gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);	
+	gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	
+	gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	
+	//calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all
+	Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e);	
+	Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e);
+	Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e);
+	
+	Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge);
+	Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge);
+	
+	//calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all
+	Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e);
+	Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e);
+	Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e);
+		
+	double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge;
+	double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge;
+
+	//calculate gradient and Hessian for Vg	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex (i1, j1, d_size);
+
+			Calc_yPDPy (eval, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, i1, j1, yPDPy_g, yPDPy_e);
+			
+			if (func_name=='R' || func_name=='r') {				
+				Calc_tracePD (eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g, tPD_e);				
+				//cout<<i1<<" "<<j1<<" "<<yPDPy_g<<" "<<yPDPy_e<<" "<<tPD_g<<" "<<tPD_e<<endl;
+				
+				dev1_g=-0.5*tPD_g+0.5*yPDPy_g;
+				dev1_e=-0.5*tPD_e+0.5*yPDPy_e;
+			} else {
+				Calc_traceHiD (eval, Hi, i1, j1, tHiD_g, tHiD_e);
+								
+				dev1_g=-0.5*tHiD_g+0.5*yPDPy_g;
+				dev1_e=-0.5*tHiD_e+0.5*yPDPy_e;
+			}
+
+			gsl_vector_set (gradient, v1, dev1_g);
+			gsl_vector_set (gradient, v1+v_size, dev1_e);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex (i2, j2, d_size);
+					
+					if (v2<v1) {continue;}
+
+					Calc_yPDPDPy (eval, Hi, xHi, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);
+
+					//cout<<i1<<" "<<j1<<" "<<i2<<" "<<j2<<" "<<yPDPDPy_gg<<" "<<yPDPDPy_ee<<" "<<yPDPDPy_ge<<endl;
+					//AI for reml
+					if (func_name=='R' || func_name=='r') {
+						Calc_tracePDPD (eval, Qi, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
+						
+						dev2_gg=0.5*tPDPD_gg-yPDPDPy_gg; 
+						dev2_ee=0.5*tPDPD_ee-yPDPDPy_ee; 
+						dev2_ge=0.5*tPDPD_ge-yPDPDPy_ge; 		
+						/*
+						dev2_gg=-0.5*yPDPDPy_gg; 
+						dev2_ee=-0.5*yPDPDPy_ee; 
+						dev2_ge=-0.5*yPDPDPy_ge; 
+						*/
+					} else {
+						Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge);
+						
+						dev2_gg=0.5*tHiDHiD_gg-yPDPDPy_gg; 
+						dev2_ee=0.5*tHiDHiD_ee-yPDPDPy_ee; 
+						dev2_ge=0.5*tHiDHiD_ge-yPDPDPy_ge; 
+					}
+
+					//set up Hessian
+					gsl_matrix_set (Hessian, v1, v2, dev2_gg);
+					gsl_matrix_set (Hessian, v1+v_size, v2+v_size, dev2_ee);
+					gsl_matrix_set (Hessian, v1, v2+v_size, dev2_ge);
+					gsl_matrix_set (Hessian, v2+v_size, v1, dev2_ge);
+					
+					if (v1!=v2) {
+						gsl_matrix_set (Hessian, v2, v1, dev2_gg);
+						gsl_matrix_set (Hessian, v2+v_size, v1+v_size, dev2_ee);
+						gsl_matrix_set (Hessian, v2, v1+v_size, dev2_ge);
+						gsl_matrix_set (Hessian, v1+v_size, v2, dev2_ge);
+					}
+				}
+			}
+		}
+	}
+	
+	/*
+	cout<<"Hessian: "<<endl;
+	for (size_t i=0; i<2*v_size; i++) {
+		for (size_t j=0; j<2*v_size; j++) {
+			cout<<gsl_matrix_get(Hessian, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	*/
+	
+	
+	//Invert Hessian
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (v_size*2);
+	
+	LUDecomp (Hessian, pmt, &sig);
+	LUInvert (Hessian, pmt, Hessian_inv);
+	/*
+	cout<<"Hessian Inverse: "<<endl;
+	for (size_t i=0; i<2*v_size; i++) {
+		for (size_t j=0; j<2*v_size; j++) {
+			cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	*/
+	gsl_permutation_free(pmt);	
+	gsl_matrix_free(Hessian);
+	
+	//calculate Edgeworth correction factors
+	//after inverting Hessian
+	if (c_size>1) {
+		CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c);
+	} else {
+		crt_a=0.0; crt_b=0.0; crt_c=0.0; 
+	}	
+	
+	gsl_matrix_free(xHiDHiy_all_g);
+	gsl_matrix_free(xHiDHiy_all_e);
+	gsl_matrix_free(xHiDHix_all_g);
+	gsl_matrix_free(xHiDHix_all_e);		
+	gsl_matrix_free(xHiDHixQixHiy_all_g);
+	gsl_matrix_free(xHiDHixQixHiy_all_e);
+	
+	gsl_matrix_free(QixHiDHiy_all_g);
+	gsl_matrix_free(QixHiDHiy_all_e);
+	gsl_matrix_free(QixHiDHix_all_g);
+	gsl_matrix_free(QixHiDHix_all_e);	
+	gsl_matrix_free(QixHiDHixQixHiy_all_g);
+	gsl_matrix_free(QixHiDHixQixHiy_all_e);
+	
+	gsl_matrix_free(xHiDHiDHiy_all_gg);
+	gsl_matrix_free(xHiDHiDHiy_all_ee);
+	gsl_matrix_free(xHiDHiDHiy_all_ge);
+	gsl_matrix_free(xHiDHiDHix_all_gg);
+	gsl_matrix_free(xHiDHiDHix_all_ee);
+	gsl_matrix_free(xHiDHiDHix_all_ge);
+	
+	return;
+}
+
+
+//update Vg, Ve
+void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, const double step_scale, gsl_matrix *V_g, gsl_matrix *V_e)
+{
+	size_t v_size=gradient->size/2, d_size=V_g->size1;
+	size_t v;
+	
+	gsl_vector *vec_v=gsl_vector_alloc (v_size*2);
+	
+	double d;
+	
+	//vectorize Vg and Ve
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			d=gsl_matrix_get (V_g, i, j);
+			gsl_vector_set (vec_v, v, d);
+			
+			d=gsl_matrix_get (V_e, i, j);
+			gsl_vector_set (vec_v, v+v_size, d);
+		}
+	}	
+	
+	gsl_blas_dgemv (CblasNoTrans, -1.0*step_scale, Hessian_inv, gradient, 1.0, vec_v);
+
+	//save Vg and Ve
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			d=gsl_vector_get (vec_v, v);
+			gsl_matrix_set (V_g, i, j, d);
+			gsl_matrix_set (V_g, j, i, d);
+			
+			d=gsl_vector_get (vec_v, v+v_size);
+			gsl_matrix_set (V_e, i, j, d);
+			gsl_matrix_set (V_e, j, i, d);
+		}
+	}	
+	
+	gsl_vector_free(vec_v);
+	
+	return;
+}
+
+
+
+
+
+
+double MphNR (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *Hi_all, gsl_matrix *xHi_all, gsl_matrix *Hiy_all, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
+	size_t dc_size=d_size*c_size;
+	size_t v_size=d_size*(d_size+1)/2;
+	
+	double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale;
+	int sig;
+	size_t step_iter, flag_pd;
+	
+	gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *D_temp=gsl_vector_alloc (d_size);
+	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *QixHiy=gsl_vector_alloc (dc_size);	
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);
+	
+	gsl_vector *gradient=gsl_vector_alloc (v_size*2);	
+	
+	//calculate |XXt| and (XXt)^{-1}
+	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
+	for (size_t i=0; i<c_size; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
+		}
+	}
+
+	gsl_permutation * pmt=gsl_permutation_alloc (c_size);
+	LUDecomp (XXt, pmt, &sig);
+	gsl_permutation_free (pmt);
+//	LUInvert (XXt, pmt, XXti);	
+	
+	//calculate the constant for logl	
+	if (func_name=='R' || func_name=='r') {		
+		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
+	} else {
+		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
+	}
+	//optimization iterations
+		
+	for (size_t t=0; t<max_iter; t++) {		
+		gsl_matrix_memcpy (Vg_save, V_g);
+		gsl_matrix_memcpy (Ve_save, V_e);
+
+		step_scale=1.0; step_iter=0;
+		do {
+			gsl_matrix_memcpy (V_g, Vg_save);
+			gsl_matrix_memcpy (V_e, Ve_save);
+			
+			//update Vg, Ve, and invert Hessian
+			if (t!=0) {UpdateVgVe (Hessian_inv, gradient, step_scale, V_g, V_e);}
+			
+			//check if both Vg and Ve are positive definite
+			flag_pd=1;
+			gsl_matrix_memcpy (V_temp, V_e);
+			EigenDecomp(V_temp, U_temp, D_temp, 0);
+			for (size_t i=0; i<d_size; i++) {
+				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}
+			}
+			gsl_matrix_memcpy (V_temp, V_g);
+			EigenDecomp(V_temp, U_temp, D_temp, 0);
+			for (size_t i=0; i<d_size; i++) {
+				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}	
+			}
+
+			//if flag_pd==1 continue to calculate quantities and logl
+			if (flag_pd==1) {				
+				CalcHiQi (eval, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q);
+				Calc_Hiy_all (Y, Hi_all, Hiy_all);
+				Calc_xHi_all (X, Hi_all, xHi_all);
+				
+				//calculate QixHiy and yPy
+				Calc_xHiy (Y, xHi_all, xHiy);
+				gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, xHiy, 0.0, QixHiy);
+				
+				gsl_blas_ddot (QixHiy, xHiy, &yPy);
+				yPy=Calc_yHiy (Y, Hiy_all)-yPy;
+				
+				//calculate log likelihood/restricted likelihood value
+				if (func_name=='R' || func_name=='r') {	
+					logl_new=logl_const-0.5*logdet_H-0.5*logdet_Q-0.5*yPy;
+				} else {
+					logl_new=logl_const-0.5*logdet_H-0.5*yPy;
+				}				
+			}
+
+			step_scale/=2.0; 
+			step_iter++;
+									
+			//cout<<t<<"\t"<<step_iter<<"\t"<<logl_old<<"\t"<<logl_new<<"\t"<<flag_pd<<endl;
+		} while ( (flag_pd==0 || logl_new<logl_old || logl_new-logl_old>10 ) && step_iter<10 && t!=0);
+
+		//terminate if change is small
+		if (t!=0) {
+			if (logl_new<logl_old || flag_pd==0) {
+				gsl_matrix_memcpy (V_g, Vg_save);
+				gsl_matrix_memcpy (V_e, Ve_save);
+				break;
+			}
+			
+			if (logl_new-logl_old<max_prec) {
+				break;
+			}
+		}
+
+		logl_old=logl_new;
+		
+		CalcDev (func_name, eval, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient, Hessian_inv, crt_a, crt_b, crt_c);
+		
+		
+		//output estimates in each iteration
+		/*
+		cout<<func_name<<" iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;
+		
+		cout<<"Vg: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Ve: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Hessian: "<<endl;
+		for (size_t i=0; i<Hessian_inv->size1; i++) {
+			for (size_t j=0; j<Hessian_inv->size2; j++) {
+				cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		*/
+	}
+	
+	//mutiply Hessian_inv with -1.0
+	//now Hessian_inv is the variance matrix
+	gsl_matrix_scale (Hessian_inv, -1.0);
+	
+	gsl_matrix_free(Vg_save);
+	gsl_matrix_free(Ve_save);
+	gsl_matrix_free(V_temp);
+	gsl_matrix_free(U_temp);
+	gsl_vector_free(D_temp);
+	gsl_vector_free(xHiy);
+	gsl_vector_free(QixHiy);	
+	
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(XXt);
+	
+	gsl_vector_free(gradient);
+	
+	return logl_new;
+}
+
+
+
+
+
+//initialize Vg, Ve and B
+void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter, const double nr_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
+{
+	gsl_matrix_set_zero (V_g);
+	gsl_matrix_set_zero (V_e);
+	gsl_matrix_set_zero (B);
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;	
+	double a, b, c;
+	double lambda, logl, vg, ve;
+	
+	//Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates
+	gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size);	
+	gsl_vector *beta_temp=gsl_vector_alloc(c_size);
+	gsl_vector *se_beta_temp=gsl_vector_alloc(c_size);
+	
+	gsl_matrix_transpose_memcpy (Xt, X);	
+	
+	for (size_t i=0; i<d_size; i++) {
+		gsl_vector_const_view Y_row=gsl_matrix_const_row (Y, i);
+		CalcLambda ('R', eval, Xt, &Y_row.vector, l_min, l_max, n_region, lambda, logl);
+		CalcLmmVgVeBeta (eval, Xt, &Y_row.vector, lambda, vg, ve, beta_temp, se_beta_temp);
+		
+		gsl_matrix_set(V_g, i, i, vg);
+		gsl_matrix_set(V_e, i, i, ve);
+	}
+
+	gsl_matrix_free (Xt);
+	gsl_vector_free (beta_temp);
+	gsl_vector_free (se_beta_temp);
+	
+	//if number of phenotypes is above four, then obtain the off diagonal elements with two trait models
+	if (d_size>4) {
+		//first obtain good initial values
+		//large matrices for EM
+		gsl_matrix *U_hat=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *E_hat=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *OmegaU=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *OmegaE=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size);	
+		
+		//large matrices for NR
+		gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size);		//each dxd block is H_k^{-1}
+		gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size);				//each column is H_k^{-1}y_k
+		gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+		gsl_matrix *Hessian=gsl_matrix_alloc (6, 6);
+		
+		//2 by n matrix of Y
+		gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2);
+		gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2);
+		gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size);
+				
+		for (size_t i=0; i<d_size; i++) {
+			gsl_vector_view Y_sub1=gsl_matrix_row (Y_sub, 0);
+			gsl_vector_const_view Y_1=gsl_matrix_const_row (Y, i);
+			gsl_vector_memcpy (&Y_sub1.vector, &Y_1.vector);
+			
+			for (size_t j=i+1; j<d_size; j++) {
+				gsl_vector_view Y_sub2=gsl_matrix_row (Y_sub, 1);
+				gsl_vector_const_view Y_2=gsl_matrix_const_row (Y, j);
+				gsl_vector_memcpy (&Y_sub2.vector, &Y_2.vector);
+				
+				gsl_matrix_set_zero (Vg_sub);
+				gsl_matrix_set_zero (Ve_sub);
+				gsl_matrix_set (Vg_sub, 0, 0, gsl_matrix_get (V_g, i, i));
+				gsl_matrix_set (Ve_sub, 0, 0, gsl_matrix_get (V_e, i, i));
+				gsl_matrix_set (Vg_sub, 1, 1, gsl_matrix_get (V_g, j, j));
+				gsl_matrix_set (Ve_sub, 1, 1, gsl_matrix_get (V_e, j, j));
+				
+				logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);	
+				logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, a, b, c);
+				
+				gsl_matrix_set(V_g, i, j, gsl_matrix_get (Vg_sub, 0, 1));
+				gsl_matrix_set(V_g, j, i, gsl_matrix_get (Vg_sub, 0, 1));
+				
+				gsl_matrix_set(V_e, i, j, ve=gsl_matrix_get (Ve_sub, 0, 1));
+				gsl_matrix_set(V_e, j, i, ve=gsl_matrix_get (Ve_sub, 0, 1));
+			}
+		}
+		
+		//free matrices
+		gsl_matrix_free(U_hat);
+		gsl_matrix_free(E_hat);
+		gsl_matrix_free(OmegaU);
+		gsl_matrix_free(OmegaE);
+		gsl_matrix_free(UltVehiY);
+		gsl_matrix_free(UltVehiBX);
+		gsl_matrix_free(UltVehiU);
+		gsl_matrix_free(UltVehiE);	
+		
+		gsl_matrix_free(Hi_all);
+		gsl_matrix_free(Hiy_all);
+		gsl_matrix_free(xHi_all);
+		gsl_matrix_free(Hessian);
+		
+		gsl_matrix_free(Y_sub);
+		gsl_matrix_free(Vg_sub);
+		gsl_matrix_free(Ve_sub);
+		gsl_matrix_free(B_sub);
+		
+		/*
+		//second, maximize a increasingly large matrix
+		for (size_t i=1; i<d_size; i++) {		
+			//large matrices for EM
+			gsl_matrix *U_hat=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *E_hat=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *OmegaU=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *OmegaE=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiY=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiBX=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiU=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiE=gsl_matrix_alloc (i+1, n_size);	
+			
+			//large matrices for NR
+			gsl_matrix *Hi_all=gsl_matrix_alloc (i+1, (i+1)*n_size);		//each dxd block is H_k^{-1}
+			gsl_matrix *Hiy_all=gsl_matrix_alloc (i+1, n_size);				//each column is H_k^{-1}y_k
+			gsl_matrix *xHi_all=gsl_matrix_alloc ((i+1)*c_size, (i+1)*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+			gsl_matrix *Hessian=gsl_matrix_alloc ((i+1)*(i+2), (i+1)*(i+2));
+			
+			//(i+1) by n matrix of Y
+			gsl_matrix *Y_sub=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *Vg_sub=gsl_matrix_alloc (i+1, i+1);
+			gsl_matrix *Ve_sub=gsl_matrix_alloc (i+1, i+1);
+			gsl_matrix *B_sub=gsl_matrix_alloc (i+1, c_size);
+			
+			gsl_matrix_const_view Y_sub_view=gsl_matrix_const_submatrix (Y, 0, 0, i+1, n_size);
+			gsl_matrix_view Vg_sub_view=gsl_matrix_submatrix (V_g, 0, 0, i+1, i+1);
+			gsl_matrix_view Ve_sub_view=gsl_matrix_submatrix (V_e, 0, 0, i+1, i+1);
+			
+			gsl_matrix_memcpy (Y_sub, &Y_sub_view.matrix);
+			gsl_matrix_memcpy (Vg_sub, &Vg_sub_view.matrix);
+			gsl_matrix_memcpy (Ve_sub, &Ve_sub_view.matrix);
+			
+			logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);	
+			logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, crt_a, crt_b, crt_c);
+			
+			gsl_matrix_memcpy (&Vg_sub_view.matrix, Vg_sub);
+			gsl_matrix_memcpy (&Ve_sub_view.matrix, Ve_sub);
+						
+			//free matrices
+			gsl_matrix_free(U_hat);
+			gsl_matrix_free(E_hat);
+			gsl_matrix_free(OmegaU);
+			gsl_matrix_free(OmegaE);
+			gsl_matrix_free(UltVehiY);
+			gsl_matrix_free(UltVehiBX);
+			gsl_matrix_free(UltVehiU);
+			gsl_matrix_free(UltVehiE);	
+			
+			gsl_matrix_free(Hi_all);
+			gsl_matrix_free(Hiy_all);
+			gsl_matrix_free(xHi_all);
+			gsl_matrix_free(Hessian);
+			
+			gsl_matrix_free(Y_sub);
+			gsl_matrix_free(Vg_sub);
+			gsl_matrix_free(Ve_sub);
+			gsl_matrix_free(B_sub);
+		}
+		 */
+	}
+	
+	//calculate B hat using GSL estimate
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (d_size*c_size, d_size*c_size);
+	gsl_vector *XHiy=gsl_vector_alloc (d_size*c_size);
+	gsl_vector *beta=gsl_vector_alloc (d_size*c_size);
+	
+	gsl_vector_set_zero (XHiy);
+	
+	double logdet_Ve, logdet_Q, dl, d, delta, dx, dy;
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, X, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+
+	//calculate XHiy
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+		
+		for (size_t j=0; j<c_size; j++) {	
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dx=gsl_matrix_get(X, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d+=dy*dx/(delta*dl+1.0);
+			}
+			gsl_vector_set(XHiy, j*d_size+i, d);
+		}
+	}
+
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, XHiy, 0.0, beta);
+
+	//multiply beta by UltVeh and save to B
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view B_col=gsl_matrix_column (B, i);
+		gsl_vector_view beta_sub=gsl_vector_subvector (beta, i*d_size, d_size);		
+		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &beta_sub.vector, 0.0, &B_col.vector);
+	}
+
+	//free memory
+	gsl_matrix_free(UltVehiY);
+	
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_vector_free(XHiy);
+	gsl_vector_free(beta);
+		
+	return;
+}
+
+
+
+//p value correction
+//mode=1 Wald; mode=2 LRT; mode=3 SCORE;
+double PCRT (const size_t mode, const size_t d_size, const double p_value, const double crt_a, const double crt_b, const double crt_c)
+{
+	double p_crt=0.0, chisq_crt=0.0, q=(double)d_size;
+	double chisq=gsl_cdf_chisq_Qinv(p_value, (double)d_size );
+		
+	if (mode==1) {		
+		double a=crt_c/(2.0*q*(q+2.0));
+		double b=1.0+(crt_a+crt_b)/(2.0*q);		
+		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);		
+	} else if (mode==2) {
+		chisq_crt=chisq/(1.0+crt_a/(2.0*q) );		
+	} else {
+		/*
+		double a=-1.0*crt_c/(2.0*q*(q+2.0));
+		double b=1.0+(crt_a-crt_b)/(2.0*q);	
+		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);
+		*/
+		chisq_crt=chisq;
+	}
+	
+	p_crt=gsl_cdf_chisq_Q (chisq_crt, (double)d_size );	
+	
+	//cout<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
+	//cout<<setprecision(10)<<p_value<<"\t"<<p_crt<<endl;
+	
+	return p_crt;
+}
+
+
+
+void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+
+	clock_t time_start=clock();
+	time_UtX=0; time_opt=0;
+	
+	string line;
+	char *ch_ptr;
+	
+	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
+	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
+	double crt_a, crt_b, crt_c;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	size_t c=0;
+	//	double s=0.0;
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+
+	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
+		
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	gsl_vector *x=gsl_vector_alloc (n_size);
+	gsl_vector *x_miss=gsl_vector_alloc (n_size);
+	
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
+	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_vector *beta=gsl_vector_alloc (d_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);
+	
+	//null estimates for initial values
+	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);	
+	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
+	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);
+	
+	gsl_matrix_transpose_memcpy (Y, UtY);
+
+	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);
+	
+	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
+	gsl_vector_set_zero(&X_row.vector);
+	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
+	gsl_vector_set_zero(&B_col.vector);		
+
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);	
+	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	
+	c=0;
+	Vg_remle_null.clear();
+	Ve_remle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_remle_null.clear(); 
+	se_beta_remle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_remle_H0=logl_H0;
+	
+	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+	cout.precision(4);
+	
+	cout<<"REMLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE likelihood = "<<logl_H0<<endl;
+	
+	
+	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	
+	c=0;
+	Vg_mle_null.clear();
+	Ve_mle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_mle_null.clear(); 
+	se_beta_mle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_mle_H0=logl_H0;
+	
+	cout<<"MLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE likelihood = "<<logl_H0<<endl;
+
+	
+	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
+	for (size_t i=0; i<d_size; i++) {
+		v_beta.push_back(0.0);
+	}
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			v_Vg.push_back(0.0);
+			v_Ve.push_back(0.0);
+			v_Vbeta.push_back(0.0);
+		}
+	}
+	
+	gsl_matrix_memcpy (V_g_null, V_g);
+	gsl_matrix_memcpy (V_e_null, V_e);
+	gsl_matrix_memcpy (B_null, B);
+	
+	//start reading genotypes and analyze		
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		//if (t>=1) {break;}
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}
+
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+
+		//calculate statistics
+		time_start=clock();		
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//initial values
+		gsl_matrix_memcpy (V_g, V_g_null);
+		gsl_matrix_memcpy (V_e, V_e_null);
+		gsl_matrix_memcpy (B, B_null);
+		
+		time_start=clock();
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {			
+			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+			if (p_score<p_nr && crt==1) {
+				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+			}
+		}		
+
+		if (a_mode==2 || a_mode==4) {
+			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			//calculate beta and Vbeta
+			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+			
+			if (p_lrt<p_nr) {
+				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				//calculate beta and Vbeta
+				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+				
+				if (crt==1) {
+					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
+				}
+			}			
+		}			
+
+		if (a_mode==1 || a_mode==4) {
+			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			
+			if (p_wald<p_nr) {
+				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				
+				if (crt==1) {
+					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
+				}
+			}			
+		}		
+
+		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		for (size_t i=0; i<d_size; i++) {
+			v_beta[i]=gsl_vector_get (beta, i);			
+		}
+		
+		c=0;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=i; j<d_size; j++) {
+				v_Vg[c]=gsl_matrix_get (V_g, i, j);
+				v_Ve[c]=gsl_matrix_get (V_e, i, j);
+				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+				c++;
+			}
+		}
+		
+		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	
+	infile.close();
+	infile.clear();
+	
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_vector_free(x);
+	gsl_vector_free(x_miss);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(X);	
+	gsl_matrix_free(V_g);
+	gsl_matrix_free(V_e);
+	gsl_matrix_free(B);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_matrix_free(V_g_null);
+	gsl_matrix_free(V_e_null);
+	gsl_matrix_free(B_null);	
+	gsl_matrix_free(se_B_null);
+	
+	return;
+}
+
+
+
+
+
+
+
+void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	time_UtX=0; time_opt=0;
+	
+	char ch[1];
+	bitset<8> b;
+	
+	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
+	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
+	double crt_a, crt_b, crt_c;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+	size_t c=0;
+	//	double s=0.0;
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
+			
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	gsl_vector *x=gsl_vector_alloc (n_size);
+	
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);		
+	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_vector *beta=gsl_vector_alloc (d_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);
+		
+	//null estimates for initial values
+	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);	
+	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);	
+	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
+	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);
+	
+	gsl_matrix_transpose_memcpy (Y, UtY);
+	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);
+	
+	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
+	gsl_vector_set_zero(&X_row.vector);
+	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
+	gsl_vector_set_zero(&B_col.vector);		
+	
+	//time_start=clock();			
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
+		
+	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	//cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+	
+	c=0;
+	Vg_remle_null.clear();
+	Ve_remle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_remle_null.clear(); 
+	se_beta_remle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_remle_H0=logl_H0;
+	
+	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+	cout.precision(4);
+	cout<<"REMLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE likelihood = "<<logl_H0<<endl;
+	
+	//time_start=clock();	
+	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	//cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+		
+	c=0;
+	Vg_mle_null.clear();
+	Ve_mle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_mle_null.clear(); 
+	se_beta_mle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_mle_H0=logl_H0;
+	
+	cout<<"MLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE likelihood = "<<logl_H0<<endl;
+	
+	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
+	for (size_t i=0; i<d_size; i++) {
+		v_beta.push_back(0.0);
+	}
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			v_Vg.push_back(0.0);
+			v_Ve.push_back(0.0);
+			v_Vbeta.push_back(0.0);
+		}
+	}
+	
+	gsl_matrix_memcpy (V_g_null, V_g);
+	gsl_matrix_memcpy (V_e_null, V_e);
+	gsl_matrix_memcpy (B_null, B);	
+	
+	
+	//start reading genotypes and analyze	
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		//if (t>=0) {break;}
+		//if (snpInfo[t].rs_number!="MAG18140902") {continue;}
+		//cout<<t<<endl;
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+				
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+				
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}		
+		
+		/*
+		if (t==0) {			
+			ofstream outfile ("./snp1.txt", ofstream::out);
+			if (!outfile) {cout<<"error writing file: "<<endl; return;}
+			for (size_t i=0; i<x->size; i++) {
+				outfile<<gsl_vector_get(x, i)<<endl;
+			}
+			outfile.clear();
+			outfile.close();			
+		}
+		*/
+	
+		//calculate statistics
+		time_start=clock();		
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//initial values
+		gsl_matrix_memcpy (V_g, V_g_null);
+		gsl_matrix_memcpy (V_e, V_e_null);
+		gsl_matrix_memcpy (B, B_null);
+		
+		time_start=clock();
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+			
+			if (p_score<p_nr && crt==1) {
+				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+			}
+		}		
+		
+		if (a_mode==2 || a_mode==4) {
+			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			//calculate beta and Vbeta
+			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+			
+			if (p_lrt<p_nr) {
+				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+
+				//calculate beta and Vbeta
+				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+				if (crt==1) {
+					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
+				}				
+			}
+		}			
+		
+		if (a_mode==1 || a_mode==4) {
+			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			
+			if (p_wald<p_nr) {
+				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				
+				if (crt==1) {
+					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
+				}
+			}
+		}
+		
+		//cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;
+		
+		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		for (size_t i=0; i<d_size; i++) {
+			v_beta[i]=gsl_vector_get (beta, i);			
+		}
+
+		c=0;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=i; j<d_size; j++) {
+				v_Vg[c]=gsl_matrix_get (V_g, i, j);
+				v_Ve[c]=gsl_matrix_get (V_e, i, j);
+				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+				c++;
+			}
+		}
+		
+		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;	
+	
+	//cout<<"time_opt = "<<time_opt<<endl;
+	
+	infile.close();
+	infile.clear();
+	
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_vector_free(x);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(X);	
+	gsl_matrix_free(V_g);
+	gsl_matrix_free(V_e);
+	gsl_matrix_free(B);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_matrix_free(V_g_null);
+	gsl_matrix_free(V_e_null);
+	gsl_matrix_free(B_null);
+	gsl_matrix_free(se_B_null);
+	
+	return;
+}
+
+
+
+
+//calculate Vg, Ve, B, se(B) in the null mvLMM model
+//both B and se_B are d by c matrices
+void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B)
+{
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+	size_t dc_size=d_size*c_size, v_size=d_size*(d_size+1)/2;
+
+	double logl, crt_a, crt_b, crt_c;
+	
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	//transpose matrices
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *W=gsl_matrix_alloc (c_size, n_size);
+	gsl_matrix_transpose_memcpy (Y, UtY);
+	gsl_matrix_transpose_memcpy (W, UtW);
+	
+	//initial, EM, NR, and calculate B
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B);	
+	logl=MphEM ('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+	logl=MphNR ('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, W, Y, V_g, V_e, UltVehiY, B, se_B);
+
+	//free matrices
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(W);
+	
+	return;
+}
+
diff --git a/mvlmm.h b/mvlmm.h
new file mode 100644
index 0000000..5aa9bbf
--- /dev/null
+++ b/mvlmm.h
@@ -0,0 +1,93 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MVLMM_H__                
+#define __MVLMM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+
+
+class MVLMM {
+	
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	
+	// MVLMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double logl_remle_H0, logl_mle_H0;
+	vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
+	vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null;
+	vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null;
+	double p_nr;
+	size_t em_iter, nr_iter;
+	double em_prec, nr_prec;
+	size_t crt;
+		
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t n_cvt;
+	size_t n_ph;
+	double time_UtX;		//time spent on optimization iterations
+	double time_opt;		//time spent on optimization iterations
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<MPHSUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY);
+	void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY);
+	void WriteFiles ();
+	
+};
+
+void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B);
+
+#endif
+
+
diff --git a/param.cpp b/param.cpp
new file mode 100644
index 0000000..edacc42
--- /dev/null
+++ b/param.cpp
@@ -0,0 +1,849 @@
+/*
+    Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <sys/stat.h>
+#include <cmath>
+#include <algorithm>
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+
+
+PARAM::PARAM(void):	
+mode_silence (false), a_mode (0), k_mode(1), d_pace (100000),
+file_out("result"), 
+miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999),
+l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0),
+pheno_mean(0),
+h_min(-1), h_max(-1),	h_scale(-1),
+rho_min(0.0), rho_max(1.0),	rho_scale(-1),
+logp_min(0.0), logp_max(0.0), logp_scale(-1),
+s_min(0), s_max(300),
+w_step(100000),	s_step(1000000),
+r_pace(10), w_pace(1000),
+n_accept(0),
+n_mh(10),
+geo_mean(2000.0),
+randseed(-1),
+error(false),
+  n_cvt(1), n_vc(1),
+time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0)
+{}
+
+
+//read files
+//obtain ns_total, ng_total, ns_test, ni_test
+void PARAM::ReadFiles (void) 
+{
+	string file_str;
+	if (!file_mk.empty()) {				
+	  if (CountFileLines (file_mk, n_vc)==false) {error=true;}
+	}
+	
+	if (!file_snps.empty()) {
+		if (ReadFile_snps (file_snps, setSnps)==false) {error=true;}
+	} else {
+		setSnps.clear();
+	}
+	
+	//for prediction
+	if (!file_epm.empty()) {
+		if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;}
+		
+		if (!file_bfile.empty()) {
+			file_str=file_bfile+".bim";
+			if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}		
+			
+			file_str=file_bfile+".fam";
+			if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}			
+		}
+		
+		if (!file_geno.empty()) {			
+			if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}		
+			
+			if (CountFileLines (file_geno, ns_total)==false) {error=true;}	
+		}
+		
+		if (!file_ebv.empty() ) {
+			if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;}
+		}
+		
+		if (!file_log.empty() ) {
+			if (ReadFile_log (file_log, pheno_mean)==false) {error=true;}
+		}
+		
+		//convert indicator_pheno to indicator_idv
+		int k=1;
+		for (size_t i=0; i<indicator_pheno.size(); i++) {
+			k=1;
+			for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+				if (indicator_pheno[i][j]==0) {k=0;}
+			}
+			indicator_idv.push_back(k);
+		}
+		
+		ns_test=0;
+		
+		return;
+	}
+	
+	//read covariates before the genotype files
+	if (!file_cvt.empty() ) {
+		if (ReadFile_cvt (file_cvt, indicator_cvt, cvt, n_cvt)==false) {error=true;}
+
+		if ((indicator_cvt).size()==0) {
+			n_cvt=1;
+		} 		
+	} else {
+		n_cvt=1;
+	}
+
+	//read genotype and phenotype file for plink format
+	if (!file_bfile.empty()) {
+		file_str=file_bfile+".bim";
+		if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}		
+		
+		file_str=file_bfile+".fam";
+		if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}
+		
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+		
+		//obtain covariate matrix
+		gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+		CopyCvt (W);
+		
+		file_str=file_bfile+".bed";
+		if (ReadFile_bed (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test)==false) {error=true;}
+		
+		gsl_matrix_free(W);
+		
+		ns_total=indicator_snp.size();
+	}
+	
+	//read genotype and phenotype file for bimbam format
+	if (!file_geno.empty()) {
+		//annotation file before genotype file
+		if (!file_anno.empty() ) {
+			if (ReadFile_anno (file_anno, mapRS2chr, mapRS2bp, mapRS2cM)==false) {error=true;}
+		}
+
+		//phenotype file before genotype file
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+		
+		//obtain covariate matrix
+		gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+		CopyCvt (W);
+
+		if (ReadFile_geno (file_geno, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test)==false) {error=true;}
+
+		gsl_matrix_free(W);
+		
+		ns_total=indicator_snp.size();
+	}
+	
+	if (!file_gene.empty()) {
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+		
+		//convert indicator_pheno to indicator_idv
+		int k=1;
+		for (size_t i=0; i<indicator_pheno.size(); i++) {
+			k=1;
+			for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+				if (indicator_pheno[i][j]==0) {k=0;}
+			}
+			indicator_idv.push_back(k);
+		}
+		
+		if (ReadFile_gene (file_gene, vec_read, snpInfo, ng_total)==false) {error=true;}	
+	}
+	
+				
+	//read is after gene file
+	if (!file_read.empty() ) {
+		if (ReadFile_column (file_read, indicator_read, vec_read, 1)==false) {error=true;}
+		
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_read[i];
+			ni_test+=indicator_idv[i];
+		}
+		
+		if (ni_test==0) {
+			error=true;
+			cout<<"error! number of analyzed individuals equals 0. "<<endl;
+			return;
+		}
+	}
+	
+	//for ridge prediction, read phenotype only
+	if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) {
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}	
+				
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+	}
+
+	return;
+}
+
+
+
+
+
+
+void PARAM::CheckParam (void) 
+{	
+	struct stat fileInfo;
+	string str;
+	
+	//check parameters
+	if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;}
+	if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=21 && a_mode!=22 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61)   
+	{cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm or -predict is sepcified correctly."<<endl; error=true;}
+	if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;}
+	if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;}
+	if (hwe_level>1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<<hwe_level<<endl; error=true;}
+	if (r2_level>1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<<r2_level<<endl; error=true;}
+	
+	if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;}	
+	if (h_max<h_min) {cout<<"error! maximum h value must be larger than the minimal value. current values = "<<h_max<<" and "<<h_min<<endl; error=true;}
+	if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;}
+	if (rho_max<rho_min) {cout<<"error! maximum rho value must be larger than the minimal value. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+	if (logp_max<logp_min) {cout<<"error! maximum logp value must be larger than the minimal value. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;}
+	
+	if (h_max>1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<<h_max<<" and "<<h_min<<endl; error=true;}
+	if (rho_max>1) {cout<<"error! rho values must be between 0 and 1. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+	if (logp_max>0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;}
+	if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;}
+		
+	if (h_scale>1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<<h_scale<<endl; error=true;}
+	if (rho_scale>1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<<rho_scale<<endl; error=true;}
+	if (logp_scale>1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<<logp_scale<<endl; error=true;}
+
+	if (rho_max==1 && rho_min==1 && a_mode==12) {cout<<"error! ridge regression does not support a rho parameter. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+		
+	//check p_column, and (no need to) sort p_column into ascending order
+	if (p_column.size()==0) {
+		p_column.push_back(1);
+	} else {
+		for (size_t i=0; i<p_column.size(); i++) {
+			for (size_t j=0; j<i; j++) {
+				if (p_column[i]==p_column[j]) {cout<<"error! identical phenotype columns: "<<p_column[i]<<endl; error=true;}
+			}
+		}
+	}
+	
+	//sort (p_column.begin(), p_column.end() );
+	n_ph=p_column.size();
+	
+		
+	
+	//only lmm option (and one prediction option) can deal with multiple phenotypes
+	//and no gene expression files
+	if (n_ph>1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) {
+		cout<<"error! the current analysis mode "<<a_mode<<" can not deal with multiple phenotypes."<<endl; error=true;
+	}
+	if (n_ph>1 && !file_gene.empty() ) {
+		cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<<endl; error=true;
+	}
+	
+	if (p_nr>1) {
+		cout<<"error! pnr value must be between 0 and 1. current value = "<<p_nr<<endl; error=true;
+	}
+	
+	//check est_column
+	if (est_column.size()==0) {
+		if (file_ebv.empty()) {
+			est_column.push_back(2);
+			est_column.push_back(5);
+			est_column.push_back(6);
+			est_column.push_back(7);
+		} else {
+			est_column.push_back(2);
+			est_column.push_back(0);
+			est_column.push_back(6);
+			est_column.push_back(7);
+		}
+	}
+	
+	if (est_column.size()!=4) {cout<<"error! -en not followed by four numbers. current number = "<<est_column.size()<<endl; error=true;}	
+	if (est_column[0]==0) {cout<<"error! -en rs column can not be zero. current number = "<<est_column.size()<<endl; error=true;}
+	
+	//check if files are compatible with each other, and if files exist
+	if (!file_bfile.empty()) {
+		str=file_bfile+".bim";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bim file: "<<str<<endl; error=true;}
+		str=file_bfile+".bed";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bed file: "<<str<<endl; error=true;}
+		str=file_bfile+".fam";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .fam file: "<<str<<endl; error=true;}			
+	}
+	
+	if ((!file_geno.empty() || !file_gene.empty()) ) {
+		str=file_pheno;
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open phenotype file: "<<str<<endl; error=true;}
+	}	
+	
+	str=file_geno;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mean genotype file: "<<str<<endl; error=true;}
+	
+	str=file_gene;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open gene expression file: "<<str<<endl; error=true;}
+	
+	size_t flag=0;
+	if (!file_bfile.empty()) {flag++;}
+	if (!file_geno.empty()) {flag++;}
+	if (!file_gene.empty()) {flag++;}
+	
+	if (flag!=1 && a_mode!=43 && a_mode!=5 && a_mode!=61) {
+		cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true;
+	}
+	
+	if (file_pheno.empty() && (a_mode==43 || a_mode==5 || a_mode==61) ) {
+		cout<<"error! phenotype file is required."<<endl; error=true;
+	}
+	
+	if (!file_epm.empty() && file_bfile.empty() && file_geno.empty() ) {cout<<"error! estimated parameter file also requires genotype file."<<endl; error=true;}
+	if (!file_ebv.empty() && file_kin.empty()) {cout<<"error! estimated breeding value file also requires relatedness file."<<endl; error=true;}
+	
+	if (!file_log.empty() && pheno_mean!=0) {cout<<"error! either log file or mu value can be provide."<<endl; error=true;}
+	
+	str=file_snps;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open snps file: "<<str<<endl; error=true;}
+	
+	str=file_log;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open log file: "<<str<<endl; error=true;}
+	
+	str=file_anno;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open annotation file: "<<str<<endl; error=true;}
+
+	str=file_kin;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open relatedness matrix file: "<<str<<endl; error=true;}
+
+	str=file_mk;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open relatedness matrix file: "<<str<<endl; error=true;}
+	
+	str=file_cvt;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open covariates file: "<<str<<endl; error=true;}
+	
+	str=file_epm;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated parameter file: "<<str<<endl; error=true;}
+	
+	str=file_ebv;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated breeding value file: "<<str<<endl; error=true;}
+	
+	str=file_read;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open total read file: "<<str<<endl; error=true;}
+		
+	//check if files are compatible with analysis mode
+	if (k_mode==2 && !file_geno.empty() ) {cout<<"error! use \"-km 1\" when using bimbam mean genotype file. "<<endl; error=true;}
+	
+	if ((a_mode==1 || a_mode==2 || a_mode==3 || a_mode==4 || a_mode==5 || a_mode==31) && (file_kin.empty() && (file_ku.empty()||file_kd.empty())) )  {cout<<"error! missing relatedness file. "<<endl;  error=true;}
+
+	if (a_mode==61 && (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) )  {cout<<"error! missing relatedness file. "<<endl;  error=true;}
+
+	if ((a_mode==43) && file_kin.empty())  {cout<<"error! missing relatedness file. -predict option requires -k option to provide a relatedness file."<<endl;  error=true;}
+	
+	if ((a_mode==11 || a_mode==12 || a_mode==13) && !file_cvt.empty() ) {cout<<"error! -bslmm option does not support covariates files."<<endl; error=true;}
+		
+	if (a_mode==41 || a_mode==42) {
+		if (!file_cvt.empty() ) {cout<<"error! -predict option does not support covariates files."<<endl; error=true;}	
+		if (file_epm.empty() ) {cout<<"error! -predict option requires estimated parameter files."<<endl; error=true;}		
+	}
+
+	return;
+}
+
+
+		
+
+
+void PARAM::CheckData (void) {
+	if ((file_cvt).empty() || (indicator_cvt).size()==0) {
+		n_cvt=1;
+	}
+	if ( (indicator_cvt).size()!=0 && (indicator_cvt).size()!=(indicator_idv).size()) {
+		error=true;
+		cout<<"error! number of rows in the covariates file do not match the number of individuals. "<<endl;
+		return;
+	}
+	
+	if ( (indicator_read).size()!=0 && (indicator_read).size()!=(indicator_idv).size()) {
+		error=true;
+		cout<<"error! number of rows in the total read file do not match the number of individuals. "<<endl;
+		return;
+	}
+
+	//calculate ni_total and ni_test, and set indicator_idv to 0 whenever indicator_cvt=0
+	//and calculate np_obs and np_miss
+	ni_total=(indicator_idv).size();
+	
+	ni_test=0; 
+	for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		ni_test++;
+	}
+	
+	ni_cvt=0;
+	for (size_t i=0; i<indicator_cvt.size(); i++) {
+		if (indicator_cvt[i]==0) {continue;}
+		ni_cvt++;
+	}
+
+	np_obs=0; np_miss=0;
+	for (size_t i=0; i<indicator_pheno.size(); i++) {
+		if (indicator_cvt.size()!=0) {
+			if (indicator_cvt[i]==0) {continue;}
+		}
+		
+		for (size_t j=0; j<indicator_pheno[i].size(); j++) {					
+			if (indicator_pheno[i][j]==0) {
+				np_miss++;
+			} else {
+				np_obs++;
+			}
+		}
+	}
+
+	/*
+	if ((indicator_cvt).size()!=0) {
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_cvt[i];
+			ni_test+=indicator_idv[i];
+		}
+	}	
+	
+	if ((indicator_read).size()!=0) {
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_read[i];
+			ni_test+=indicator_idv[i];
+		}
+	}
+	*/
+	if (ni_test==0) {
+		error=true;
+		cout<<"error! number of analyzed individuals equals 0. "<<endl;
+		return;
+	}
+	
+	if (a_mode==43) {
+		if (ni_cvt==ni_test) {
+			error=true;
+			cout<<"error! no individual has missing phenotypes."<<endl; 
+			return;
+		}
+		if ((np_obs+np_miss)!=(ni_cvt*n_ph)) {
+			error=true;
+			//cout<<ni_cvt<<"\t"<<ni_test<<"\t"<<ni_total<<"\t"<<np_obs<<"\t"<<np_miss<<"\t"<<indicator_cvt.size()<<endl;
+			cout<<"error! number of phenotypes do not match the summation of missing and observed phenotypes."<<endl; 
+			return;
+		}
+	}
+
+	//output some information
+	cout<<"## number of total individuals = "<<ni_total<<endl;
+	if (a_mode==43) {
+		cout<<"## number of analyzed individuals = "<<ni_cvt<<endl;
+		cout<<"## number of individuals with full phenotypes = "<<ni_test<<endl;
+	} else {
+		cout<<"## number of analyzed individuals = "<<ni_test<<endl;
+	}
+	cout<<"## number of covariates = "<<n_cvt<<endl;
+	cout<<"## number of phenotypes = "<<n_ph<<endl;
+	if (a_mode==43) {
+		cout<<"## number of observed data = "<<np_obs<<endl;
+		cout<<"## number of missing data = "<<np_miss<<endl;
+	}
+	if (!file_gene.empty()) {		
+		cout<<"## number of total genes = "<<ng_total<<endl;
+	} else if (file_epm.empty() && a_mode!=43 && a_mode!=5) {
+		cout<<"## number of total SNPs = "<<ns_total<<endl;	
+		cout<<"## number of analyzed SNPs = "<<ns_test<<endl;
+	} else {}
+	
+	//set d_pace to 1000 for gene expression
+	if (!file_gene.empty() && d_pace==100000) {
+		d_pace=1000;
+	}
+	
+	//for case-control studies, count #cases and #controls
+	int flag_cc=0;
+	if (a_mode==13) {	
+		ni_case=0;
+		ni_control=0;
+		for (size_t i=0; i<indicator_idv.size(); i++) {
+			if (indicator_idv[i]==0) {continue;}
+		
+			if (pheno[i][0]==0) {ni_control++;}
+			else if (pheno[i][0]==1) {ni_case++;}
+			else {flag_cc=1;}
+		}
+		cout<<"## number of cases = "<<ni_case<<endl;	
+		cout<<"## number of controls = "<<ni_control<<endl;	
+	}	
+	
+	if (flag_cc==1) {cout<<"Unexpected non-binary phenotypes for case/control analysis. Use default (BSLMM) analysis instead."<<endl; a_mode=11;}
+	
+	//set parameters for BSLMM
+	//and check for predict
+	if (a_mode==11 || a_mode==12 || a_mode==13) {
+		if (a_mode==11) {n_mh=1;}	
+		if (logp_min==0) {logp_min=-1.0*log((double)ns_test);}
+	
+		if (h_scale==-1) {h_scale=min(1.0, 10.0/sqrt((double)ni_test) );}
+		if (rho_scale==-1) {rho_scale=min(1.0, 10.0/sqrt((double)ni_test) );}
+		if (logp_scale==-1) {logp_scale=min(1.0, 5.0/sqrt((double)ni_test) );}
+		
+		if (h_min==-1) {h_min=0.0;}
+		if (h_max==-1) {h_max=1.0;}
+		
+		if (s_max>ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."<<endl;}
+		if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;}
+	} else if (a_mode==41 || a_mode==42) {		
+		if (indicator_bv.size()!=0) {
+			if (indicator_idv.size()!=indicator_bv.size()) {
+				cout<<"error! number of rows in the phenotype file does not match that in the estimated breeding value file: "<<indicator_idv.size()<<"\t"<<indicator_bv.size()<<endl;
+				error=true;
+			} else {
+				size_t flag_bv=0;
+				for (size_t i=0; i<(indicator_bv).size(); ++i) {
+					if (indicator_idv[i]!=indicator_bv[i]) {flag_bv++;}
+				}
+				if (flag_bv!=0) {
+					cout<<"error! individuals with missing value in the phenotype file does not match that in the estimated breeding value file: "<<flag_bv<<endl;
+					error=true;
+				}
+			}
+		}
+	}
+
+	//file_mk needs to contain more than one line
+	if (n_vc==1 && !file_mk.empty()) {cout<<"error! -mk file should contain more than one line."<<endl; error=true;}
+	
+	return;
+}
+
+
+void PARAM::PrintSummary () 
+{
+	if (n_ph==1) {
+		cout<<"pve estimate ="<<pve_null<<endl;
+		cout<<"se(pve) ="<<pve_se_null<<endl;
+	} else {
+		
+	}
+	return;
+}
+
+
+
+void PARAM::ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
+	string file_str;
+	
+	if (!file_bfile.empty()) {
+		file_str=file_bfile+".bed";
+		if (ReadFile_bed (file_str, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;}
+	}
+	else {
+		if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;}
+	}
+	
+	return;
+}
+		
+
+
+
+void PARAM::CalcKin (gsl_matrix *matrix_kin)  {
+	string file_str;
+	
+	gsl_matrix_set_zero (matrix_kin);
+	
+	if (!file_bfile.empty() ) {		
+		file_str=file_bfile+".bed";
+		if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
+	}
+	else {
+		file_str=file_geno;
+		if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
+	}
+	
+	return;
+}
+		
+
+
+
+
+void PARAM::WriteMatrix (const gsl_matrix *matrix_U, const string suffix) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".";
+	file_str+=suffix;
+	file_str+=".txt";	
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile.precision(10);
+	
+	for (size_t i=0; i<matrix_U->size1; ++i) {
+		for (size_t j=0; j<matrix_U->size2; ++j) {
+			outfile<<gsl_matrix_get (matrix_U, i, j)<<"\t";
+		}
+		outfile<<endl;
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PARAM::WriteVector (const gsl_vector *vector_D, const string suffix) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".";
+	file_str+=suffix;
+	file_str+=".txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile.precision(10);
+	
+	for (size_t i=0; i<vector_D->size; ++i) {
+		outfile<<gsl_vector_get (vector_D, i)<<endl;
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PARAM::CheckCvt () 
+{
+	if (indicator_cvt.size()==0) {return;}
+		
+	size_t ci_test=0;
+	
+	gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+
+	size_t flag_ipt=0;
+	double v_min, v_max;
+	set<size_t> set_remove;
+	
+	//check if any columns is an intercept
+	for (size_t i=0; i<W->size2; i++) {
+		gsl_vector_view w_col=gsl_matrix_column (W, i);
+		gsl_vector_minmax (&w_col.vector, &v_min, &v_max);
+		if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);}
+	}
+	
+	//add an intecept term if needed
+	if (n_cvt==set_remove.size()) {
+		indicator_cvt.clear();
+		n_cvt=1;
+	} else if (flag_ipt==0) {
+		cout<<"no intecept term is found in the cvt file. a column of 1s is added."<<endl;
+		for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+			if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+			cvt[i].push_back(1.0);
+		}
+		
+		n_cvt++;
+	} else {}	
+	
+	gsl_matrix_free(W);
+	
+	return;
+}
+
+
+//post-process phentoypes, covariates
+void PARAM::ProcessCvtPhen ()
+{	
+	//convert indicator_pheno to indicator_idv
+	int k=1;
+	indicator_idv.clear();
+	for (size_t i=0; i<indicator_pheno.size(); i++) {
+		k=1;
+		for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+			if (indicator_pheno[i][j]==0) {k=0;}
+		}
+		indicator_idv.push_back(k);
+	}
+	
+	//remove individuals with missing covariates
+	if ((indicator_cvt).size()!=0) {
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_cvt[i];
+		}
+	}
+	
+	//obtain ni_test
+	ni_test=0; 
+	for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		ni_test++;
+	}
+	
+	if (ni_test==0) {
+		error=true;
+		cout<<"error! number of analyzed individuals equals 0. "<<endl;
+		return;
+	}
+	
+	//check covariates to see if they are correlated with each other, and to see if the intercept term is included
+	//after getting ni_test
+	//add or remove covariates
+	if (indicator_cvt.size()!=0) {
+		CheckCvt();
+	} else {
+		vector<double> cvt_row;
+		cvt_row.push_back(1);
+		
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_cvt.push_back(1);
+			
+			cvt.push_back(cvt_row);
+		}
+	}
+	 
+	return;
+}
+
+
+
+
+void PARAM::CopyCvt (gsl_matrix *W) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+
+//if flag=0, then use indicator_idv to load W and Y
+//else, use indicator_cvt to load them
+void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (flag==0) {
+			if (indicator_idv[i]==0) {continue;}
+		} else {
+			if (indicator_cvt[i]==0) {continue;}
+		}
+		
+		gsl_vector_set (y, ci_test, (pheno)[i][0]);
+		
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+//if flag=0, then use indicator_idv to load W and Y
+//else, use indicator_cvt to load them
+void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (flag==0) {
+			if (indicator_idv[i]==0) {continue;}
+		} else {
+			if (indicator_cvt[i]==0) {continue;}
+		}		
+		
+		for (size_t j=0; j<n_ph; ++j) {
+			gsl_matrix_set (Y, ci_test, j, (pheno)[i][j]);
+		}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+
+
+
+
+void PARAM::CopyRead (gsl_vector *log_N) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		gsl_vector_set (log_N, ci_test, log(vec_read[i]) );	
+		ci_test++;
+	}
+	
+	return;
+}
+		
+		
+
diff --git a/param.h b/param.h
new file mode 100644
index 0000000..5d58c4c
--- /dev/null
+++ b/param.h
@@ -0,0 +1,231 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __PARAM_H__                
+#define __PARAM_H__
+
+#include <vector>
+#include <map>
+#include <set>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+using namespace std;
+
+
+
+class SNPINFO {
+public:
+	string chr;
+	string rs_number;
+	double cM;
+	long int base_position;
+	string a_minor;
+	string a_major;
+	size_t n_miss;
+	double missingness;
+	double maf;	
+};
+
+//results for lmm
+class SUMSTAT {
+public:
+	double beta;				//REML estimator for beta
+	double se;				//SE for beta  
+	double lambda_remle;		//REML estimator for lambda
+	double lambda_mle;		//MLE estimator for lambda
+	double p_wald;			//p value from a Wald test
+	double p_lrt;				//p value from a likelihood ratio test
+	double p_score;			//p value from a score test
+};
+
+//results for mvlmm
+class MPHSUMSTAT {
+public:
+	vector<double> v_beta;	//REML estimator for beta
+	double p_wald;			//p value from a Wald test
+	double p_lrt;				//p value from a likelihood ratio test
+	double p_score;			//p value from a score test
+	vector<double> v_Vg;	//estimator for Vg, right half
+	vector<double> v_Ve;	//estimator for Ve, right half
+	vector<double> v_Vbeta;	//estimator for Vbeta, right half
+};
+
+
+//hyper-parameters for bslmm
+class HYPBSLMM {
+public:
+	double h;
+	double pve;
+	double rho;
+	double pge;
+	double logp;
+	
+	size_t n_gamma;
+};
+
+
+
+
+class PARAM {
+public:	
+	// IO related parameters
+	bool mode_silence;
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	int k_mode;				//kinship read mode: 1: n by n matrix, 2: id/id/k_value; 		
+	vector<size_t> p_column;			//which phenotype column needs analysis
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_pheno;
+	string file_anno;		//optional
+	string file_cvt;		//optional
+	string file_kin;
+	string file_ku, file_kd;
+	string file_mk;
+	string file_out;
+	
+	string file_epm;		//estimated parameter file
+	string file_ebv;		//estimated breeding value file
+	string file_log;		//log file containing mean estimate
+	
+	string file_read;		//file containing total number of reads
+	string file_gene;		//gene expression file
+	
+	string file_snps;		//file containing analyzed snps or genes
+	
+	
+	
+	// QC related parameters	
+	double miss_level;
+	double maf_level;	
+	double hwe_level;
+	double r2_level;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double l_mle_null, l_remle_null;
+	double logl_mle_H0, logl_remle_H0;
+	double pve_null, pve_se_null;
+	double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null;
+	vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
+	vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null;
+	vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null;
+	double p_nr;	
+	double em_prec, nr_prec;
+	size_t em_iter, nr_iter;
+	size_t crt;
+	double pheno_mean;		//phenotype mean from bslmm fitting or for prediction
+
+	//for fitting multiple variance components
+	//the first three are of size n_vc, and the next two are of size n_vc+1
+	vector<double> v_traceG;
+	vector<double> v_pve;
+	vector<double> v_se_pve;
+
+	vector<double> v_sigma2;
+	vector<double> v_se_sigma2;	
+	vector<double> v_beta;
+	vector<double> v_se_beta;	
+	
+	// BSLMM MCMC related parameters
+	double h_min, h_max, h_scale;			//priors for h
+	double rho_min, rho_max, rho_scale;		//priors for rho
+	double logp_min, logp_max, logp_scale;		//priors for log(pi)
+	size_t s_min, s_max;			//minimum and maximum number of gammas
+	size_t w_step;					//number of warm up/burn in iterations
+	size_t s_step;					//number of sampling iterations
+	size_t r_pace;					//record pace
+	size_t w_pace;					//write pace
+	size_t n_accept;				//number of acceptance
+	size_t n_mh;					//number of MH steps within each iteration
+	double geo_mean;				//mean of the geometric distribution
+	long int randseed;
+	double trace_G;
+
+	HYPBSLMM cHyp_initial;
+		
+	// Summary statistics
+	bool error;
+	size_t ni_total, ni_test, ni_cvt;	//number of individuals
+	size_t np_obs, np_miss;		//number of observed and missing phenotypes
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t ni_control, ni_case;	//number of controls and number of cases
+	size_t n_cvt;			//number of covariates
+	size_t n_ph;			//number of phenotypes
+	size_t n_vc;			//number of variance components (including the diagonal matrix)
+	double time_total;		//record total time
+	double time_G;			//time spent on reading files the second time and calculate K
+	double time_eigen;		//time spent on eigen-decomposition
+	double time_UtX;		//time spent on calculating UX and Uy
+	double time_UtZ;		//time spent on calculating UtZ, for probit BSLMM
+	double time_opt;		//time spent on optimization iterations/or mcmc
+	double time_Omega;		//time spent on calculating Omega
+	double time_hyp;		//time spent on sampling hyper-parameters, in PMM
+	double time_Proposal;  //time spend on constructing the proposal distribution (i.e. the initial lmm or lm analysis)
+
+	// Data
+	vector<vector<double> > pheno;			//a vector record all phenotypes, NA replaced with -9
+	vector<vector<double> > cvt;			//a vector record all covariates, NA replaced with -9	
+	vector<vector<int> > indicator_pheno;			//a matrix record when a phenotype is missing for an individual; 0 missing, 1 available
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	vector<int> indicator_cvt;				//indicator for covariates, 0 missing, 1 available for analysis
+	
+	vector<int> indicator_bv;				//indicator for estimated breeding value file, 0 missing, 1 available for analysis
+	vector<int> indicator_read;				//indicator for read file, 0 missing, 1 available for analysis
+	vector<double> vec_read;				//total number of reads
+	vector<double> vec_bv;					//breeding values
+	vector<size_t> est_column;
+	
+	map<string, int> mapID2num;		//map small ID number to number, from 0 to n-1
+	map<string, string> mapRS2chr;		//map rs# to chromosome location
+	map<string, long int> mapRS2bp;		//map rs# to base position
+	map<string, double> mapRS2cM;		//map rs# to cM
+	map<string, double> mapRS2est;			//map rs# to parameters
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	set<string> setSnps;			//a set of snps for analysis
+	
+	//constructor
+	PARAM();
+	
+	//functions
+	void ReadFiles ();		
+	void CheckParam (); 
+	void CheckData ();	
+	void PrintSummary ();
+	void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);	
+	void CheckCvt ();
+	void CopyCvt (gsl_matrix *W);
+	void ProcessCvtPhen();
+	void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag);
+	void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag);
+	void CalcKin (gsl_matrix *matrix_kin);
+	void WriteMatrix (const gsl_matrix *matrix_U, const string suffix);
+	void WriteVector (const gsl_vector *vector_D, const string suffix);
+	void CopyRead (gsl_vector *log_N);
+};
+
+
+#endif
+
diff --git a/prdt.cpp b/prdt.cpp
new file mode 100644
index 0000000..7570d36
--- /dev/null
+++ b/prdt.cpp
@@ -0,0 +1,543 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <cmath>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "io.h"
+#include "lapack.h"  //for functions EigenDecomp
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"
+#include "prdt_float.h"
+#include "mathfunc_float.h"
+#else
+#include "io.h"
+#include "prdt.h"
+#include "mathfunc.h"
+#endif
+
+using namespace std;
+
+
+
+
+void PRDT::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	
+	indicator_pheno=cPar.indicator_pheno;	
+	indicator_cvt=cPar.indicator_cvt;
+	indicator_idv=cPar.indicator_idv;
+	
+	snpInfo=cPar.snpInfo;
+	mapRS2est=cPar.mapRS2est;
+	
+	time_eigen=0;
+	
+	n_ph=cPar.n_ph;
+	np_obs=cPar.np_obs;
+	np_miss=cPar.np_miss;
+	ns_total=cPar.ns_total;
+	ns_test=0;	
+	
+	return;
+}
+
+void PRDT::CopyToParam (PARAM &cPar) 
+{
+	cPar.ns_test=ns_test;
+	cPar.time_eigen=time_eigen;
+	
+	return;
+}               
+
+
+
+
+void PRDT::WriteFiles (gsl_vector *y_prdt) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".";
+	file_str+="prdt";
+	file_str+=".txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		if (indicator_idv[i]==1) {
+			outfile<<"NA"<<endl;
+		} else {
+			outfile<<gsl_vector_get (y_prdt, ci_test)<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PRDT::WriteFiles (gsl_matrix *Y_full) 
+{
+	string file_str;
+	file_str="./output/"+file_out;
+	file_str+=".prdt.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_cvt.size(); i++) {
+		if (indicator_cvt[i]==0) {
+			outfile<<"NA"<<endl;
+		} else {
+			for (size_t j=0; j<Y_full->size2; j++) {
+				outfile<<gsl_matrix_get (Y_full, ci_test, j)<<"\t";
+			}
+			outfile<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) 
+{
+	size_t ni_test=u_hat->size, ni_total=G->size1;
+	
+	gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test);
+	gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test);
+	gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); 
+	gsl_vector *eval=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu=gsl_vector_alloc (ni_test);
+	gsl_vector *w=gsl_vector_alloc (ni_total);
+	gsl_permutation *pmt=gsl_permutation_alloc (ni_test);
+	
+	//center matrix G based on indicator_idv
+	for (size_t i=0; i<ni_total; i++) {
+		gsl_vector_set(w, i, indicator_idv[i]);
+	}
+	CenterMatrix(G, w);
+		
+	//obtain Koo and Kfo
+	size_t o_i=0, o_j=0;
+	double d;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		o_j=0;
+		for (size_t j=0; j<indicator_idv.size(); j++) {
+			d=gsl_matrix_get(G, i, j);
+			if (indicator_idv[i]==1 && indicator_idv[j]==1) {
+				gsl_matrix_set(Goo, o_i, o_j, d);
+			}
+			if (indicator_idv[i]==0 && indicator_idv[j]==1) {
+				gsl_matrix_set(Gfo, i-o_i, o_j, d);
+			}
+			if (indicator_idv[j]==1) {o_j++;}
+		}
+		if (indicator_idv[i]==1) {o_i++;}
+	}
+		
+	//matrix operations to get u_prdt
+	cout<<"Start Eigen-Decomposition..."<<endl;
+	clock_t time_start=clock();
+	EigenDecomp (Goo, U, eval, 0);
+	for (size_t i=0; i<eval->size; i++) {
+		if (gsl_vector_get(eval,i)<1e-10) {gsl_vector_set(eval, i, 0);}
+	}
+
+	time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+	
+	gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu);
+	for (size_t i=0; i<eval->size; i++) {
+		d=gsl_vector_get(eval, i);
+		if (d!=0) {d=gsl_vector_get(Utu, i)/d; gsl_vector_set(Utu, i, d);}
+	}
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
+	
+	//free matrices
+	gsl_matrix_free(Goo);
+	gsl_matrix_free(Gfo);
+	gsl_matrix_free(U);
+	gsl_vector_free(eval);
+	gsl_vector_free(Utu);
+	gsl_vector_free(w);
+	gsl_permutation_free(pmt);
+
+	return;	
+}
+
+
+
+void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+	
+	string line;
+	char *ch_ptr;
+	string rs;
+	
+	size_t n_miss, n_train_nomiss, c_phen;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size);
+	
+	ns_test=0;
+
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<ns_total; ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		x_mean=0.0; c_phen=0; n_miss=0; x_train_mean=0; n_train_nomiss=0;
+		gsl_vector_set_zero(x_miss);
+
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==1) {
+				if (strcmp(ch_ptr, "NA")!=0) {
+					geno=atof(ch_ptr); 			
+					x_train_mean+=geno;
+					n_train_nomiss++;
+				}
+			} else {
+				if (strcmp(ch_ptr, "NA")==0) {
+					gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;
+				} else {
+					geno=atof(ch_ptr); 	
+					
+					gsl_vector_set(x, c_phen, geno); 
+					gsl_vector_set(x_miss, c_phen, 1.0); 
+					x_mean+=geno;
+				}
+				c_phen++;
+			}
+		}
+
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (gsl_vector_get (x_miss, i)==0) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (x_miss);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void PRDT::AnalyzePlink (gsl_vector *y_prdt) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	char ch[1];
+	bitset<8> b;	
+	string rs;
+	
+	size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;}
+	else {n_bit=indicator_idv.size()/4+1; }
+	
+	//print the first three majic numbers
+	for (size_t i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}	
+	
+	ns_test=0;
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		//if (indicator_snp[t]==0) {continue;}
+		
+		rs=snpInfo[t].rs_number;
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; x_train_mean=0; n_train_nomiss=0;
+		for (size_t i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==indicator_idv.size() ) {break;}
+				if (indicator_idv[ci_total]==1) {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {x_train_mean+=2.0; n_train_nomiss++;}
+						else {x_train_mean+=1.0; n_train_nomiss++;}
+					}
+					else {
+						if (b[2*j+1]==1) {n_train_nomiss++;}                                  
+						else {}
+					}
+				} else {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+						else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+					}
+					else {
+						if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+						else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+					}
+					ci_test++;
+				}
+				ci_total++;
+				
+			}
+		}
+		
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+		
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (geno==-9) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+		
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+//predict missing phenotypes using ridge regression
+//Y_hat contains fixed effects
+void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full) 
+{	
+	gsl_vector *y_obs=gsl_vector_alloc (np_obs);
+	gsl_vector *y_miss=gsl_vector_alloc (np_miss);
+	gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs);
+	gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs);
+	gsl_vector *Hiy=gsl_vector_alloc (np_obs);
+	
+	size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0;
+	
+	//obtain H_oo, H_mo
+	c_obs1=0; c_miss1=0; 
+	for (vector<int>::size_type i1=0; i1<indicator_pheno.size(); ++i1) {
+		if (indicator_cvt[i1]==0) {continue;}
+		for (vector<int>::size_type j1=0; j1<n_ph; ++j1) {
+			
+			c_obs2=0; c_miss2=0;
+			for (vector<int>::size_type i2=0; i2<indicator_pheno.size(); ++i2) {
+				if (indicator_cvt[i2]==0) {continue;}
+				for (vector<int>::size_type j2=0; j2<n_ph; j2++) {
+					
+					if (indicator_pheno[i2][j2]==1) {
+						if (indicator_pheno[i1][j1]==1) {
+							gsl_matrix_set (H_oo, c_obs1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						} else {
+							gsl_matrix_set (H_mo, c_miss1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						}
+						c_obs2++;
+					} else {
+						c_miss2++;
+					}
+				}				
+			}
+			
+			if (indicator_pheno[i1][j1]==1) {
+				c_obs1++;
+			} else {
+				c_miss1++;
+			}
+		}
+		
+	}	
+	
+	//do LU decomposition of H_oo
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (np_obs);
+	LUDecomp (H_oo, pmt, &sig);
+	
+//	if (mode_temp==0) {
+		//obtain y_obs=y_full-y_hat
+		//add the fixed effects part to y_miss: y_miss=y_hat
+		c_obs1=0; c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==1) {
+					gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j)-gsl_matrix_get (Y_hat, i, j) );
+					c_obs1++;
+				} else {
+					gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+					c_miss1++;
+				}
+			}
+		}	
+		
+		LUSolve (H_oo, pmt, y_obs, Hiy);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+		
+		//put back predicted y_miss to Y_full
+		c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==0) {
+					gsl_matrix_set (Y_full, i, j, gsl_vector_get (y_miss, c_miss1) );
+					c_miss1++;
+				}
+			}
+		}
+/*
+	} else {
+		for (size_t k=0; k<mode_temp; k++) {
+			c_obs1=0; c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==1) {
+						gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j+k*2)-gsl_matrix_get (Y_hat, i, j) );
+						c_obs1++;
+					} else {
+						gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+						c_miss1++;
+					}
+				}
+			}	
+			
+			LUSolve (H_oo, pmt, y_obs, Hiy);
+			
+			gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+			
+			//put back predicted y_miss to Y_full
+			c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==0) {
+						gsl_matrix_set (Y_full, i, j+k*2, gsl_vector_get (y_miss, c_miss1) );
+						c_miss1++;
+					}
+				}
+			}
+		}
+	}
+*/
+	//free matrices
+	gsl_vector_free(y_obs);
+	gsl_vector_free(y_miss);
+	gsl_matrix_free(H_oo);
+	gsl_matrix_free(H_mo);
+	gsl_vector_free(Hiy);
+	
+	return;
+}
+
+
diff --git a/prdt.h b/prdt.h
new file mode 100644
index 0000000..69043df
--- /dev/null
+++ b/prdt.h
@@ -0,0 +1,80 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __PRDT_H__                
+#define __PRDT_H__
+
+
+#include <vector>
+#include <map>
+#include <string.h>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+class PRDT {
+	
+public:
+	// IO related parameters
+	size_t a_mode;
+	size_t d_pace;
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	
+	vector<vector<int> > indicator_pheno;
+	vector<int> indicator_cvt;
+	vector<int> indicator_idv;
+	vector<SNPINFO> snpInfo;
+	map<string, double> mapRS2est;
+	
+	size_t n_ph;
+	size_t np_obs, np_miss;
+	size_t ns_total;
+	size_t ns_test;
+	
+	double time_eigen;
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void WriteFiles (gsl_vector *y_prdt);
+	void WriteFiles (gsl_matrix *Y_full);
+	void AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt);
+	void AnalyzeBimbam (gsl_vector *y_prdt);
+	void AnalyzePlink (gsl_vector *y_prdt);
+	void MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full);
+};
+
+
+#endif
+
+
+
+
+
+
+
diff --git a/vc.cpp b/vc.cpp
new file mode 100644
index 0000000..77cf746
--- /dev/null
+++ b/vc.cpp
@@ -0,0 +1,443 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_multiroots.h"
+#include "gsl/gsl_min.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#include "vc_float.h"
+#else
+#include "lmm.h"
+#include "vc.h"
+#endif
+
+
+
+using namespace std;
+
+
+//in this file, X, Y are already transformed (i.e. UtX and UtY)
+
+
+void VC::CopyFromParam (PARAM &cPar) 
+{	
+	file_out=cPar.file_out;
+	
+	//	v_sigma2=cPar.v_sigma2;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+
+	v_traceG=cPar.v_traceG;
+	
+	return;
+}
+
+
+void VC::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+		
+	cPar.v_sigma2=v_sigma2;
+	cPar.v_se_sigma2=v_se_sigma2;
+	cPar.v_pve=v_pve;
+	cPar.v_se_pve=v_se_pve;
+	cPar.v_traceG=v_traceG;
+	
+	cPar.v_beta=v_beta;
+	cPar.v_se_beta=v_se_beta;
+	
+	return;
+}
+
+
+
+void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p)
+{
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2;
+ 
+  gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1);
+  gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt);
+  gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt);
+  gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt);
+  gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1);
+
+  double sigma2;  
+  //calculate H=\sum_i^{k+1} \sigma_i^2 K_i
+  gsl_matrix_set_zero (p->P);
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      gsl_matrix_set_identity (K_temp);      
+    } else {
+      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);
+      gsl_matrix_memcpy (K_temp, &K_sub.matrix);
+    }
+
+    sigma2=exp(gsl_vector_get (log_sigma2, i) );
+    gsl_matrix_scale(K_temp, sigma2);
+    gsl_matrix_add (p->P, K_temp);
+  }
+
+  //calculate H^{-1}
+  int sig;
+  gsl_permutation * pmt1=gsl_permutation_alloc (n1);
+  LUDecomp (p->P, pmt1, &sig);	
+  LUInvert (p->P, pmt1, K_temp);
+  gsl_permutation_free(pmt1);
+
+  gsl_matrix_memcpy (p->P, K_temp);
+
+  //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1}
+  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW);
+  gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW);
+
+  gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt);
+  LUDecomp (WtHiW, pmt2, &sig);	
+  LUInvert (WtHiW, pmt2, WtHiWi);
+  gsl_permutation_free(pmt2);
+
+  gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi);  
+  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P);
+  
+  //calculate Py, KPy, PKPy
+  gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py);    
+
+  for (size_t i=0; i<n_vc+1; i++) {
+    gsl_vector_view KPy=gsl_matrix_column (p->KPy_mat, i);
+    gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i);
+
+    if (i==n_vc) {
+      gsl_vector_memcpy (&KPy.vector, p->Py);
+    } else {
+      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);      
+      gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector);
+    }
+    
+    gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector);
+  }
+
+  gsl_matrix_free (K_temp);
+  gsl_matrix_free (HiW);
+  gsl_matrix_free (WtHiW);
+  gsl_matrix_free (WtHiWi);
+  gsl_matrix_free (WtHiWiWtHi);
+
+  return;
+}
+
+
+//below are functions for AI algorithm
+int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
+  
+  double tr, d;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+
+  //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	tr+=gsl_matrix_get (p->P, l, l);
+      }
+    } else {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	gsl_vector_view P_row=gsl_matrix_row (p->P, l);
+	gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l);
+	gsl_blas_ddot(&P_row.vector, &K_col.vector, &d);
+	tr+=d;
+      }
+    }
+
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
+
+    d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i));
+    
+    gsl_vector_set(dev1, i, d);
+  }
+
+  return GSL_SUCCESS;
+}
+
+
+
+int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n_vc=log_sigma2->size-1;
+  
+  double d, sigma2_i, sigma2_j;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+  
+  //calculate dev2=0.5(yPKPKPy)
+  for (size_t i=0; i<n_vc+1; i++) {
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+
+    for (size_t j=i; j<n_vc+1; j++) {
+      gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
+
+      gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
+      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+
+      d*=-0.5*sigma2_i*sigma2_j;
+
+      gsl_matrix_set(dev2, i, j, d);
+      if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
+    }   
+  }
+
+  gsl_matrix_memcpy (p->Hessian, dev2);
+
+  return GSL_SUCCESS;
+}
+
+
+
+int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, gsl_matrix *dev2)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
+  
+  double tr, d, sigma2_i, sigma2_j;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+
+  //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy
+  //calculate dev2=0.5(yPKPKPy)
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	tr+=gsl_matrix_get (p->P, l, l);
+      }
+    } else {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	gsl_vector_view P_row=gsl_matrix_row (p->P, l);
+	gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l);
+	gsl_blas_ddot(&P_row.vector, &K_col.vector, &d);
+	tr+=d;
+      }
+    }
+
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
+
+    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+    d=(-0.5*tr+0.5*d)*sigma2_i;
+ 
+    gsl_vector_set(dev1, i, d);
+      
+    for (size_t j=i; j<n_vc+1; j++) {
+      gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
+      gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
+
+      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+      d*=-0.5*sigma2_i*sigma2_j;
+
+      gsl_matrix_set(dev2, i, j, d);
+      if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
+    }   
+
+  }
+
+  gsl_matrix_memcpy (p->Hessian, dev2);
+
+  return GSL_SUCCESS;
+}
+
+
+
+
+void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y)
+{
+  size_t n1=K->size1, n2=K->size2;
+  size_t n_vc=n2/n1;
+  gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1);
+  double d, s;
+
+  //set up params
+  gsl_matrix *P=gsl_matrix_alloc (n1, n1);
+  gsl_vector *Py=gsl_vector_alloc (n1);
+  gsl_matrix *KPy_mat=gsl_matrix_alloc (n1, n_vc+1);
+  gsl_matrix *PKPy_mat=gsl_matrix_alloc (n1, n_vc+1);
+  gsl_vector *dev1=gsl_vector_alloc (n_vc+1);
+  gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1);
+  gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1);
+  VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian};
+
+  //initialize sigma2/log_sigma2
+  gsl_blas_ddot (y, y, &s);
+  s/=(double)n1;
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      d=s/((double)n_vc+1.0);
+    } else {
+      d=s/( ((double)n_vc+1.0)*v_traceG[i]);
+    }
+
+    gsl_vector_set (log_sigma2, i, d);
+  }
+  //  gsl_vector_set (log_sigma2, 0, 0.38);
+  //  gsl_vector_set (log_sigma2, 1, -1.08);
+
+  cout<<"iteration "<<0<<endl;
+  cout<<"sigma2 = ";
+  for (size_t i=0; i<n_vc+1; i++) {
+    cout<<exp(gsl_vector_get(log_sigma2, i))<<" ";
+  }
+  cout<<endl;
+
+  //set up fdf
+  gsl_multiroot_function_fdf FDF;
+  FDF.n=n_vc+1;
+  FDF.params=&params;
+  FDF.f=&LogRL_dev1;
+  FDF.df=&LogRL_dev2;
+  FDF.fdf=&LogRL_dev12;
+  
+  //set up solver 	
+  int status;
+  int iter=0, max_iter=100;
+
+  const gsl_multiroot_fdfsolver_type *T_fdf;
+  gsl_multiroot_fdfsolver *s_fdf;
+  T_fdf=gsl_multiroot_fdfsolver_hybridsj;
+  s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1);	
+
+  gsl_multiroot_fdfsolver_set (s_fdf, &FDF, log_sigma2);
+
+  do {
+    iter++;
+    status=gsl_multiroot_fdfsolver_iterate (s_fdf);
+
+    if (status) break;
+
+    cout<<"iteration "<<iter<<endl;
+    cout<<"sigma2 = ";
+    for (size_t i=0; i<n_vc+1; i++) {
+      cout<<exp(gsl_vector_get(s_fdf->x, i))<<" ";
+    }
+    cout<<endl;
+    cout<<"derivatives = ";
+    for (size_t i=0; i<n_vc+1; i++) {
+      cout<<gsl_vector_get(s_fdf->f, i)<<" ";
+    }
+    cout<<endl;
+
+    status=gsl_multiroot_test_residual (s_fdf->f, 1e-3);		
+  }
+  while (status==GSL_CONTINUE && iter<max_iter); 
+
+  //obtain Hessian inverse
+  int sig=LogRL_dev12 (s_fdf->f, &params, dev1, dev2);
+
+  gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1);
+  LUDecomp (dev2, pmt, &sig);	
+  LUInvert (dev2, pmt, Hessian);
+  gsl_permutation_free(pmt);
+
+  //save data
+  v_sigma2.clear(); 
+  for (size_t i=0; i<n_vc+1; i++) {
+    d=exp(gsl_vector_get(s_fdf->x, i));
+    v_sigma2.push_back(d);
+  }
+
+  v_se_sigma2.clear();
+  for (size_t i=0; i<n_vc+1; i++) {
+    d=-1.0*v_sigma2[i]*v_sigma2[i]*gsl_matrix_get(Hessian, i, i);
+    v_se_sigma2.push_back(sqrt(d));
+  }
+
+  s=0;
+  for (size_t i=0; i<n_vc; i++) {
+    s+=v_traceG[i]*v_sigma2[i];
+  }
+  s+=v_sigma2[n_vc];
+  
+  v_pve.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    d=v_traceG[i]*v_sigma2[i]/s;
+    v_pve.push_back(d);
+  }
+
+  v_se_pve.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    d=v_traceG[i]*(s-v_sigma2[i]*v_traceG[i])/(s*s)*v_se_sigma2[i]*v_se_sigma2[i];
+    v_se_pve.push_back(sqrt(d) );
+  }
+  
+  gsl_multiroot_fdfsolver_free(s_fdf);	
+
+  gsl_vector_free(log_sigma2);
+  gsl_matrix_free(P);
+  gsl_vector_free(Py);
+  gsl_matrix_free(KPy_mat);
+  gsl_matrix_free(PKPy_mat);
+  gsl_vector_free(dev1);
+  gsl_matrix_free(dev2);
+  gsl_matrix_free(Hessian);
+
+  return;
+}
+
+
+	
+
+
+
diff --git a/vc.h b/vc.h
new file mode 100644
index 0000000..aed5247
--- /dev/null
+++ b/vc.h
@@ -0,0 +1,80 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __VC_H__                
+#define __VC_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+class VC_PARAM
+{
+
+public:	
+	const gsl_matrix *K;
+	const gsl_matrix *W;
+	const gsl_vector *y;
+	gsl_matrix *P;
+	gsl_vector *Py;
+	gsl_matrix *KPy_mat;
+	gsl_matrix *PKPy_mat;
+	gsl_matrix *Hessian;
+};
+
+
+
+
+class VC {
+
+public:
+	// IO related parameters
+	string file_out;
+
+	vector<double> v_sigma2;
+	vector<double> v_se_sigma2;
+	vector<double> v_pve;
+	vector<double> v_se_pve;
+	vector<double> v_traceG;
+	vector<double> v_beta;
+	vector<double> v_se_beta;
+
+	double time_UtX;
+	double time_opt;
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
+};
+
+#endif
+
+
-- 
cgit 1.4.1