version 0.95alpha

author: xiangzhou 2014-09-22 11:06:02 -0400
committer: xiangzhou 2014-09-22 11:06:02 -0400
commit: 7762722f264adc402ea3b0f21923b18f072253ba (patch)
tree: 879ed22943d424b52bd04b4ee6fbdf51616dc9a9
parent: 44faf98d2c6fe56c916cace02fe498fc1271bd9d (diff)
download: pangemma-7762722f264adc402ea3b0f21923b18f072253ba.tar.gz
26 files changed, 15453 insertions, 0 deletions
diff --git a/bin/gemma b/bin/gemma
new file mode 100755
index 0000000..6734240
--- /dev/null
+++ b/bin/gemma
Binary files differdiff --git a/src/bslmm.cpp b/src/bslmm.cpp
new file mode 100644
index 0000000..55a05ca
--- /dev/null
+++ b/src/bslmm.cpp
@@ -0,0 +1,1928 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <ctime>
+#include <cstring>
+#include <algorithm>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_eigen.h"
+#include "gsl/gsl_randist.h"
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+
+
+
+
+#include "lapack.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "bslmm_float.h"
+#include "lmm_float.h"  //for class FUNC_PARAM and MatrixCalcLR
+#include "lm_float.h"
+#include "mathfunc_float.h"  //for function CenterVector
+#else
+#include "param.h"
+#include "bslmm.h"
+#include "lmm.h"
+#include "lm.h"
+#include "mathfunc.h"
+#endif
+
+using namespace std;
+
+
+
+
+void BSLMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	
+	l_min=cPar.h_min;	
+	l_max=cPar.h_max;  
+	n_region=cPar.n_region;	
+	pve_null=cPar.pve_null;
+	pheno_mean=cPar.pheno_mean;
+	
+	time_UtZ=0.0;
+	time_Omega=0.0;
+	n_accept=0;
+	
+	h_min=cPar.h_min;	
+	h_max=cPar.h_max;  
+	h_scale=cPar.h_scale;
+	rho_min=cPar.rho_min;	
+	rho_max=cPar.rho_max;  
+	rho_scale=cPar.rho_scale;
+	logp_min=cPar.logp_min;	
+	logp_max=cPar.logp_max;  
+	logp_scale=cPar.logp_scale;
+	
+	s_min=cPar.s_min;
+	s_max=cPar.s_max;
+	w_step=cPar.w_step;
+	s_step=cPar.s_step;
+	r_pace=cPar.r_pace;
+	w_pace=cPar.w_pace;
+	n_mh=cPar.n_mh;
+	geo_mean=cPar.geo_mean;
+	randseed=cPar.randseed;
+	trace_G=cPar.trace_G;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	indicator_idv=cPar.indicator_idv;
+	indicator_snp=cPar.indicator_snp;
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void BSLMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtZ=time_UtZ;
+	cPar.time_Omega=time_Omega;
+	cPar.time_Proposal=time_Proposal;
+	cPar.cHyp_initial=cHyp_initial;
+	cPar.n_accept=n_accept;
+	cPar.pheno_mean=pheno_mean;
+	cPar.randseed=randseed;
+	
+	return;
+}
+
+
+
+void BSLMM::WriteBV (const gsl_vector *bv) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".bv.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t t=0;
+	for (size_t i=0; i<ni_total; ++i) {
+		if (indicator_idv[i]==0) {
+			outfile<<"NA"<<endl;
+		}		
+		else {
+			outfile<<scientific<<setprecision(6)<<gsl_vector_get(bv, t)<<endl;
+			t++;
+		}
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+
+
+void BSLMM::WriteParam (vector<pair<double, double> > &beta_g, const gsl_vector *alpha, const size_t w) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".param.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"
+			<<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t"
+			<<"beta"<<"\t"<<"gamma"<<endl;
+	
+	size_t t=0;
+	for (size_t i=0; i<ns_total; ++i) {
+		if (indicator_snp[i]==0) {continue;}		
+		
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"
+		<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t";	
+				
+		outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t";
+		if (beta_g[t].second!=0) {
+			outfile<<beta_g[t].first/beta_g[t].second<<"\t"<<beta_g[t].second/(double)w<<endl;
+		}
+		else {
+			outfile<<0.0<<"\t"<<0.0<<endl;
+		}
+		t++;
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+void BSLMM::WriteParam (const gsl_vector *alpha) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".param.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"
+			<<"ps"<<"\t"<<"n_miss"<<"\t"<<"alpha"<<"\t"
+			<<"beta"<<"\t"<<"gamma"<<endl;
+	
+	size_t t=0;
+	for (size_t i=0; i<ns_total; ++i) {
+		if (indicator_snp[i]==0) {continue;}		
+
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"
+				<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t";				
+		outfile<<scientific<<setprecision(6)<<gsl_vector_get(alpha, t)<<"\t";
+		outfile<<0.0<<"\t"<<0.0<<endl;
+		t++;
+	}		
+	
+	outfile.clear();	
+	outfile.close();	
+	return;
+}
+
+
+void BSLMM::WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col) 
+{
+	string file_gamma, file_hyp;
+	file_gamma=path_out+"/"+file_out;
+	file_gamma+=".gamma.txt";
+	file_hyp=path_out+"/"+file_out;
+	file_hyp+=".hyp.txt";
+
+	ofstream outfile_gamma, outfile_hyp;
+		
+	if (flag==0) {
+		outfile_gamma.open (file_gamma.c_str(), ofstream::out);
+		outfile_hyp.open (file_hyp.c_str(), ofstream::out);
+		if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;}
+		if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;}
+		
+		outfile_hyp<<"h \t pve \t rho \t pge \t pi \t n_gamma"<<endl;
+		
+		for (size_t i=0; i<s_max; ++i) {
+			outfile_gamma<<"s"<<i<<"\t";
+		}
+		outfile_gamma<<endl;
+	}
+	else {
+		outfile_gamma.open (file_gamma.c_str(), ofstream::app);
+		outfile_hyp.open (file_hyp.c_str(), ofstream::app);
+		if (!outfile_gamma) {cout<<"error writing file: "<<file_gamma<<endl; return;}
+		if (!outfile_hyp) {cout<<"error writing file: "<<file_hyp<<endl; return;}
+		
+		size_t w;
+		if (w_col==0) {w=w_pace;}
+		else {w=w_col;}
+		
+		for (size_t i=0; i<w; ++i) {
+			outfile_hyp<<scientific;
+			for (size_t j=0; j<4; ++j) {
+				outfile_hyp<<setprecision(6)<<gsl_matrix_get (Result_hyp, i, j)<<"\t";
+			}
+			outfile_hyp<<setprecision(6)<<exp(gsl_matrix_get (Result_hyp, i, 4))<<"\t";
+			outfile_hyp<<(int)gsl_matrix_get (Result_hyp, i, 5)<<"\t";
+			outfile_hyp<<endl;
+		}
+		
+		for (size_t i=0; i<w; ++i) {
+			for (size_t j=0; j<s_max; ++j) {
+				outfile_gamma<<(int)gsl_matrix_get (Result_gamma, i, j)<<"\t";
+			}
+			outfile_gamma<<endl;
+		}
+		
+	}
+	
+	outfile_hyp.close();
+	outfile_hyp.clear();
+	outfile_gamma.close();
+	outfile_gamma.clear();	
+	return;
+}
+
+
+
+void BSLMM::CalcPgamma (double *p_gamma)
+{
+	double p, s=0.0;
+	for (size_t i=0; i<ns_test; ++i) {
+		p=0.7*gsl_ran_geometric_pdf (i+1, 1.0/geo_mean)+0.3/(double)ns_test;
+		p_gamma[i]=p;
+		s+=p;
+	}
+	for (size_t i=0; i<ns_test; ++i) {
+		p=p_gamma[i];
+		p_gamma[i]=p/s;
+	}
+	return;
+}
+
+
+
+void BSLMM::SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector<size_t> &rank)
+{
+	size_t pos;
+	for (size_t i=0; i<rank.size(); ++i) {
+		pos=mapRank2pos[rank[i]];
+		gsl_vector_view Xgamma_col=gsl_matrix_column (Xgamma, i);
+		gsl_vector_const_view X_col=gsl_matrix_const_column (X, pos);
+		gsl_vector_memcpy (&Xgamma_col.vector, &X_col.vector);
+	}
+	
+	return;
+}
+
+
+
+double BSLMM::CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2) 
+{
+	double pve, var_y;	
+	
+	gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2);
+	gsl_vector *Xty=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *OiXty=gsl_vector_alloc (UtXgamma->size2);
+
+	gsl_matrix_set_identity (Omega);
+	gsl_matrix_scale (Omega, 1.0/sigma_a2); 
+
+#ifdef WITH_LAPACK
+	lapack_dgemm ((char *)"T", (char *)"N", 1.0, UtXgamma, UtXgamma, 1.0, Omega);
+#else
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, UtXgamma, UtXgamma, 1.0, Omega);	
+#endif
+	gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma, Uty, 0.0, Xty);
+
+	CholeskySolve(Omega, Xty, OiXty);
+	
+	gsl_blas_ddot (Xty, OiXty, &pve);
+	gsl_blas_ddot (Uty, Uty, &var_y);
+	
+	pve/=var_y;
+	
+	gsl_matrix_free (Omega);
+	gsl_vector_free (Xty);
+	gsl_vector_free (OiXty);
+
+	return pve;
+}
+
+
+void BSLMM::InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector<size_t> &rank, class HYPBSLMM &cHyp, vector<pair<size_t, double> > &pos_loglr)
+{
+	double q_genome=gsl_cdf_chisq_Qinv(0.05/(double)ns_test, 1);
+	
+	cHyp.n_gamma=0;
+	for (size_t i=0; i<pos_loglr.size(); ++i) {
+		if (2.0*pos_loglr[i].second>q_genome) {cHyp.n_gamma++;}
+	}
+	if (cHyp.n_gamma<10) {cHyp.n_gamma=10;}
+	
+	if (cHyp.n_gamma>s_max) {cHyp.n_gamma=s_max;}
+	if (cHyp.n_gamma<s_min) {cHyp.n_gamma=s_min;}	
+	
+	rank.clear();
+	for (size_t i=0; i<cHyp.n_gamma; ++i) {
+		rank.push_back(i);
+	}
+	
+	cHyp.logp=log((double)cHyp.n_gamma/(double)ns_test);
+	cHyp.h=pve_null; 
+	
+	if (cHyp.logp==0) {cHyp.logp=-0.000001;}
+	if (cHyp.h==0) {cHyp.h=0.1;}
+
+	gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp.n_gamma);
+	SetXgamma (UtXgamma, UtX, rank);
+	double sigma_a2;
+	if (trace_G!=0) {
+	  sigma_a2=cHyp.h*1.0/(trace_G*(1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	} else {
+	  sigma_a2=cHyp.h*1.0/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	}
+	if (sigma_a2==0) {sigma_a2=0.025;}	
+	cHyp.rho=CalcPveLM (UtXgamma, Uty, sigma_a2)/cHyp.h;
+	gsl_matrix_free (UtXgamma);
+	
+	if (cHyp.rho>1.0) {cHyp.rho=1.0;}
+	
+	if (cHyp.h<h_min) {cHyp.h=h_min;}
+	if (cHyp.h>h_max) {cHyp.h=h_max;}
+	if (cHyp.rho<rho_min) {cHyp.rho=rho_min;}
+	if (cHyp.rho>rho_max) {cHyp.rho=rho_max;}
+	if (cHyp.logp<logp_min) {cHyp.logp=logp_min;}
+	if (cHyp.logp>logp_max) {cHyp.logp=logp_max;}
+	
+	
+//	if (fix_sigma>=0) {
+//		fix_sigma=cHyp.h;
+//		rho_max=1-cHyp.h;
+//		cHyp.rho=rho_max/2.0;
+//	}
+	
+	//Initial for grid sampling:
+//	cHyp.h=0.225;
+//	cHyp.rho=1.0;
+//	cHyp.logp=-4.835429;
+	
+	cout<<"initial value of h = "<<cHyp.h<<endl;
+	cout<<"initial value of rho = "<<cHyp.rho<<endl;
+	cout<<"initial value of pi = "<<exp(cHyp.logp)<<endl;
+	cout<<"initial value of |gamma| = "<<cHyp.n_gamma<<endl;
+	
+	return;
+}
+
+
+
+double BSLMM::CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp)
+{
+	double sigma_b2=cHyp.h*(1.0-cHyp.rho)/(trace_G*(1-cHyp.h));
+	
+	gsl_vector *Utu_rand=gsl_vector_alloc (Uty->size);	
+	gsl_vector *weight_Hi=gsl_vector_alloc (Uty->size);
+	
+	double logpost=0.0;
+	double d, ds, uy, Hi_yy=0, logdet_H=0.0;
+	for (size_t i=0; i<ni_test; ++i) {
+		d=gsl_vector_get (K_eval, i)*sigma_b2;
+		ds=d/(d+1.0);
+		d=1.0/(d+1.0);		
+		gsl_vector_set (weight_Hi, i, d);
+		
+		logdet_H-=log(d);
+		uy=gsl_vector_get (Uty, i);
+		Hi_yy+=d*uy*uy;
+		
+		gsl_vector_set (Utu_rand, i, gsl_ran_gaussian(gsl_r, 1)*sqrt(ds));
+	}
+	
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau = gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/Hi_yy); }
+	
+	//sample alpha
+	gsl_vector_memcpy (alpha_prime, Uty);
+	gsl_vector_mul (alpha_prime, weight_Hi);
+	gsl_vector_scale (alpha_prime, sigma_b2);
+	
+	//sample u
+	gsl_vector_memcpy (Utu, alpha_prime);
+	gsl_vector_mul (Utu, K_eval);
+	if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));}
+	gsl_vector_add (Utu, Utu_rand);	
+	
+	//for quantitative traits, calculate pve and ppe
+	if (a_mode==11) {
+		gsl_blas_ddot (Utu, Utu, &d);
+		cHyp.pve=d/(double)ni_test;	
+		cHyp.pve/=cHyp.pve+1.0/tau;
+		cHyp.pge=0.0;	
+	}
+
+	//calculate likelihood
+	logpost=-0.5*logdet_H;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(Hi_yy);}
+	else {logpost-=0.5*Hi_yy;}
+	
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp));
+	
+	gsl_vector_free (Utu_rand);
+	gsl_vector_free (weight_Hi);
+	
+	return logpost;
+}
+
+
+double BSLMM::CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp)
+{
+	clock_t time_start;	
+	
+	double sigma_a2=cHyp.h*cHyp.rho/(trace_G*(1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	double sigma_b2=cHyp.h*(1.0-cHyp.rho)/(trace_G*(1-cHyp.h));
+	
+	double logpost=0.0;
+	double d, ds, uy, P_yy=0, logdet_O=0.0, logdet_H=0.0;
+	
+	gsl_matrix *UtXgamma_eval=gsl_matrix_alloc (UtXgamma->size1, UtXgamma->size2);	
+	gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2);
+	gsl_vector *XtHiy=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *beta_hat=gsl_vector_alloc (UtXgamma->size2);
+	gsl_vector *Utu_rand=gsl_vector_alloc (UtXgamma->size1);	
+	gsl_vector *weight_Hi=gsl_vector_alloc (UtXgamma->size1);
+	
+	gsl_matrix_memcpy (UtXgamma_eval, UtXgamma);
+	
+	logdet_H=0.0; P_yy=0.0;
+	for (size_t i=0; i<ni_test; ++i) {
+		gsl_vector_view UtXgamma_row=gsl_matrix_row (UtXgamma_eval, i);
+		d=gsl_vector_get (K_eval, i)*sigma_b2;
+		ds=d/(d+1.0);
+		d=1.0/(d+1.0);
+		gsl_vector_set (weight_Hi, i, d);
+		
+		logdet_H-=log(d);
+		uy=gsl_vector_get (Uty, i);
+		P_yy+=d*uy*uy;
+		gsl_vector_scale (&UtXgamma_row.vector, d);
+		
+		gsl_vector_set (Utu_rand, i, gsl_ran_gaussian(gsl_r, 1)*sqrt(ds));
+	}
+	
+	//calculate Omega
+	gsl_matrix_set_identity (Omega);
+	
+	time_start=clock();
+#ifdef WITH_LAPACK
+	lapack_dgemm ((char *)"T", (char *)"N", sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega);
+#else
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, sigma_a2, UtXgamma_eval, UtXgamma, 1.0, Omega);
+#endif	
+	time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	
+	
+	//calculate beta_hat
+	gsl_blas_dgemv (CblasTrans, 1.0, UtXgamma_eval, Uty, 0.0, XtHiy);	
+
+	logdet_O=CholeskySolve(Omega, XtHiy, beta_hat);
+	
+	gsl_vector_scale (beta_hat, sigma_a2);
+
+	gsl_blas_ddot (XtHiy, beta_hat, &d);
+	P_yy-=d;
+	
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/P_yy); }
+
+	//sample beta
+	for (size_t i=0; i<beta->size; i++)
+	{
+		d=gsl_ran_gaussian(gsl_r, 1); 
+		gsl_vector_set(beta, i, d); 
+	}
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, beta); 
+	
+	
+	//it compuates inv(L^T(Omega)) %*% beta;  
+	gsl_vector_scale(beta, sqrt(sigma_a2/tau));
+	gsl_vector_add(beta, beta_hat); 
+	gsl_blas_dgemv (CblasNoTrans, 1.0, UtXgamma, beta, 0.0, UtXb);
+	
+	//sample alpha
+	gsl_vector_memcpy (alpha_prime, Uty);
+	gsl_vector_sub (alpha_prime, UtXb);
+	gsl_vector_mul (alpha_prime, weight_Hi);
+	gsl_vector_scale (alpha_prime, sigma_b2);
+	
+	//sample u
+	gsl_vector_memcpy (Utu, alpha_prime);
+	gsl_vector_mul (Utu, K_eval);
+	
+	if (a_mode==11) {gsl_vector_scale (Utu_rand, sqrt(1.0/tau));}
+	gsl_vector_add (Utu, Utu_rand);	
+	
+	
+	//for quantitative traits, calculate pve and pge
+	if (a_mode==11) {
+		gsl_blas_ddot (UtXb, UtXb, &d);
+		cHyp.pge=d/(double)ni_test;
+	
+		gsl_blas_ddot (Utu, Utu, &d);
+		cHyp.pve=cHyp.pge+d/(double)ni_test;
+		
+		if (cHyp.pve==0) {cHyp.pge=0.0;}
+		else {cHyp.pge/=cHyp.pve;}
+		cHyp.pve/=cHyp.pve+1.0/tau;	
+	}	
+	
+
+	gsl_matrix_free (UtXgamma_eval);
+	gsl_matrix_free (Omega);
+	gsl_vector_free (XtHiy);
+	gsl_vector_free (beta_hat);
+	gsl_vector_free (Utu_rand);	
+	gsl_vector_free (weight_Hi);
+	
+	logpost=-0.5*logdet_H-0.5*logdet_O;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);}
+	else {logpost-=0.5*P_yy;}
+//	else {logpost+=-0.5*P_yy*tau+0.5*(double)ni_test*log(tau);}
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp));
+	
+	return logpost;
+}
+
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	
+	gsl_blas_ddot (Utu, Utu, &d);
+	cHyp.pve=d/(double)ni_test;	
+		
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, z_hat);
+		
+	cHyp.pve/=cHyp.pve+1.0;
+	cHyp.pge=0.0;	
+	
+	return;
+}
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	gsl_vector *UtXbU=gsl_vector_alloc (Utu->size);
+	
+	gsl_blas_ddot (UtXb, UtXb, &d);
+	cHyp.pge=d/(double)ni_test;
+	
+	gsl_blas_ddot (Utu, Utu, &d);
+	cHyp.pve=cHyp.pge+d/(double)ni_test;
+	
+	gsl_vector_memcpy (UtXbU, Utu);
+	gsl_vector_add (UtXbU, UtXb);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, UtXbU, 0.0, z_hat);	
+	
+	if (cHyp.pve==0) {cHyp.pge=0.0;}
+	else {cHyp.pge/=cHyp.pve;}
+	
+	cHyp.pve/=cHyp.pve+1.0;
+	
+	gsl_vector_free(UtXbU);
+	return;
+}
+
+
+
+
+void BSLMM::SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z)
+{	
+	double d1, d2, z_rand=0.0;
+	for (size_t i=0; i<z->size; ++i) {
+		d1=gsl_vector_get (y, i);
+		d2=gsl_vector_get (z_hat, i);
+		//y is centerred for case control studies
+		if (d1<=0.0) {
+			//control, right truncated
+			do {				
+				z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0);
+			} while (z_rand>0.0);
+		}
+		else {
+			do {
+				z_rand=d2+gsl_ran_gaussian(gsl_r, 1.0);
+			} while (z_rand<0.0);
+		}
+		
+		gsl_vector_set (z, i, z_rand);
+	}
+
+	return;
+}
+
+
+
+
+
+double BSLMM::ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	
+	double h=cHyp_old.h, rho=cHyp_old.rho;
+	
+	double d_h=(h_max-h_min)*h_scale, d_rho=(rho_max-rho_min)*rho_scale;
+	
+	for (size_t i=0; i<repeat; ++i) {
+		h=h+(gsl_rng_uniform(gsl_r)-0.5)*d_h;
+		if (h<h_min) {h=2*h_min-h;}
+		if (h>h_max) {h=2*h_max-h;}
+		
+		rho=rho+(gsl_rng_uniform(gsl_r)-0.5)*d_rho;
+		if (rho<rho_min) {rho=2*rho_min-rho;}
+		if (rho>rho_max) {rho=2*rho_max-rho;}
+	}
+	/*
+	//Grid Sampling
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		h=h+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.1;
+		if (h<h_min) {h=h_max;}
+		if (h>h_max) {h=h_min;}
+	}
+	
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		rho=rho+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.1;
+		if (rho<rho_min) {rho=rho_max;}
+		if (rho>rho_max) {rho=rho_min;}
+	}
+	*/
+	cHyp_new.h=h;
+	cHyp_new.rho=rho;
+	return 0.0;
+}
+
+
+double BSLMM::ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	double logp_old=cHyp_old.logp, logp_new=cHyp_old.logp;
+	double log_ratio=0.0;
+	
+	double d_logp=min(0.1, (logp_max-logp_min)*logp_scale);
+	
+	for (size_t i=0; i<repeat; ++i) {
+		logp_new=logp_old+(gsl_rng_uniform(gsl_r)-0.5)*d_logp;
+		if (logp_new<logp_min) {logp_new=2*logp_min-logp_new;}
+		if (logp_new>logp_max) {logp_new=2*logp_max-logp_new;}		
+		
+		log_ratio+=logp_new-logp_old;
+		logp_old=logp_new;
+	}
+	/*
+	//Grid Sampling
+	for (size_t i=0; i<repeat; ++i) {
+		if (gsl_rng_uniform(gsl_r)<0.66) {continue;}
+		logp_new=logp_old+(gsl_rng_uniform_int(gsl_r, 2)-0.5)*0.5*log(10.0);
+		if (logp_new<logp_min) {logp_new=logp_max;}
+		if (logp_new>logp_max) {logp_new=logp_min;}	
+		
+		log_ratio+=logp_new-logp_old;
+		logp_old=logp_new;
+	}
+	*/
+	cHyp_new.logp=logp_new;
+	
+	return log_ratio;
+}
+
+bool comp_vec (size_t a, size_t b)
+{
+	return (a < b); 
+}
+
+
+double BSLMM::ProposeGamma (const vector<size_t> &rank_old, vector<size_t> &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat)
+{
+	map<size_t, int> mapRank2in;
+	size_t r;
+	double unif, logp=0.0;
+	int flag_gamma;
+	size_t r_add, r_remove, col_id;
+	
+	rank_new.clear();
+	if (cHyp_old.n_gamma!=rank_old.size()) {cout<<"size wrong"<<endl;}
+	
+	if (cHyp_old.n_gamma!=0) {
+		for (size_t i=0; i<rank_old.size(); ++i) {
+			r=rank_old[i];
+			rank_new.push_back(r);
+			mapRank2in[r]=1;
+		}
+	}
+	cHyp_new.n_gamma=cHyp_old.n_gamma;	
+	
+	for (size_t i=0; i<repeat; ++i) {
+		unif=gsl_rng_uniform(gsl_r); 
+	
+		if (unif < 0.40 && cHyp_new.n_gamma<s_max) {flag_gamma=1;}
+		else if (unif>=0.40 && unif < 0.80 && cHyp_new.n_gamma>s_min) {flag_gamma=2;}
+		else if (unif>=0.80 && cHyp_new.n_gamma>0 && cHyp_new.n_gamma<ns_test) {flag_gamma=3;}
+		else {flag_gamma=4;}
+	
+		if(flag_gamma==1)  {//add a snp; 
+			do {
+				r_add=gsl_ran_discrete (gsl_r, gsl_t);
+			} while (mapRank2in.count(r_add)!=0); 
+		
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+
+			mapRank2in[r_add]=1;
+			rank_new.push_back(r_add);
+			cHyp_new.n_gamma++;
+			logp+=-log(p_gamma[r_add]/prob_total)-log((double)cHyp_new.n_gamma);
+		}
+		else if (flag_gamma==2) {//delete a snp;
+			col_id=gsl_rng_uniform_int(gsl_r, cHyp_new.n_gamma);		
+			r_remove=rank_new[col_id];
+		
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+			prob_total+=p_gamma[r_remove];
+		
+			mapRank2in.erase(r_remove);
+			rank_new.erase(rank_new.begin()+col_id);
+			logp+=log(p_gamma[r_remove]/prob_total)+log((double)cHyp_new.n_gamma);
+			cHyp_new.n_gamma--;
+		}
+		else if (flag_gamma==3) {//switch a snp;
+			col_id=gsl_rng_uniform_int(gsl_r, cHyp_new.n_gamma);		
+			r_remove=rank_new[col_id];
+		//careful with the proposal
+			do {
+				r_add=gsl_ran_discrete (gsl_r, gsl_t);
+			} while (mapRank2in.count(r_add)!=0); 
+			
+			double prob_total=1.0;
+			for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				r=rank_new[i];
+				prob_total-=p_gamma[r];
+			}
+			
+			logp+=log(p_gamma[r_remove]/(prob_total+p_gamma[r_remove]-p_gamma[r_add]) );
+			logp-=log(p_gamma[r_add]/prob_total);
+			
+			mapRank2in.erase(r_remove);
+			mapRank2in[r_add]=1;
+			rank_new.erase(rank_new.begin()+col_id);
+			rank_new.push_back(r_add);
+		}
+		else {logp+=0;}//do not change
+	}
+	
+	stable_sort (rank_new.begin(), rank_new.end(), comp_vec);
+
+	mapRank2in.clear();
+	return logp;
+}
+
+
+
+
+
+
+bool comp_lr (pair<size_t, double> a, pair<size_t, double> b)
+{
+	return (a.second > b.second); 
+}
+
+
+
+
+
+
+
+//if a_mode==13 then Uty==y
+void BSLMM::MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y) {
+	clock_t time_start;	
+
+	class HYPBSLMM cHyp_old, cHyp_new;
+	
+	gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6);
+	gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max);	
+	
+	gsl_vector *alpha_prime=gsl_vector_alloc (ni_test);		
+	gsl_vector *alpha_new=gsl_vector_alloc (ni_test);
+	gsl_vector *alpha_old=gsl_vector_alloc (ni_test);	
+	gsl_vector *Utu=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu_new=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu_old=gsl_vector_alloc (ni_test);
+	
+	gsl_vector *UtXb_new=gsl_vector_alloc (ni_test);
+	gsl_vector *UtXb_old=gsl_vector_alloc (ni_test);
+	
+	gsl_vector *z_hat=gsl_vector_alloc (ni_test);
+	gsl_vector *z=gsl_vector_alloc (ni_test);
+	gsl_vector *Utz=gsl_vector_alloc (ni_test);	
+
+	gsl_vector_memcpy (Utz, Uty);			
+	
+	double logPost_new, logPost_old;
+	double logMHratio;
+	double mean_z=0.0;	
+	
+	gsl_matrix_set_zero (Result_gamma);
+	gsl_vector_set_zero (Utu);
+	gsl_vector_set_zero (alpha_prime);
+	if (a_mode==13) {
+		pheno_mean=0.0;
+	}
+	
+	vector<pair<double, double> > beta_g;
+	for (size_t i=0; i<ns_test; i++) {
+		beta_g.push_back(make_pair(0.0, 0.0));
+	}
+	
+	vector<size_t> rank_new, rank_old;
+	vector<double> beta_new, beta_old;	
+
+	vector<pair<size_t, double> > pos_loglr;
+
+	time_start=clock();
+	MatrixCalcLR (U, UtX, Utz, K_eval, l_min, l_max, n_region, pos_loglr);
+	time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+	stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr);
+	for (size_t i=0; i<ns_test; ++i) {
+		mapRank2pos[i]=pos_loglr[i].first;
+	}
+	
+	//calculate proposal distribution for gamma (unnormalized), and set up gsl_r and gsl_t			
+	gsl_rng_env_setup();                
+	const gsl_rng_type * gslType;                                               
+	gslType = gsl_rng_default; 
+	if (randseed<0)
+	{
+		time_t rawtime;
+		time (&rawtime);
+		tm * ptm = gmtime (&rawtime);
+		
+		randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
+	}
+	gsl_r = gsl_rng_alloc(gslType); 
+	gsl_rng_set(gsl_r, randseed);
+	
+	double *p_gamma = new double[ns_test]; 
+	CalcPgamma (p_gamma);
+	
+	gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma);
+	
+	//initial parameters
+	InitialMCMC (UtX, Utz, rank_old, cHyp_old, pos_loglr);
+//	if (fix_sigma>=0) {
+//		rho_max=1-fix_sigma;
+//		cHyp_old.h=fix_sigma/(1-cHyp_old.rho);
+//	}
+	
+	cHyp_initial=cHyp_old;
+	
+	if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+		logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old);
+
+		beta_old.clear();
+		for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+		  beta_old.push_back(0);
+		}	
+	}
+	else {
+		gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_old.n_gamma);
+		gsl_vector *beta=gsl_vector_alloc (cHyp_old.n_gamma);
+		SetXgamma (UtXgamma, UtX, rank_old);		
+		logPost_old=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_old, Utu_old, alpha_old, beta, cHyp_old);
+	
+		beta_old.clear();
+		for (size_t i=0; i<beta->size; ++i) {
+			beta_old.push_back(gsl_vector_get(beta, i));
+		}	
+		gsl_matrix_free (UtXgamma);
+		gsl_vector_free (beta);
+	}	
+	
+	//calculate centered z_hat, and pve
+	if (a_mode==13) {
+		time_start=clock();
+		if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+			CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old);
+		}
+		else {
+			CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old);
+		}
+		time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	}
+	
+	//start MCMC
+	int accept;
+	size_t total_step=w_step+s_step;
+	size_t w=0, w_col, pos;
+	size_t repeat=0;
+	
+	for (size_t t=0; t<total_step; ++t) {
+		if (t%d_pace==0 || t==total_step-1) {ProgressBar ("Running MCMC ", t, total_step-1, (double)n_accept/(double)(t*n_mh+1));}
+//		if (t>10) {break;}		
+
+		if (a_mode==13) {			
+			SampleZ (y, z_hat, z);		
+			mean_z=CenterVector (z);	
+			
+			time_start=clock();
+			gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz);
+			time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+			//First proposal
+			if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {				
+				logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old);
+				beta_old.clear();
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+				  beta_old.push_back(0);
+				}	
+			}
+			else {
+				gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_old.n_gamma);
+				gsl_vector *beta=gsl_vector_alloc (cHyp_old.n_gamma);
+				SetXgamma (UtXgamma, UtX, rank_old);
+				logPost_old=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_old, Utu_old, alpha_old, beta, cHyp_old);
+				
+				beta_old.clear();
+				for (size_t i=0; i<beta->size; ++i) {
+					beta_old.push_back(gsl_vector_get(beta, i));
+				}
+				gsl_matrix_free (UtXgamma);
+				gsl_vector_free (beta);
+			}
+		}
+		
+		//MH steps
+		for (size_t i=0; i<n_mh; ++i) {
+			if (gsl_rng_uniform(gsl_r)<0.33) {repeat = 1+gsl_rng_uniform_int(gsl_r, 20);}
+			else {repeat=1;}
+			
+			logMHratio=0.0;
+			logMHratio+=ProposeHnRho(cHyp_old, cHyp_new, repeat);		
+			logMHratio+=ProposeGamma (rank_old, rank_new, p_gamma, cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposePi(cHyp_old, cHyp_new, repeat);
+			
+//			if (fix_sigma>=0) {
+//				cHyp_new.h=fix_sigma/(1-cHyp_new.rho);
+//			}
+			
+			if (cHyp_new.n_gamma==0 || cHyp_new.rho==0) {
+				logPost_new=CalcPosterior(Utz, K_eval, Utu_new, alpha_new, cHyp_new);
+				beta_new.clear();
+				for (size_t i=0; i<cHyp_new.n_gamma; ++i) {
+				  beta_new.push_back(0);
+				}	
+			}
+			else {
+				gsl_matrix *UtXgamma=gsl_matrix_alloc (ni_test, cHyp_new.n_gamma);
+				gsl_vector *beta=gsl_vector_alloc (cHyp_new.n_gamma);
+				SetXgamma (UtXgamma, UtX, rank_new);
+				logPost_new=CalcPosterior(UtXgamma, Utz, K_eval, UtXb_new, Utu_new, alpha_new, beta, cHyp_new);
+				beta_new.clear();
+				for (size_t i=0; i<beta->size; ++i) {
+					beta_new.push_back(gsl_vector_get(beta, i));
+				}
+				gsl_matrix_free (UtXgamma);
+				gsl_vector_free (beta);
+			}	
+			
+			logMHratio+=logPost_new-logPost_old;		
+		
+			if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))<logMHratio) {accept=1; n_accept++;}
+			else {accept=0;}
+
+			if (accept==1) {			
+				logPost_old=logPost_new;
+				rank_old.clear(); beta_old.clear();
+				if (rank_new.size()!=0) {
+					for (size_t i=0; i<rank_new.size(); ++i) {
+						rank_old.push_back(rank_new[i]);
+						beta_old.push_back(beta_new[i]);
+					}
+				}
+				cHyp_old=cHyp_new;
+				gsl_vector_memcpy (alpha_old, alpha_new);
+				gsl_vector_memcpy (UtXb_old, UtXb_new);
+				gsl_vector_memcpy (Utu_old, Utu_new);
+			}
+			else {cHyp_new=cHyp_old;}
+		}				
+		
+		//calculate z_hat, and pve
+		if (a_mode==13) {
+			time_start=clock();
+			if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) {
+				CalcCC_PVEnZ (U, Utu_old, z_hat, cHyp_old);
+			}
+			else {
+				CalcCC_PVEnZ (U, UtXb_old, Utu_old, z_hat, cHyp_old);
+			}
+			
+			//sample mu and update z hat
+			gsl_vector_sub (z, z_hat);
+			mean_z+=CenterVector(z);
+			mean_z+=gsl_ran_gaussian(gsl_r, sqrt(1.0/(double) ni_test) );			
+			
+			gsl_vector_add_constant (z_hat, mean_z);
+			
+			time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		}
+		
+		//Save data
+		if (t<w_step) {continue;}
+		else {		
+			if (t%r_pace==0) {
+				w_col=w%w_pace;
+				if (w_col==0) {
+					if (w==0) {WriteResult (0, Result_hyp, Result_gamma, w_col);}					
+					else {
+						WriteResult (1, Result_hyp, Result_gamma, w_col);
+						gsl_matrix_set_zero (Result_hyp);
+						gsl_matrix_set_zero (Result_gamma);
+					}
+				}
+				
+				gsl_matrix_set (Result_hyp, w_col, 0, cHyp_old.h);
+				gsl_matrix_set (Result_hyp, w_col, 1, cHyp_old.pve);
+				gsl_matrix_set (Result_hyp, w_col, 2, cHyp_old.rho);
+				gsl_matrix_set (Result_hyp, w_col, 3, cHyp_old.pge);
+				gsl_matrix_set (Result_hyp, w_col, 4, cHyp_old.logp);
+				gsl_matrix_set (Result_hyp, w_col, 5, cHyp_old.n_gamma);
+				
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+					pos=mapRank2pos[rank_old[i]]+1;
+
+					gsl_matrix_set (Result_gamma, w_col, i, pos);
+					
+					beta_g[pos-1].first+=beta_old[i];
+					beta_g[pos-1].second+=1.0;	
+				}
+				
+				gsl_vector_add (alpha_prime, alpha_old);
+				gsl_vector_add (Utu, Utu_old);
+				
+				if (a_mode==13) {
+					pheno_mean+=mean_z;
+				}
+				
+				w++;
+				
+			}
+			
+		}
+	}
+	cout<<endl;
+	
+	w_col=w%w_pace;
+	WriteResult (1, Result_hyp, Result_gamma, w_col);	
+	
+	gsl_matrix_free(Result_hyp);
+	gsl_matrix_free(Result_gamma);	
+	
+	gsl_vector_free(z_hat);
+	gsl_vector_free(z);
+	gsl_vector_free(Utz);	
+	gsl_vector_free(UtXb_new);	
+	gsl_vector_free(UtXb_old);
+	gsl_vector_free(alpha_new);	
+	gsl_vector_free(alpha_old);
+	gsl_vector_free(Utu_new);	
+	gsl_vector_free(Utu_old);	
+	
+	gsl_vector_scale (alpha_prime, 1.0/(double)w);	
+	gsl_vector_scale (Utu, 1.0/(double)w);	
+	if (a_mode==13) {
+		pheno_mean/=(double)w;
+	}
+	
+	gsl_vector *alpha=gsl_vector_alloc (ns_test);
+	gsl_blas_dgemv (CblasTrans, 1.0/(double)ns_test, UtX, alpha_prime, 0.0, alpha);	
+	WriteParam (beta_g, alpha, w);
+	gsl_vector_free(alpha);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, alpha_prime);
+	WriteBV(alpha_prime);	
+	
+	gsl_vector_free(alpha_prime);				
+	gsl_vector_free(Utu);	
+		
+	delete [] p_gamma;
+	beta_g.clear();
+	
+	return;
+}
+
+
+
+void BSLMM::RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda)
+{
+	gsl_vector *beta=gsl_vector_alloc (UtX->size2);
+	gsl_vector *H_eval=gsl_vector_alloc (Uty->size);
+	gsl_vector *bv=gsl_vector_alloc (Uty->size);
+
+	gsl_vector_memcpy (H_eval, eval);
+	gsl_vector_scale (H_eval, lambda);
+	gsl_vector_add_constant (H_eval, 1.0);
+	
+	gsl_vector_memcpy (bv, Uty);
+	gsl_vector_div (bv, H_eval);	
+
+	gsl_blas_dgemv (CblasTrans, lambda/(double)UtX->size2, UtX, bv, 0.0, beta);
+	gsl_vector_add_constant (H_eval, -1.0);
+	gsl_vector_mul (H_eval, bv);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, H_eval, 0.0, bv);
+
+	WriteParam (beta);
+	WriteBV(bv);
+	
+	gsl_vector_free (H_eval);
+	gsl_vector_free (beta);
+	gsl_vector_free (bv);
+	
+	return;
+}
+ 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//below fits MCMC for rho=1
+void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty)
+{
+  time_t time_start=clock();	
+  gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size);
+  gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size);
+  gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size);
+
+#ifdef WITH_LAPACK
+  lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix);
+#else
+  gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix);
+#endif
+  gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector);
+
+  time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+  return;
+}
+
+
+void BSLMM::SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector<size_t> &rank_old, const vector<size_t> &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new)
+{
+  double d;
+
+  //rank_old and rank_new are sorted already inside PorposeGamma
+  //calculate vectors rank_remove and rank_add
+  //  size_t v_size=max(rank_old.size(), rank_new.size());
+  //make sure that v_size is larger than repeat
+  size_t v_size=20;
+  vector<size_t> rank_remove(v_size), rank_add(v_size), rank_union(s_max+v_size);
+  vector<size_t>::iterator it;
+
+  it=set_difference (rank_old.begin(), rank_old.end(), rank_new.begin(), rank_new.end(), rank_remove.begin());
+  rank_remove.resize(it-rank_remove.begin());
+
+  it=set_difference (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_add.begin());
+  rank_add.resize(it-rank_add.begin());
+
+  it=set_union (rank_new.begin(), rank_new.end(), rank_old.begin(), rank_old.end(), rank_union.begin());
+  rank_union.resize(it-rank_union.begin());
+
+  //map rank_remove and rank_add
+  map<size_t, int> mapRank2in_remove, mapRank2in_add;
+  for (size_t i=0; i<rank_remove.size(); i++) {
+    mapRank2in_remove[rank_remove[i]]=1;
+  }
+  for (size_t i=0; i<rank_add.size(); i++) {
+    mapRank2in_add[rank_add[i]]=1;
+  }
+
+  //obtain the subset of matrix/vector
+  gsl_matrix_const_view Xold_sub=gsl_matrix_const_submatrix(X_old, 0, 0, X_old->size1, rank_old.size());
+  gsl_matrix_const_view XtXold_sub=gsl_matrix_const_submatrix(XtX_old, 0, 0, rank_old.size(), rank_old.size());
+  gsl_vector_const_view Xtyold_sub=gsl_vector_const_subvector(Xty_old, 0, rank_old.size());
+
+  gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(X_new, 0, 0, X_new->size1, rank_new.size());
+  gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size());
+  gsl_vector_view Xtynew_sub=gsl_vector_subvector(Xty_new, 0, rank_new.size());
+
+  //get X_new and calculate XtX_new
+  /*
+  if (rank_remove.size()==0 && rank_add.size()==0) {
+    gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix);
+    gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix);
+    gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector);
+  } else {
+    gsl_matrix *X_temp=gsl_matrix_alloc(X_old->size1, rank_old.size()-rank_remove.size() );
+    gsl_matrix *XtX_temp=gsl_matrix_alloc(X_temp->size2, X_temp->size2);
+    gsl_vector *Xty_temp=gsl_vector_alloc(X_temp->size2);
+    
+    if (rank_remove.size()==0) {
+      gsl_matrix_memcpy (X_temp, &Xold_sub.matrix);
+      gsl_matrix_memcpy (XtX_temp, &XtXold_sub.matrix);
+      gsl_vector_memcpy (Xty_temp, &Xtyold_sub.vector);
+    } else {
+      size_t i_temp=0, j_temp;
+      for (size_t i=0; i<rank_old.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i])!=0) {continue;}
+	gsl_vector_const_view Xold_col=gsl_matrix_const_column(X_old, i);	
+	gsl_vector_view Xtemp_col=gsl_matrix_column(X_temp, i_temp);
+	gsl_vector_memcpy (&Xtemp_col.vector, &Xold_col.vector);
+
+	d=gsl_vector_get (Xty_old, i);
+	gsl_vector_set (Xty_temp, i_temp, d);
+	
+	j_temp=i_temp;
+	for (size_t j=i; j<rank_old.size(); j++) {
+	  if (mapRank2in_remove.count(rank_old[j])!=0) {continue;}
+	  d=gsl_matrix_get (XtX_old, i, j);
+	  gsl_matrix_set (XtX_temp, i_temp, j_temp, d);
+	  if (i_temp!=j_temp) {gsl_matrix_set (XtX_temp, j_temp, i_temp, d);}
+	  j_temp++;
+	}
+	i_temp++;
+      }
+    }
+
+    if (rank_add.size()==0) {
+      gsl_matrix_memcpy (&Xnew_sub.matrix, X_temp);
+      gsl_matrix_memcpy (&XtXnew_sub.matrix, XtX_temp);
+      gsl_vector_memcpy (&Xtynew_sub.vector, Xty_temp);
+    } else {
+      gsl_matrix *X_add=gsl_matrix_alloc(X_old->size1, rank_add.size() );
+      gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2);
+      gsl_matrix *XtX_at=gsl_matrix_alloc(X_add->size2, X_temp->size2);
+      gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2);
+
+      //get X_add
+      SetXgamma (X_add, X, rank_add);
+
+      //get t(X_add)X_add and t(X_add)X_temp	
+      clock_t time_start=clock();
+      
+      //somehow the lapack_dgemm does not work here
+      //#ifdef WITH_LAPACK
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa);
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at);
+      
+      //#else
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa);
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_temp, 0.0, XtX_at);
+      //#endif
+      gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add);
+
+      time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+      //save to X_new, XtX_new and Xty_new
+      size_t i_temp=0, j_temp, i_flag=0, j_flag=0;
+      for (size_t i=0; i<rank_new.size(); i++) {
+	if (mapRank2in_add.count(rank_new[i])!=0) {i_flag=1;} else {i_flag=0;}
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i); 
+	if (i_flag==1) {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_add, i-i_temp);
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	} else {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_temp, i_temp);	  
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	}	
+
+	if (i_flag==1) {
+          d=gsl_vector_get (Xty_add, i-i_temp);
+        } else {
+          d=gsl_vector_get (Xty_temp, i_temp);
+        }
+	gsl_vector_set (Xty_new, i, d);
+
+	j_temp=i_temp;
+	for (size_t j=i; j<rank_new.size(); j++) {
+          if (mapRank2in_add.count(rank_new[j])!=0) {j_flag=1;} else {j_flag=0;}
+
+	  if (i_flag==1 && j_flag==1) {
+            d=gsl_matrix_get(XtX_aa, i-i_temp, j-j_temp);
+	  } else if (i_flag==1) {
+	    d=gsl_matrix_get(XtX_at, i-i_temp, j_temp);
+	  } else if (j_flag==1) {
+	    d=gsl_matrix_get(XtX_at, j-j_temp, i_temp);
+	  } else {
+	    d=gsl_matrix_get(XtX_temp, i_temp, j_temp);
+	  }
+
+	  gsl_matrix_set (XtX_new, i, j, d);
+	  if (i!=j) {gsl_matrix_set (XtX_new, j, i, d);}
+
+	  if (j_flag==0) {j_temp++;}
+        }
+	if (i_flag==0) {i_temp++;}
+      }
+
+      gsl_matrix_free(X_add);
+      gsl_matrix_free(XtX_aa);
+      gsl_matrix_free(XtX_at);
+      gsl_vector_free(Xty_add);
+    }
+
+    gsl_matrix_free(X_temp);
+    gsl_matrix_free(XtX_temp);
+    gsl_vector_free(Xty_temp);
+  }
+  */
+
+
+  if (rank_remove.size()==0 && rank_add.size()==0) {
+    gsl_matrix_memcpy(&Xnew_sub.matrix, &Xold_sub.matrix);
+    gsl_matrix_memcpy(&XtXnew_sub.matrix, &XtXold_sub.matrix);
+    gsl_vector_memcpy(&Xtynew_sub.vector, &Xtyold_sub.vector);
+  } else {
+    size_t i_old, j_old, i_new, j_new, i_add, j_add, i_flag, j_flag;
+    if (rank_add.size()==0) {
+      i_old=0; i_new=0;
+      for (size_t i=0; i<rank_union.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i_old])!=0) {i_old++; continue;}
+
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i_new); 
+	gsl_vector_const_view Xcopy_col=gsl_matrix_const_column(X_old, i_old);
+	gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+
+	d=gsl_vector_get (Xty_old, i_old);
+	gsl_vector_set (Xty_new, i_new, d);
+
+	j_old=i_old; j_new=i_new;
+	for (size_t j=i; j<rank_union.size(); j++) {
+          if (mapRank2in_remove.count(rank_old[j_old])!=0) {j_old++; continue;}
+
+	  d=gsl_matrix_get(XtX_old, i_old, j_old);
+
+	  gsl_matrix_set (XtX_new, i_new, j_new, d);
+	  if (i_new!=j_new) {gsl_matrix_set (XtX_new, j_new, i_new, d);}
+
+	  j_old++; j_new++;
+        }
+	i_old++; i_new++;
+      }	
+    } else {
+      gsl_matrix *X_add=gsl_matrix_alloc(X_old->size1, rank_add.size() );
+      gsl_matrix *XtX_aa=gsl_matrix_alloc(X_add->size2, X_add->size2);
+      gsl_matrix *XtX_ao=gsl_matrix_alloc(X_add->size2, X_old->size2);
+      gsl_vector *Xty_add=gsl_vector_alloc(X_add->size2);
+
+      //get X_add
+      SetXgamma (X_add, X, rank_add);
+
+      //get t(X_add)X_add and t(X_add)X_temp	
+      clock_t time_start=clock();
+      
+      //somehow the lapack_dgemm does not work here
+      //#ifdef WITH_LAPACK
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_add, 0.0, XtX_aa);
+      //lapack_dgemm ((char *)"T", (char *)"N", 1.0, X_add, X_temp, 0.0, XtX_at);
+      
+      //#else
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_add, 0.0, XtX_aa);
+      gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, X_add, X_old, 0.0, XtX_ao);
+      //#endif
+      gsl_blas_dgemv(CblasTrans, 1.0, X_add, y, 0.0, Xty_add);
+
+      time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+      //save to X_new, XtX_new and Xty_new
+      i_old=0; i_new=0; i_add=0;
+      for (size_t i=0; i<rank_union.size(); i++) {
+	if (mapRank2in_remove.count(rank_old[i_old])!=0) {i_old++; continue;}
+	if (mapRank2in_add.count(rank_new[i_new])!=0) {i_flag=1;} else {i_flag=0;}
+
+	gsl_vector_view Xnew_col=gsl_matrix_column(X_new, i_new); 
+	if (i_flag==1) {
+	  gsl_vector_view Xcopy_col=gsl_matrix_column(X_add, i_add);
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	} else {
+	  gsl_vector_const_view Xcopy_col=gsl_matrix_const_column(X_old, i_old);	  
+	  gsl_vector_memcpy (&Xnew_col.vector, &Xcopy_col.vector);
+	}	
+
+	if (i_flag==1) {
+          d=gsl_vector_get (Xty_add, i_add);
+        } else {
+          d=gsl_vector_get (Xty_old, i_old);
+        }
+	gsl_vector_set (Xty_new, i_new, d);
+
+	j_old=i_old; j_new=i_new; j_add=i_add;
+	for (size_t j=i; j<rank_union.size(); j++) {
+	  if (mapRank2in_remove.count(rank_old[j_old])!=0) {j_old++; continue;}
+	  if (mapRank2in_add.count(rank_new[j_new])!=0) {j_flag=1;} else {j_flag=0;}
+
+	  if (i_flag==1 && j_flag==1) {
+            d=gsl_matrix_get(XtX_aa, i_add, j_add);
+	  } else if (i_flag==1) {
+	    d=gsl_matrix_get(XtX_ao, i_add, j_old);
+	  } else if (j_flag==1) {
+	    d=gsl_matrix_get(XtX_ao, j_add, i_old);
+	  } else {
+	    d=gsl_matrix_get(XtX_old, i_old, j_old);
+	  }
+
+	  gsl_matrix_set (XtX_new, i_new, j_new, d);
+	  if (i_new!=j_new) {gsl_matrix_set (XtX_new, j_new, i_new, d);}
+
+	  j_new++; if (j_flag==1) {j_add++;} else {j_old++;}
+        }
+	i_new++; if (i_flag==1) {i_add++;} else {i_old++;}
+      }
+
+      gsl_matrix_free(X_add);
+      gsl_matrix_free(XtX_aa);
+      gsl_matrix_free(XtX_ao);
+      gsl_vector_free(Xty_add);
+    }
+
+  }
+
+  rank_remove.clear();
+  rank_add.clear();
+  rank_union.clear();
+  mapRank2in_remove.clear();
+  mapRank2in_add.clear();
+	
+  return;
+}
+
+
+double BSLMM::CalcPosterior (const double yty, class HYPBSLMM &cHyp)
+{	
+	double logpost=0.0;
+	
+	//for quantitative traits, calculate pve and pge
+	//pve and pge for case/control data are calculted in CalcCC_PVEnZ
+	if (a_mode==11) {
+		cHyp.pve=0.0;
+		cHyp.pge=1.0;	
+	}
+
+	//calculate likelihood
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(yty);}
+	else {logpost-=0.5*yty;}
+	
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp));
+		
+	return logpost;
+}
+
+
+double BSLMM::CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp)
+{	
+	double sigma_a2=cHyp.h/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test);
+	double logpost=0.0;
+	double d, P_yy=yty, logdet_O=0.0;
+
+	gsl_matrix_const_view Xgamma_sub=gsl_matrix_const_submatrix (Xgamma, 0, 0, Xgamma->size1, s_size);
+	gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size);
+	gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size);
+	
+	gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size);
+	gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size);
+	gsl_vector *beta_hat=gsl_vector_alloc (s_size);	
+	gsl_vector *Xty_temp=gsl_vector_alloc (s_size);
+
+	gsl_vector_memcpy (Xty_temp, &Xty_sub.vector);
+
+	//calculate Omega
+	gsl_matrix_memcpy (Omega, &XtX_sub.matrix);
+	gsl_matrix_scale (Omega, sigma_a2);
+	gsl_matrix_set_identity (M_temp);
+	gsl_matrix_add (Omega, M_temp);
+	
+	//calculate beta_hat
+	logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat);	
+	gsl_vector_scale (beta_hat, sigma_a2);
+
+	gsl_blas_ddot (Xty_temp, beta_hat, &d);
+	P_yy-=d;
+
+	//sample tau
+	double tau=1.0;
+	if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0,  2.0/P_yy); }
+
+	//sample beta
+	for (size_t i=0; i<s_size; i++)
+	{
+		d=gsl_ran_gaussian(gsl_r, 1); 
+		gsl_vector_set(beta, i, d); 
+	}
+	gsl_vector_view beta_sub=gsl_vector_subvector(beta, 0, s_size);
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, &beta_sub.vector); 
+		
+	//it compuates inv(L^T(Omega)) %*% beta;  
+	gsl_vector_scale(&beta_sub.vector, sqrt(sigma_a2/tau));
+	gsl_vector_add(&beta_sub.vector, beta_hat); 
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &Xgamma_sub.matrix, &beta_sub.vector, 0.0, Xb);		
+	
+	//for quantitative traits, calculate pve and pge
+	if (a_mode==11) {
+		gsl_blas_ddot (Xb, Xb, &d);
+		cHyp.pve=d/(double)ni_test;
+		cHyp.pve/=cHyp.pve+1.0/tau;
+		cHyp.pge=1.0;	
+	}	
+	
+	logpost=-0.5*logdet_O;
+	if (a_mode==11) {logpost-=0.5*(double)ni_test*log(P_yy);}
+	else {logpost-=0.5*P_yy;}
+
+	logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1.0-exp(cHyp.logp));
+
+	gsl_matrix_free (Omega);
+	gsl_matrix_free (M_temp);
+	gsl_vector_free (beta_hat);
+	gsl_vector_free (Xty_temp);
+
+	return logpost;
+}
+
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+  gsl_vector_set_zero(z_hat);
+  cHyp.pve=0.0;
+  cHyp.pge=1.0;		
+  return;
+}
+
+
+//calculate pve and pge, and calculate z_hat for case-control data	
+void BSLMM::CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp) 
+{
+	double d;
+	
+	gsl_blas_ddot (Xb, Xb, &d);
+	cHyp.pve=d/(double)ni_test;
+	cHyp.pve/=cHyp.pve+1.0;
+	cHyp.pge=1.0;
+	
+	gsl_vector_memcpy (z_hat, Xb);
+
+	return;
+}
+
+
+
+//if a_mode==13, then run probit model
+void BSLMM::MCMC (const gsl_matrix *X, const gsl_vector *y) {
+	clock_t time_start;	
+	double time_set=0, time_post=0;
+
+	class HYPBSLMM cHyp_old, cHyp_new;
+	
+	gsl_matrix *Result_hyp=gsl_matrix_alloc (w_pace, 6);
+	gsl_matrix *Result_gamma=gsl_matrix_alloc (w_pace, s_max);	
+	
+	gsl_vector *Xb_new=gsl_vector_alloc (ni_test);
+	gsl_vector *Xb_old=gsl_vector_alloc (ni_test);	
+	gsl_vector *z_hat=gsl_vector_alloc (ni_test);
+	gsl_vector *z=gsl_vector_alloc (ni_test);
+
+	gsl_matrix *Xgamma_old=gsl_matrix_alloc (ni_test, s_max);
+	gsl_matrix *XtX_old=gsl_matrix_alloc (s_max, s_max);
+	gsl_vector *Xtz_old=gsl_vector_alloc (s_max);
+	gsl_vector *beta_old=gsl_vector_alloc (s_max);
+
+	gsl_matrix *Xgamma_new=gsl_matrix_alloc (ni_test, s_max);
+	gsl_matrix *XtX_new=gsl_matrix_alloc (s_max, s_max);
+	gsl_vector *Xtz_new=gsl_vector_alloc (s_max);
+	gsl_vector *beta_new=gsl_vector_alloc (s_max);
+
+	double ztz=0.0;
+	gsl_vector_memcpy (z, y);
+	//for quantitative traits, y is centered already in gemma.cpp, but just in case
+	double mean_z=CenterVector (z);				
+	gsl_blas_ddot(z, z, &ztz);
+
+	double logPost_new, logPost_old;
+	double logMHratio;
+	
+	gsl_matrix_set_zero (Result_gamma);
+	if (a_mode==13) {
+		pheno_mean=0.0;
+	}
+	
+	vector<pair<double, double> > beta_g;
+	for (size_t i=0; i<ns_test; i++) {
+		beta_g.push_back(make_pair(0.0, 0.0));
+	}
+	
+	vector<size_t> rank_new, rank_old;
+	vector<pair<size_t, double> > pos_loglr;
+	
+	time_start=clock();
+	MatrixCalcLmLR (X, z, pos_loglr);
+	time_Proposal=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+	stable_sort (pos_loglr.begin(), pos_loglr.end(), comp_lr);
+	for (size_t i=0; i<ns_test; ++i) {
+		mapRank2pos[i]=pos_loglr[i].first;
+	}
+	
+	//calculate proposal distribution for gamma (unnormalized), and set up gsl_r and gsl_t		
+	gsl_rng_env_setup();                
+	const gsl_rng_type * gslType;                                               
+	gslType = gsl_rng_default; 
+	if (randseed<0)
+	{
+		time_t rawtime;
+		time (&rawtime);
+		tm * ptm = gmtime (&rawtime);
+		
+		randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
+	}
+	gsl_r = gsl_rng_alloc(gslType); 
+	gsl_rng_set(gsl_r, randseed);
+	
+	double *p_gamma = new double[ns_test]; 
+	CalcPgamma (p_gamma);
+	
+	gsl_t=gsl_ran_discrete_preproc (ns_test, p_gamma);
+	
+	//initial parameters
+	InitialMCMC (X, z, rank_old, cHyp_old, pos_loglr);
+	
+	cHyp_initial=cHyp_old;
+
+	if (cHyp_old.n_gamma==0) {	  
+	    logPost_old=CalcPosterior (ztz, cHyp_old);
+	}
+	else {	  
+	  SetXgamma (Xgamma_old, X, rank_old);	  
+	  CalcXtX (Xgamma_old, z, rank_old.size(), XtX_old, Xtz_old);
+	  logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old);
+	}	
+
+	//calculate centered z_hat, and pve
+	if (a_mode==13) {
+		if (cHyp_old.n_gamma==0) {
+			CalcCC_PVEnZ (z_hat, cHyp_old);
+		}
+		else {
+			CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old);
+		}
+	}
+	
+	//start MCMC
+	int accept;
+	size_t total_step=w_step+s_step;
+	size_t w=0, w_col, pos;
+	size_t repeat=0;
+	
+	for (size_t t=0; t<total_step; ++t) {
+		if (t%d_pace==0 || t==total_step-1) {ProgressBar ("Running MCMC ", t, total_step-1, (double)n_accept/(double)(t*n_mh+1));}
+//		if (t>10) {break;}		
+		if (a_mode==13) {			
+			SampleZ (y, z_hat, z);		
+			mean_z=CenterVector (z);
+			gsl_blas_ddot(z,z,&ztz);
+					
+			//First proposal		
+			if (cHyp_old.n_gamma==0) {	  
+			  logPost_old=CalcPosterior (ztz, cHyp_old);
+			} else {	  
+			  gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_old.size());
+			  gsl_vector_view Xtz_sub=gsl_vector_subvector(Xtz_old, 0, rank_old.size());
+			  gsl_blas_dgemv (CblasTrans, 1.0, &Xold_sub.matrix, z, 0.0, &Xtz_sub.vector);
+			  logPost_old=CalcPosterior (Xgamma_old, XtX_old, Xtz_old, ztz, rank_old.size(), Xb_old, beta_old, cHyp_old);
+			}	
+		}
+
+		//MH steps
+		for (size_t i=0; i<n_mh; ++i) {
+			if (gsl_rng_uniform(gsl_r)<0.33) {repeat = 1+gsl_rng_uniform_int(gsl_r, 20);}
+			else {repeat=1;}
+
+			logMHratio=0.0;
+			logMHratio+=ProposeHnRho(cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposeGamma (rank_old, rank_new, p_gamma, cHyp_old, cHyp_new, repeat);	
+			logMHratio+=ProposePi(cHyp_old, cHyp_new, repeat);
+			
+			if (cHyp_new.n_gamma==0) {
+				logPost_new=CalcPosterior (ztz, cHyp_new);
+			} else {
+			  //this if makes sure that rank_old.size()==rank_remove.size() does not happen
+			  if (cHyp_new.n_gamma<=20 || cHyp_old.n_gamma<=20) {
+			    time_start=clock();
+			    SetXgamma (Xgamma_new, X, rank_new);	  
+			    CalcXtX (Xgamma_new, z, rank_new.size(), XtX_new, Xtz_new);	
+			    time_set+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			  } else {
+			    time_start=clock();
+			    SetXgamma (X, Xgamma_old, XtX_old, Xtz_old, z, rank_old, rank_new, Xgamma_new, XtX_new, Xtz_new);
+			    time_set+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			  }
+			  time_start=clock();
+			  logPost_new=CalcPosterior (Xgamma_new, XtX_new, Xtz_new, ztz, rank_new.size(), Xb_new, beta_new, cHyp_new);
+			  time_post+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+			}	
+			logMHratio+=logPost_new-logPost_old;	
+		
+			if (logMHratio>0 || log(gsl_rng_uniform(gsl_r))<logMHratio) {accept=1; n_accept++;}
+			else {accept=0;}
+			
+			//cout<<rank_new.size()<<"\t"<<rank_old.size()<<"\t"<<logPost_new<<"\t"<<logPost_old<<endl;
+			if (accept==1) {			
+				logPost_old=logPost_new;
+				cHyp_old=cHyp_new;
+				gsl_vector_memcpy (Xb_old, Xb_new);
+
+				rank_old.clear();
+				if (rank_new.size()!=0) {
+					for (size_t i=0; i<rank_new.size(); ++i) {
+						rank_old.push_back(rank_new[i]);
+					}
+								
+					gsl_matrix_view Xold_sub=gsl_matrix_submatrix(Xgamma_old, 0, 0, ni_test, rank_new.size());
+					gsl_matrix_view XtXold_sub=gsl_matrix_submatrix(XtX_old, 0, 0, rank_new.size(), rank_new.size());
+					gsl_vector_view Xtzold_sub=gsl_vector_subvector(Xtz_old, 0, rank_new.size());
+					gsl_vector_view betaold_sub=gsl_vector_subvector(beta_old, 0, rank_new.size());
+
+					gsl_matrix_view Xnew_sub=gsl_matrix_submatrix(Xgamma_new, 0, 0, ni_test, rank_new.size());
+					gsl_matrix_view XtXnew_sub=gsl_matrix_submatrix(XtX_new, 0, 0, rank_new.size(), rank_new.size());
+					gsl_vector_view Xtznew_sub=gsl_vector_subvector(Xtz_new, 0, rank_new.size());
+					gsl_vector_view betanew_sub=gsl_vector_subvector(beta_new, 0, rank_new.size());
+
+					gsl_matrix_memcpy(&Xold_sub.matrix, &Xnew_sub.matrix);
+					gsl_matrix_memcpy(&XtXold_sub.matrix, &XtXnew_sub.matrix);
+					gsl_vector_memcpy(&Xtzold_sub.vector, &Xtznew_sub.vector);
+					gsl_vector_memcpy(&betaold_sub.vector, &betanew_sub.vector);
+				}
+			} else {
+			  cHyp_new=cHyp_old;
+			}
+			
+		}				
+
+		//calculate z_hat, and pve
+		if (a_mode==13) {
+			if (cHyp_old.n_gamma==0) {
+				CalcCC_PVEnZ (z_hat, cHyp_old);
+			}
+			else {
+				CalcCC_PVEnZ (Xb_old, z_hat, cHyp_old);
+			}
+			
+			//sample mu and update z hat
+			gsl_vector_sub (z, z_hat);
+			mean_z+=CenterVector(z);
+			mean_z+=gsl_ran_gaussian(gsl_r, sqrt(1.0/(double) ni_test) );			
+			
+			gsl_vector_add_constant (z_hat, mean_z);
+		}
+		
+		//Save data
+		if (t<w_step) {continue;}
+		else {		
+			if (t%r_pace==0) {
+				w_col=w%w_pace;
+				if (w_col==0) {
+					if (w==0) {WriteResult (0, Result_hyp, Result_gamma, w_col);}					
+					else {
+						WriteResult (1, Result_hyp, Result_gamma, w_col);
+						gsl_matrix_set_zero (Result_hyp);
+						gsl_matrix_set_zero (Result_gamma);
+					}
+				}
+
+				gsl_matrix_set (Result_hyp, w_col, 0, cHyp_old.h);
+				gsl_matrix_set (Result_hyp, w_col, 1, cHyp_old.pve);
+				gsl_matrix_set (Result_hyp, w_col, 2, cHyp_old.rho);
+				gsl_matrix_set (Result_hyp, w_col, 3, cHyp_old.pge);
+				gsl_matrix_set (Result_hyp, w_col, 4, cHyp_old.logp);
+				gsl_matrix_set (Result_hyp, w_col, 5, cHyp_old.n_gamma);
+				
+				for (size_t i=0; i<cHyp_old.n_gamma; ++i) {
+					pos=mapRank2pos[rank_old[i]]+1;
+
+					gsl_matrix_set (Result_gamma, w_col, i, pos);
+					
+					beta_g[pos-1].first+=gsl_vector_get(beta_old, i);
+					beta_g[pos-1].second+=1.0;	
+				}
+				
+				if (a_mode==13) {
+					pheno_mean+=mean_z;
+				}
+				
+				w++;
+				
+			}
+			
+		}
+	}
+	cout<<endl;
+
+	cout<<"time on selecting Xgamma: "<<time_set<<endl;
+	cout<<"time on calculating posterior: "<<time_post<<endl;
+
+	w_col=w%w_pace;
+	WriteResult (1, Result_hyp, Result_gamma, w_col);	
+	
+	gsl_vector *alpha=gsl_vector_alloc (ns_test);
+	gsl_vector_set_zero (alpha);
+	WriteParam (beta_g, alpha, w);
+	gsl_vector_free(alpha);
+
+	gsl_matrix_free(Result_hyp);
+	gsl_matrix_free(Result_gamma);	
+	
+	gsl_vector_free(z_hat);
+	gsl_vector_free(z);
+	gsl_vector_free(Xb_new);	
+	gsl_vector_free(Xb_old);
+
+	gsl_matrix_free(Xgamma_old);
+	gsl_matrix_free(XtX_old);
+	gsl_vector_free(Xtz_old);
+	gsl_vector_free(beta_old);
+
+	gsl_matrix_free(Xgamma_new);
+	gsl_matrix_free(XtX_new);
+	gsl_vector_free(Xtz_new);
+	gsl_vector_free(beta_new);
+	
+	delete [] p_gamma;
+	beta_g.clear();
+	
+	return;
+}
diff --git a/src/bslmm.h b/src/bslmm.h
new file mode 100644
index 0000000..8b5edc7
--- /dev/null
+++ b/src/bslmm.h
@@ -0,0 +1,146 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef __BSLMM_H__                
+#define __BSLMM_H__
+
+#include <vector>
+#include <map>
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+
+class BSLMM {
+
+public:	
+	// IO related parameters
+	int a_mode;	
+	size_t d_pace;
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	string path_out;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double pve_null;
+	double pheno_mean;
+	
+	// BSLMM MCMC related parameters
+	double h_min, h_max, h_scale;			//priors for h
+	double rho_min, rho_max, rho_scale;		//priors for rho
+	double logp_min, logp_max, logp_scale;		//priors for log(pi)
+	size_t s_min, s_max;			//minimum and maximum number of gammas
+	size_t w_step;					//number of warm up/burn in iterations
+	size_t s_step;					//number of sampling iterations
+	size_t r_pace;					//record pace
+	size_t w_pace;					//write pace
+	size_t n_accept;				//number of acceptance
+	size_t n_mh;					//number of MH steps within each iteration
+	double geo_mean;				//mean of the geometric distribution
+	long int randseed;
+	double trace_G;	
+	
+	HYPBSLMM cHyp_initial;
+
+	// Summary statistics
+	size_t ni_total, ns_total;	//number of total individuals and snps
+	size_t ni_test, ns_test;	//number of individuals and snps used for analysis
+	size_t n_cvt;				//number of covariates
+	double time_UtZ;
+	double time_Omega;		//time spent on optimization iterations
+	double time_Proposal;        //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis)
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	gsl_rng *gsl_r; 
+	gsl_ran_discrete_t *gsl_t;	
+	map<size_t, size_t> mapRank2pos;	
+	
+	// Main Functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	
+	void RidgeR(const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *eval, const double lambda);
+	
+	void MCMC (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y);
+	void WriteLog ();
+	void WriteLR ();
+	void WriteBV (const gsl_vector *bv);
+	void WriteParam (vector<pair<double, double> > &beta_g, const gsl_vector *alpha, const size_t w);
+	void WriteParam (const gsl_vector *alpha);
+	void WriteResult (const int flag, const gsl_matrix *Result_hyp, const gsl_matrix *Result_gamma, const size_t w_col);
+	
+	//Subfunctions inside MCMC
+	void CalcPgamma (double *p_gammar);
+	
+	double CalcPveLM (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const double sigma_a2);
+	void InitialMCMC (const gsl_matrix *UtX, const gsl_vector *Uty, vector<size_t> &rank_old, class HYPBSLMM &cHyp, vector<pair<size_t, double> > &pos_loglr);
+	double CalcPosterior (const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *Utu, gsl_vector *alpha_prime, class HYPBSLMM &cHyp);
+	double CalcPosterior (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, gsl_vector *UtXb, gsl_vector *Utu, gsl_vector *alpha_prime, gsl_vector *beta, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_matrix *U, const gsl_vector *UtXb, const gsl_vector *Utu, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	double CalcREMLE (const gsl_matrix *Utw, const gsl_vector *Uty, const gsl_vector *K_eval);
+	double CalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, vector<pair<size_t, double> > &loglr_sort);		//calculate the maximum marginal likelihood ratio for each analyzed SNPs with gemma, use it to rank SNPs
+	void SampleZ (const gsl_vector *y, const gsl_vector *z_hat, gsl_vector *z);
+	double ProposeHnRho (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	double ProposePi (const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	double ProposeGamma (const vector<size_t> &rank_old, vector<size_t> &rank_new, const double *p_gamma, const class HYPBSLMM &cHyp_old, class HYPBSLMM &cHyp_new, const size_t &repeat);
+	void SetXgamma (gsl_matrix *Xgamma, const gsl_matrix *X, vector<size_t> &rank);
+
+	void CalcXtX (const gsl_matrix *X_new, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX_new, gsl_vector *Xty_new);
+	void SetXgamma (const gsl_matrix *X, const gsl_matrix *X_old, const gsl_matrix *XtX_old, const gsl_vector *Xty_old, const gsl_vector *y, const vector<size_t> &rank_old, const vector<size_t> &rank_new, gsl_matrix *X_new, gsl_matrix *XtX_new, gsl_vector *Xty_new);
+	double CalcPosterior (const double yty, class HYPBSLMM &cHyp);
+	double CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void CalcCC_PVEnZ (const gsl_vector *Xb, gsl_vector *z_hat, class HYPBSLMM &cHyp);
+	void MCMC (const gsl_matrix *X, const gsl_vector *y);
+	
+	//utility functions
+//	double vec_sum (gsl_vector *v);
+//	void vec_center (gsl_vector *v);
+//	double calc_var (gsl_vector *v);
+//	void calc_sigma (MCMC &cMcmc);
+//	bool comp_lr (pair<size_t, double> a, pair<size_t, double> b);
+};
+
+
+
+#endif
+
+
diff --git a/src/gemma.cpp b/src/gemma.cpp
new file mode 100644
index 0000000..b8693a8
--- /dev/null
+++ b/src/gemma.cpp
@@ -0,0 +1,1864 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <sys/stat.h>
+#include <ctime>
+#include <cmath>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_eigen.h"
+#include "gsl/gsl_cdf.h"
+
+#include "lapack.h"  //for functions EigenDecomp
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"   //for function ReadFile_kin
+#include "gemma_float.h"
+#include "vc_float.h"
+#include "lm_float.h"  //for LM class
+#include "bslmm_float.h"  //for BSLMM class
+#include "lmm_float.h"  //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe
+#include "mvlmm_float.h"  //for MVLMM class
+#include "prdt_float.h"	//for PRDT class
+#include "mathfunc_float.h"	//for a few functions
+#else
+#include "io.h"
+#include "gemma.h"
+#include "vc.h"
+#include "lm.h"
+#include "bslmm.h"
+#include "lmm.h"
+#include "mvlmm.h"
+#include "prdt.h"
+#include "mathfunc.h"
+#endif
+
+
+using namespace std;
+
+
+
+GEMMA::GEMMA(void):	
+version("0.95alpha"), date("08/08/2014"), year("2011")
+{}
+
+void GEMMA::PrintHeader (void)
+{
+	cout<<endl;
+	cout<<"*********************************************************"<<endl;
+	cout<<"  Genome-wide Efficient Mixed Model Association (GEMMA) "<<endl;
+	cout<<"  Version "<<version<<", "<<date<<"                              "<<endl;
+	cout<<"  Visit                                                 "<<endl;
+	cout<<"     http://stephenslab.uchicago.edu/software.html      "<<endl;
+	cout<<"     http://home.uchicago.edu/~xz7/software.html        "<<endl;
+	cout<<"  For Possible Updates                                  "<<endl;
+	cout<<"  (C) "<<year<<" Xiang Zhou                                   "<<endl;
+	cout<<"  GNU General Public License                            "<<endl;
+	cout<<"  For Help, Type ./gemma -h                             "<<endl;
+	cout<<"*********************************************************"<<endl;
+	cout<<endl;
+	
+	return;
+}
+
+
+void GEMMA::PrintLicense (void)
+{
+	cout<<endl;
+	cout<<"The Software Is Distributed Under GNU General Public License, But May Also Require The Following Notifications."<<endl;
+	cout<<endl;
+	
+	cout<<"Including Lapack Routines In The Software May Require The Following Notification:"<<endl;
+	cout<<"Copyright (c) 1992-2010 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved."<<endl;
+	cout<<"Copyright (c) 2000-2010 The University of California Berkeley. All rights reserved."<<endl;
+	cout<<"Copyright (c) 2006-2010 The University of Colorado Denver.  All rights reserved."<<endl;	
+	cout<<endl;
+	
+	cout<<"$COPYRIGHT$"<<endl;
+	cout<<"Additional copyrights may follow"<<endl;
+	cout<<"$HEADER$"<<endl;
+	cout<<"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:"<<endl;
+	cout<<"- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer."<<endl;
+	cout<<"- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution."<<endl;
+	cout<<"- Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission."<<endl;
+	cout<<"The copyright holders provide no reassurances that the source code provided does not infringe any patent, copyright, or any other "
+		<<"intellectual property rights of third parties.  The copyright holders disclaim any liability to any recipient for claims brought against "
+		<<"recipient by any third party for infringement of that parties intellectual property rights. "<<endl;
+	cout<<"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT "
+		<<"LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT "
+		<<"OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT "
+		<<"LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY "
+		<<"THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE "
+		<<"OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."<<endl;
+	cout<<endl;
+	
+	
+	
+	return;
+}
+
+
+
+void GEMMA::PrintHelp(size_t option)
+{
+	if (option==0) {
+		cout<<endl; 
+		cout<<" GEMMA version "<<version<<", released on "<<date<<endl;
+		cout<<" implemented by Xiang Zhou"<<endl; 
+		cout<<endl;
+		cout<<" type ./gemma -h [num] for detailed helps"<<endl;
+		cout<<" options: " << endl;
+		cout<<" 1: quick guide"<<endl;
+		cout<<" 2: file I/O related"<<endl;
+		cout<<" 3: SNP QC"<<endl;
+		cout<<" 4: calculate relatedness matrix"<<endl;
+		cout<<" 5: perform eigen decomposition"<<endl;
+		cout<<" 6: perform variance component estiamtion"<<endl;
+		cout<<" 7: fit a linear model"<<endl;
+		cout<<" 8: fit a linear mixed model"<<endl;
+		cout<<" 9: fit a multivariate linear mixed model"<<endl;
+		cout<<" 10: fit a Bayesian sparse linear mixed model"<<endl;
+		cout<<" 11: obtain predicted values"<<endl;
+		cout<<" 12: note"<<endl;
+		cout<<endl;
+	}	
+	
+	if (option==1) {
+		cout<<" QUICK GUIDE" << endl;
+		cout<<" to generate a relatedness matrix: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -gk [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -gk [num] -o [prefix]"<<endl;
+		cout<<" to perform eigen decomposition of the relatedness matrix: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -eigen -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]"<<endl;
+		cout<<" to estimate variance components: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -k [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -mk [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -mk [filename] -vc -o [prefix]"<<endl;
+		cout<<" to fit a linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;	
+		cout<<" to fit a multivariate linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;	
+		cout<<" to fit a Bayesian sparse linear mixed model: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -bslmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -bslmm [num] -o [prefix]"<<endl;
+		cout<<" to obtain predicted values: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==2) {
+		cout<<" FILE I/O RELATED OPTIONS" << endl;
+		cout<<" -bfile    [prefix]       "<<" specify input PLINK binary ped file prefix."<<endl;	
+		cout<<"          requires: *.fam, *.bim and *.bed files"<<endl;	
+		cout<<"          missing value: -9"<<endl;
+		cout<<" -g        [filename]     "<<" specify input BIMBAM mean genotype file name"<<endl;
+		cout<<"          format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
+		cout<<"                  rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
+		cout<<"                  ..."<<endl;	
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -p        [filename]     "<<" specify input BIMBAM phenotype file name"<<endl;
+		cout<<"          format: phenotype for individual 1"<<endl;	
+		cout<<"                  phenotype for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -a        [filename]     "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl;	
+		cout<<"          format: rs#1, base_position, chr_number"<<endl;	
+		cout<<"                  rs#2, base_position, chr_number"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<" -k        [filename]     "<<" specify input kinship/relatedness matrix file name"<<endl;	
+		cout<<" -mk       [filename]     "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl;	
+		cout<<" -u        [filename]     "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl;	
+		cout<<" -d        [filename]     "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl;	
+		cout<<" -c        [filename]     "<<" specify input covariates file name (optional)"<<endl;	
+		cout<<"          format: covariate 1 for individual 1, ... , covariate c for individual 1"<<endl;	
+		cout<<"                  covariate 1 for individual 2, ... , covariate c for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<"          note: the intercept (a column of 1s) may need to be included"<<endl;
+		cout<<" -epm      [filename]     "<<" specify input estimated parameter file name"<<endl;
+		cout<<" -en [n1] [n2] [n3] [n4]  "<<" specify values for the input estimated parameter file (with a header)"<<endl;
+		cout<<"          options: n1: rs column number"<<endl;
+		cout<<"                   n2: estimated alpha column number (0 to ignore)"<<endl;
+		cout<<"                   n3: estimated beta column number (0 to ignore)"<<endl;
+		cout<<"                   n4: estimated gamma column number (0 to ignore)"<<endl;
+		cout<<"          default: 2 4 5 6 if -ebv is not specified; 2 0 5 6 if -ebv is specified"<<endl;
+		cout<<" -ebv      [filename]     "<<" specify input estimated random effect (breeding value) file name"<<endl;
+		cout<<"          format: value for individual 1"<<endl;	
+		cout<<"                  value for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -emu      [filename]     "<<" specify input log file name containing estimated mean"<<endl;
+		cout<<" -mu       [num]          "<<" specify input estimated mean value"<<endl;
+		cout<<" -gene     [filename]     "<<" specify input gene expression file name"<<endl;
+		cout<<"          format: header"<<endl;	
+		cout<<"                  gene1, count for individual 1, count for individual 2, ..."<<endl;	
+		cout<<"                  gene2, count for individual 1, count for individual 2, ..."<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: not allowed"<<endl;	
+		cout<<" -r        [filename]     "<<" specify input total read count file name"<<endl;
+		cout<<"          format: total read count for individual 1"<<endl;	
+		cout<<"                  total read count for individual 2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -snps     [filename]     "<<" specify input snps file name to only analyze a certain set of snps"<<endl;
+		cout<<"          format: rs#1"<<endl;	
+		cout<<"                  rs#2"<<endl;	
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;	
+		cout<<" -silence                 "<<" silent terminal display"<<endl;
+		cout<<" -km       [num]          "<<" specify input kinship/relatedness file type (default 1)."<<endl;
+		cout<<"          options: 1: \"n by n matrix\" format"<<endl;
+		cout<<"                   2: \"id  id  value\" format"<<endl;
+		cout<<" -n        [num]          "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl;	
+		cout<<" -pace     [num]          "<<" specify terminal display update pace (default 100000 SNPs or 100000 iterations)."<<endl;
+		cout<<" -outdir   [path]         "<<" specify output directory path (default \"./output/\")"<<endl; 
+		cout<<" -o        [prefix]       "<<" specify output file prefix (default \"result\")"<<endl;  
+		cout<<"          output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl;	
+		cout<<"          output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==3) {
+		cout<<" SNP QC OPTIONS" << endl;
+		cout<<" -miss     [num]          "<<" specify missingness threshold (default 0.05)" << endl; 
+		cout<<" -maf      [num]          "<<" specify minor allele frequency threshold (default 0.01)" << endl; 
+		cout<<" -hwe      [num]          "<<" specify HWE test p value threshold (default 0; no test)" << endl; 
+		cout<<" -r2       [num]          "<<" specify r-squared threshold (default 0.9999)" << endl; 
+		cout<<" -notsnp                  "<<" minor allele frequency cutoff is not used" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==4) {
+		cout<<" RELATEDNESS MATRIX CALCULATION OPTIONS" << endl;
+		cout<<" -gk       [num]          "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl; 
+		cout<<"          options: 1: centered XX^T/p"<<endl;
+		cout<<"                   2: standardized XX^T/p"<<endl;
+		cout<<"          note: non-polymorphic SNPs are excluded "<<endl;
+		cout<<endl;
+	}
+	
+	if (option==5) {
+		cout<<" EIGEN-DECOMPOSITION OPTIONS" << endl;
+		cout<<" -eigen                   "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl; 
+		cout<<endl;
+	}
+
+	if (option==6) {
+		cout<<" VARIANCE COMPONENT ESTIMATION OPTIONS" << endl;
+		cout<<" -vc                      "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==7) {
+		cout<<" LINEAR MODEL OPTIONS" << endl;		
+		cout<<" -lm       [num]         "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: Wald test"<<endl;
+		cout<<"                   2: Likelihood ratio test"<<endl;
+		cout<<"                   3: Score test"<<endl;
+		cout<<"                   4: 1-3"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==8) {
+		cout<<" LINEAR MIXED MODEL OPTIONS" << endl;		
+		cout<<" -lmm      [num]         "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: Wald test"<<endl;		
+		cout<<"                   2: Likelihood ratio test"<<endl;
+		cout<<"                   3: Score test"<<endl;
+		cout<<"                   4: 1-3"<<endl;
+		cout<<"                   5: Parameter estimation in the null model only"<<endl;
+		cout<<" -lmin     [num]          "<<" specify minimal value for lambda (default 1e-5)" << endl; 
+		cout<<" -lmax     [num]          "<<" specify maximum value for lambda (default 1e+5)" << endl; 
+		cout<<" -region   [num]          "<<" specify the number of regions used to evaluate lambda (default 10)" << endl; 
+		cout<<endl;
+	}
+	
+	if (option==9) {
+		cout<<" MULTIVARIATE LINEAR MIXED MODEL OPTIONS" << endl;
+		cout<<" -pnr				     "<<" specify the pvalue threshold to use the Newton-Raphson's method (default 0.001)"<<endl;
+		cout<<" -emi				     "<<" specify the maximum number of iterations for the PX-EM method in the null (default 10000)"<<endl;
+		cout<<" -nri				     "<<" specify the maximum number of iterations for the Newton-Raphson's method in the null (default 100)"<<endl;
+		cout<<" -emp				     "<<" specify the precision for the PX-EM method in the null (default 0.0001)"<<endl;
+		cout<<" -nrp				     "<<" specify the precision for the Newton-Raphson's method in the null (default 0.0001)"<<endl;
+		cout<<" -crt				     "<<" specify to output corrected pvalues for these pvalues that are below the -pnr threshold"<<endl;
+		cout<<endl;
+	}
+	
+	if (option==10) {
+		cout<<" MULTI-LOCUS ANALYSIS OPTIONS" << endl;
+		cout<<" -bslmm	  [num]			 "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: BSLMM"<<endl;	
+		cout<<"                   2: standard ridge regression/GBLUP (no mcmc)"<<endl;	
+		cout<<"                   3: probit BSLMM (requires 0/1 phenotypes)"<<endl;			
+		
+		cout<<"   MCMC OPTIONS" << endl;
+		cout<<"   Prior" << endl;	
+		cout<<" -hmin     [num]          "<<" specify minimum value for h (default 0)" << endl; 
+		cout<<" -hmax     [num]          "<<" specify maximum value for h (default 1)" << endl; 
+		cout<<" -rmin     [num]          "<<" specify minimum value for rho (default 0)" << endl; 
+		cout<<" -rmax     [num]          "<<" specify maximum value for rho (default 1)" << endl; 
+		cout<<" -pmin     [num]          "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl; 
+		cout<<" -pmax     [num]          "<<" specify maximum value for log10(pi) (default log10(1) )" << endl; 	
+		cout<<" -smin     [num]          "<<" specify minimum value for |gamma| (default 0)" << endl; 
+		cout<<" -smax     [num]          "<<" specify maximum value for |gamma| (default 300)" << endl; 
+		
+		cout<<"   Proposal" << endl;
+		cout<<" -gmean    [num]          "<<" specify the mean for the geometric distribution (default: 2000)" << endl; 
+		cout<<" -hscale   [num]          "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
+		cout<<" -rscale   [num]          "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
+		cout<<" -pscale   [num]          "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl; 
+		
+		cout<<"   Others" << endl;
+		cout<<" -w        [num]          "<<" specify burn-in steps (default 100,000)" << endl; 
+		cout<<" -s        [num]          "<<" specify sampling steps (default 1,000,000)" << endl; 
+		cout<<" -rpace    [num]          "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl; 	
+		cout<<" -wpace    [num]          "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl; 	
+		cout<<" -seed     [num]          "<<" specify random seed (a random seed is generated by default)" << endl; 	
+		cout<<" -mh       [num]          "<<" specify number of MH steps in each iteration (default 10)" << endl; 
+		cout<<"          requires: 0/1 phenotypes and -bslmm 3 option"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==11) {
+		cout<<" PREDICTION OPTIONS" << endl;
+		cout<<" -predict  [num]			 "<<" specify prediction options (default 1)."<<endl;
+		cout<<"          options: 1: predict for individuals with missing phenotypes"<<endl;	
+		cout<<"                   2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl;	
+		cout<<endl;
+	}
+	
+	if (option==12) {
+		cout<<" NOTE"<<endl;
+		cout<<" 1. Only individuals with non-missing phenotoypes and covariates will be analyzed."<<endl;
+		cout<<" 2. Missing genotoypes will be repalced with the mean genotype of that SNP."<<endl;
+		cout<<" 3. For lmm analysis, memory should be large enough to hold the relatedness matrix and to perform eigen decomposition."<<endl;
+		cout<<" 4. For multivariate lmm analysis, use a large -pnr for each snp will increase computation time dramatically."<<endl;
+		cout<<" 5. For bslmm analysis, in addition to 3, memory should be large enough to hold the whole genotype matrix."<<endl;
+		cout<<endl;
+	}
+	
+	return;
+}
+
+
+
+void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
+{
+	string str;
+	
+	for(int i = 1; i < argc; i++) {		
+		if (strcmp(argv[i], "-bfile")==0 || strcmp(argv[i], "--bfile")==0 || strcmp(argv[i], "-b")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_bfile=str;
+		}
+		else if (strcmp(argv[i], "-silence")==0) {
+			cPar.mode_silence=true;
+		}
+		else if (strcmp(argv[i], "-g")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_geno=str;
+		}
+		else if (strcmp(argv[i], "-p")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_pheno=str;
+		}
+		else if (strcmp(argv[i], "-a")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_anno=str;
+		}
+		else if (strcmp(argv[i], "-k")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_kin=str;
+		}
+		else if (strcmp(argv[i], "-mk")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mk=str;
+		}
+		else if (strcmp(argv[i], "-u")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_ku=str;
+		}
+		else if (strcmp(argv[i], "-d")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_kd=str;
+		}
+		else if (strcmp(argv[i], "-c")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_cvt=str;
+		}
+		else if (strcmp(argv[i], "-epm")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_epm=str;
+		}
+		else if (strcmp(argv[i], "-en")==0) {			
+			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
+				++i;
+				str.clear();
+				str.assign(argv[i]);
+				cPar.est_column.push_back(atoi(str.c_str()));
+			}
+		}
+		else if (strcmp(argv[i], "-ebv")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_ebv=str;
+		}
+		else if (strcmp(argv[i], "-emu")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_log=str;
+		}
+		else if (strcmp(argv[i], "-mu")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.pheno_mean=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gene")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_gene=str;
+		}
+		else if (strcmp(argv[i], "-r")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_read=str;
+		}
+		else if (strcmp(argv[i], "-snps")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_snps=str;
+		}
+		else if (strcmp(argv[i], "-km")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.k_mode=atoi(str.c_str());
+		}		
+		else if (strcmp(argv[i], "-n")==0) {
+			(cPar.p_column).clear();
+			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
+				++i;
+				str.clear();
+				str.assign(argv[i]);
+				(cPar.p_column).push_back(atoi(str.c_str()));
+			}
+		}
+		else if (strcmp(argv[i], "-pace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.d_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-outdir")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.path_out=str;
+		}
+		else if (strcmp(argv[i], "-o")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_out=str;
+		}		
+		else if (strcmp(argv[i], "-miss")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.miss_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-maf")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			if (cPar.maf_level!=-1) {cPar.maf_level=atof(str.c_str());}
+		}
+		else if (strcmp(argv[i], "-hwe")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.hwe_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-r2")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.r2_level=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-notsnp")==0) {
+			cPar.maf_level=-1;
+		}
+		else if (strcmp(argv[i], "-gk")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=21; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=20+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-eigen")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=31; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=30+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-vc")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=61; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=60+atoi(str.c_str());
+		}	
+		else if (strcmp(argv[i], "-lm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=51; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=50+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-fa")==0 || strcmp(argv[i], "-lmm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=1; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-lmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.l_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-lmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.l_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-region")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.n_region=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pnr")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.p_nr=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-emi")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.em_iter=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-nri")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.nr_iter=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-emp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.em_prec=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-nrp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.nr_prec=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-crt")==0) {
+			cPar.crt=1;
+		}
+		else if (strcmp(argv[i], "-bslmm")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=11; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=10+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rmin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_min=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rmax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_max=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pmin")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_min=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-pmax")==0) {
+			if(argv[i+1] == NULL) {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_max=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-smin")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_min=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-smax")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_max=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gmean")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.geo_mean=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-hscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.h_scale=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.rho_scale=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pscale")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.logp_scale=atof(str.c_str())*log(10.0);
+		}
+		else if (strcmp(argv[i], "-w")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.w_step=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-s")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.s_step=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-rpace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.r_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-wpace")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.w_pace=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-seed")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.randseed=atol(str.c_str());
+		}
+		else if (strcmp(argv[i], "-mh")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.n_mh=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-predict")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=41; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=40+atoi(str.c_str());
+		}
+		else {cout<<"error! unrecognized option: "<<argv[i]<<endl; cPar.error=true; continue;}
+	}
+	
+	//change prediction mode to 43, if the epm file is not provided
+	if (cPar.a_mode==41 && cPar.file_epm.empty()) {cPar.a_mode=43;}
+	
+	return;
+}
+
+
+
+void GEMMA::BatchRun (PARAM &cPar) 
+{
+	clock_t time_begin, time_start;
+	time_begin=clock();
+
+	//Read Files
+	cout<<"Reading Files ... "<<endl;
+	cPar.ReadFiles();
+	if (cPar.error==true) {cout<<"error! fail to read files. "<<endl; return;}
+	cPar.CheckData();
+	if (cPar.error==true) {cout<<"error! fail to check data. "<<endl; return;}
+	//Prediction for bslmm	
+	if (cPar.a_mode==41 || cPar.a_mode==42) {
+		gsl_vector *y_prdt;
+		
+		y_prdt=gsl_vector_alloc (cPar.ni_total-cPar.ni_test);
+
+		//set to zero
+		gsl_vector_set_zero (y_prdt);
+		
+		PRDT cPRDT;
+		cPRDT.CopyFromParam(cPar);
+		
+		//add breeding value if needed
+		if (!cPar.file_kin.empty() && !cPar.file_ebv.empty()) {
+			cout<<"Adding Breeding Values ... "<<endl;
+			
+			gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
+			gsl_vector *u_hat=gsl_vector_alloc (cPar.ni_test);
+			
+			//read kinship matrix and set u_hat
+			vector<int> indicator_all;
+			size_t c_bv=0;
+			for (size_t i=0; i<cPar.indicator_idv.size(); i++) {
+				indicator_all.push_back(1);
+				if (cPar.indicator_bv[i]==1) {gsl_vector_set(u_hat, c_bv, cPar.vec_bv[i]); c_bv++;}
+			}
+			
+			ReadFile_kin (cPar.file_kin, indicator_all, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+			
+			//read u			
+			cPRDT.AddBV(G, u_hat, y_prdt);					
+			
+			gsl_matrix_free(G);
+			gsl_vector_free(u_hat);
+		}
+
+		//add beta
+		if (!cPar.file_bfile.empty()) {
+			cPRDT.AnalyzePlink (y_prdt);
+		}
+		else {
+			cPRDT.AnalyzeBimbam (y_prdt);
+		}
+		
+		//add mu
+		gsl_vector_add_constant(y_prdt, cPar.pheno_mean);
+		
+		//convert y to probability if needed
+		if (cPar.a_mode==42) {
+			double d;
+			for (size_t i=0; i<y_prdt->size; i++) {
+				d=gsl_vector_get(y_prdt, i);
+				d=gsl_cdf_gaussian_P(d, 1.0);
+				gsl_vector_set(y_prdt, i, d);
+			}
+		}
+			
+			
+		cPRDT.CopyToParam(cPar);
+		
+		cPRDT.WriteFiles(y_prdt);
+		
+		gsl_vector_free(y_prdt);
+	}
+	
+	
+	//Prediction with kinship matrix only; for one or more phenotypes
+	if (cPar.a_mode==43) {
+		//first, use individuals with full phenotypes to obtain estimates of Vg and Ve		
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);		
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
+		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
+		gsl_vector *eval=gsl_vector_alloc (Y->size1);
+		
+		gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph);
+		gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt);
+		//set covariates matrix W and phenotype matrix Y
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+		cPar.CopyCvtPhen (W_full, Y_full, 1);
+				
+		gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph);		
+		gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1);		
+		gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2);
+				
+		//read relatedness matrix G, and matrix G_full
+		ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+		ReadFile_kin (cPar.file_kin, cPar.indicator_cvt, cPar.mapID2num, cPar.k_mode, cPar.error, G_full);
+		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+				
+		//center matrix G
+		CenterMatrix (G);
+		CenterMatrix (G_full);
+		
+		//eigen-decomposition and calculate trace_G
+		cout<<"Start Eigen-Decomposition..."<<endl;
+		time_start=clock();	
+		cPar.trace_G=EigenDecomp (G, U, eval, 0);
+		cPar.trace_G=0.0;
+		for (size_t i=0; i<eval->size; i++) {
+			if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+			cPar.trace_G+=gsl_vector_get (eval, i);
+		}
+		cPar.trace_G/=(double)eval->size;
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		
+		//calculate UtW and Uty
+		CalcUtX (U, W, UtW);
+		CalcUtX (U, Y, UtY);
+
+		//calculate variance component and beta estimates
+		//and then obtain predicted values
+		if (cPar.n_ph==1) {
+			gsl_vector *beta=gsl_vector_alloc (W->size2);
+			gsl_vector *se_beta=gsl_vector_alloc (W->size2);
+			
+			double lambda, logl, vg, ve;
+			gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+			//obtain estimates
+			CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, lambda, logl);
+			CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, lambda, vg, ve, beta, se_beta);
+
+			cout<<"REMLE estimate for vg in the null model = "<<vg<<endl;
+			cout<<"REMLE estimate for ve in the null model = "<<ve<<endl;
+			cPar.vg_remle_null=vg; cPar.ve_remle_null=ve;
+			
+			//obtain Y_hat from fixed effects
+			gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0);			
+			gsl_blas_dgemv (CblasNoTrans, 1.0, W_full, beta, 0.0, &Yhat_col.vector);
+			
+			//obtain H
+			gsl_matrix_set_identity (H_full);
+			gsl_matrix_scale (H_full, ve);
+			gsl_matrix_scale (G_full, vg);
+			gsl_matrix_add (H_full, G_full);
+			
+			//free matrices			
+			gsl_vector_free(beta);
+			gsl_vector_free(se_beta);
+		} else {			
+			gsl_matrix *Vg=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
+			gsl_matrix *Ve=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
+			gsl_matrix *B=gsl_matrix_alloc (cPar.n_ph, W->size2);
+			gsl_matrix *se_B=gsl_matrix_alloc (cPar.n_ph, W->size2);
+			
+			//obtain estimates
+			CalcMvLmmVgVeBeta (eval, UtW, UtY, cPar.em_iter, cPar.nr_iter, cPar.em_prec, cPar.nr_prec, cPar.l_min, cPar.l_max, cPar.n_region, Vg, Ve, B, se_B);
+			
+			cout<<"REMLE estimate for Vg in the null model: "<<endl;
+			for (size_t i=0; i<Vg->size1; i++) {
+				for (size_t j=0; j<=i; j++) {
+					cout<<gsl_matrix_get(Vg, i, j)<<"\t";
+				}
+				cout<<endl;
+			}
+			cout<<"REMLE estimate for Ve in the null model: "<<endl;
+			for (size_t i=0; i<Ve->size1; i++) {
+				for (size_t j=0; j<=i; j++) {
+					cout<<gsl_matrix_get(Ve, i, j)<<"\t";
+				}
+				cout<<endl;
+			}
+			cPar.Vg_remle_null.clear();
+			cPar.Ve_remle_null.clear();
+			for (size_t i=0; i<Vg->size1; i++) {
+				for (size_t j=i; j<Vg->size2; j++) {
+					cPar.Vg_remle_null.push_back(gsl_matrix_get (Vg, i, j) );
+					cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) );
+				}
+			}
+			
+			//obtain Y_hat from fixed effects
+			gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat);
+			
+			//obtain H
+			KroneckerSym(G_full, Vg, H_full);
+			for (size_t i=0; i<G_full->size1; i++) {
+				gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2);
+				gsl_matrix_add (&H_sub.matrix, Ve);
+			}
+			
+			//free matrices					
+			gsl_matrix_free (Vg);
+			gsl_matrix_free (Ve);
+			gsl_matrix_free (B);
+			gsl_matrix_free (se_B);
+		}
+					
+		PRDT cPRDT;
+		
+		cPRDT.CopyFromParam(cPar);
+		
+		cout<<"Predicting Missing Phentypes ... "<<endl;
+		time_start=clock();	
+		cPRDT.MvnormPrdt(Y_hat, H_full, Y_full);
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+
+		cPRDT.WriteFiles(Y_full);
+		
+		gsl_matrix_free(Y);
+		gsl_matrix_free(W);		
+		gsl_matrix_free(G);
+		gsl_matrix_free(U); 
+		gsl_matrix_free(UtW);
+		gsl_matrix_free(UtY);
+		gsl_vector_free(eval);
+		
+		gsl_matrix_free(Y_full);
+		gsl_matrix_free(Y_hat);
+		gsl_matrix_free(W_full);
+		gsl_matrix_free(G_full);		
+		gsl_matrix_free(H_full);
+	}
+	
+	
+	//Generate Kinship matrix
+	if (cPar.a_mode==21 || cPar.a_mode==22) {  
+		cout<<"Calculating Relatedness Matrix ... "<<endl;
+		
+		gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
+		
+		time_start=clock();
+		cPar.CalcKin (G);
+		cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		if (cPar.error==true) {cout<<"error! fail to calculate relatedness matrix. "<<endl; return;}
+		
+		if (cPar.a_mode==21) {
+			cPar.WriteMatrix (G, "cXX");
+		} else {
+			cPar.WriteMatrix (G, "sXX");
+		}
+		
+		gsl_matrix_free (G);
+	}
+	
+	
+	//LM
+	if (cPar.a_mode==51 || cPar.a_mode==52 || cPar.a_mode==53 || cPar.a_mode==54) {  //Fit LM
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);	
+		
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+		
+		//Fit LM or mvLM
+		if (cPar.n_ph==1) {			
+			LM cLm;
+			cLm.CopyFromParam(cPar);
+			
+			gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+			
+			if (!cPar.file_gene.empty()) {		
+				cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype
+			} else if (!cPar.file_bfile.empty()) {
+				cLm.AnalyzePlink (W, &Y_col.vector);
+			} else {
+				cLm.AnalyzeBimbam (W, &Y_col.vector);
+			}
+			
+			cLm.WriteFiles();
+			cLm.CopyToParam(cPar);
+		}
+		/*
+		else {			 
+			MVLM cMvlm;
+			cMvlm.CopyFromParam(cPar);			
+			
+			if (!cPar.file_bfile.empty()) {
+				cMvlm.AnalyzePlink (W, Y);
+			} else {
+				cMvlm.AnalyzeBimbam (W, Y);
+			}
+			
+			cMvlm.WriteFiles();
+			cMvlm.CopyToParam(cPar);
+		}
+		*/
+		//release all matrices and vectors
+		gsl_matrix_free (Y);
+		gsl_matrix_free (W);
+	} 
+
+
+	//VC estimation with one or multiple kinship matrices
+	//REML approach only
+	//if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp
+	//for one phenotype only; 
+	if (cPar.a_mode==61) {
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc );
+
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+
+		//read kinship matrices
+		if (!(cPar.file_mk).empty()) {
+		  ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+		  if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+	
+		  //center matrix G, and obtain v_traceG
+		  double d=0;
+		  (cPar.v_traceG).clear();
+		  for (size_t i=0; i<cPar.n_vc; i++) {
+		    gsl_matrix_view G_sub=gsl_matrix_submatrix (G, 0, i*G->size1, G->size1, G->size1);
+		    CenterMatrix (&G_sub.matrix);
+		    d=0;
+		    for (size_t j=0; j<G->size1; j++) {
+		      d+=gsl_matrix_get (&G_sub.matrix, j, j);
+		    }
+		    d/=(double)G->size1;
+		    (cPar.v_traceG).push_back(d);
+		  }
+		} else if (!(cPar.file_kin).empty()) {
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+						
+			//center matrix G
+			CenterMatrix (G);
+
+			(cPar.v_traceG).clear();
+			double d=0;
+			for (size_t j=0; j<G->size1; j++) {
+			  d+=gsl_matrix_get (G, j, j);
+			}
+			d/=(double)G->size1;
+			(cPar.v_traceG).push_back(d);
+		}
+			/*
+			//eigen-decomposition and calculate trace_G
+			cout<<"Start Eigen-Decomposition..."<<endl;
+			time_start=clock();	
+	
+			if (cPar.a_mode==31) {
+				cPar.trace_G=EigenDecomp (G, U, eval, 1);
+			} else {
+				cPar.trace_G=EigenDecomp (G, U, eval, 0);
+			}
+
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+				cPar.trace_G+=gsl_vector_get (eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		} else {
+			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
+			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
+			
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
+			
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
+			  	cPar.trace_G+=gsl_vector_get(eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+		}
+		*/
+		//fit multiple variance components
+		if (cPar.n_ph==1) {
+		  //		  if (cPar.n_vc==1) {
+		    /*
+		    //calculate UtW and Uty	
+		    CalcUtX (U, W, UtW);
+		    CalcUtX (U, Y, UtY);
+
+		    gsl_vector_view beta=gsl_matrix_row (B, 0);
+		    gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
+		    gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+		    CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+		    CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
+
+		    cPar.beta_mle_null.clear();
+		    cPar.se_beta_mle_null.clear();
+		    for (size_t i=0; i<B->size2; i++) {
+		      cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
+		      cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+		    }
+
+		    CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+		    CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
+		    cPar.beta_remle_null.clear();
+		    cPar.se_beta_remle_null.clear();
+		    for (size_t i=0; i<B->size2; i++) {
+		      cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
+		      cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+		    }
+				
+		    CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+		    cPar.PrintSummary();
+				
+		    //calculate and output residuals
+		    if (cPar.a_mode==5) {
+		      gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
+		      gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
+					
+		      //obtain Utu and Ute
+		      gsl_vector_memcpy (y_hat, &UtY_col.vector);
+		      gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
+		      
+		      double d, u, e;
+		      for (size_t i=0; i<eval->size; i++) {
+			d=gsl_vector_get (eval, i);
+			u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+			e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+			gsl_vector_set (Utu_hat, i, u);
+			gsl_vector_set (Ute_hat, i, e);
+		      }
+					
+		      //obtain u and e
+		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
+		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
+		      
+		      //output residuals					
+		      cPar.WriteVector(u_hat, "residU");
+		      cPar.WriteVector(e_hat, "residE");
+		      
+		      gsl_vector_free(u_hat);
+		      gsl_vector_free(e_hat);
+		      gsl_vector_free(y_hat);
+		    }	
+*/	
+		  //		  } else {
+		    gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+		    VC cVc;
+		    cVc.CopyFromParam(cPar); 
+		    cVc.CalcVCreml (G, W, &Y_col.vector);			
+		    cVc.CopyToParam(cPar);
+
+		    //obtain pve from sigma2
+		    //obtain se_pve from se_sigma2
+		    
+		    //}
+		} 
+
+		
+	}
+	
+	
+	//LMM or mvLMM or Eigen-Decomposition
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) {  //Fit LMM or mvLMM or eigen
+		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
+		gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2);	//B is a d by c matrix
+		gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2);
+		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
+		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
+		gsl_vector *eval=gsl_vector_alloc (Y->size1);
+				
+		//set covariates matrix W and phenotype matrix Y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, Y, 0);
+				
+		//read relatedness matrix G	
+		if (!(cPar.file_kin).empty()) {
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+						
+			//center matrix G
+			CenterMatrix (G);
+			
+			//eigen-decomposition and calculate trace_G
+			cout<<"Start Eigen-Decomposition..."<<endl;
+			time_start=clock();	
+	
+			if (cPar.a_mode==31) {
+				cPar.trace_G=EigenDecomp (G, U, eval, 1);
+			} else {
+				cPar.trace_G=EigenDecomp (G, U, eval, 0);
+			}
+
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+				cPar.trace_G+=gsl_vector_get (eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		} else {
+			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
+			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
+			
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
+			
+			cPar.trace_G=0.0;
+			for (size_t i=0; i<eval->size; i++) {
+				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
+			  	cPar.trace_G+=gsl_vector_get(eval, i);
+			}
+			cPar.trace_G/=(double)eval->size;
+		}
+		
+		if (cPar.a_mode==31) {
+			cPar.WriteMatrix(U, "eigenU");
+			cPar.WriteVector(eval, "eigenD");
+		} else {
+			//calculate UtW and Uty	
+			CalcUtX (U, W, UtW);
+			CalcUtX (U, Y, UtY);			
+
+			//calculate REMLE/MLE estimate and pve for univariate model
+			if (cPar.n_ph==1) {
+				gsl_vector_view beta=gsl_matrix_row (B, 0);
+				gsl_vector_view se_beta=gsl_matrix_row (se_B, 0);
+				gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+				CalcLambda ('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+				CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector);
+
+				cPar.beta_mle_null.clear();
+				cPar.se_beta_mle_null.clear();
+				for (size_t i=0; i<B->size2; i++) {
+					cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i) );
+					cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+				}
+
+				CalcLambda ('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+				CalcLmmVgVeBeta (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector, &se_beta.vector);
+				cPar.beta_remle_null.clear();
+				cPar.se_beta_remle_null.clear();
+				for (size_t i=0; i<B->size2; i++) {
+					cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
+					cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
+				}
+				
+				CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+				cPar.PrintSummary();
+				
+				//calculate and output residuals
+				if (cPar.a_mode==5) {
+					gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *Ute_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
+					gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
+					
+					//obtain Utu and Ute
+					gsl_vector_memcpy (y_hat, &UtY_col.vector);
+					gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
+					
+					double d, u, e;
+					for (size_t i=0; i<eval->size; i++) {
+						d=gsl_vector_get (eval, i);
+						u=cPar.l_remle_null*d/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+						e=1.0/(cPar.l_remle_null*d+1.0)*gsl_vector_get(y_hat, i);
+						gsl_vector_set (Utu_hat, i, u);
+						gsl_vector_set (Ute_hat, i, e);
+					}
+					
+					//obtain u and e
+					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
+					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
+					
+					//output residuals					
+					cPar.WriteVector(u_hat, "residU");
+					cPar.WriteVector(e_hat, "residE");
+					
+					gsl_vector_free(u_hat);
+					gsl_vector_free(e_hat);
+					gsl_vector_free(y_hat);
+				}							
+			} 
+			
+			//Fit LMM or mvLMM
+			if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) {
+				if (cPar.n_ph==1) {			
+					LMM cLmm;
+					cLmm.CopyFromParam(cPar);
+					
+					gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+					gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+					
+					if (!cPar.file_gene.empty()) {		
+						cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype
+					} else if (!cPar.file_bfile.empty()) {
+						cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					} else {
+						cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					}	
+					
+					cLmm.WriteFiles();
+					cLmm.CopyToParam(cPar);
+				} else {			 
+					MVLMM cMvlmm;
+					cMvlmm.CopyFromParam(cPar);			
+					
+					if (!cPar.file_bfile.empty()) {
+						cMvlmm.AnalyzePlink (U, eval, UtW, UtY);
+					} else {
+						cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY);
+					}
+					
+					cMvlmm.WriteFiles();
+					cMvlmm.CopyToParam(cPar);
+				}
+			}
+		}
+		
+				
+		//release all matrices and vectors
+		gsl_matrix_free (Y);
+		gsl_matrix_free (W);
+		gsl_matrix_free(B);
+		gsl_matrix_free(se_B);
+		gsl_matrix_free (G);	
+		gsl_matrix_free (U);
+		gsl_matrix_free (UtW);
+		gsl_matrix_free (UtY);
+		gsl_vector_free (eval);
+	} 
+	
+	
+	//BSLMM
+	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
+		gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);	
+		gsl_matrix *G=gsl_matrix_alloc (y->size, y->size);
+		gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test);	
+		
+		//set covariates matrix W and phenotype vector y		
+		//an intercept should be included in W, 
+		cPar.CopyCvtPhen (W, y, 0);
+		
+		//center y, even for case/control data
+		cPar.pheno_mean=CenterVector(y);
+
+		//run bslmm if rho==1
+		if (cPar.rho_min==1 && cPar.rho_max==1) {
+		  //read genotypes X (not UtX)
+		  cPar.ReadGenotypes (UtX, G, false);
+
+		  //perform BSLMM analysis
+		  BSLMM cBslmm;
+		  cBslmm.CopyFromParam(cPar);
+		  time_start=clock();	
+		  cBslmm.MCMC(UtX, y);
+		  cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		  cBslmm.CopyToParam(cPar);
+		  //else, if rho!=1
+		} else {
+		gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); 
+		gsl_vector *eval=gsl_vector_alloc (y->size);
+		gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2);
+		gsl_vector *Uty=gsl_vector_alloc (y->size);
+
+		
+		//read relatedness matrix G		
+		if (!(cPar.file_kin).empty()) {		
+			cPar.ReadGenotypes (UtX, G, false);
+			
+			//read relatedness matrix G
+			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
+			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
+			
+			//center matrix G
+			CenterMatrix (G);
+		} else {
+			cPar.ReadGenotypes (UtX, G, true);
+		}
+		
+		//eigen-decomposition and calculate trace_G
+		cout<<"Start Eigen-Decomposition..."<<endl;
+		time_start=clock();
+		cPar.trace_G=EigenDecomp (G, U, eval, 0);
+		cPar.trace_G=0.0;
+		for (size_t i=0; i<eval->size; i++) {
+			if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
+			cPar.trace_G+=gsl_vector_get (eval, i);
+		}
+		cPar.trace_G/=(double)eval->size;
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);			
+		
+		//calculate UtW and Uty		
+		CalcUtX (U, W, UtW);
+		CalcUtX (U, y, Uty);
+		
+		//calculate REMLE/MLE estimate and pve
+		CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
+		CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
+		CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
+		
+		cPar.PrintSummary();
+				
+		//Creat and calcualte UtX, use a large memory
+		cout<<"Calculating UtX..."<<endl;
+		time_start=clock();							
+		CalcUtX (U, UtX);
+		cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//perform BSLMM analysis
+		BSLMM cBslmm;
+		cBslmm.CopyFromParam(cPar);
+		time_start=clock();	
+		if (cPar.a_mode==12) {  //ridge regression				
+			cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null);
+		} else {	//Run MCMC
+			cBslmm.MCMC(U, UtX, Uty, eval, y);
+		}
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		cBslmm.CopyToParam(cPar);
+		
+		//release all matrices and vectors
+		gsl_matrix_free (G);	
+		gsl_matrix_free (U);
+		gsl_matrix_free (UtW);
+		gsl_vector_free (eval);
+		gsl_vector_free (Uty);
+
+		}
+		gsl_matrix_free (W);
+		gsl_vector_free (y);
+		gsl_matrix_free (UtX);
+	} 
+	
+	
+		
+	cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0);
+	
+	return;
+}
+
+
+
+
+void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) 
+{
+	string file_str;
+	file_str=cPar.path_out+"/"+cPar.file_out;
+	file_str+=".log.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing log file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"##"<<endl;
+	outfile<<"## GEMMA Version = "<<version<<endl;
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Command Line Input = ";
+	for(int i = 1; i < argc; i++) {	
+		outfile<<argv[i]<<" ";
+	}
+	outfile<<endl;
+
+	outfile<<"##"<<endl;
+	time_t  rawtime; 
+	time(&rawtime);
+	tm *ptm = localtime (&rawtime);
+
+	outfile<<"## Date = "<<asctime(ptm)<<endl;
+	  //ptm->tm_year<<":"<<ptm->tm_month<<":"<<ptm->tm_day":"<<ptm->tm_hour<<":"<<ptm->tm_min<<endl;
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Summary Statistics:"<<endl;
+	outfile<<"## number of total individuals = "<<cPar.ni_total<<endl;	
+	if (cPar.a_mode==43) {
+		outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl;
+		outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl;
+	} else {
+		outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl;
+	}
+	outfile<<"## number of covariates = "<<cPar.n_cvt<<endl;
+	outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl;
+	if (cPar.a_mode==43) {
+		outfile<<"## number of observed data = "<<cPar.np_obs<<endl;
+		outfile<<"## number of missing data = "<<cPar.np_miss<<endl;
+	}
+	if (cPar.a_mode==61) {
+		outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+	}
+		
+	if (!(cPar.file_gene).empty()) {
+		outfile<<"## number of total genes = "<<cPar.ng_total<<endl;
+		outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl;		
+	} else if (cPar.file_epm.empty()) {	
+		outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl;	
+		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	} else {
+		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	}
+	
+	if (cPar.a_mode==13) {
+		outfile<<"## number of cases = "<<cPar.ni_case<<endl;
+		outfile<<"## number of controls = "<<cPar.ni_control<<endl;
+	}
+
+
+	if (cPar.a_mode==61) {
+	  //	        outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
+		if (cPar.n_ph==1) {
+		  outfile<<"## pve estimate in the null model = ";
+		  for (size_t i=0; i<cPar.v_pve.size(); i++) {
+		    outfile<<"  "<<cPar.v_pve[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## se(pve) in the null model = ";
+		  for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
+		    outfile<<"  "<<cPar.v_se_pve[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## sigma2 estimate in the null model = ";
+		  for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
+		    outfile<<"  "<<cPar.v_sigma2[i];
+		  }
+		  outfile<<endl;
+
+		  outfile<<"## se(sigma2) in the null model = ";
+		  for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
+		    outfile<<"  "<<cPar.v_se_sigma2[i];
+		  }
+		  outfile<<endl;
+		  /*
+			outfile<<"## beta estimate in the null model = ";
+			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.beta_remle_null[i];
+			}
+			outfile<<endl;
+			outfile<<"## se(beta) = ";
+			for (size_t i=0; i<cPar.se_beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.se_beta_remle_null[i];
+			}
+			outfile<<endl;
+		  */
+		}
+	}
+	
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
+		outfile<<"## MLE log-likelihood in the null model = "<<cPar.logl_mle_H0<<endl;
+		if (cPar.n_ph==1) {
+			//outfile<<"## lambda REMLE estimate in the null (linear mixed) model = "<<cPar.l_remle_null<<endl;
+			//outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl;	
+			outfile<<"## pve estimate in the null model = "<<cPar.pve_null<<endl;
+			outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl;	
+			outfile<<"## vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
+			outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+			outfile<<"## beta estimate in the null model = ";
+			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.beta_remle_null[i];
+			}
+			outfile<<endl;
+			outfile<<"## se(beta) = ";
+			for (size_t i=0; i<cPar.se_beta_remle_null.size(); i++) {
+				outfile<<"  "<<cPar.se_beta_remle_null[i];
+			}
+			outfile<<endl;
+			
+		} else {
+			size_t c;
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Vg): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVg_remle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Ve): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVe_remle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			
+			outfile<<"## MLE estimate for Vg in the null model: "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_ph; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_mle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Vg): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVg_mle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## MLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_ph; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_mle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(Ve): "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<sqrt(cPar.VVe_mle_null[c])<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## estimate for B (d by c) in the null model (columns correspond to the covariates provided in the file): "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_cvt; j++) {
+					c=i*cPar.n_cvt+j;
+					outfile<<cPar.beta_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## se(B): "<<endl;
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<cPar.n_cvt; j++) {
+					c=i*cPar.n_cvt+j;
+					outfile<<cPar.se_beta_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+		}
+	}
+	
+	/*
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		if (cPar.n_ph==1) {
+			outfile<<"## REMLE vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
+			outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+		} else {
+			size_t c;
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Vg_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			for (size_t i=0; i<cPar.n_ph; i++) {
+				for (size_t j=0; j<=i; j++) {
+					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
+					outfile<<cPar.Ve_remle_null[c]<<"\t";
+				}
+				outfile<<endl;
+			}
+		}
+	}
+	 */
+	
+	
+	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl;
+	}
+	
+	if (cPar.a_mode==11 || cPar.a_mode==13) {	
+		outfile<<"##"<<endl;
+		outfile<<"## MCMC related:"<<endl;	
+		outfile<<"## initial value of h = "<<cPar.cHyp_initial.h<<endl;
+		outfile<<"## initial value of rho = "<<cPar.cHyp_initial.rho<<endl;
+		outfile<<"## initial value of pi = "<<exp(cPar.cHyp_initial.logp)<<endl;
+		outfile<<"## initial value of |gamma| = "<<cPar.cHyp_initial.n_gamma<<endl;
+		outfile<<"## random seed = "<<cPar.randseed<<endl;
+		outfile<<"## acceptance ratio = "<<(double)cPar.n_accept/(double)((cPar.w_step+cPar.s_step)*cPar.n_mh)<<endl;
+	}
+	
+	outfile<<"##"<<endl;
+	outfile<<"## Computation Time:"<<endl;
+	outfile<<"## total computation time = "<<cPar.time_total<<" min "<<endl;
+	outfile<<"## computation time break down: "<<endl;
+	if (cPar.a_mode==21 || cPar.a_mode==22 || cPar.a_mode==11 || cPar.a_mode==13) {
+		outfile<<"##      time on calculating relatedness matrix = "<<cPar.time_G<<" min "<<endl;
+	}
+	if (cPar.a_mode==31) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+	}
+	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+		outfile<<"##      time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl;		
+	}
+	if ((cPar.a_mode>=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) {
+		outfile<<"##      time on optimization = "<<cPar.time_opt<<" min "<<endl;
+	}
+	if (cPar.a_mode==11 || cPar.a_mode==13) {
+		outfile<<"##      time on proposal = "<<cPar.time_Proposal<<" min "<<endl;
+		outfile<<"##      time on mcmc = "<<cPar.time_opt<<" min "<<endl;
+		outfile<<"##      time on Omega = "<<cPar.time_Omega<<" min "<<endl;
+	}
+	if (cPar.a_mode==41 || cPar.a_mode==42) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+	}
+	if (cPar.a_mode==43) {
+		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
+		outfile<<"##      time on predicting phenotypes = "<<cPar.time_opt<<" min "<<endl;
+	}
+	outfile<<"##"<<endl;
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
diff --git a/src/gemma.h b/src/gemma.h
new file mode 100644
index 0000000..acb1309
--- /dev/null
+++ b/src/gemma.h
@@ -0,0 +1,52 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __GEMMA_H__                
+#define __GEMMA_H__
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+class GEMMA {
+
+public:			
+	//parameters
+	string version;
+	string date;
+	string year;
+	
+	//constructor
+	GEMMA(void);
+	
+	//functions
+	void PrintHeader (void);
+	void PrintHelp (size_t option);
+	void PrintLicense (void);
+	void Assign (int argc, char **argv, PARAM &cPar);
+	void BatchRun (PARAM &cPar);
+	void WriteLog (int argc, char **argv, PARAM &cPar);
+};
+
+
+#endif
+
diff --git a/src/gzstream.cpp b/src/gzstream.cpp
new file mode 100644
index 0000000..bbb4ba8
--- /dev/null
+++ b/src/gzstream.cpp
@@ -0,0 +1,165 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.C
+// Revision      : $Revision: 1.7 $
+// Revision_date : $Date: 2003/01/08 14:41:27 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#include "gzstream.h"
+#include <iostream>
+#include <string.h>  // for memcpy
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See header file for user classes.
+// ----------------------------------------------------------------------------
+
+// --------------------------------------
+// class gzstreambuf:
+// --------------------------------------
+
+gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
+    if ( is_open())
+        return (gzstreambuf*)0;
+    mode = open_mode;
+    // no append nor read/write mode
+    if ((mode & std::ios::ate) || (mode & std::ios::app)
+        || ((mode & std::ios::in) && (mode & std::ios::out)))
+        return (gzstreambuf*)0;
+    char  fmode[10];
+    char* fmodeptr = fmode;
+    if ( mode & std::ios::in)
+        *fmodeptr++ = 'r';
+    else if ( mode & std::ios::out)
+        *fmodeptr++ = 'w';
+    *fmodeptr++ = 'b';
+    *fmodeptr = '\0';
+    file = gzopen( name, fmode);
+    if (file == 0)
+        return (gzstreambuf*)0;
+    opened = 1;
+    return this;
+}
+
+gzstreambuf * gzstreambuf::close() {
+    if ( is_open()) {
+        sync();
+        opened = 0;
+        if ( gzclose( file) == Z_OK)
+            return this;
+    }
+    return (gzstreambuf*)0;
+}
+
+int gzstreambuf::underflow() { // used for input buffer only
+    if ( gptr() && ( gptr() < egptr()))
+        return * reinterpret_cast<unsigned char *>( gptr());
+
+    if ( ! (mode & std::ios::in) || ! opened)
+        return EOF;
+    // Josuttis' implementation of inbuf
+    int n_putback = gptr() - eback();
+    if ( n_putback > 4)
+        n_putback = 4;
+    memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
+
+    int num = gzread( file, buffer+4, bufferSize-4);
+    if (num <= 0) // ERROR or EOF
+        return EOF;
+
+    // reset buffer pointers
+    setg( buffer + (4 - n_putback),   // beginning of putback area
+          buffer + 4,                 // read position
+          buffer + 4 + num);          // end of buffer
+
+    // return next character
+    return * reinterpret_cast<unsigned char *>( gptr());    
+}
+
+int gzstreambuf::flush_buffer() {
+    // Separate the writing of the buffer from overflow() and
+    // sync() operation.
+    int w = pptr() - pbase();
+    if ( gzwrite( file, pbase(), w) != w)
+        return EOF;
+    pbump( -w);
+    return w;
+}
+
+int gzstreambuf::overflow( int c) { // used for output buffer only
+    if ( ! ( mode & std::ios::out) || ! opened)
+        return EOF;
+    if (c != EOF) {
+        *pptr() = c;
+        pbump(1);
+    }
+    if ( flush_buffer() == EOF)
+        return EOF;
+    return c;
+}
+
+int gzstreambuf::sync() {
+    // Changed to use flush_buffer() instead of overflow( EOF)
+    // which caused improper behavior with std::endl and flush(),
+    // bug reported by Vincent Ricard.
+    if ( pptr() && pptr() > pbase()) {
+        if ( flush_buffer() == EOF)
+            return -1;
+    }
+    return 0;
+}
+
+// --------------------------------------
+// class gzstreambase:
+// --------------------------------------
+
+gzstreambase::gzstreambase( const char* name, int mode) {
+    init( &buf);
+    open( name, mode);
+}
+
+gzstreambase::~gzstreambase() {
+    buf.close();
+}
+
+void gzstreambase::open( const char* name, int open_mode) {
+    if ( ! buf.open( name, open_mode))
+        clear( rdstate() | std::ios::badbit);
+}
+
+void gzstreambase::close() {
+    if ( buf.is_open())
+        if ( ! buf.close())
+            clear( rdstate() | std::ios::badbit);
+}
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+// ============================================================================
+// EOF //
diff --git a/src/gzstream.h b/src/gzstream.h
new file mode 100644
index 0000000..861653f
--- /dev/null
+++ b/src/gzstream.h
@@ -0,0 +1,121 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.h
+// Revision      : $Revision: 1.5 $
+// Revision_date : $Date: 2002/04/26 23:30:15 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#ifndef GZSTREAM_H
+#define GZSTREAM_H 1
+
+// standard C++ with new header file names and std:: namespace
+#include <iostream>
+#include <fstream>
+#include <zlib.h>
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+private:
+    static const int bufferSize = 47+256;    // size of data buff
+    // totals 512 bytes under g++ for igzstream at the end.
+
+    gzFile           file;               // file handle for compressed file
+    char             buffer[bufferSize]; // data buffer
+    char             opened;             // open/close state of stream
+    int              mode;               // I/O mode
+
+    int flush_buffer();
+public:
+    gzstreambuf() : opened(0) {
+        setp( buffer, buffer + (bufferSize-1));
+        setg( buffer + 4,     // beginning of putback area
+              buffer + 4,     // read position
+              buffer + 4);    // end position      
+        // ASSERT: both input & output capabilities will not be used together
+    }
+    int is_open() { return opened; }
+    gzstreambuf* open( const char* name, int open_mode);
+    gzstreambuf* close();
+    ~gzstreambuf() { close(); }
+    
+    virtual int     overflow( int c = EOF);
+    virtual int     underflow();
+    virtual int     sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+    gzstreambuf buf;
+public:
+    gzstreambase() { init(&buf); }
+    gzstreambase( const char* name, int open_mode);
+    ~gzstreambase();
+    void open( const char* name, int open_mode);
+    void close();
+    gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz* 
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+    igzstream() : std::istream( &buf) {} 
+    igzstream( const char* name, int open_mode = std::ios::in)
+        : gzstreambase( name, open_mode), std::istream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::in) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+    ogzstream() : std::ostream( &buf) {}
+    ogzstream( const char* name, int mode = std::ios::out)
+        : gzstreambase( name, mode), std::ostream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::out) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/src/io.cpp b/src/io.cpp
new file mode 100644
index 0000000..c22f668
--- /dev/null
+++ b/src/io.cpp
@@ -0,0 +1,1396 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <map>
+#include <set>
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h> 
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_cdf.h"
+
+#include "lapack.h"
+#include "gzstream.h"
+#include "mathfunc.h"
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"
+#else
+#include "io.h"
+#endif
+
+
+using namespace std;
+
+
+
+//Print process bar
+void ProgressBar (string str, double p, double total)
+{
+	double progress = (100.0 * p / total); 
+	int barsize = (int) (progress / 2.0); 
+	char bar[51];
+	
+	cout<<str;
+	for (int i = 0; i <50; i++) {
+		if (i<barsize) {bar[i] = '=';}
+		else {bar[i]=' ';}
+		cout<<bar[i];
+	}
+	cout<<setprecision(2)<<fixed<<progress<<"%\r"<<flush;
+	
+	return;
+}
+
+
+//Print process bar (with acceptance ratio)
+void ProgressBar (string str, double p, double total, double ratio)
+{
+	double progress = (100.0 * p / total); 
+	int barsize = (int) (progress / 2.0); 
+	char bar[51];
+	
+	cout<<str;
+	for (int i = 0; i <50; i++) {
+		if (i<barsize) {bar[i] = '=';}
+		else {bar[i]=' ';}
+		cout<<bar[i];
+	}
+	cout<<setprecision(2)<<fixed<<progress<<"%    "<<ratio<<"\r"<<flush;
+	
+	
+	return;
+}
+
+// in case files are ended with "\r" or "\r\n"
+std::istream& safeGetline(std::istream& is, std::string& t)
+{
+    t.clear();
+
+    // The characters in the stream are read one-by-one using a std::streambuf.
+    // That is faster than reading them one-by-one using the std::istream.
+    // Code that uses streambuf this way must be guarded by a sentry object.
+    // The sentry object performs various tasks,
+    // such as thread synchronization and updating the stream state.
+
+    std::istream::sentry se(is, true);
+    std::streambuf* sb = is.rdbuf();
+
+    for(;;) {
+        int c = sb->sbumpc();
+        switch (c) {
+        case '\n':
+            return is;
+        case '\r':
+            if(sb->sgetc() == '\n')
+                sb->sbumpc();
+            return is;
+        case EOF:
+            // Also handle the case when the last line has no line ending
+            if(t.empty())
+                is.setstate(std::ios::eofbit);
+            return is;
+        default:
+            t += (char)c;
+        }
+    }
+}
+
+//Read snp file
+bool ReadFile_snps (const string &file_snps, set<string> &setSnps)
+{
+	setSnps.clear();
+
+	ifstream infile (file_snps.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		setSnps.insert(ch_ptr); 
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+//Read log file
+bool ReadFile_log (const string &file_log, double &pheno_mean)
+{
+	ifstream infile (file_log.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open log file: "<<file_log<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	size_t flag=0;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		if (ch_ptr!=NULL && strcmp(ch_ptr, "estimated")==0) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (ch_ptr!=NULL && strcmp(ch_ptr, "mean")==0) {
+				ch_ptr=strtok (NULL, " , \t");
+				if (ch_ptr!=NULL && strcmp(ch_ptr, "=")==0) {
+					ch_ptr=strtok (NULL, " , \t");
+					pheno_mean=atof(ch_ptr);
+					flag=1;
+				}
+			}
+		}
+		
+		if (flag==1) {break;}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+//Read bimbam annotation file
+bool ReadFile_anno (const string &file_anno, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM)
+{
+	mapRS2chr.clear();
+	mapRS2bp.clear();
+	
+	ifstream infile (file_anno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening annotation file: "<<file_anno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	long int b_pos;
+	string chr;
+	double cM;
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		if (strcmp(ch_ptr, "NA")==0) {b_pos=-9;} else {b_pos=atol(ch_ptr);}
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {chr="-9";} else {chr=ch_ptr;}
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr==NULL || strcmp(ch_ptr, "NA")==0) {cM=-9;} else {cM=atof(ch_ptr);}
+		
+		mapRS2chr[rs]=chr;
+		mapRS2bp[rs]=b_pos;
+		mapRS2cM[rs]=cM;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+//read one column of phenotype
+bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column)
+{
+	indicator_idv.clear();
+	pheno.clear();
+	
+	igzstream infile (file_pheno.c_str(), igzstream::in);
+//	ifstream infile (file_pheno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string id;
+	double p;
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		for (int i=0; i<(p_column-1); ++i) {
+			ch_ptr=strtok (NULL, " , \t");	
+		}		
+		if (strcmp(ch_ptr, "NA")==0) {indicator_idv.push_back(0); pheno.push_back(-9);}		//pheno is different from pimass2
+		else {p=atof(ch_ptr); indicator_idv.push_back(1); pheno.push_back(p);}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+//Read bimbam phenotype file, p_column=1, 2 ...
+bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column)
+{
+	indicator_pheno.clear();
+	pheno.clear();
+	
+	igzstream infile (file_pheno.c_str(), igzstream::in);
+//	ifstream infile (file_pheno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open phenotype file: "<<file_pheno<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+  
+	string id;
+	double p;
+	
+	vector<double> pheno_row;
+	vector<int> ind_pheno_row;
+	
+	size_t p_max=*max_element(p_column.begin(), p_column.end() );
+	map<size_t, size_t> mapP2c;
+	for (size_t i=0; i<p_column.size(); i++) {
+		mapP2c[p_column[i]]=i;
+		pheno_row.push_back(-9);
+		ind_pheno_row.push_back(0);
+	}	
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		
+		size_t i=0;
+		while (i<p_max ) {			
+			if (mapP2c.count(i+1)!=0) {
+				if (strcmp(ch_ptr, "NA")==0) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;}
+				else {p=atof(ch_ptr); ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;}
+			}
+			i++;
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		indicator_pheno.push_back(ind_pheno_row);	
+		pheno.push_back(pheno_row);			
+	}
+ 
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt)
+{
+	indicator_cvt.clear();
+	
+	ifstream infile (file_cvt.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open covariates file: "<<file_cvt<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	double d;	
+	
+	int flag_na=0;	
+	
+	while (!safeGetline(infile, line).eof()) {
+		vector<double> v_d; flag_na=0;
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		while (ch_ptr!=NULL) {
+			if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;}
+			else {d=atof(ch_ptr);}
+			
+			v_d.push_back(d);
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} 
+		cvt.push_back(v_d);
+	}
+	
+	if (indicator_cvt.empty()) {n_cvt=0;}
+	else {
+		flag_na=0;
+		for (vector<int>::size_type i=0; i<indicator_cvt.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			if (flag_na==0) {flag_na=1; n_cvt=cvt[i].size();}
+			if (flag_na!=0 && n_cvt!=cvt[i].size()) {cout<<"error! number of covariates in row "<<i<<" do not match other rows."<<endl; return false;}
+		}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+//Read .bim file
+bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo)
+{
+	snpInfo.clear();
+	
+	ifstream infile (file_bim.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening .bim file: "<<file_bim<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	long int b_pos;
+	string chr;
+	double cM;
+	string major;
+	string minor;
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");
+		chr=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		cM=atof(ch_ptr);
+		ch_ptr=strtok (NULL, " \t");
+		b_pos=atol(ch_ptr);
+		ch_ptr=strtok (NULL, " \t");
+		minor=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		major=ch_ptr;
+		
+		SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, -9, -9, -9};
+		snpInfo.push_back(sInfo);
+	}
+	
+	infile.close();
+	infile.clear();	
+	return true;
+}
+
+
+//Read .fam file
+bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column)
+{
+	indicator_pheno.clear();
+	pheno.clear();
+	mapID2num.clear();	
+	
+	igzstream infile (file_fam.c_str(), igzstream::in);
+	//ifstream infile (file_fam.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening .fam file: "<<file_fam<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+
+	string id;
+	int c=0;
+	double p;
+
+	vector<double> pheno_row;
+	vector<int> ind_pheno_row;
+	
+	size_t p_max=*max_element(p_column.begin(), p_column.end() );
+	map<size_t, size_t> mapP2c;
+	for (size_t i=0; i<p_column.size(); i++) {
+		mapP2c[p_column[i]]=i;
+		pheno_row.push_back(-9);
+		ind_pheno_row.push_back(0);
+	}	
+	
+	while (!safeGetline(infile, line).eof()) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");
+		ch_ptr=strtok (NULL, " \t");
+		id=ch_ptr;
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		ch_ptr=strtok (NULL, " \t");
+		
+		size_t i=0;
+		while (i<p_max ) {
+			if (mapP2c.count(i+1)!=0 ) {
+				if (strcmp(ch_ptr, "NA")==0) {
+					ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;
+				} else {
+					p=atof(ch_ptr);
+					
+					if (p==-9) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;}
+					else {ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;}
+				}
+			}
+			i++;
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		indicator_pheno.push_back(ind_pheno_row);
+		pheno.push_back(pheno_row);				
+		
+		mapID2num[id]=c; c++;
+	}
+ 
+	infile.close();
+	infile.clear();	
+	return true;
+}
+
+
+
+
+
+
+//Read bimbam mean genotype file, the first time, to obtain #SNPs for analysis (ns_test) and total #SNP (ns_total)
+bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test)
+{
+	indicator_snp.clear();
+	snpInfo.clear();
+	
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+
+	gsl_vector *genotype=gsl_vector_alloc (W->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (W->size1);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+	
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;	
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+	
+	double v_x, v_w;
+	int c_idv=0;
+	
+	string line;
+	char *ch_ptr;
+		
+	string rs;
+	long int b_pos;
+	string chr;
+	string major;
+	string minor;
+	double cM;
+  
+	double maf, geno, geno_old;
+	size_t n_miss;
+	size_t n_0, n_1, n_2;
+	int flag_poly;
+	
+	int ni_total=indicator_idv.size();
+	int ni_test=0;
+	for (int i=0; i<ni_total; ++i) {
+		ni_test+=indicator_idv[i];
+	}
+	ns_test=0;
+	
+	while (!safeGetline(infile, line).eof()) {		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		minor=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		major=ch_ptr;
+		
+		if (setSnps.size()!=0 && setSnps.count(rs)==0) {
+			SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9};
+			snpInfo.push_back(sInfo);
+			indicator_snp.push_back(0);
+			continue;
+		}
+				
+		if (mapRS2bp.count(rs)==0) {chr="-9"; b_pos=-9;cM=-9;}
+		else {b_pos=mapRS2bp[rs]; chr=mapRS2chr[rs]; cM=mapRS2cM[rs];}		
+				
+		maf=0; n_miss=0; flag_poly=0; geno_old=-9;
+		n_0=0; n_1=0; n_2=0;
+		c_idv=0; gsl_vector_set_zero (genotype_miss);
+		for (int i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}		
+
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;}
+			
+			geno=atof(ch_ptr);
+			if (geno>=0 && geno<=0.5) {n_0++;}
+			if (geno>0.5 && geno<1.5) {n_1++;}
+			if (geno>=1.5 && geno<=2.0) {n_2++;}
+			
+			gsl_vector_set (genotype, c_idv, geno); 
+			
+//			if (geno<0) {n_miss++; continue;}
+			
+			if (flag_poly==0) {geno_old=geno; flag_poly=2;}
+			if (flag_poly==2 && geno!=geno_old) {flag_poly=1;}
+			
+			maf+=geno;
+			
+			c_idv++;
+		}
+		maf/=2.0*(double)(ni_test-n_miss);	
+		
+		SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf};
+		snpInfo.push_back(sInfo);
+		
+		if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;}
+		
+		if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;}
+		
+		if (flag_poly!=1) {indicator_snp.push_back(0); continue;}
+		
+		if (hwe_level!=0) {
+			if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;}
+		}
+		
+		//filter SNP if it is correlated with W
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);}		
+		}
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_ddot (genotype, genotype, &v_x);
+		gsl_blas_ddot (Wtx, WtWiWtx, &v_w);
+		
+		if (v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;}
+		
+		indicator_snp.push_back(1); 
+		ns_test++;
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (Wtx);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+      
+//Read bed file, the first time
+bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test)
+{
+	indicator_snp.clear();
+	size_t ns_total=snpInfo.size();
+	
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+
+	gsl_vector *genotype=gsl_vector_alloc (W->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (W->size1);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+	
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;	
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+	
+	double v_x, v_w, geno;
+	size_t c_idv=0;
+	
+	char ch[1];
+	bitset<8> b;
+  	
+	size_t ni_total=indicator_idv.size();
+	size_t ni_test=0;
+	for (size_t i=0; i<ni_total; ++i) {
+		ni_test+=indicator_idv[i];
+	}
+	ns_test=0;
+	
+	//calculate n_bit and c, the number of bit for each snp
+	size_t n_bit;
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1;}
+
+	//ignore the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	double maf;
+	size_t n_miss;
+	size_t n_0, n_1, n_2, c;	
+	
+	//start reading snps and doing association test
+	for (size_t t=0; t<ns_total; ++t) {
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) {
+			snpInfo[t].n_miss=-9;
+			snpInfo[t].missingness=-9;
+			snpInfo[t].maf=-9;
+			indicator_snp.push_back(0);
+			continue;
+		}
+
+		//read genotypes
+		c=0; maf=0.0; n_miss=0; n_0=0; n_1=0; n_2=0;
+		c_idv=0; gsl_vector_set_zero (genotype_miss);
+		for (size_t i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && c==ni_total) {break;}
+				if (indicator_idv[c]==0) {c++; continue;}
+				c++;
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); maf+=2.0; n_2++;}
+					else {gsl_vector_set(genotype, c_idv, 1.0); maf+=1.0; n_1++;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;}                                  
+					else {gsl_vector_set(genotype_miss, c_idv, 1); n_miss++; }
+				}
+				c_idv++;
+			}
+		}
+		maf/=2.0*(double)(ni_test-n_miss);
+		
+		snpInfo[t].n_miss=n_miss;
+		snpInfo[t].missingness=(double)n_miss/(double)ni_test;
+		snpInfo[t].maf=maf;
+		
+		if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;}
+		
+		if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;}
+		
+		if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;}
+		
+		if (hwe_level!=1) {
+			if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;}
+		}
+			
+		
+		//filter SNP if it is correlated with W
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);}		
+		}
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_ddot (genotype, genotype, &v_x);
+		gsl_blas_ddot (Wtx, WtWiWtx, &v_w);
+		
+		if (v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;}
+		
+		indicator_snp.push_back(1); 
+		ns_test++;
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (Wtx);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
+		  
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) 
+{
+	igzstream infile (file_kin.c_str(), igzstream::in);
+//	ifstream infile (file_kin.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open kinship file: "<<file_kin<<endl; error=true; return;}
+	
+	size_t ni_total=indicator_idv.size();
+	
+	gsl_matrix_set_zero (G);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	if (k_mode==1) {
+		size_t i_test=0, i_total=0, j_test=0, j_total=0;
+		while (getline(infile, line)) {
+			if (i_total==ni_total) {cout<<"error! number of rows in the kinship file is larger than the number of phentypes."<<endl; error=true;}			
+			
+			if (indicator_idv[i_total]==0) {i_total++; continue;}
+			
+			j_total=0; j_test=0;
+			ch_ptr=strtok ((char *)line.c_str(), " , \t");
+			while (ch_ptr!=NULL) {
+				if (j_total==ni_total) {cout<<"error! number of columns in the kinship file is larger than the number of phentypes for row = "<<i_total<<endl; error=true;}
+				
+				d=atof(ch_ptr);
+				if (indicator_idv[j_total]==1) {gsl_matrix_set (G, i_test, j_test, d); j_test++;}				
+				j_total++;
+				
+				ch_ptr=strtok (NULL, " , \t");
+			}
+			if (j_total!=ni_total) {cout<<"error! number of columns in the kinship file do not match the number of phentypes for row = "<<i_total<<endl; error=true;}
+			i_total++; i_test++;			
+		}
+		if (i_total!=ni_total) {cout<<"error! number of rows in the kinship file do not match the number of phentypes."<<endl; error=true;}
+	}	
+	else {  
+		map<size_t, size_t> mapID2ID;
+		size_t c=0;
+		for (size_t i=0; i<indicator_idv.size(); i++) {
+			if (indicator_idv[i]==1) {mapID2ID[i]=c; c++;}
+		}
+		
+		string id1, id2;
+		double Cov_d;
+		size_t n_id1, n_id2;
+		
+		while (getline(infile, line)) {
+			ch_ptr=strtok ((char *)line.c_str(), " , \t");
+			id1=ch_ptr;
+			ch_ptr=strtok (NULL, " , \t");
+			id2=ch_ptr;
+			ch_ptr=strtok (NULL, " , \t");
+			d=atof(ch_ptr);
+			if (mapID2num.count(id1)==0 || mapID2num.count(id2)==0) {continue;}
+			if (indicator_idv[mapID2num[id1]]==0 || indicator_idv[mapID2num[id2]]==0) {continue;}
+			
+			n_id1=mapID2ID[mapID2num[id1]];
+			n_id2=mapID2ID[mapID2num[id2]];
+			
+			Cov_d=gsl_matrix_get(G, n_id1, n_id2);
+			if (Cov_d!=0 && Cov_d!=d) {cout<<"error! redundant and unequal terms in the kinship file, for id1 = "<<id1<<" and id2 = "<<id2<<endl;}
+			else {
+				gsl_matrix_set(G, n_id1, n_id2, d);
+				gsl_matrix_set(G, n_id2, n_id1, d);
+			}
+		}
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) 
+{
+	igzstream infile (file_mk.c_str(), igzstream::in);
+	if (!infile) {cout<<"error! fail to open file: "<<file_mk<<endl; error=true; return;}
+
+	string file_kin, line;
+
+	size_t i=0;
+	while (getline(infile, line)) {
+	  file_kin=line.c_str();
+	  gsl_matrix_view G_sub=gsl_matrix_submatrix(G, 0, i*G->size1, G->size1, G->size1);
+	  ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix);
+	  i++;
+	}
+
+	infile.close();
+	infile.clear();	
+	return;
+}
+
+
+void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) 
+{
+	igzstream infile (file_ku.c_str(), igzstream::in);
+//	ifstream infile (file_ku.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open the U file: "<<file_ku<<endl; error=true; return;}
+	
+	size_t n_row=U->size1, n_col=U->size2, i_row=0, i_col=0;
+	
+	gsl_matrix_set_zero (U);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	while (getline(infile, line)) {
+		if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<<endl; error=true;}			
+				
+		i_col=0;
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		while (ch_ptr!=NULL) {
+			if (i_col==n_col) {cout<<"error! number of columns in the U file is larger than expected, for row = "<<i_row<<endl; error=true;}
+			
+			d=atof(ch_ptr);
+			gsl_matrix_set (U, i_row, i_col, d);			
+			i_col++;
+			
+			ch_ptr=strtok (NULL, " , \t");
+		}
+		
+		i_row++;
+	}
+		
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) 
+{
+	igzstream infile (file_kd.c_str(), igzstream::in);
+//	ifstream infile (file_kd.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open the D file: "<<file_kd<<endl; error=true; return;}
+	
+	size_t n_row=eval->size, i_row=0;
+	
+	gsl_vector_set_zero (eval);
+	
+	string line;
+	char *ch_ptr;	
+	double d;
+	
+	while (getline(infile, line)) {
+		if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<<endl; error=true;}			
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		d=atof(ch_ptr);
+		
+		ch_ptr=strtok (NULL, " , \t");
+		if (ch_ptr!=NULL) {cout<<"error! number of columns in the D file is larger than expected, for row = "<<i_row<<endl; error=true;}
+		
+		gsl_vector_set (eval, i_row, d);
+		
+		i_row++;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+//read bimbam mean genotype file and calculate kinship matrix
+bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	size_t n_miss;
+	double d, geno_mean, geno_var;
+	
+	size_t ni_total=matrix_kin->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_total);
+	gsl_vector *geno_miss=gsl_vector_alloc (ni_total);
+
+	size_t ns_test=0;
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		geno_mean=0.0; n_miss=0; geno_var=0.0;
+		gsl_vector_set_all(geno_miss, 0);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;}
+			else {
+				d=atof(ch_ptr);
+				gsl_vector_set (geno, i, d);
+				gsl_vector_set (geno_miss, i, 1);
+				geno_mean+=d;
+				geno_var+=d*d;
+			}
+		}
+		
+		geno_mean/=(double)(ni_total-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_total;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+		
+		for (size_t i=0; i<ni_total; ++i) {
+			if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);}
+		}		
+		
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+		
+		if (geno_var!=0) {
+			if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);}
+			else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);}
+			else {cout<<"Unknown kinship mode."<<endl;}
+		}
+		
+		ns_test++;
+    }	
+	cout<<endl;
+	
+	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
+	
+	for (size_t i=0; i<ni_total; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (matrix_kin, j, i);
+			gsl_matrix_set (matrix_kin, i, j, d);
+		}
+	}
+	
+	gsl_vector_free (geno);
+	gsl_vector_free (geno_miss);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+
+
+bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) 
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+		
+	char ch[1];
+	bitset<8> b;
+	
+	size_t n_miss, ci_total;
+	double d, geno_mean, geno_var;
+	
+	size_t ni_total=matrix_kin->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_total);
+
+	size_t ns_test=0;
+	int n_bit;
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}	
+	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		geno_mean=0.0;	n_miss=0; ci_total=0; geno_var=0.0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==ni_total) {break;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(geno, ci_total, 2.0); geno_mean+=2.0; geno_var+=4.0; }
+					else {gsl_vector_set(geno, ci_total, 1.0); geno_mean+=1.0; geno_var+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); }      
+					else {gsl_vector_set(geno, ci_total, -9.0); n_miss++; }
+				}
+
+				ci_total++;
+			}
+		}
+				
+		geno_mean/=(double)(ni_total-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_total;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+		
+		for (size_t i=0; i<ni_total; ++i) {
+			d=gsl_vector_get(geno,i);
+			if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);}
+		}		
+		
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+		
+		if (geno_var!=0) {
+			if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);}
+			else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);}
+			else {cout<<"Unknown kinship mode."<<endl;}
+		}
+		
+		ns_test++;
+    }	
+	cout<<endl;
+	
+	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
+	
+	for (size_t i=0; i<ni_total; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (matrix_kin, j, i);
+			gsl_matrix_set (matrix_kin, i, j, d);
+		}
+	}
+	
+	gsl_vector_free (geno);
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
+
+
+
+//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K
+bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	if (calc_K==true) {gsl_matrix_set_zero (K);}
+	
+	gsl_vector *genotype=gsl_vector_alloc (UtX->size1);
+	gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1);
+	double geno, geno_mean;
+	size_t n_miss;
+	
+	int ni_total=(int)indicator_idv.size();
+	int ns_total=(int)indicator_snp.size();
+	int ni_test=UtX->size1;
+	int ns_test=UtX->size2;
+	
+	int c_idv=0, c_snp=0;
+	
+	for (int i=0; i<ns_total; ++i) {
+		!safeGetline(infile, line).eof();
+		if (indicator_snp[i]==0) {continue;}	
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		c_idv=0; geno_mean=0; n_miss=0;
+		gsl_vector_set_zero (genotype_miss);
+		for (int j=0; j<ni_total; ++j) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[j]==0) {continue;}			
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;}
+			else {			
+				geno=atof(ch_ptr);
+				gsl_vector_set (genotype, c_idv, geno); 
+				geno_mean+=geno;
+			}
+			c_idv++;
+		}
+		
+		geno_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<genotype->size; ++i) {			
+			if (gsl_vector_get (genotype_miss, i)==1) {geno=0;}
+			else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;}
+			
+			gsl_vector_set (genotype, i, geno);
+			gsl_matrix_set (UtX, i, c_snp, geno);
+		}
+		
+		if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);}
+		
+		c_snp++;
+	}	
+	
+	if (calc_K==true) {
+		gsl_matrix_scale (K, 1.0/(double)ns_test);
+		
+		for (size_t i=0; i<genotype->size; ++i) {
+			for (size_t j=0; j<i; ++j) {
+				geno=gsl_matrix_get (K, j, i);
+				gsl_matrix_set (K, i, j, geno);
+			}
+		}
+	}
+	
+	gsl_vector_free (genotype);
+	gsl_vector_free (genotype_miss);
+	
+	infile.clear();
+	infile.close();
+	
+	return true;
+}
+
+
+
+
+
+
+
+//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K
+bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K)
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+	
+	char ch[1];
+	bitset<8> b;
+	
+	int ni_total=(int)indicator_idv.size();
+	int ns_total=(int)indicator_snp.size();
+	int ni_test=UtX->size1;
+	int ns_test=UtX->size2;
+	int n_bit;
+	
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1;}
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	if (calc_K==true) {gsl_matrix_set_zero (K);}
+	
+	gsl_vector *genotype=gsl_vector_alloc (UtX->size1);	
+	
+	double geno, geno_mean;
+	size_t n_miss;	
+	int c_idv=0, c_snp=0, c=0;
+	
+	//start reading snps and doing association test
+	for (int t=0; t<ns_total; ++t) {
+		if (indicator_snp[t]==0) {continue;}	
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		c_idv=0; geno_mean=0.0; n_miss=0; c=0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && c==ni_total) {break;}				
+				if (indicator_idv[c]==0) {c++; continue;}
+				c++;
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;}
+					else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;}                               
+					else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;}
+				}
+				c_idv++;
+			}
+		}
+		
+		geno_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<genotype->size; ++i) {		
+			geno=gsl_vector_get (genotype, i);
+			if (geno==-9) {geno=0;}
+			else {geno-=geno_mean;}
+			
+			gsl_vector_set (genotype, i, geno);
+			gsl_matrix_set (UtX, i, c_snp, geno);
+		}
+		
+		if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);}
+		
+		c_snp++;
+	}	
+	
+	if (calc_K==true) {
+		gsl_matrix_scale (K, 1.0/(double)ns_test);
+		
+		for (size_t i=0; i<genotype->size; ++i) {
+			for (size_t j=0; j<i; ++j) {
+				geno=gsl_matrix_get (K, j, i);
+				gsl_matrix_set (K, i, j, geno);
+			}
+		}
+	}
+	
+	gsl_vector_free (genotype);		  
+	infile.clear();
+	infile.close();
+	
+	return true;
+}
+
+
+
+
+
+bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est)
+{
+	mapRS2est.clear();
+	
+	ifstream infile (file_est.c_str(), ifstream::in);
+	if (!infile) {cout<<"error opening estimated parameter file: "<<file_est<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	
+	string rs;
+	double alpha, beta, gamma, d;
+	
+	//header
+	getline(infile, line);
+	
+	size_t n=*max_element(est_column.begin(), est_column.end());
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " \t");		
+		
+		alpha=0.0; beta=0.0; gamma=1.0;
+		for (size_t i=0; i<n+1; ++i) {
+			if (i==est_column[0]-1) {rs=ch_ptr;}
+			if (i==est_column[1]-1) {alpha=atof(ch_ptr);}
+			if (i==est_column[2]-1) {beta=atof(ch_ptr);}
+			if (i==est_column[3]-1) {gamma=atof(ch_ptr);}
+			if (i<n) {ch_ptr=strtok (NULL, " \t");}
+		}
+		
+		d=alpha+beta*gamma;
+		
+		if (mapRS2est.count(rs)==0) {
+			mapRS2est[rs]=d;
+		}
+		else {
+			cout<<"the same SNP occurs more than once in estimated parameter file: "<<rs<<endl; return false;
+		}
+	}
+	
+	infile.clear();
+	infile.close();
+	return true;
+}
+
+
+
+bool CountFileLines (const string &file_input, size_t &n_lines)
+{
+	igzstream infile (file_input.c_str(), igzstream::in);
+	//ifstream infile (file_input.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open file: "<<file_input<<endl; return false;}
+
+	n_lines=count(istreambuf_iterator<char>(infile), istreambuf_iterator<char>(), '\n');
+	infile.seekg (0, ios::beg);
+	
+	return true;
+}
+
+
+
+//Read gene expression file
+bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total)
+{
+	vec_read.clear();
+	ng_total=0;
+	
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error! fail to open gene expression file: "<<file_gene<<endl; return false;}
+	
+	string line;
+	char *ch_ptr;
+	string rs;
+	
+	size_t n_idv=0, t=0;
+	
+	//header
+	getline(infile, line);
+	
+	while (getline(infile, line)) {
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		ch_ptr=strtok (NULL, " , \t");	
+		
+		t=0;
+		while (ch_ptr!=NULL) {
+			if (ng_total==0) {
+				vec_read.push_back(0);
+				t++;
+				n_idv++;
+			} else {
+				vec_read[t]+=atof(ch_ptr);		
+				t++;
+			}
+			
+			ch_ptr=strtok (NULL, " , \t");	
+		}
+		
+		if (t!=n_idv) {cout<<"error! number of columns doesn't match in row: "<<ng_total<<endl; return false;}
+		
+		SNPINFO sInfo={"-9", rs, -9, -9, "-9", "-9", -9, -9, -9};
+		snpInfo.push_back(sInfo);
+		
+		ng_total++;
+	}
+	
+	infile.close();
+	infile.clear();	
+	
+	return true;
+}
+
+
diff --git a/src/io.h b/src/io.h
new file mode 100644
index 0000000..13e3e47
--- /dev/null
+++ b/src/io.h
@@ -0,0 +1,79 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __IO_H__                
+#define __IO_H__
+
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+void ProgressBar (string str, double p, double total);
+void ProgressBar (string str, double p, double total, double ratio);
+std::istream& safeGetline(std::istream& is, std::string& t);
+
+bool ReadFile_snps (const string &file_snps, set<string> &setSnps);
+bool ReadFile_log (const string &file_log, double &pheno_mean);
+
+bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo);
+bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column);
+
+bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt);
+bool ReadFile_anno (const string &file_bim, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM);
+bool ReadFile_pheno (const string &file_pheno, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column);
+bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column);
+
+bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test);
+bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test);
+
+void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G);
+void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G);
+void ReadFile_eigenU (const string &file_u, bool &error, gsl_matrix *U);
+void ReadFile_eigenD (const string &file_d, bool &error, gsl_vector *eval); 
+
+bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
+bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
+
+bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);
+bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);
+
+bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est);
+
+bool CountFileLines (const string &file_input, size_t &n_lines);
+
+bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total);
+
+#endif
+
+
+
+
+
+
+
diff --git a/src/lapack.cpp b/src/lapack.cpp
new file mode 100644
index 0000000..83d5290
--- /dev/null
+++ b/src/lapack.cpp
@@ -0,0 +1,609 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <cmath>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+
+using namespace std;
+
+extern "C" void sgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, float *ALPHA, float *A, int *LDA, float *B, int *LDB, float *BETA, float *C, int *LDC);
+extern "C" void spotrf_(char *UPLO, int *N, float *A, int *LDA, int *INFO);
+extern "C" void spotrs_(char *UPLO, int *N, int *NRHS, float *A, int *LDA, float *B, int *LDB, int *INFO);
+extern "C" void ssyev_(char* JOBZ, char* UPLO, int *N, float *A, int *LDA, float *W, float *WORK, int *LWORK, int *INFO);
+extern "C" void ssyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, float *A, int *LDA, float *VL, float *VU, int *IL, int *IU, float *ABSTOL, int *M, float *W, float *Z, int *LDZ, int *ISUPPZ, float *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);
+
+extern "C" void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int *LDB, double *BETA, double *C, int *LDC);
+extern "C" void dpotrf_(char *UPLO, int *N, double *A, int *LDA, int *INFO);
+extern "C" void dpotrs_(char *UPLO, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, int *INFO);
+extern "C" void dsyev_(char* JOBZ, char* UPLO, int *N, double *A, int *LDA, double *W, double *WORK, int *LWORK, int *INFO);
+extern "C" void dsyevr_(char* JOBZ, char *RANGE, char* UPLO, int *N, double *A, int *LDA, double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z, int *LDZ, int *ISUPPZ, double *WORK, int *LWORK, int *IWORK, int *LIWORK, int *INFO);
+
+
+//cholesky decomposition, A is distroyed
+void lapack_float_cholesky_decomp (gsl_matrix_float *A)
+{
+	int N=A->size1, LDA=A->size1, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<<endl; return;}
+	
+	spotrf_(&UPLO, &N, A->data, &LDA, &INFO);
+	if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky decomposition, A is distroyed
+void lapack_cholesky_decomp (gsl_matrix *A)
+{
+	int N=A->size1, LDA=A->size1, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_decomp."<<endl; return;}
+	
+	dpotrf_(&UPLO, &N, A->data, &LDA, &INFO);
+	if (INFO!=0) {cout<<"Cholesky decomposition unsuccessful in lapack_cholesky_decomp."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky solve, A is decomposed, 
+void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x)
+{
+	int N=A->size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<<endl; return;}
+	
+	gsl_vector_float_memcpy (x, b);
+	spotrs_(&UPLO, &N, &NRHS, A->data, &LDA, x->data, &LDB, &INFO);
+	if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<<endl; return;}	
+	
+	return;
+}
+
+//cholesky solve, A is decomposed, 
+void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x)
+{
+	int N=A->size1, NRHS=1, LDA=A->size1, LDB=b->size, INFO;
+	char UPLO='L';
+	
+	if (N!=(int)A->size2 || N!=LDB) {cout<<"Matrix needs to be symmetric and same dimension in lapack_cholesky_solve."<<endl; return;}
+	
+	gsl_vector_memcpy (x, b);
+	dpotrs_(&UPLO, &N, &NRHS, A->data, &LDA, x->data, &LDB, &INFO);
+	if (INFO!=0) {cout<<"Cholesky solve unsuccessful in lapack_cholesky_solve."<<endl; return;}	
+	
+	return;
+}
+
+
+void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C)
+{
+	int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2;
+	
+	if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;}
+	else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;}
+	else {cout<<"need 'N' or 'T' in lapack_sgemm"<<endl; return;}
+	
+	if (*TransB=='N' || *TransB=='n') {N=B->size2; K2=B->size1;}
+	else if (*TransB=='T' || *TransB=='t')  {N=B->size1; K2=B->size2;}
+	else {cout<<"need 'N' or 'T' in lapack_sgemm"<<endl;  return;}
+	
+	if (K1!=K2) {cout<<"A and B not compatible in lapack_sgemm"<<endl; return;}
+	if (C->size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_sgemm"<<endl; return;}
+	
+	gsl_matrix_float *A_t=gsl_matrix_float_alloc (A->size2, A->size1);
+	gsl_matrix_float_transpose_memcpy (A_t, A);
+	gsl_matrix_float *B_t=gsl_matrix_float_alloc (B->size2, B->size1);
+	gsl_matrix_float_transpose_memcpy (B_t, B);
+	gsl_matrix_float *C_t=gsl_matrix_float_alloc (C->size2, C->size1);
+	gsl_matrix_float_transpose_memcpy (C_t, C);
+	
+	sgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC);
+	gsl_matrix_float_transpose_memcpy (C, C_t);
+	
+	gsl_matrix_float_free (A_t);
+	gsl_matrix_float_free (B_t);
+	gsl_matrix_float_free (C_t);
+	return;
+}
+
+
+
+void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C)
+{
+	int M, N, K1, K2, LDA=A->size1, LDB=B->size1, LDC=C->size2;
+	
+	if (*TransA=='N' || *TransA=='n') {M=A->size1; K1=A->size2;}
+	else if (*TransA=='T' || *TransA=='t') {M=A->size2; K1=A->size1;}
+	else {cout<<"need 'N' or 'T' in lapack_dgemm"<<endl; return;}
+	
+	if (*TransB=='N' || *TransB=='n') {N=B->size2; K2=B->size1;}
+	else if (*TransB=='T' || *TransB=='t')  {N=B->size1; K2=B->size2;}
+	else {cout<<"need 'N' or 'T' in lapack_dgemm"<<endl;  return;}
+	
+	if (K1!=K2) {cout<<"A and B not compatible in lapack_dgemm"<<endl; return;}
+	if (C->size1!=(size_t)M || C->size2!=(size_t)N) {cout<<"C not compatible in lapack_dgemm"<<endl; return;}
+	
+	gsl_matrix *A_t=gsl_matrix_alloc (A->size2, A->size1);
+	gsl_matrix_transpose_memcpy (A_t, A);
+	gsl_matrix *B_t=gsl_matrix_alloc (B->size2, B->size1);
+	gsl_matrix_transpose_memcpy (B_t, B);
+	gsl_matrix *C_t=gsl_matrix_alloc (C->size2, C->size1);
+	gsl_matrix_transpose_memcpy (C_t, C);
+
+	dgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC);
+
+	gsl_matrix_transpose_memcpy (C, C_t);
+	
+	gsl_matrix_free (A_t);
+	gsl_matrix_free (B_t);
+	gsl_matrix_free (C_t);
+	return;
+}
+
+
+
+//eigen value decomposition, matrix A is destroyed, float seems to have problem with large matrices (in mac)
+void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix)
+{
+	if (flag_largematrix==1) {
+		int N=A->size1, LDA=A->size1, INFO, LWORK=-1;
+		char JOBZ='V', UPLO='L';
+				
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		//	float temp[1];
+		//	ssyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, temp, &LWORK, &INFO);
+		//	if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		//	LWORK=(int)temp[0];
+		
+		LWORK=3*N;
+		float *WORK=new float [LWORK];	
+		ssyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, WORK, &LWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_float_view A_sub=gsl_matrix_float_submatrix(A, 0, 0, N, N);
+		gsl_matrix_float_memcpy (evec, &A_sub.matrix);
+		gsl_matrix_float_transpose (evec);
+		
+		delete [] WORK;
+	} else {	
+		int N=A->size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1;
+		char JOBZ='V', UPLO='L', RANGE='A';
+		float ABSTOL=1.0E-7;
+		
+		//VL, VU, IL, IU are not referenced; M equals N if RANGE='A'
+		float VL=0.0, VU=0.0;
+		int IL=0, IU=0, M;
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_float_eigen_symmv."<<endl; return;}
+		
+		int *ISUPPZ=new int [2*N];
+				
+		float WORK_temp[1];
+		int IWORK_temp[1];
+		ssyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_float_eigen_symmv."<<endl; return;}	
+		LWORK=(int)WORK_temp[0]; LIWORK=(int)IWORK_temp[0];	
+		 
+		//LWORK=26*N;
+		//LIWORK=10*N;
+		float *WORK=new float [LWORK];
+		int *IWORK=new int [LIWORK];
+		
+		ssyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_float_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_float_transpose (evec);
+		
+		delete [] ISUPPZ;
+		delete [] WORK;
+		delete [] IWORK;
+	}
+	
+	
+	return;
+}
+
+
+
+//eigen value decomposition, matrix A is destroyed
+void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix)
+{
+	if (flag_largematrix==1) {
+		int N=A->size1, LDA=A->size1, INFO, LWORK=-1;
+		char JOBZ='V', UPLO='L';		
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		//	double temp[1];
+		//	dsyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, temp, &LWORK, &INFO);
+		//	if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}		
+		//	LWORK=(int)temp[0];
+		
+		LWORK=3*N;
+		double *WORK=new double [LWORK];	
+		dsyev_(&JOBZ, &UPLO, &N, A->data, &LDA, eval->data, WORK, &LWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+		
+		gsl_matrix_view A_sub=gsl_matrix_submatrix(A, 0, 0, N, N);
+		gsl_matrix_memcpy (evec, &A_sub.matrix);
+		gsl_matrix_transpose (evec);
+		
+		delete [] WORK;
+	} else {	
+		int N=A->size1, LDA=A->size1, LDZ=A->size1, INFO, LWORK=-1, LIWORK=-1;
+		char JOBZ='V', UPLO='L', RANGE='A';
+		double ABSTOL=1.0E-7;
+		
+		//VL, VU, IL, IU are not referenced; M equals N if RANGE='A'
+		double VL=0.0, VU=0.0;
+		int IL=0, IU=0, M;
+		
+		if (N!=(int)A->size2 || N!=(int)eval->size) {cout<<"Matrix needs to be symmetric and same dimension in lapack_eigen_symmv."<<endl; return;}
+		
+		int *ISUPPZ=new int [2*N];
+		
+		double WORK_temp[1];
+		int IWORK_temp[1];
+
+		dsyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK_temp, &LWORK, IWORK_temp, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Work space estimate unsuccessful in lapack_eigen_symmv."<<endl; return;}	
+		LWORK=(int)WORK_temp[0]; LIWORK=(int)IWORK_temp[0];	
+
+		//LWORK=26*N;
+		//LIWORK=10*N;
+		double *WORK=new double [LWORK];
+		int *IWORK=new int [LIWORK];
+		
+		dsyevr_(&JOBZ, &RANGE, &UPLO, &N, A->data, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, &M, eval->data, evec->data, &LDZ, ISUPPZ, WORK, &LWORK, IWORK, &LIWORK, &INFO);
+		if (INFO!=0) {cout<<"Eigen decomposition unsuccessful in lapack_eigen_symmv."<<endl; return;}
+
+		gsl_matrix_transpose (evec);
+		
+		delete [] ISUPPZ;
+		delete [] WORK;
+		delete [] IWORK;
+	}
+	
+	return;
+}
+
+//DO NOT set eigen values to be positive
+double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix)
+{
+#ifdef WITH_LAPACK
+	lapack_eigen_symmv (G, eval, U, flag_largematrix);
+#else
+	gsl_eigen_symmv_workspace *w=gsl_eigen_symmv_alloc (G->size1);
+	gsl_eigen_symmv (G, eval, U, w);
+	gsl_eigen_symmv_free (w);	
+#endif	
+	/*
+	for (size_t i=0; i<eval->size; ++i) {
+		if (gsl_vector_get (eval, i)<1e-10) {
+//			cout<<gsl_vector_get (eval, i)<<endl;
+			gsl_vector_set (eval, i, 0);			
+		}
+	}
+	*/
+	//calculate track_G=mean(diag(G))	
+	double d=0.0;
+	for (size_t i=0; i<eval->size; ++i) {
+		d+=gsl_vector_get(eval, i);
+	}
+	d/=(double)eval->size;
+	
+	return d;
+}
+
+
+//DO NOT set eigen values to be positive
+double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix)
+{
+#ifdef WITH_LAPACK
+	lapack_float_eigen_symmv (G, eval, U, flag_largematrix);
+#else
+	//gsl doesn't provide float precision eigen decomposition; plus, float precision eigen decomposition in lapack may not work on OS 10.4
+	//first change to double precision
+	gsl_matrix *G_double=gsl_matrix_alloc (G->size1, G->size2);
+	gsl_matrix *U_double=gsl_matrix_alloc (U->size1, U->size2);
+	gsl_vector *eval_double=gsl_vector_alloc (eval->size);
+	for (size_t i=0; i<G->size1; i++) {
+		for (size_t j=0; j<G->size2; j++) {
+			gsl_matrix_set(G_double, i, j, gsl_matrix_float_get(G, i, j));
+		}
+	}	
+	gsl_eigen_symmv_workspace *w_space=gsl_eigen_symmv_alloc (G->size1);
+	gsl_eigen_symmv (G_double, eval_double, U_double, w_space);
+	gsl_eigen_symmv_free (w_space);	
+	
+	//change back to float precision
+	for (size_t i=0; i<G->size1; i++) {
+		for (size_t j=0; j<G->size2; j++) {
+			gsl_matrix_float_set(K, i, j, gsl_matrix_get(G_double, i, j));
+		}
+	}
+	for (size_t i=0; i<U->size1; i++) {
+		for (size_t j=0; j<U->size2; j++) {
+			gsl_matrix_float_set(U, i, j, gsl_matrix_get(U_double, i, j));
+		}
+	}
+	for (size_t i=0; i<eval->size; i++) {
+		gsl_vector_float_set(eval, i, gsl_vector_get(eval_double, i));
+	}	
+	
+	//delete double precision matrices
+	gsl_matrix_free (G_double);
+	gsl_matrix_free (U_double);
+	gsl_vector_free (eval_double);
+#endif
+	/*
+	for (size_t i=0; i<eval->size; ++i) {
+		if (gsl_vector_float_get (eval, i)<1e-10) {
+			gsl_vector_float_set (eval, i, 0);
+		}
+	}
+	*/
+	//calculate track_G=mean(diag(G))	
+	double d=0.0;
+	for (size_t i=0; i<eval->size; ++i) {
+		d+=gsl_vector_float_get(eval, i);
+	}
+	d/=(double)eval->size;
+	
+	return d;
+}
+
+
+double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty)
+{
+	double logdet_O=0.0;
+	
+#ifdef WITH_LAPACK
+	lapack_cholesky_decomp(Omega);
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_get (Omega, i, i));
+	}	
+	logdet_O*=2.0;	
+	lapack_cholesky_solve(Omega, Xty, OiXty);	
+#else	
+	int status = gsl_linalg_cholesky_decomp(Omega);
+	if(status == GSL_EDOM) {
+		cout << "## non-positive definite matrix" << endl; 
+		//		exit(0); 
+	}
+	
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_get (Omega, i, i));
+	}
+	logdet_O*=2.0;	
+	
+	gsl_vector_memcpy (OiXty, Xty);
+	gsl_blas_dtrsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); 
+	gsl_blas_dtrsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); 	
+	//	gsl_linalg_cholesky_solve(XtX, Xty, iXty);
+#endif
+	
+	return logdet_O;
+}
+
+
+double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty)
+{
+	double logdet_O=0.0;
+	
+#ifdef WITH_LAPACK
+	lapack_float_cholesky_decomp(Omega);
+	for (size_t i=0; i<Omega->size1; ++i) {
+		logdet_O+=log(gsl_matrix_float_get (Omega, i, i));
+	}	
+	logdet_O*=2.0;	
+	lapack_float_cholesky_solve(Omega, Xty, OiXty);	
+#else
+	gsl_matrix *Omega_double=gsl_matrix_alloc (Omega->size1, Omega->size2);
+	double d;
+	for (size_t i=0; i<Omega->size1; ++i) {
+		for (size_t j=0; j<Omega->size2; ++j) {
+			d=(double)gsl_matrix_float_get (Omega, i, j);
+			gsl_matrix_set (Omega_double, i, j, d);
+		}
+	}
+	
+	int status = gsl_linalg_cholesky_decomp(Omega_double);
+	if(status == GSL_EDOM) {
+		cout << "## non-positive definite matrix" << endl; 
+		//		exit(0); 
+	}	
+	
+	for (size_t i=0; i<Omega->size1; ++i) {
+		for (size_t j=0; j<Omega->size2; ++j) {
+			d=gsl_matrix_get (Omega_double, i, j);
+			if (j==i) {logdet_O+=log(d);}
+			gsl_matrix_float_set (Omega, i, j, (float)d);
+		}
+	}
+	logdet_O*=2.0;	
+	
+	gsl_vector_float_memcpy (OiXty, Xty);
+	gsl_blas_strsv(CblasLower, CblasNoTrans, CblasNonUnit, Omega, OiXty); 
+	gsl_blas_strsv(CblasUpper, CblasNoTrans, CblasNonUnit, Omega, OiXty); 	
+	//	gsl_linalg_cholesky_solve(XtX, Xty, iXty);
+	
+	gsl_matrix_free (Omega_double);
+#endif
+	
+	return logdet_O;
+}	
+
+
+//LU decomposition
+void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum)
+{
+	gsl_linalg_LU_decomp (LU, p, signum);
+	return;
+}
+
+void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_decomp (LU_double, p, signum);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j));
+		}
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	return;
+}
+
+
+//LU invert
+void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse)
+{
+	gsl_linalg_LU_invert (LU, p, inverse);
+	return;
+}
+
+void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	gsl_matrix *inverse_double=gsl_matrix_alloc (inverse->size1, inverse->size2);
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_invert (LU_double, p, inverse_double);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<inverse->size1; i++) {
+		for (size_t j=0; j<inverse->size2; j++) {
+			gsl_matrix_float_set (inverse, i, j, gsl_matrix_get(inverse_double, i, j));
+		}
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	gsl_matrix_free (inverse_double);
+	return;
+}
+
+//LU lndet
+double LULndet (gsl_matrix *LU)
+{
+	double d;
+	d=gsl_linalg_LU_lndet (LU);
+	return d;
+}
+
+double LULndet (gsl_matrix_float *LU)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	double d;
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	//LU decomposition
+	d=gsl_linalg_LU_lndet (LU_double);
+	
+	//copy float matrix to double
+	/*
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_float_set (LU, i, j, gsl_matrix_get(LU_double, i, j));
+		}
+	}
+	*/
+	//free matrix
+	gsl_matrix_free (LU_double);
+	return d;
+}
+
+
+//LU solve
+void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x)
+{
+	gsl_linalg_LU_solve (LU, p, b, x);
+	return;
+}
+
+void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x)
+{
+	gsl_matrix *LU_double=gsl_matrix_alloc (LU->size1, LU->size2);
+	gsl_vector *b_double=gsl_vector_alloc (b->size);
+	gsl_vector *x_double=gsl_vector_alloc (x->size);	
+	
+	//copy float matrix to double	
+	for (size_t i=0; i<LU->size1; i++) {
+		for (size_t j=0; j<LU->size2; j++) {
+			gsl_matrix_set (LU_double, i, j, gsl_matrix_float_get(LU, i, j));
+		}
+	}
+	
+	for (size_t i=0; i<b->size; i++) {
+		gsl_vector_set (b_double, i, gsl_vector_float_get(b, i));
+	}
+	
+	for (size_t i=0; i<x->size; i++) {
+		gsl_vector_set (x_double, i, gsl_vector_float_get(x, i));
+	}
+	
+	//LU decomposition
+	gsl_linalg_LU_solve (LU_double, p, b_double, x_double);
+	
+	//copy float matrix to double
+	for (size_t i=0; i<x->size; i++) {
+		gsl_vector_float_set (x, i, gsl_vector_get(x_double, i));
+	}
+	
+	//free matrix
+	gsl_matrix_free (LU_double);
+	gsl_vector_free (b_double);
+	gsl_vector_free (x_double);
+	return;
+}
+
+
diff --git a/src/lapack.h b/src/lapack.h
new file mode 100644
index 0000000..cb7b156
--- /dev/null
+++ b/src/lapack.h
@@ -0,0 +1,53 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __LAPACK_H__                
+#define __LAPACK_H__
+
+
+
+using namespace std;
+
+
+void lapack_float_cholesky_decomp (gsl_matrix_float *A);
+void lapack_cholesky_decomp (gsl_matrix *A);
+void lapack_float_cholesky_solve (gsl_matrix_float *A, const gsl_vector_float *b, gsl_vector_float *x);
+void lapack_cholesky_solve (gsl_matrix *A, const gsl_vector *b, gsl_vector *x);
+void lapack_sgemm (char *TransA, char *TransB, float alpha, const gsl_matrix_float *A, const gsl_matrix_float *B, float beta, gsl_matrix_float *C);
+void lapack_dgemm (char *TransA, char *TransB, double alpha, const gsl_matrix *A, const gsl_matrix *B, double beta, gsl_matrix *C);
+void lapack_float_eigen_symmv (gsl_matrix_float *A, gsl_vector_float *eval, gsl_matrix_float *evec, const size_t flag_largematrix);
+void lapack_eigen_symmv (gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, const size_t flag_largematrix);
+
+double EigenDecomp (gsl_matrix *G, gsl_matrix *U, gsl_vector *eval, const size_t flag_largematrix);
+double EigenDecomp (gsl_matrix_float *G, gsl_matrix_float *U, gsl_vector_float *eval, const size_t flag_largematrix);
+
+double CholeskySolve(gsl_matrix *Omega, gsl_vector *Xty, gsl_vector *OiXty);
+double CholeskySolve(gsl_matrix_float *Omega, gsl_vector_float *Xty, gsl_vector_float *OiXty);
+
+void LUDecomp (gsl_matrix *LU, gsl_permutation *p, int *signum);
+void LUDecomp (gsl_matrix_float *LU, gsl_permutation *p, int *signum);
+void LUInvert (const gsl_matrix *LU, const gsl_permutation *p, gsl_matrix *inverse);
+void LUInvert (const gsl_matrix_float *LU, const gsl_permutation *p, gsl_matrix_float *inverse);
+double LULndet (gsl_matrix *LU);
+double LULndet (gsl_matrix_float *LU);
+void LUSolve (const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x);
+void LUSolve (const gsl_matrix_float *LU, const gsl_permutation *p, const gsl_vector_float *b, gsl_vector_float *x);
+#endif
+
+
+
diff --git a/src/lm.cpp b/src/lm.cpp
new file mode 100644
index 0000000..7577d0a
--- /dev/null
+++ b/src/lm.cpp
@@ -0,0 +1,572 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "gzstream.h"
+#include "lapack.h"
+
+#ifdef FORCE_FLOAT
+#include "lm_float.h"
+#else
+#include "lm.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+void LM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	file_gene=cPar.file_gene;
+	
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	ng_total=cPar.ng_total;
+	ng_test=0;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;	
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void LM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_opt=time_opt;	
+	
+	cPar.ng_test=ng_test;
+	
+	return;
+}
+
+
+
+void LM::WriteFiles () 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".assoc.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	if (!file_gene.empty()) {
+		outfile<<"geneID"<<"\t";
+		
+		if (a_mode==51) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==52) {
+			outfile<<"p_lrt"<<endl;
+		} else if (a_mode==53) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==54) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+				
+		for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) {	
+			outfile<<snpInfo[t].rs_number<<"\t";
+			
+			if (a_mode==51) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==52) {
+				outfile<<scientific<<setprecision(6)<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==53) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==54) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+		}	
+	}  else {
+		outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+		
+		if (a_mode==51) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==52) {
+			outfile<<"p_lrt"<<endl;
+		} else if (a_mode==53) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==54) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+		
+		size_t t=0;
+		for (size_t i=0; i<snpInfo.size(); ++i) {
+			if (indicator_snp[i]==0) {continue;}
+			
+			outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+			
+			if (a_mode==51) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==52) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==53) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==54) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+			t++;
+		}
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+
+void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x,  double &xPwy, double &xPwx)
+{
+	size_t c_size=Wty->size;
+	double d;
+	
+	gsl_vector *WtWiWtx=gsl_vector_alloc (c_size);
+	
+	gsl_blas_ddot (x, x, &xPwx);
+	gsl_blas_ddot (x, y, &xPwy);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);	
+	
+	gsl_blas_ddot (WtWiWtx, Wtx, &d);	
+	xPwx-=d;
+	
+	gsl_blas_ddot (WtWiWtx, Wty, &d);	
+	xPwy-=d;
+	
+	gsl_vector_free (WtWiWtx);
+	
+	return;
+}
+
+
+void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy)
+{
+	size_t c_size=Wty->size;
+	double d;
+	
+	gsl_vector *WtWiWty=gsl_vector_alloc (c_size);
+	
+	gsl_blas_ddot (y, y, &yPwy);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);	
+	
+	gsl_blas_ddot (WtWiWty, Wty, &d);	
+	yPwy-=d;
+	
+	gsl_vector_free (WtWiWty);
+	
+	return;
+}
+
+
+
+//calculate p values and beta/se in a linear model
+void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score)
+{
+	double yPxy=yPwy-xPwy*xPwy/xPwx;
+	double se_wald, se_score;
+	
+	beta=xPwy/xPwx;
+	se_wald=sqrt(yPxy/(df*xPwx) );
+	se_score=sqrt(yPwy/((double)n_size*xPwx) );
+	
+	p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df);
+	p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df);
+	p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1);
+	
+	if (test_mode==3) {se=se_score;} else {se=se_wald;}
+	
+	return;
+}
+
+
+
+
+void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) 
+{
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int c_phen;
+	string rs; //gene id
+	double d;
+	
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *y=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
+	CalcvPv(WtWi, Wtx, x, xPwx);
+		
+	//header
+	getline(infile, line);
+	
+	for (size_t t=0; t<ng_total; t++) {
+		getline(infile, line);
+		if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);}
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		c_phen=0; 
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			d=atof(ch_ptr); 			
+			gsl_vector_set(y, c_phen, d);
+			
+			c_phen++;
+		}
+				
+		//calculate statistics		
+		time_start=clock();	
+	
+		gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
+		CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy);
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);	
+	
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}
+	cout<<endl;
+	
+	gsl_vector_free(y);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *x=gsl_vector_alloc (W->size1);
+	gsl_vector *x_miss=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);		
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
+	CalcvPv(WtWi, Wty, y, yPwy);
+	
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		//if (t>1) {break;}
+		getline(infile, line);
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}	
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}		
+		
+		//calculate statistics		
+		time_start=clock();		
+
+		gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);		
+		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}	
+	cout<<endl;
+
+	gsl_vector_free(x);
+	gsl_vector_free(x_miss);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	char ch[1];
+	bitset<8> b;	
+	
+	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+		
+	//calculate some basic quantities
+	double yPwy, xPwy, xPwx;
+	double df=(double)W->size1-(double)W->size2-1.0;
+
+	gsl_vector *x=gsl_vector_alloc (W->size1);
+
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
+	gsl_vector *Wty=gsl_vector_alloc (W->size2);
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
+	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
+	CalcvPv(WtWi, Wty, y, yPwy);
+		
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+				
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		//calculate statistics		
+		time_start=clock();	
+		
+		gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
+		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);		
+		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);    
+
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+	}	
+	cout<<endl;
+	
+	gsl_vector_free(x);
+
+	gsl_matrix_free(WtW);
+	gsl_matrix_free(WtWi);	
+	gsl_vector_free(Wty);
+	gsl_vector_free(Wtx);
+	gsl_permutation_free(pmt);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+//make sure that both y and X are centered already
+void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) 
+{
+	double yty, xty, xtx, log_lr;
+	gsl_blas_ddot(y, y, &yty);
+
+	for (size_t i=0; i<X->size2; ++i) {
+	  gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
+	  gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
+	  gsl_blas_ddot(&X_col.vector, y, &xty);
+
+	  log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx));
+	  pos_loglr.push_back(make_pair(i,log_lr) );
+	}
+	
+	return;
+}
diff --git a/src/lm.h b/src/lm.h
new file mode 100644
index 0000000..ceec060
--- /dev/null
+++ b/src/lm.h
@@ -0,0 +1,75 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LM_H__                
+#define __LM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+class LM {
+	
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 50+1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	string path_out;
+	
+	string file_gene;
+	
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t n_cvt;
+	double time_opt;		//time spent
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<SUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeGene (const gsl_matrix *W, const gsl_vector *x);
+	void AnalyzePlink (const gsl_matrix *W, const gsl_vector *y);
+	void AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y);
+	void WriteFiles ();
+};
+void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr);
+#endif
diff --git a/src/lmm.cpp b/src/lmm.cpp
new file mode 100644
index 0000000..e0b4160
--- /dev/null
+++ b/src/lmm.cpp
@@ -0,0 +1,1771 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#else
+#include "lmm.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+
+void LMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	file_gene=cPar.file_gene;
+	
+	l_min=cPar.l_min;
+	l_max=cPar.l_max;
+	n_region=cPar.n_region;	
+	l_mle_null=cPar.l_mle_null;
+	logl_mle_H0=cPar.logl_mle_H0;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+	
+	ng_total=cPar.ng_total;
+	ng_test=0;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;	
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void LMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+	
+	cPar.ng_test=ng_test;
+	
+	return;
+}
+
+
+
+void LMM::WriteFiles () 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".assoc.txt";
+
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	if (!file_gene.empty()) {
+		outfile<<"geneID"<<"\t";
+		
+		if (a_mode==1) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==2) {
+			outfile<<"l_mle"<<"\t"<<"p_lrt"<<endl;
+		} else if (a_mode==3) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==4) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+				
+		for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) {	
+			outfile<<snpInfo[t].rs_number<<"\t";
+			
+			if (a_mode==1) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==2) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==3) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==4) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+		}	
+	}  else {
+		outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+		
+		if (a_mode==1) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"p_wald"<<endl;
+		} else if (a_mode==2) {
+			outfile<<"l_mle"<<"\t"<<"p_lrt"<<endl;
+		} else if (a_mode==3) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
+		} else if (a_mode==4) {
+			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"l_remle"<<"\t"<<"l_mle"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+		} else {}
+		
+		size_t t=0;
+		for (size_t i=0; i<snpInfo.size(); ++i) {
+			if (indicator_snp[i]==0) {continue;}
+			
+			outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+			
+			if (a_mode==1) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].p_wald <<endl;
+			} else if (a_mode==2) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_lrt<<endl;
+			} else if (a_mode==3) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
+			} else if (a_mode==4) {
+				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].lambda_remle<<"\t"<<sumStat[t].lambda_mle<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+			} else {}
+			t++;
+		}
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+
+
+
+
+
+
+
+//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1
+size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
+	if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;}
+	size_t index;
+	size_t l, h;
+	if (b>a) {l=a; h=b;} else {l=b; h=a;}
+	
+	size_t n=n_cvt+2;
+	index=(2*n-l+2)*(l-1)/2+h-l;	
+	
+	return index;
+}
+
+
+void CalcPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, const gsl_matrix *Uab, const gsl_vector *ab, gsl_matrix *Pab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p_ab;
+	double ps_ab, ps_aw, ps_bw, ps_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {			
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (Hi_eval, &Uab_col.vector, &p_ab);
+					if (e_mode!=0) {p_ab=gsl_vector_get (ab, index_ab)-p_ab;}
+					gsl_matrix_set (Pab, 0, index_ab, p_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps_ab=gsl_matrix_get (Pab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					
+					p_ab=ps_ab-ps_aw*ps_bw/ps_ww;
+					gsl_matrix_set (Pab, p, index_ab, p_ab);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+void CalcPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, gsl_matrix *PPab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p2_ab;
+	double ps2_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {					
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (HiHi_eval, &Uab_col.vector, &p2_ab);
+					if (e_mode!=0) {p2_ab=p2_ab-gsl_vector_get (ab, index_ab)+2.0*gsl_matrix_get (Pab, 0, index_ab);}
+					gsl_matrix_set (PPab, 0, index_ab, p2_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps2_ab=gsl_matrix_get (PPab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					ps2_aw=gsl_matrix_get (PPab, p-1, index_aw);
+					ps2_bw=gsl_matrix_get (PPab, p-1, index_bw);
+					ps2_ww=gsl_matrix_get (PPab, p-1, index_ww);
+					
+					p2_ab=ps2_ab+ps_aw*ps_bw*ps2_ww/(ps_ww*ps_ww);
+					p2_ab-=(ps_aw*ps2_bw+ps_bw*ps2_aw)/ps_ww;
+					gsl_matrix_set (PPab, p, index_ab, p2_ab);
+					
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+void CalcPPPab (const size_t n_cvt, const size_t e_mode, const gsl_vector *HiHiHi_eval, const gsl_matrix *Uab, const gsl_vector *ab, const gsl_matrix *Pab, const gsl_matrix *PPab, gsl_matrix *PPPab)
+{
+	size_t index_ab, index_aw, index_bw, index_ww;
+	double p3_ab;
+	double ps3_ab, ps_aw, ps_bw, ps_ww, ps2_aw, ps2_bw, ps2_ww, ps3_aw, ps3_bw, ps3_ww;
+	
+	for (size_t p=0; p<=n_cvt+1; ++p) {
+		for (size_t a=p+1; a<=n_cvt+2; ++a) {
+			for (size_t b=a; b<=n_cvt+2; ++b) {
+				index_ab=GetabIndex (a, b, n_cvt);
+				if (p==0) {					
+					gsl_vector_const_view Uab_col=gsl_matrix_const_column (Uab, index_ab);
+					gsl_blas_ddot (HiHiHi_eval, &Uab_col.vector, &p3_ab);
+					if (e_mode!=0) {p3_ab=gsl_vector_get (ab, index_ab)-p3_ab+3.0*gsl_matrix_get (PPab, 0, index_ab)-3.0*gsl_matrix_get (Pab, 0, index_ab);}
+					gsl_matrix_set (PPPab, 0, index_ab, p3_ab);
+				}
+				else {
+					index_aw=GetabIndex (a, p, n_cvt);
+					index_bw=GetabIndex (b, p, n_cvt);
+					index_ww=GetabIndex (p, p, n_cvt);
+					
+					ps3_ab=gsl_matrix_get (PPPab, p-1, index_ab);
+					ps_aw=gsl_matrix_get (Pab, p-1, index_aw);
+					ps_bw=gsl_matrix_get (Pab, p-1, index_bw);
+					ps_ww=gsl_matrix_get (Pab, p-1, index_ww);
+					ps2_aw=gsl_matrix_get (PPab, p-1, index_aw);
+					ps2_bw=gsl_matrix_get (PPab, p-1, index_bw);
+					ps2_ww=gsl_matrix_get (PPab, p-1, index_ww);
+					ps3_aw=gsl_matrix_get (PPPab, p-1, index_aw);
+					ps3_bw=gsl_matrix_get (PPPab, p-1, index_bw);
+					ps3_ww=gsl_matrix_get (PPPab, p-1, index_ww);
+					
+					p3_ab=ps3_ab-ps_aw*ps_bw*ps2_ww*ps2_ww/(ps_ww*ps_ww*ps_ww);
+					p3_ab-=(ps_aw*ps3_bw+ps_bw*ps3_aw+ps2_aw*ps2_bw)/ps_ww;
+					p3_ab+=(ps_aw*ps2_bw*ps2_ww+ps_bw*ps2_aw*ps2_ww+ps_aw*ps_bw*ps3_ww)/(ps_ww*ps_ww);
+					
+					gsl_matrix_set (PPPab, p, index_ab, p3_ab);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+
+double LogL_f (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double f=0.0, logdet_h=0.0, d;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+				
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	for (size_t i=0; i<(p->eval)->size; ++i) {
+		d=gsl_vector_get (v_temp, i);
+		logdet_h+=log(fabs(d));
+	}	
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	
+	double c=0.5*(double)ni_test*(log((double)ni_test)-log(2*M_PI)-1.0);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	f=c-0.5*logdet_h-0.5*(double)ni_test*log(P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return f;
+}
+
+ 
+ 
+
+
+
+double LogL_dev1 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double dev1=0.0, trace_Hi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+		
+	if (p->e_mode!=0) {trace_Hi=(double)ni_test-trace_Hi;}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	
+	double trace_HiK=((double)ni_test-trace_Hi)/l;	
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double yPKPy=(P_yy-PP_yy)/l;	
+	dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy;
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev1;
+}
+	
+	
+
+
+double LogL_dev2 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {		
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy);		
+		
+	double yPKPy=(P_yy-PP_yy)/l;
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+		
+	dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+		
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev2;
+}
+	
+	
+	
+	
+	
+void LogL_dev12 (double l, void *params, double *dev1, double *dev2)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt;} else {nc_total=n_cvt+1;}
+	
+	double trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_yy;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	double trace_HiK=((double)ni_test-trace_Hi)/l;
+	double trace_HiKHiK=((double)ni_test+trace_HiHi-2*trace_Hi)/(l*l);
+	
+	index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_yy);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_yy);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_yy);		
+		
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+		
+	*dev1=-0.5*trace_HiK+0.5*(double)ni_test*yPKPy/P_yy;
+	*dev2=0.5*trace_HiKHiK-0.5*(double)ni_test*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return;
+}
+
+
+
+double LogRL_f (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double f=0.0, logdet_h=0.0, logdet_hiw=0.0, d;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *Iab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	for (size_t i=0; i<(p->eval)->size; ++i) {
+		d=gsl_vector_get (v_temp, i);
+		logdet_h+=log(fabs(d));
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	gsl_vector_set_all (v_temp, 1.0);
+	CalcPab (n_cvt, p->e_mode, v_temp, p->Uab, p->ab, Iab);	
+	
+	//calculate |WHiW|-|WW|
+	logdet_hiw=0.0;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		d=gsl_matrix_get (Pab, i, index_ww);
+		logdet_hiw+=log(d);
+		d=gsl_matrix_get (Iab, i, index_ww);
+		logdet_hiw-=log(d);
+	}
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	
+	double c=0.5*df*(log(df)-log(2*M_PI)-1.0);		
+	f=c-0.5*logdet_h-0.5*logdet_hiw-0.5*df*log(P_yy);
+		
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (Iab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return f;
+}
+
+
+
+double LogRL_dev1 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double dev1=0.0, trace_Hi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi;
+	double ps_ww, ps2_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+	}
+	double trace_PK=(df-trace_P)/l;
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);		
+	double yPKPy=(P_yy-PP_yy)/l;	
+	
+	dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy;	
+			
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev1;
+}
+
+
+
+
+double LogRL_dev2 (double l, void *params)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double dev2=0.0, trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+	
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi, trace_PP=trace_HiHi;
+	double ps_ww, ps2_ww, ps3_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		ps3_ww=gsl_matrix_get (PPPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+		trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww;
+	}
+	double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l);
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww);				
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+	
+	dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return dev2;
+}
+	
+
+
+
+void LogRL_dev12 (double l, void *params, double *dev1, double *dev2)
+{
+	FUNC_PARAM *p=(FUNC_PARAM *) params;	
+	size_t n_cvt=p->n_cvt;
+	size_t ni_test=p->ni_test;	
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	double df;
+	size_t nc_total;
+	if (p->calc_null==true) {nc_total=n_cvt; df=(double)ni_test-(double)n_cvt; }
+	else {nc_total=n_cvt+1; df=(double)ni_test-(double)n_cvt-1.0;}
+	
+	double trace_Hi=0.0, trace_HiHi=0.0;
+	size_t index_ww;
+	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_matrix *PPPab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *HiHiHi_eval=gsl_vector_alloc((p->eval)->size);
+	gsl_vector *v_temp=gsl_vector_alloc((p->eval)->size);
+	
+	gsl_vector_memcpy (v_temp, p->eval);
+	gsl_vector_scale (v_temp, l);
+	if (p->e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+		
+	gsl_vector_memcpy (HiHi_eval, Hi_eval);
+	gsl_vector_mul (HiHi_eval, Hi_eval);	
+	gsl_vector_memcpy (HiHiHi_eval, HiHi_eval);
+	gsl_vector_mul (HiHiHi_eval, Hi_eval);
+	
+	gsl_vector_set_all (v_temp, 1.0);
+	gsl_blas_ddot (Hi_eval, v_temp, &trace_Hi);
+	gsl_blas_ddot (HiHi_eval, v_temp, &trace_HiHi);
+	
+	if (p->e_mode!=0) {	
+		trace_Hi=(double)ni_test-trace_Hi;
+		trace_HiHi=2*trace_Hi+trace_HiHi-(double)ni_test;
+	}
+	
+	CalcPab (n_cvt, p->e_mode, Hi_eval, p->Uab, p->ab, Pab);	
+	CalcPPab (n_cvt, p->e_mode, HiHi_eval, p->Uab, p->ab, Pab, PPab);	
+	CalcPPPab (n_cvt, p->e_mode, HiHiHi_eval, p->Uab, p->ab, Pab, PPab, PPPab);	
+		
+	//calculate tracePK and trace PKPK
+	double trace_P=trace_Hi, trace_PP=trace_HiHi;
+	double ps_ww, ps2_ww, ps3_ww;
+	for (size_t i=0; i<nc_total; ++i) {
+		index_ww=GetabIndex (i+1, i+1, n_cvt);
+		ps_ww=gsl_matrix_get (Pab, i, index_ww);
+		ps2_ww=gsl_matrix_get (PPab, i, index_ww);
+		ps3_ww=gsl_matrix_get (PPPab, i, index_ww);
+		trace_P-=ps2_ww/ps_ww;
+		trace_PP+=ps2_ww*ps2_ww/(ps_ww*ps_ww)-2.0*ps3_ww/ps_ww;
+	}
+	double trace_PK=(df-trace_P)/l;
+	double trace_PKPK=(df+trace_PP-2.0*trace_P)/(l*l);
+	
+	//calculate yPKPy, yPKPKPy
+	index_ww=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, nc_total, index_ww);
+	double PP_yy=gsl_matrix_get (PPab, nc_total, index_ww);
+	double PPP_yy=gsl_matrix_get (PPPab, nc_total, index_ww);				
+	double yPKPy=(P_yy-PP_yy)/l;	
+	double yPKPKPy=(P_yy+PPP_yy-2.0*PP_yy)/(l*l);
+	
+	*dev1=-0.5*trace_PK+0.5*df*yPKPy/P_yy;
+	*dev2=0.5*trace_PKPK-0.5*df*(2.0*yPKPKPy*P_yy-yPKPy*yPKPy)/(P_yy*P_yy);
+	
+	gsl_matrix_free (Pab);
+	gsl_matrix_free (PPab);
+	gsl_matrix_free (PPPab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (HiHi_eval);
+	gsl_vector_free (HiHiHi_eval);
+	gsl_vector_free (v_temp);	
+	
+	return ;
+}
+	
+
+
+
+
+
+
+
+void LMM::CalcRLWald (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_wald)
+{
+	size_t n_cvt=params.n_cvt;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	int df=(int)ni_test-(int)n_cvt-1;
+			
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(params.eval->size);
+	
+	gsl_vector_memcpy (v_temp, params.eval);
+	gsl_vector_scale (v_temp, l);
+	if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt);
+	size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);
+	double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx);
+	double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy);	
+	double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy);	
+	
+	beta=P_xy/P_xx;
+	double tau=(double)df/Px_yy;
+	se=sqrt(1.0/(tau*P_xx));	
+	p_wald=gsl_cdf_fdist_Q ((P_yy-Px_yy)*tau, 1.0, df);	
+//	p_wald=gsl_cdf_chisq_Q ((P_yy-Px_yy)*tau, 1);	
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return ;
+}
+
+
+void LMM::CalcRLScore (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_score)
+{
+	size_t n_cvt=params.n_cvt;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	int df=(int)ni_test-(int)n_cvt-1;
+			
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(params.eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(params.eval->size);
+	
+	gsl_vector_memcpy (v_temp, params.eval);
+	gsl_vector_scale (v_temp, l);
+	if (params.e_mode==0) {gsl_vector_set_all (Hi_eval, 1.0);} else {gsl_vector_memcpy (Hi_eval, v_temp);}
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);	
+	
+	CalcPab (n_cvt, params.e_mode, Hi_eval, params.Uab, params.ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	size_t index_xx=GetabIndex (n_cvt+1, n_cvt+1, n_cvt);
+	size_t index_xy=GetabIndex (n_cvt+2, n_cvt+1, n_cvt);
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);
+	double P_xx=gsl_matrix_get (Pab, n_cvt, index_xx);
+	double P_xy=gsl_matrix_get (Pab, n_cvt, index_xy);	
+	double Px_yy=gsl_matrix_get (Pab, n_cvt+1, index_yy);	
+	
+	beta=P_xy/P_xx;
+	double tau=(double)df/Px_yy;
+	se=sqrt(1.0/(tau*P_xx));	
+	
+	p_score=gsl_cdf_fdist_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1.0, df);
+//	p_score=gsl_cdf_chisq_Q ((double)ni_test*P_xy*P_xy/(P_yy*P_xx), 1);	
+	
+	gsl_matrix_free (Pab);
+	gsl_vector_free (Hi_eval);
+	gsl_vector_free (v_temp);
+	return ;
+}
+
+
+
+
+
+
+
+
+void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) 
+{
+	size_t index_ab;
+	size_t n_cvt=UtW->size2;
+	
+	gsl_vector *u_a=gsl_vector_alloc (Uty->size);
+	
+	for (size_t a=1; a<=n_cvt+2; ++a) {
+		if (a==n_cvt+1) {continue;}
+		
+		if (a==n_cvt+2) {gsl_vector_memcpy (u_a, Uty);}
+		else {
+			gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, a-1);
+			gsl_vector_memcpy (u_a, &UtW_col.vector);
+		}
+		
+		for (size_t b=a; b>=1; --b) {		
+			if (b==n_cvt+1) {continue;}
+			
+			index_ab=GetabIndex (a, b, n_cvt);
+			gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab);
+			
+			if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);}
+			else {
+				gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1);
+				gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector);
+			}			
+			
+			gsl_vector_mul(&Uab_col.vector, u_a);
+		}
+	}
+	
+	gsl_vector_free (u_a);
+	return;
+}
+
+
+void CalcUab (const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_vector *Utx, gsl_matrix *Uab) 
+{	
+	size_t index_ab;
+	size_t n_cvt=UtW->size2;
+	
+	for (size_t b=1; b<=n_cvt+2; ++b) {			
+		index_ab=GetabIndex (n_cvt+1, b, n_cvt);
+		gsl_vector_view Uab_col=gsl_matrix_column (Uab, index_ab);
+		
+		if (b==n_cvt+2) {gsl_vector_memcpy (&Uab_col.vector, Uty);}
+		else if (b==n_cvt+1) {gsl_vector_memcpy (&Uab_col.vector, Utx);}
+		else {
+			gsl_vector_const_view UtW_col=gsl_matrix_const_column (UtW, b-1);
+			gsl_vector_memcpy (&Uab_col.vector, &UtW_col.vector);
+		}
+		
+		gsl_vector_mul(&Uab_col.vector, Utx);
+	}
+	
+	return;
+}
+
+
+
+void Calcab (const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) 
+{
+	size_t index_ab;
+	size_t n_cvt=W->size2;
+	
+	double d;
+	gsl_vector *v_a=gsl_vector_alloc (y->size);
+	gsl_vector *v_b=gsl_vector_alloc (y->size);
+	
+	for (size_t a=1; a<=n_cvt+2; ++a) {
+		if (a==n_cvt+1) {continue;}
+		
+		if (a==n_cvt+2) {gsl_vector_memcpy (v_a, y);}
+		else {
+			gsl_vector_const_view W_col=gsl_matrix_const_column (W, a-1);
+			gsl_vector_memcpy (v_a, &W_col.vector);
+		}
+		
+		for (size_t b=a; b>=1; --b) {		
+			if (b==n_cvt+1) {continue;}
+			
+			index_ab=GetabIndex (a, b, n_cvt);
+			
+			if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);}
+			else {
+				gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1);
+				gsl_vector_memcpy (v_b, &W_col.vector);
+			}			
+			
+			gsl_blas_ddot (v_a, v_b, &d);
+			gsl_vector_set(ab, index_ab, d);
+		}
+	}
+	
+	gsl_vector_free (v_a);
+	gsl_vector_free (v_b);
+	return;
+}
+
+
+void Calcab (const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, gsl_vector *ab) 
+{	
+	size_t index_ab;
+	size_t n_cvt=W->size2;
+	
+	double d;
+	gsl_vector *v_b=gsl_vector_alloc (y->size);
+	
+	for (size_t b=1; b<=n_cvt+2; ++b) {			
+		index_ab=GetabIndex (n_cvt+1, b, n_cvt);
+		
+		if (b==n_cvt+2) {gsl_vector_memcpy (v_b, y);}
+		else if (b==n_cvt+1) {gsl_vector_memcpy (v_b, x);}
+		else {
+			gsl_vector_const_view W_col=gsl_matrix_const_column (W, b-1);
+			gsl_vector_memcpy (v_b, &W_col.vector);
+		}
+		
+		gsl_blas_ddot (x, v_b, &d);
+		gsl_vector_set(ab, index_ab, d);
+	}
+	
+	gsl_vector_free (v_b);
+	
+	return;
+}
+
+
+
+
+
+void LMM::AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) 
+{
+	ifstream infile (file_gene.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0, logl_H0=0.0, l_H0;
+	int c_phen;
+	string rs; //gene id
+	double d;
+	
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_vector *y=gsl_vector_alloc (U->size1);
+	gsl_vector *Uty=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+		
+	//header
+	getline(infile, line);
+	
+	for (size_t t=0; t<ng_total; t++) {
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);}
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		
+		c_phen=0; 
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			d=atof(ch_ptr); 			
+			gsl_vector_set(y, c_phen, d);
+			
+			c_phen++;
+		}
+		
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, y, 0.0, Uty);		
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	
+		//calculate null
+		time_start=clock();
+		
+		gsl_matrix_set_zero (Uab);
+		
+		CalcUab (UtW, Uty, Uab);
+		FUNC_PARAM param0={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		if (a_mode==2 || a_mode==3 || a_mode==4) {
+			CalcLambda('L', param0, l_min, l_max, n_region, l_H0, logl_H0);
+		}
+		
+		//calculate alternative
+		CalcUab(UtW, Uty, Utx, Uab);
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_H0, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), 1);	
+		}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }
+	cout<<endl;
+	
+	gsl_vector_free (y);
+	gsl_vector_free (Uty);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+
+	clock_t time_start=clock();
+	
+	string line;
+	char *ch_ptr;
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+
+	gsl_vector *x=gsl_vector_alloc (U->size1);
+	gsl_vector *x_miss=gsl_vector_alloc (U->size1);
+	gsl_vector *Utx=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}	
+	
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+//		if (t>1) {break;}
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+		
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}	
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		
+		//calculate statistics
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);		
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		CalcUab(UtW, Uty, Utx, Uab);
+//		if (e_mode!=0) {
+//			Calcab (W, y, x, ab);
+//		}
+		
+		time_start=clock();
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);	
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);	
+		}			
+		
+		if (x_mean>1) {beta*=-1;}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (x_miss);
+	gsl_vector_free (Utx);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	
+	char ch[1];
+	bitset<8> b;	
+	
+	double lambda_mle=0, lambda_remle=0, beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
+	double logl_H1=0.0;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+		
+	//Calculate basic quantities
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+
+	gsl_vector *x=gsl_vector_alloc (U->size1);
+	gsl_vector *Utx=gsl_vector_alloc (U->size2);
+	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}
+		
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+				
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+		
+		//calculate statistics
+		time_start=clock();
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		CalcUab(UtW, Uty, Utx, Uab);
+//		if (e_mode!=0) {
+//			Calcab (W, y, x, ab);
+//		}
+		
+		time_start=clock();
+		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		
+		//3 is before 1, for beta
+		if (a_mode==3 || a_mode==4) {
+			CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		}
+		
+		if (a_mode==1 || a_mode==4) {
+			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);	
+			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		}
+		
+		if (a_mode==2 || a_mode==4) {
+			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);	
+		}		
+		
+		if (x_mean>1) {beta*=-1;}		
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (Utx);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+
+void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr) 
+{
+	double logl_H0, logl_H1, log_lr, lambda0, lambda1;
+	
+	gsl_vector *w=gsl_vector_alloc (Uty->size);
+	gsl_matrix *Utw=gsl_matrix_alloc (Uty->size, 1);	
+	gsl_matrix *Uab=gsl_matrix_alloc (Uty->size, 6);
+	gsl_vector *ab=gsl_vector_alloc (6);	
+	
+	gsl_vector_set_zero(ab);
+	gsl_vector_set_all (w, 1.0);
+	gsl_vector_view Utw_col=gsl_matrix_column (Utw, 0);	
+	gsl_blas_dgemv (CblasTrans, 1.0, U, w, 0.0, &Utw_col.vector);		
+	
+	CalcUab (Utw, Uty, Uab) ;	
+	FUNC_PARAM param0={true, Uty->size, 1, K_eval, Uab, ab, 0};	
+	
+	CalcLambda('L', param0, l_min, l_max, n_region, lambda0, logl_H0);
+	
+	for (size_t i=0; i<UtX->size2; ++i) {
+		gsl_vector_const_view UtX_col=gsl_matrix_const_column (UtX, i);
+		CalcUab(Utw, Uty, &UtX_col.vector, Uab);
+		FUNC_PARAM param1={false, UtX->size1, 1, K_eval, Uab, ab, 0};
+		
+		CalcLambda ('L', param1, l_min, l_max, n_region, lambda1, logl_H1);
+		log_lr=logl_H1-logl_H0;				
+		
+		pos_loglr.push_back(make_pair(i,log_lr) );
+	}
+	
+	gsl_vector_free (w);
+	gsl_matrix_free (Utw);
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);
+	
+	return;
+}
+
+
+
+
+void CalcLambda (const char func_name, FUNC_PARAM &params, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+	
+	vector<pair<double, double> > lambda_lh;
+	
+	//evaluate first order derivates in different intervals
+	double lambda_l, lambda_h, lambda_interval=log(l_max/l_min)/(double)n_region;
+	double dev1_l, dev1_h, logf_l, logf_h;
+	
+	for (size_t i=0; i<n_region; ++i) {
+		lambda_l=l_min*exp(lambda_interval*i);
+		lambda_h=l_min*exp(lambda_interval*(i+1.0));
+		
+		if (func_name=='R' || func_name=='r') {
+			dev1_l=LogRL_dev1 (lambda_l, &params);
+			dev1_h=LogRL_dev1 (lambda_h, &params);
+		}
+		else {
+			dev1_l=LogL_dev1 (lambda_l, &params);
+			dev1_h=LogL_dev1 (lambda_h, &params);
+		}
+		
+		if (dev1_l*dev1_h<=0) {
+			lambda_lh.push_back(make_pair(lambda_l, lambda_h));
+		}
+	}
+	
+	//if derivates do not change signs in any interval
+	if (lambda_lh.empty()) {
+		if (func_name=='R' || func_name=='r') {
+			logf_l=LogRL_f (l_min, &params);
+			logf_h=LogRL_f (l_max, &params);
+		}
+		else {
+			logf_l=LogL_f (l_min, &params);
+			logf_h=LogL_f (l_max, &params);
+		}
+		
+		if (logf_l>=logf_h) {lambda=l_min; logf=logf_l;} else {lambda=l_max; logf=logf_h;}
+	}
+	else {
+		//if derivates change signs
+		int status;
+		int iter=0, max_iter=100;
+		double l, l_temp;	
+		
+		gsl_function F;
+		gsl_function_fdf FDF;
+		
+		F.params=&params;
+		FDF.params=&params;
+		
+		if (func_name=='R' || func_name=='r') {
+			F.function=&LogRL_dev1;
+			FDF.f=&LogRL_dev1;
+			FDF.df=&LogRL_dev2;
+			FDF.fdf=&LogRL_dev12;
+		}
+		else {
+			F.function=&LogL_dev1;
+			FDF.f=&LogL_dev1;
+			FDF.df=&LogL_dev2;
+			FDF.fdf=&LogL_dev12;
+		}
+		
+		const gsl_root_fsolver_type *T_f;
+		gsl_root_fsolver *s_f;
+		T_f=gsl_root_fsolver_brent;
+		s_f=gsl_root_fsolver_alloc (T_f);
+		
+		const gsl_root_fdfsolver_type *T_fdf;
+		gsl_root_fdfsolver *s_fdf;
+		T_fdf=gsl_root_fdfsolver_newton;
+		s_fdf=gsl_root_fdfsolver_alloc(T_fdf);	
+		
+		for (vector<double>::size_type i=0; i<lambda_lh.size(); ++i) {
+			lambda_l=lambda_lh[i].first; lambda_h=lambda_lh[i].second;
+			
+			gsl_root_fsolver_set (s_f, &F, lambda_l, lambda_h);
+			
+			do {
+				iter++;
+				status=gsl_root_fsolver_iterate (s_f);
+				l=gsl_root_fsolver_root (s_f);
+				lambda_l=gsl_root_fsolver_x_lower (s_f);
+				lambda_h=gsl_root_fsolver_x_upper (s_f);
+				status=gsl_root_test_interval (lambda_l, lambda_h, 0, 1e-1);		
+			}
+			while (status==GSL_CONTINUE && iter<max_iter); 				
+			
+			iter=0;
+			
+			gsl_root_fdfsolver_set (s_fdf, &FDF, l);	
+			
+			do {
+				iter++;
+				status=gsl_root_fdfsolver_iterate (s_fdf);
+				l_temp=l;
+				l=gsl_root_fdfsolver_root (s_fdf);
+				status=gsl_root_test_delta (l, l_temp, 0, 1e-5);		
+			}
+			while (status==GSL_CONTINUE && iter<max_iter && l>l_min && l<l_max); 
+			
+			l=l_temp;
+			if (l<l_min) {l=l_min;}
+			if (l>l_max) {l=l_max;}
+			if (func_name=='R' || func_name=='r') {logf_l=LogRL_f (l, &params);} else {logf_l=LogL_f (l, &params);}			
+			
+			if (i==0) {logf=logf_l; lambda=l;}
+			else if (logf<logf_l) {logf=logf_l; lambda=l;}
+			else {}
+		}
+		gsl_root_fsolver_free (s_f);	
+		gsl_root_fdfsolver_free (s_fdf);		
+		
+		if (func_name=='R' || func_name=='r') {
+			logf_l=LogRL_f (l_min, &params);
+			logf_h=LogRL_f (l_max, &params);
+		}
+		else {
+			logf_l=LogL_f (l_min, &params);
+			logf_h=LogL_f (l_max, &params);
+		}
+		
+		if (logf_l>logf) {lambda=l_min; logf=logf_l;} 
+		if (logf_h>logf) {lambda=l_max; logf=logf_h;}
+	}
+	
+	return;
+}
+
+
+
+
+
+//calculate lambda in the null model
+void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+//	if (e_mode!=0) {
+//		gsl_vector_set_zero (ab);
+//		Calcab (W, y, ab);
+//	}
+		
+	FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0};
+	
+	CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0);
+	
+	gsl_matrix_free(Uab);	
+	gsl_vector_free(ab);	
+	
+	return;
+}
+	
+	
+//obtain REMLE estimate for PVE using lambda_remle
+void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se)
+{
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);
+	//	if (e_mode!=0) {
+	//		gsl_vector_set_zero (ab);
+	//		Calcab (W, y, ab);
+	//	}
+	
+	FUNC_PARAM param0={true, ni_test, n_cvt, eval, Uab, ab, 0};
+	
+	double se=sqrt(-1.0/LogRL_dev2 (lambda, &param0));
+	
+	pve=trace_G*lambda/(trace_G*lambda+1.0);
+	pve_se=trace_G/((trace_G*lambda+1.0)*(trace_G*lambda+1.0))*se;
+	
+	gsl_matrix_free (Uab);
+	gsl_vector_free (ab);	
+	return;
+}
+
+//obtain REML estimate for Vg and Ve using lambda_remle
+//obtain beta and se(beta) for coefficients
+//ab is not used when e_mode==0
+void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta)
+{
+	size_t n_cvt=UtW->size2, ni_test=UtW->size1;
+	size_t n_index=(n_cvt+2+1)*(n_cvt+2)/2;
+	
+	gsl_matrix *Uab=gsl_matrix_alloc (ni_test, n_index);	
+	gsl_vector *ab=gsl_vector_alloc (n_index);	
+	gsl_matrix *Pab=gsl_matrix_alloc (n_cvt+2, n_index);
+	gsl_vector *Hi_eval=gsl_vector_alloc(eval->size);
+	gsl_vector *v_temp=gsl_vector_alloc(eval->size);
+	gsl_matrix *HiW=gsl_matrix_alloc(eval->size, UtW->size2);
+	gsl_matrix *WHiW=gsl_matrix_alloc(UtW->size2, UtW->size2);
+	gsl_vector *WHiy=gsl_vector_alloc(UtW->size2);
+	gsl_matrix *Vbeta=gsl_matrix_alloc(UtW->size2, UtW->size2);
+	
+	gsl_matrix_set_zero (Uab);
+	CalcUab (UtW, Uty, Uab);	
+	
+	gsl_vector_memcpy (v_temp, eval);
+	gsl_vector_scale (v_temp, lambda);
+	gsl_vector_set_all (Hi_eval, 1.0);
+	gsl_vector_add_constant (v_temp, 1.0);
+	gsl_vector_div (Hi_eval, v_temp);
+	
+	//calculate beta
+	gsl_matrix_memcpy (HiW, UtW);
+	for (size_t i=0; i<UtW->size2; i++) {
+		gsl_vector_view HiW_col=gsl_matrix_column(HiW, i);
+		gsl_vector_mul(&HiW_col.vector, Hi_eval);
+	}
+	gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, HiW, UtW, 0.0, WHiW);
+	gsl_blas_dgemv (CblasTrans, 1.0, HiW, Uty, 0.0, WHiy);
+	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (UtW->size2);
+	LUDecomp (WHiW, pmt, &sig);
+	LUSolve (WHiW, pmt, WHiy, beta);
+	LUInvert (WHiW, pmt, Vbeta);
+		
+	//calculate vg and ve
+	CalcPab (n_cvt, 0, Hi_eval, Uab, ab, Pab);	
+	
+	size_t index_yy=GetabIndex (n_cvt+2, n_cvt+2, n_cvt);	
+	double P_yy=gsl_matrix_get (Pab, n_cvt, index_yy);	
+	
+	ve=P_yy/(double)(ni_test-n_cvt);
+	vg=ve*lambda;
+	
+	//with ve, calculate se(beta)
+	gsl_matrix_scale(Vbeta, ve);
+	
+	//obtain se_beta
+	for (size_t i=0; i<Vbeta->size1; i++) {
+		gsl_vector_set (se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i) ) );
+	}
+	
+	gsl_matrix_free(Uab);
+	gsl_matrix_free(Pab);
+	gsl_vector_free(ab);
+	gsl_vector_free(Hi_eval);
+	gsl_vector_free(v_temp);
+	gsl_matrix_free(HiW);
+	gsl_matrix_free(WHiW);
+	gsl_vector_free(WHiy);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_permutation_free(pmt);
+	return;
+}
+
diff --git a/src/lmm.h b/src/lmm.h
new file mode 100644
index 0000000..45f9b72
--- /dev/null
+++ b/src/lmm.h
@@ -0,0 +1,111 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __LMM_H__                
+#define __LMM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+class FUNC_PARAM
+{
+
+public:
+	bool calc_null;
+	size_t ni_test;
+	size_t n_cvt;
+	const gsl_vector *eval;
+	const gsl_matrix *Uab;
+	const gsl_vector *ab;
+	size_t e_mode;
+};
+
+
+
+
+class LMM {
+
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	string path_out;
+	
+	string file_gene;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double l_mle_null;
+	double logl_mle_H0;	
+	
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t n_cvt;
+	double time_UtX;		//time spent on optimization iterations
+	double time_opt;		//time spent on optimization iterations
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<SUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeGene (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x);
+	void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y);
+	void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y);
+	void WriteFiles ();
+	
+	void CalcRLWald (const double &lambda, const FUNC_PARAM &params, double &beta, double &se, double &p_wald);
+	void CalcRLScore (const double &l, const FUNC_PARAM &params, double &beta, double &se, double &p_score);	
+};
+
+void MatrixCalcLR (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const double l_min, const double l_max, const size_t n_region, vector<pair<size_t, double> > &pos_loglr);
+void CalcLambda (const char func_name, FUNC_PARAM &params, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf);
+void CalcLambda (const char func_name, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logl_H0);
+void CalcPve (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, const double trace_G, double &pve, double &pve_se);
+void CalcLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const double lambda, double &vg, double &ve, gsl_vector *beta, gsl_vector *se_beta);
+
+#endif
+
+
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..e1fb336
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,86 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "param.h"
+
+#ifdef FORCE_FLOAT
+#include "gemma_float.h"
+#else
+#include "gemma.h"
+#endif
+
+using namespace std;
+
+
+
+int main(int argc, char * argv[])
+{ 	
+	GEMMA cGemma;	
+	PARAM cPar;
+
+	if (argc <= 1) {
+		cGemma.PrintHeader(); 
+		return EXIT_SUCCESS;
+	}
+	if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'h') {
+		cGemma.PrintHelp(0);
+		return EXIT_SUCCESS;
+	}
+	if (argc==3 && argv[1][0] == '-' && argv[1][1] == 'h') {
+		string str;
+		str.assign(argv[2]);
+		cGemma.PrintHelp(atoi(str.c_str()));
+		return EXIT_SUCCESS;
+	}
+	if (argc==2 && argv[1][0] == '-' && argv[1][1] == 'l') {
+		cGemma.PrintLicense();
+		return EXIT_SUCCESS;
+	}	
+	
+	cGemma.Assign(argc, argv, cPar); 
+
+	ifstream check_dir((cPar.path_out).c_str());
+	if (!check_dir) {
+	  mkdir((cPar.path_out).c_str(), S_IRWXU|S_IRGRP|S_IROTH);
+	}	
+		
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	     
+	if (cPar.mode_silence) {stringstream ss; cout.rdbuf (ss.rdbuf());}
+	
+	cPar.CheckParam();
+	
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	
+	cGemma.BatchRun(cPar);
+	
+	if (cPar.error==true) {return EXIT_FAILURE;}
+	
+	cGemma.WriteLog(argc, argv, cPar);
+	
+    return EXIT_SUCCESS;                                                          
+}
+
+
+ 
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
new file mode 100644
index 0000000..09e58dc
--- /dev/null
+++ b/src/mathfunc.cpp
@@ -0,0 +1,310 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <map>
+#include <set>
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h> 
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_cdf.h"
+
+#ifdef FORCE_FLOAT
+#include "mathfunc_float.h"
+#else
+#include "mathfunc.h"
+#endif
+
+
+using namespace std;
+
+
+
+//calculate variance of a vector
+double VectorVar (const gsl_vector *v)
+{
+	double d, m=0.0, m2=0.0;
+	for (size_t i=0; i<v->size; ++i) {
+		d=gsl_vector_get (v, i);
+		m+=d;
+		m2+=d*d;
+	}
+	m/=(double)v->size;
+	m2/=(double)v->size;
+	return m2-m*m;
+}
+
+
+
+//center the matrix G	
+void CenterMatrix (gsl_matrix *G)
+{		
+	double d;
+	gsl_vector *w=gsl_vector_alloc (G->size1);
+	gsl_vector *Gw=gsl_vector_alloc (G->size1);
+	gsl_vector_set_all (w, 1.0);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw);			
+	gsl_blas_dsyr2 (CblasUpper, -1.0/(double)G->size1, Gw, w, G);
+	gsl_blas_ddot (w, Gw, &d);		
+	gsl_blas_dsyr (CblasUpper, d/((double)G->size1*(double)G->size1), w, G);
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (G, j, i);
+			gsl_matrix_set (G, i, j, d);
+		}
+	}
+	
+	gsl_vector_free(w);
+	gsl_vector_free(Gw);
+	
+	return;
+}
+
+
+//center the matrix G	
+void CenterMatrix (gsl_matrix *G, gsl_vector *w)
+{		
+	double d, wtw;
+	gsl_vector *Gw=gsl_vector_alloc (G->size1);
+	
+	gsl_blas_ddot (w, w, &wtw);	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, G, w, 0.0, Gw);			
+	gsl_blas_dsyr2 (CblasUpper, -1.0/wtw, Gw, w, G);
+	gsl_blas_ddot (w, Gw, &d);		
+	gsl_blas_dsyr (CblasUpper, d/(wtw*wtw), w, G);
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			d=gsl_matrix_get (G, j, i);
+			gsl_matrix_set (G, i, j, d);
+		}
+	}
+	
+	gsl_vector_free(Gw);
+	
+	return;
+}
+
+
+//scale the matrix G such that the mean diagonal = 1
+void ScaleMatrix (gsl_matrix *G)
+{		
+	double d=0.0;
+	
+	for (size_t i=0; i<G->size1; ++i) {
+		d+=gsl_matrix_get(G, i, i);
+	}
+	d/=(double)G->size1;
+	
+	gsl_matrix_scale (G, 1.0/d);
+	
+	return;
+}
+
+
+//center the vector y
+double CenterVector (gsl_vector *y)
+{		
+	double d=0.0;
+	
+	for (size_t i=0; i<y->size; ++i) {
+		d+=gsl_vector_get (y, i);
+	}
+	d/=(double)y->size;
+	
+	gsl_vector_add_constant (y, -1.0*d);
+	
+	return d;
+}
+
+
+//calculate UtX
+void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) 
+{
+	gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1);
+	for (size_t i=0; i<UtX->size2; ++i) {
+		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
+		gsl_blas_dgemv (CblasTrans, 1.0, U, &UtX_col.vector, 0.0, Utx_vec);
+		gsl_vector_memcpy (&UtX_col.vector, Utx_vec);
+	}	
+	gsl_vector_free (Utx_vec);
+	return;
+}
+
+
+void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) 
+{
+	for (size_t i=0; i<X->size2; ++i) {
+		gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
+		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
+		gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector);
+	}
+	return;
+}
+
+void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx) 
+{
+	gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
+	return;
+}
+
+
+//Kronecker product
+void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) 
+{
+	for (size_t i=0; i<K->size1; i++) {
+		for (size_t j=0; j<K->size2; j++) {
+			gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2);
+			gsl_matrix_memcpy (&H_sub.matrix, V);
+			gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j));
+		}
+	}
+	return;
+}
+
+//symmetric K matrix
+void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) 
+{
+	for (size_t i=0; i<K->size1; i++) {
+		for (size_t j=i; j<K->size2; j++) {
+			gsl_matrix_view H_sub=gsl_matrix_submatrix (H, i*V->size1, j*V->size2, V->size1, V->size2);
+			gsl_matrix_memcpy (&H_sub.matrix, V);
+			gsl_matrix_scale (&H_sub.matrix, gsl_matrix_get (K, i, j));
+			
+			if (i!=j) {
+				gsl_matrix_view H_sub_sym=gsl_matrix_submatrix (H, j*V->size1, i*V->size2, V->size1, V->size2);
+				gsl_matrix_memcpy (&H_sub_sym.matrix, &H_sub.matrix);
+			}
+		}
+	}
+	return;
+}
+
+
+// this function calculates HWE p value with methods described in Wigginton et al., 2005 AJHG; 
+// it is based on the code in plink 1.07
+double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab)
+{
+	if ( (n_hom1+n_hom2+n_ab)==0 ) {return 1;}
+	
+	//aa is the rare allele
+	int n_aa=n_hom1 < n_hom2 ? n_hom1 : n_hom2;
+	int n_bb=n_hom1 < n_hom2 ? n_hom2 : n_hom1;
+	
+	int rare_copies = 2 * n_aa + n_ab;
+	int genotypes   = n_ab + n_bb + n_aa;
+	
+	double * het_probs = (double *) malloc( (rare_copies + 1) * sizeof(double));
+	if (het_probs == NULL) 
+		cout<<"Internal error: SNP-HWE: Unable to allocate array"<<endl;
+		
+		int i;
+	for (i = 0; i <= rare_copies; i++)
+		het_probs[i] = 0.0;
+		
+	/* start at midpoint */
+		int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
+		
+	/* check to ensure that midpoint and rare alleles have same parity */
+		if ((rare_copies & 1) ^ (mid & 1))
+			mid++;
+	
+	int curr_hets = mid;
+	int curr_homr = (rare_copies - mid) / 2;
+	int curr_homc = genotypes - curr_hets - curr_homr;
+	
+	het_probs[mid] = 1.0;
+	double sum = het_probs[mid];
+	for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
+    {
+		het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0)
+		/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
+		sum += het_probs[curr_hets - 2];
+		
+		/* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
+		curr_homr++;
+		curr_homc++;
+    }
+	
+	curr_hets = mid;
+	curr_homr = (rare_copies - mid) / 2;
+	curr_homc = genotypes - curr_hets - curr_homr;
+	for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
+    {
+		het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc
+		/((curr_hets + 2.0) * (curr_hets + 1.0));
+		sum += het_probs[curr_hets + 2];
+		
+		/* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
+		curr_homr--;
+		curr_homc--;
+    }
+	
+	for (i = 0; i <= rare_copies; i++)
+		het_probs[i] /= sum;
+		
+	/* alternate p-value calculation for p_hi/p_lo
+	 double p_hi = het_probs[n_ab];
+	 for (i = n_ab + 1; i <= rare_copies; i++)
+     p_hi += het_probs[i];
+	 
+	 double p_lo = het_probs[n_ab];
+	 for (i = n_ab - 1; i >= 0; i--)
+	 p_lo += het_probs[i];
+	 
+	 double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo;
+	 */
+		
+		double p_hwe = 0.0;
+	/*  p-value calculation for p_hwe  */
+		for (i = 0; i <= rare_copies; i++)
+		{
+			if (het_probs[i] > het_probs[n_ab])
+				continue;
+			p_hwe += het_probs[i];
+		}
+	
+	p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
+	
+	free(het_probs);
+	
+	return p_hwe;
+}
+
+
+
+
+
+
+	
+
diff --git a/src/mathfunc.h b/src/mathfunc.h
new file mode 100644
index 0000000..d0e1696
--- /dev/null
+++ b/src/mathfunc.h
@@ -0,0 +1,41 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MATHFUNC_H__                
+#define __MATHFUNC_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+using namespace std;
+
+double VectorVar (const gsl_vector *v);
+void CenterMatrix (gsl_matrix *G);
+void CenterMatrix (gsl_matrix *G, gsl_vector *w);
+void ScaleMatrix (gsl_matrix *G);
+double CenterVector (gsl_vector *y);
+void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX);
+void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX);
+void CalcUtX (const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx);
+double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab);
+void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H);
+void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H);
+
+
+#endif
diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp
new file mode 100644
index 0000000..4b910ee
--- /dev/null
+++ b/src/mvlmm.cpp
@@ -0,0 +1,3749 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_roots.h"
+#include "gsl/gsl_min.h"
+#include "gsl/gsl_integration.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#include "mvlmm_float.h"
+#else
+#include "lmm.h"
+#include "mvlmm.h"
+#endif
+
+
+
+using namespace std;
+
+
+//in this file, X, Y are already transformed (i.e. UtX and UtY)
+
+
+void MVLMM::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	
+	l_min=cPar.l_min;
+	l_max=cPar.l_max;
+	n_region=cPar.n_region;
+	p_nr=cPar.p_nr;
+	em_iter=cPar.em_iter;
+	nr_iter=cPar.nr_iter;
+	em_prec=cPar.em_prec;
+	nr_prec=cPar.nr_prec;
+	crt=cPar.crt;
+	
+	Vg_remle_null=cPar.Vg_remle_null;
+	Ve_remle_null=cPar.Ve_remle_null;
+	Vg_mle_null=cPar.Vg_mle_null;
+	Ve_mle_null=cPar.Ve_mle_null;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+	
+	ni_total=cPar.ni_total;
+	ns_total=cPar.ns_total;
+	ni_test=cPar.ni_test;
+	ns_test=cPar.ns_test;
+	n_cvt=cPar.n_cvt;
+		
+	n_ph=cPar.n_ph;
+	
+	indicator_idv=cPar.indicator_idv;	
+	indicator_snp=cPar.indicator_snp;
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void MVLMM::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+		
+	cPar.Vg_remle_null=Vg_remle_null;
+	cPar.Ve_remle_null=Ve_remle_null;
+	cPar.Vg_mle_null=Vg_mle_null;
+	cPar.Ve_mle_null=Ve_mle_null;
+	
+	cPar.VVg_remle_null=VVg_remle_null;
+	cPar.VVe_remle_null=VVe_remle_null;
+	cPar.VVg_mle_null=VVg_mle_null;
+	cPar.VVe_mle_null=VVe_mle_null;
+	
+	cPar.beta_remle_null=beta_remle_null;
+	cPar.se_beta_remle_null=se_beta_remle_null;
+	cPar.beta_mle_null=beta_mle_null;
+	cPar.se_beta_mle_null=se_beta_mle_null;
+	
+	cPar.logl_remle_H0=logl_remle_H0;
+	cPar.logl_mle_H0=logl_mle_H0;	
+	return;
+}
+
+
+void MVLMM::WriteFiles () 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".assoc.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
+	
+	for (size_t i=0; i<n_ph; i++) {
+		outfile<<"beta_"<<i+1<<"\t";
+	}
+	for (size_t i=0; i<n_ph; i++) {
+		for (size_t j=i; j<n_ph; j++) {
+			outfile<<"Vbeta_"<<i+1<<"_"<<j+1<<"\t";
+		}
+	}
+	
+	if (a_mode==1) {
+		outfile<<"p_wald"<<endl;
+	} else if (a_mode==2) {
+		outfile<<"p_lrt"<<endl;
+	} else if (a_mode==3) {
+		outfile<<"p_score"<<endl;
+	} else if (a_mode==4) {
+		outfile<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
+	} else {}
+	
+	
+	size_t t=0, c=0;
+	for (size_t i=0; i<snpInfo.size(); ++i) {
+		if (indicator_snp[i]==0) {continue;}
+		
+		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
+		
+		outfile<<scientific<<setprecision(6);
+		
+		for (size_t i=0; i<n_ph; i++) {
+			outfile<<sumStat[t].v_beta[i]<<"\t";
+		}
+		
+		c=0;
+		for (size_t i=0; i<n_ph; i++) {
+			for (size_t j=i; j<n_ph; j++) {
+				outfile<<sumStat[t].v_Vbeta[c]<<"\t";
+				c++;
+			}
+		}
+		
+		if (a_mode==1) {
+			outfile<<sumStat[t].p_wald <<endl;
+		} else if (a_mode==2) {
+			outfile<<sumStat[t].p_lrt<<endl;
+		} else if (a_mode==3) {
+			outfile<<sumStat[t].p_score<<endl;
+		} else if (a_mode==4) {
+			outfile<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
+		} else {}
+		
+		t++;
+	}
+	
+		
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+//below are functions for EM algorithm
+
+
+
+
+	
+
+double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix *UltVeh, gsl_matrix *UltVehi)
+{
+	size_t d_size=V_g->size1;
+	double d, logdet_Ve=0.0;	
+	
+	//eigen decomposition of V_e
+	gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size);	
+	gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size);	
+	
+	gsl_matrix_memcpy(V_e_temp, V_e);
+	EigenDecomp(V_e_temp, U_l, D_l, 0);
+		
+	//calculate V_e_h and V_e_hi
+	gsl_matrix_set_zero(V_e_h);
+	gsl_matrix_set_zero(V_e_hi);
+	for (size_t i=0; i<d_size; i++) {
+		d=gsl_vector_get (D_l, i);
+		if (d<=0) {continue;}
+		logdet_Ve+=log(d);
+		
+		gsl_vector_view U_col=gsl_matrix_column(U_l, i);
+		d=sqrt(d);
+		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_h);
+		d=1.0/d;
+		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_hi);
+	}
+	
+	//copy the upper part to lower part
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<i; j++) {
+			gsl_matrix_set (V_e_h, i, j, gsl_matrix_get(V_e_h, j, i));
+			gsl_matrix_set (V_e_hi, i, j, gsl_matrix_get(V_e_hi, j, i));
+		}
+	}
+	
+	//calculate Lambda=V_ehi V_g V_ehi
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_g, V_e_hi, 0.0, VgVehi);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_e_hi, VgVehi, 0.0, Lambda);
+	
+	//eigen decomposition of Lambda
+	EigenDecomp(Lambda, U_l, D_l, 0);
+	
+	for (size_t i=0; i<d_size; i++) {
+		d=gsl_vector_get (D_l, i);
+		if (d<0) {gsl_vector_set (D_l, i, 0);}
+	}
+	
+	//calculate UltVeh and UltVehi
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_h, 0.0, UltVeh);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_hi, 0.0, UltVehi);
+	/*
+	cout<<"Vg: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (V_g, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	cout<<"Ve: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (V_e, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	
+	cout<<"Dl: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		cout<<gsl_vector_get (D_l, i)<<endl;
+	}
+	cout<<"UltVeh: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			cout<<gsl_matrix_get (UltVeh, i, j)<<" ";
+		}
+		cout<<endl;
+	}
+	*/
+	
+	//free memory
+	gsl_matrix_free (Lambda);
+	gsl_matrix_free (V_e_temp);
+	gsl_matrix_free (V_e_h);
+	gsl_matrix_free (V_e_hi);
+	gsl_matrix_free (VgVehi);
+	gsl_matrix_free (U_l);
+	
+	return logdet_Ve;
+}
+	
+//Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+I)^{-1} )^{-1}
+double CalcQi (const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, gsl_matrix *Qi)
+{
+	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
+	size_t c_size=dc_size/d_size;
+	
+	double delta, dl, d1, d2, d, logdet_Q;
+		
+	gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix_set_zero (Q);
+	
+	for (size_t i=0; i<c_size; i++) {		
+		for (size_t j=0; j<c_size; j++) {			
+			for (size_t l=0; l<d_size; l++) {
+				dl=gsl_vector_get(D_l, l);
+				
+				if (j<i) {
+					d=gsl_matrix_get (Q, j*d_size+l, i*d_size+l);		
+				} else {
+					d=0.0;
+					for (size_t k=0; k<n_size; k++) {
+						d1=gsl_matrix_get(X, i, k);
+						d2=gsl_matrix_get(X, j, k);
+						delta=gsl_vector_get(eval, k);
+						d+=d1*d2/(dl*delta+1.0);				
+					}
+				}
+				
+				gsl_matrix_set (Q, i*d_size+l, j*d_size+l, d);
+			}
+		}
+	}
+	
+	//calculate LU decomposition of Q, and invert Q and calculate |Q|	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (dc_size);
+	LUDecomp (Q, pmt, &sig);	
+	LUInvert (Q, pmt, Qi);
+	
+	logdet_Q=LULndet (Q);
+	
+	gsl_matrix_free (Q);
+	gsl_permutation_free (pmt);
+	
+	return logdet_Q;
+}
+
+//xHiy=\sum_{k=1}^n x_k\otimes ((delta_k*Dl+I)^{-1}Ul^TVe^{-1/2}y
+void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *UltVehiY, gsl_vector *xHiy)
+{
+	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size;
+	
+	gsl_vector_set_zero (xHiy);
+	
+	double x, delta, dl, y, d;
+	for (size_t i=0; i<d_size; i++) {	
+		dl=gsl_vector_get(D_l, i);
+		for (size_t j=0; j<c_size; j++) {		
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				x=gsl_matrix_get(X, j, k);
+				y=gsl_matrix_get(UltVehiY, i, k);
+				delta=gsl_vector_get(eval, k);
+				d+=x*y/(delta*dl+1.0);
+			}
+			gsl_vector_set(xHiy, j*d_size+i, d);
+		}
+	}
+	/*
+	cout<<"xHiy: "<<endl;
+	for (size_t i=0; i<(d_size*c_size); i++) {
+		cout<<gsl_vector_get(xHiy, i)<<endl;
+	}
+	 */
+	return;
+}
+
+
+//OmegaU=D_l/(delta Dl+I)^{-1}
+//OmegaE=delta D_l/(delta Dl+I)^{-1}
+void CalcOmega (const gsl_vector *eval, const gsl_vector *D_l, gsl_matrix *OmegaU, gsl_matrix *OmegaE)
+{
+	size_t n_size=eval->size, d_size=D_l->size;
+	double delta, dl, d_u, d_e;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get(eval, k);
+		for (size_t i=0; i<d_size; i++) {
+			dl=gsl_vector_get(D_l, i);
+			
+			d_u=dl/(delta*dl+1.0);
+			d_e=delta*d_u;
+			
+			gsl_matrix_set(OmegaU, i, k, d_u);
+			gsl_matrix_set(OmegaE, i, k, d_e);
+		}
+	}
+	
+	return;
+}
+
+
+void UpdateU (const gsl_matrix *OmegaE, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU)
+{
+	gsl_matrix_memcpy (UltVehiU, UltVehiY);
+	gsl_matrix_sub (UltVehiU, UltVehiBX);
+	
+	gsl_matrix_mul_elements (UltVehiU, OmegaE);	
+	return;
+}
+
+
+void UpdateE (const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiE)
+{
+	gsl_matrix_memcpy (UltVehiE, UltVehiY);
+	gsl_matrix_sub (UltVehiE, UltVehiBX);
+	gsl_matrix_sub (UltVehiE, UltVehiU);
+	
+	return;
+}
+
+
+
+void UpdateL_B (const gsl_matrix *X, const gsl_matrix *XXti, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiB)
+{
+	size_t c_size=X->size1, d_size=UltVehiY->size1;
+	
+	gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_memcpy (UltVehiBX, UltVehiY);
+	gsl_matrix_sub (UltVehiBX, UltVehiU);
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB);
+	
+	gsl_matrix_free(YUX);	
+	
+	return;
+}
+
+void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB)
+{
+	size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1;
+	
+	gsl_vector *b=gsl_vector_alloc (dc_size);
+	
+	//calculate b=Qiv
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b);
+	
+	//copy b to UltVehiB
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view UltVehiB_col=gsl_matrix_column (UltVehiB, i);
+		gsl_vector_const_view b_subcol=gsl_vector_const_subvector (b, i*d_size, d_size);
+		gsl_vector_memcpy (&UltVehiB_col.vector, &b_subcol.vector);
+	}	
+	
+	gsl_vector_free(b);
+	
+	return;
+}
+
+
+
+void UpdateV (const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E, const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee, gsl_matrix *V_g, gsl_matrix *V_e)
+{
+	size_t n_size=eval->size, d_size=U->size1;
+	
+	gsl_matrix_set_zero (V_g);
+	gsl_matrix_set_zero (V_e);
+	
+	double delta;
+	
+	//calculate the first part: UD^{-1}U^T and EE^T	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		if (delta==0) {continue;}
+		
+		gsl_vector_const_view U_col=gsl_matrix_const_column (U, k);		
+		gsl_blas_dsyr (CblasUpper, 1.0/delta, &U_col.vector, V_g);
+	}
+	
+	gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e);	
+	
+	//copy the upper part to lower part
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<i; j++) {
+			gsl_matrix_set (V_g, i, j, gsl_matrix_get(V_g, j, i));
+			gsl_matrix_set (V_e, i, j, gsl_matrix_get(V_e, j, i));
+		}
+	}
+	
+	//add Sigma
+	gsl_matrix_add (V_g, Sigma_uu);
+	gsl_matrix_add (V_e, Sigma_ee);
+	
+	//scale by 1/n
+	gsl_matrix_scale (V_g, 1.0/(double)n_size);
+	gsl_matrix_scale (V_e, 1.0/(double)n_size);
+	
+	return;
+}
+
+
+void CalcSigma (const char func_name, const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *OmegaU, const gsl_matrix *OmegaE, const gsl_matrix *UltVeh, const gsl_matrix *Qi, gsl_matrix *Sigma_uu, gsl_matrix *Sigma_ee)
+{	
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1;
+	
+	gsl_matrix_set_zero(Sigma_uu);
+	gsl_matrix_set_zero(Sigma_ee);
+	
+	double delta, dl, x, d;	
+	
+	//calculate the first diagonal term
+	gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu);
+	gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee);
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_vector_const_view OmegaU_col=gsl_matrix_const_column (OmegaU, k);
+		gsl_vector_const_view OmegaE_col=gsl_matrix_const_column (OmegaE, k);
+		
+		gsl_vector_add (&Suu_diag.vector, &OmegaU_col.vector);
+		gsl_vector_add (&See_diag.vector, &OmegaE_col.vector);
+	}	
+	
+	//calculate the second term for reml
+	if (func_name=='R' || func_name=='r') {		
+		gsl_matrix *M_u=gsl_matrix_alloc(dc_size, d_size);
+		gsl_matrix *M_e=gsl_matrix_alloc(dc_size, d_size);
+		gsl_matrix *QiM=gsl_matrix_alloc(dc_size, d_size);		
+		
+		gsl_matrix_set_zero(M_u);
+		gsl_matrix_set_zero(M_e);
+		
+		for (size_t k=0; k<n_size; k++) {
+			delta=gsl_vector_get(eval, k);
+			//if (delta==0) {continue;}
+			
+			for (size_t i=0; i<d_size; i++) {
+				dl=gsl_vector_get(D_l, i);
+				for (size_t j=0; j<c_size; j++) {				
+					x=gsl_matrix_get(X, j, k);
+					d=x/(delta*dl+1.0);
+					gsl_matrix_set(M_e, j*d_size+i, i, d);
+					gsl_matrix_set(M_u, j*d_size+i, i, d*dl);					
+				}
+			}			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_u, 0.0, QiM);
+			gsl_blas_dgemm(CblasTrans, CblasNoTrans, delta, M_u, QiM, 1.0, Sigma_uu);
+		
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_e, 0.0, QiM);
+			gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, M_e, QiM, 1.0, Sigma_ee);
+		}		
+		
+		gsl_matrix_free(M_u);
+		gsl_matrix_free(M_e);
+		gsl_matrix_free(QiM);	
+	}
+	
+	//multiply both sides by VehUl
+	gsl_matrix *M=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_uu, UltVeh, 0.0, M);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_uu);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_ee, UltVeh, 0.0, M);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_ee);
+	
+	gsl_matrix_free(M);
+	return;
+}
+
+
+//'R' for restricted likelihood and 'L' for likelihood
+//'R' update B and 'L' don't
+//only calculate -0.5*\sum_{k=1}^n|H_k|-0.5yPxy
+double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *Qi) 
+{
+	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
+	double logl=0.0, delta, dl, y, d;
+	
+	//calculate yHiy+log|H_k|
+	for (size_t k=0; k<n_size; k++) {		
+		delta=gsl_vector_get(eval, k);
+		for (size_t i=0; i<d_size; i++) {
+			y=gsl_matrix_get(UltVehiY, i, k);
+			dl=gsl_vector_get(D_l, i);
+			d=delta*dl+1.0;
+			
+			logl+=y*y/d+log(d);
+		}
+	}
+	
+	//calculate the rest of yPxy
+	gsl_vector *Qiv=gsl_vector_alloc(dc_size);
+	
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, Qiv);
+	gsl_blas_ddot(xHiy, Qiv, &d);
+	
+	logl-=d;
+		
+	gsl_vector_free(Qiv);
+	
+	return -0.5*logl;
+}
+
+
+
+
+
+//Y is a dxn matrix, X is a cxn matrix, B is a dxc matrix, V_g is a dxd matrix, V_e is a dxd matrix, eval is a size n vector
+//'R' for restricted likelihood and 'L' for likelihood
+double MphEM (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *U_hat, gsl_matrix *E_hat, gsl_matrix *OmegaU, gsl_matrix *OmegaE, gsl_matrix *UltVehiY, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU, gsl_matrix *UltVehiE, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
+	size_t dc_size=d_size*c_size;	
+		
+	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);
+	gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size);
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehiB=gsl_matrix_alloc (d_size, c_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
+	gsl_permutation * pmt=gsl_permutation_alloc (c_size);	
+		
+	double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve;
+	int sig;
+	
+	//calculate |XXt| and (XXt)^{-1}
+	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
+	for (size_t i=0; i<c_size; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
+		}
+	}
+	
+	LUDecomp (XXt, pmt, &sig);
+	LUInvert (XXt, pmt, XXti);
+	
+	//calculate the constant for logl	
+	if (func_name=='R' || func_name=='r') {		
+		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
+	} else {
+		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
+	}	
+	
+	//start EM
+	for (size_t t=0; t<max_iter; t++) {
+		logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);
+
+		logdet_Q=CalcQi (eval, D_l, X, Qi);
+
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+		CalcXHiY(eval, D_l, X, UltVehiY, xHiy);
+
+		//calculate log likelihood/restricted likelihood value, and terminate if change is small	
+		logl_new=logl_const+MphCalcLogL (eval, xHiy, D_l, UltVehiY, Qi)-0.5*(double)n_size*logdet_Ve;
+		if (func_name=='R' || func_name=='r') {	
+			logl_new+=-0.5*(logdet_Q-(double)c_size*logdet_Ve);
+		}		
+		if (t!=0 && abs(logl_new-logl_old)<max_prec) {break;}
+		logl_old=logl_new;
+		
+		/*
+		cout<<"iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;
+		
+		cout<<"Vg: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Ve: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		*/
+		
+		CalcOmega (eval, D_l, OmegaU, OmegaE);
+
+		//Update UltVehiB, UltVehiU
+		if (func_name=='R' || func_name=='r') {	
+			UpdateRL_B(xHiy, Qi, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		} else if (t==0) {
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, B, 0.0, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		}
+		
+		UpdateU(OmegaE, UltVehiY, UltVehiBX, UltVehiU);
+		
+		if (func_name=='L' || func_name=='l') {	
+			//UltVehiBX is destroyed here
+			UpdateL_B(X, XXti, UltVehiY, UltVehiU, UltVehiBX, UltVehiB);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
+		}
+
+		UpdateE(UltVehiY, UltVehiBX, UltVehiU, UltVehiE);
+		
+		//calculate U_hat, E_hat and B
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiU, 0.0, U_hat);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiE, 0.0, E_hat);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiB, 0.0, B);
+
+		//calculate Sigma_uu and Sigma_ee
+		CalcSigma (func_name, eval, D_l, X, OmegaU, OmegaE, UltVeh, Qi, Sigma_uu, Sigma_ee);
+		
+		//update V_g and V_e
+		UpdateV (eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e);		
+	}
+		
+	gsl_matrix_free(XXt);
+	gsl_matrix_free(XXti);
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(UltVehiB);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(Sigma_uu);
+	gsl_matrix_free(Sigma_ee);
+	gsl_vector_free(xHiy);
+	gsl_permutation_free(pmt);
+	
+	return logl_new;
+}
+
+
+
+
+
+
+
+//calculate p-value, beta (d by 1 vector) and V(beta)
+double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta, gsl_matrix *Vbeta) 
+{
+	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
+	size_t dc_size=d_size*c_size;
+	double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value;
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size);	
+	gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size);
+	
+	gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size);	
+	gsl_vector *xPy=gsl_vector_alloc (d_size);
+	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
+	gsl_vector *WHiy=gsl_vector_alloc (dc_size);
+	
+	gsl_matrix_set_zero (xPx);
+	gsl_matrix_set_zero (WHix);
+	gsl_vector_set_zero (xPy);
+	gsl_vector_set_zero (WHiy);
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, W, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+		
+	//calculate WHix, WHiy, xHiy, xHix
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+		
+		d1=0.0; d2=0.0;
+		for (size_t k=0; k<n_size; k++) {
+			delta=gsl_vector_get(eval, k);
+			dx=gsl_vector_get(x_vec, k);
+			dy=gsl_matrix_get(UltVehiY, i, k);
+			
+			d1+=dx*dy/(delta*dl+1.0);
+			d2+=dx*dx/(delta*dl+1.0);
+		}
+		gsl_vector_set (xPy, i, d1);
+		gsl_matrix_set (xPx, i, i, d2);
+		
+		for (size_t j=0; j<c_size; j++) {	
+			d1=0.0; d2=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dx=gsl_vector_get(x_vec, k);
+				dw=gsl_matrix_get(W, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d1+=dx*dw/(delta*dl+1.0);
+				d2+=dy*dw/(delta*dl+1.0);
+			}
+			gsl_matrix_set(WHix, j*d_size+i, i, d1);
+			gsl_vector_set(WHiy, j*d_size+i, d2);
+		}
+	}
+	
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, WHix, 0.0, QiWHix);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, -1.0, WHix, QiWHix, 1.0, xPx);
+	gsl_blas_dgemv(CblasTrans, -1.0, QiWHix, WHiy, 1.0, xPy);
+		
+	//calculate V(beta) and beta
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (d_size);
+	LUDecomp (xPx, pmt, &sig);
+	LUSolve (xPx, pmt, xPy, D_l);
+	LUInvert (xPx, pmt, Vbeta);
+
+	//need to multiply UltVehi on both sides or one side
+	gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, D_l, 0.0, beta);
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Vbeta, UltVeh, 0.0, xPx);
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, xPx, 0.0, Vbeta);	
+
+	//calculate test statistic and p value	
+	gsl_blas_ddot(D_l, xPy, &d);
+	
+	p_value=gsl_cdf_chisq_Q (d, (double)d_size);
+	//d*=(double)(n_size-c_size-d_size)/((double)d_size*(double)(n_size-c_size-1));
+	//p_value=gsl_cdf_fdist_Q (d, (double)d_size, (double)(n_size-c_size-d_size));	
+	
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(WHix);	
+	gsl_matrix_free(QiWHix);
+	
+	gsl_matrix_free(xPx);	
+	gsl_vector_free(xPy);
+	gsl_vector_free(WHiy);
+	
+	gsl_permutation_free(pmt);
+	
+	return p_value;
+}
+
+
+
+//calculate B and its standard error (which is a matrix of the same dimension as B)
+void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B, gsl_matrix *se_B) 
+{
+	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
+	size_t dc_size=d_size*c_size;
+	double delta, dl, d, dy, dw, logdet_Ve, logdet_Q;
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *Qi_temp=gsl_matrix_alloc (dc_size, dc_size);
+	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
+	gsl_vector *WHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *QiWHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *beta=gsl_vector_alloc (dc_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_vector_set_zero (WHiy);
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, W, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+	
+	//calculate WHiy
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+				
+		for (size_t j=0; j<c_size; j++) {	
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dw=gsl_matrix_get(W, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d+=dy*dw/(delta*dl+1.0);
+			}
+			gsl_vector_set(WHiy, j*d_size+i, d);
+		}
+	}
+	
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, WHiy, 0.0, QiWHiy);
+	
+	//need to multiply I_c\otimes UltVehi on both sides or one side
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view QiWHiy_sub=gsl_vector_subvector(QiWHiy, i*d_size, d_size);
+		gsl_vector_view beta_sub=gsl_vector_subvector(beta, i*d_size, d_size);		
+		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &QiWHiy_sub.vector, 0.0, &beta_sub.vector);
+	
+		for (size_t j=0; j<c_size; j++) {
+			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
+			gsl_matrix_view Qitemp_sub=gsl_matrix_submatrix (Qi_temp, i*d_size, j*d_size, d_size, d_size);
+			gsl_matrix_view Vbeta_sub=gsl_matrix_submatrix (Vbeta, i*d_size, j*d_size, d_size, d_size);
+			
+			if (j<i) {
+				gsl_matrix_view Vbeta_sym=gsl_matrix_submatrix (Vbeta, j*d_size, i*d_size, d_size, d_size);
+				gsl_matrix_transpose_memcpy (&Vbeta_sub.matrix, &Vbeta_sym.matrix);
+			} else {
+				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, &Qitemp_sub.matrix);
+				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, &Qitemp_sub.matrix, 0.0, &Vbeta_sub.matrix);
+			}			
+		}
+	}
+	
+	//copy beta to B, and Vbeta to se_B
+	for (size_t j=0; j<B->size2; j++) {
+		for (size_t i=0; i<B->size1; i++) {
+			gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i));
+			gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i)));
+		}
+	}	
+	
+	//free matrices
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(Qi_temp);
+	gsl_vector_free(WHiy);
+	gsl_vector_free(QiWHiy);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+		
+	return;
+}
+
+
+
+//below are functions for Newton-Raphson's algorithm
+
+
+
+
+
+//calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k|
+//and calculate Qi and return logdet_Q
+//and calculate yPy
+void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all, gsl_matrix *Qi, double &logdet_H, double &logdet_Q)
+{
+	gsl_matrix_set_zero (Hi_all);
+	gsl_matrix_set_zero (Qi);
+	logdet_H=0.0; logdet_Q=0.0;
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1;
+	double logdet_Ve=0.0, delta, dl, d;	
+	
+	gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	
+	//calculate D_l, UltVeh and UltVehi
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);
+	
+	//calculate each Hi and log|H_k|
+	logdet_H=(double)n_size*logdet_Ve;
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_matrix_memcpy (mat_dd, UltVehi);
+		for (size_t i=0; i<d_size; i++) {
+			dl=gsl_vector_get(D_l, i);
+			d=delta*dl+1.0;
+			
+			gsl_vector_view mat_row=gsl_matrix_row (mat_dd, i);
+			gsl_vector_scale (&mat_row.vector, 1.0/d);
+			
+			logdet_H+=log(d);
+		}
+		
+		gsl_matrix_view Hi_k=gsl_matrix_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVehi, mat_dd, 0.0, &Hi_k.matrix);
+	}	
+	
+	//calculate Qi, and multiply I\otimes UtVeh on both side
+	//and calculate logdet_Q, don't forget to substract c_size*logdet_Ve
+	logdet_Q=CalcQi (eval, D_l, X, Qi)-(double)c_size*logdet_Ve;
+		
+	for (size_t i=0; i<c_size; i++) {
+		for (size_t j=0; j<c_size; j++) {
+			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
+			if (j<i) {
+				gsl_matrix_view Qi_sym=gsl_matrix_submatrix (Qi, j*d_size, i*d_size, d_size, d_size);
+				gsl_matrix_transpose_memcpy (&Qi_sub.matrix, &Qi_sym.matrix);
+			} else {
+				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, mat_dd);
+				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, mat_dd, 0.0, &Qi_sub.matrix);
+			}
+		}
+	}
+
+	//free memory
+	gsl_matrix_free(mat_dd);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_vector_free(D_l);
+	
+	return;
+}
+
+
+
+
+//calculate all Hiy
+void Calc_Hiy_all (const gsl_matrix *Y, const gsl_matrix *Hi_all, gsl_matrix *Hiy_all)
+{
+	gsl_matrix_set_zero (Hiy_all);
+	
+	size_t n_size=Y->size2, d_size=Y->size1;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		gsl_vector_view Hiy_k=gsl_matrix_column(Hiy_all, k);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &Hi_k.matrix, &y_k.vector, 0.0, &Hiy_k.vector);
+	}
+	
+	return;
+}
+
+
+//calculate all xHi
+void Calc_xHi_all (const gsl_matrix *X, const gsl_matrix *Hi_all, gsl_matrix *xHi_all)
+{
+	gsl_matrix_set_zero (xHi_all);
+	
+	size_t n_size=X->size2, c_size=X->size1, d_size=Hi_all->size1;
+
+	double d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
+
+		for (size_t i=0; i<c_size; i++) {
+			d=gsl_matrix_get (X, i, k);
+			gsl_matrix_view xHi_sub=gsl_matrix_submatrix(xHi_all, i*d_size, k*d_size, d_size, d_size);
+			gsl_matrix_memcpy(&xHi_sub.matrix, &Hi_k.matrix);
+			gsl_matrix_scale(&xHi_sub.matrix, d);
+		}
+	}
+	
+	return;
+}
+
+
+//calculate scalar yHiy
+double Calc_yHiy (const gsl_matrix *Y, const gsl_matrix *Hiy_all)
+{
+	double yHiy=0.0, d;
+	size_t n_size=Y->size2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		gsl_vector_const_view Hiy_k=gsl_matrix_const_column(Hiy_all, k);
+		
+		gsl_blas_ddot (&Hiy_k.vector, &y_k.vector, &d);
+		yHiy+=d;
+	}
+	
+	return yHiy;
+}
+
+
+//calculate the vector xHiy
+void Calc_xHiy (const gsl_matrix *Y, const gsl_matrix *xHi, gsl_vector *xHiy)
+{
+	gsl_vector_set_zero (xHiy);
+	
+	size_t n_size=Y->size2, d_size=Y->size1, dc_size=xHi->size1;
+	
+	for (size_t k=0; k<n_size; k++) {
+		gsl_matrix_const_view xHi_k=gsl_matrix_const_submatrix(xHi, 0, k*d_size, dc_size, d_size);
+		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHi_k.matrix, &y_k.vector, 1.0, xHiy);
+	}
+	
+	return;
+}
+
+
+
+
+//0<=i,j<d_size
+size_t GetIndex (const size_t i, const size_t j, const size_t d_size)
+{
+	if (i>=d_size || j>=d_size) {cout<<"error in GetIndex."<<endl; return 0;}
+	
+	size_t s, l;
+	if (j<i) {s=j; l=i;} else {s=i; l=j;}
+	
+	return (2*d_size-s+1)*s/2+l-s;
+}
+
+
+
+void Calc_yHiDHiy (const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i, const size_t j, double &yHiDHiy_g, double &yHiDHiy_e)
+{
+	yHiDHiy_g=0.0;
+	yHiDHiy_e=0.0;
+	
+	size_t n_size=eval->size;
+	
+	double delta, d1, d2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		d1=gsl_matrix_get (Hiy, i, k);
+		d2=gsl_matrix_get (Hiy, j, k);
+		
+		if (i==j) {
+			yHiDHiy_g+=delta*d1*d2;
+			yHiDHiy_e+=d1*d2;
+		} else {
+			yHiDHiy_g+=delta*d1*d2*2.0;
+			yHiDHiy_e+=d1*d2*2.0;
+		}
+	}	
+	
+	return;
+}
+
+
+
+void Calc_xHiDHiy (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i, const size_t j, gsl_vector *xHiDHiy_g, gsl_vector *xHiDHiy_e)
+{
+	gsl_vector_set_zero(xHiDHiy_g);
+	gsl_vector_set_zero(xHiDHiy_e);
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
+		d=gsl_matrix_get (Hiy, j, k);
+		
+		gsl_blas_daxpy (d*delta, &xHi_col_i.vector, xHiDHiy_g);
+		gsl_blas_daxpy (d, &xHi_col_i.vector, xHiDHiy_e);		
+		
+		if (i!=j) {
+			gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);
+			d=gsl_matrix_get (Hiy, i, k);
+			
+			gsl_blas_daxpy (d*delta, &xHi_col_j.vector, xHiDHiy_g);
+			gsl_blas_daxpy (d, &xHi_col_j.vector, xHiDHiy_e);	
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHix (const gsl_vector *eval, const gsl_matrix *xHi, const size_t i, const size_t j, gsl_matrix *xHiDHix_g, gsl_matrix *xHiDHix_e)
+{
+	gsl_matrix_set_zero(xHiDHix_g);
+	gsl_matrix_set_zero(xHiDHix_e);
+	
+	size_t n_size=eval->size, dc_size=xHi->size1;
+	size_t d_size=xHi->size2/n_size;
+	
+	double delta;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
+		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);
+		
+		gsl_matrix_set_zero (mat_dcdc);
+		gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);
+		
+		gsl_matrix_transpose_memcpy (mat_dcdc_t, mat_dcdc);
+		
+		gsl_matrix_add (xHiDHix_e, mat_dcdc);
+		
+		gsl_matrix_scale (mat_dcdc, delta);
+		gsl_matrix_add (xHiDHix_g, mat_dcdc);
+		
+		if (i!=j) {
+			gsl_matrix_add (xHiDHix_e, mat_dcdc_t);		
+			
+			gsl_matrix_scale (mat_dcdc_t, delta);
+			gsl_matrix_add (xHiDHix_g, mat_dcdc_t);
+		}
+	}
+	
+	gsl_matrix_free(mat_dcdc);
+	gsl_matrix_free(mat_dcdc_t);
+	
+	return;
+}
+
+
+
+void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yHiDHiDHiy_gg, double &yHiDHiDHiy_ee, double &yHiDHiDHiy_ge)
+{
+	yHiDHiDHiy_gg=0.0;
+	yHiDHiDHiy_ee=0.0;
+	yHiDHiDHiy_ge=0.0;
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		d_Hiy_i1=gsl_matrix_get (Hiy, i1, k);
+		d_Hiy_j1=gsl_matrix_get (Hiy, j1, k);
+		d_Hiy_i2=gsl_matrix_get (Hiy, i2, k);
+		d_Hiy_j2=gsl_matrix_get (Hiy, j2, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {			
+			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
+			
+			if (i2!=j2) {				
+				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
+			}
+		} else {
+			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
+						
+			if (i2!=j2) {
+				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_vector *xHiDHiDHiy_gg, gsl_vector *xHiDHiDHiy_ee, gsl_vector *xHiDHiDHiy_ge)
+{
+	gsl_vector_set_zero(xHiDHiDHiy_gg);
+	gsl_vector_set_zero(xHiDHiDHiy_ee);
+	gsl_vector_set_zero(xHiDHiDHiy_ge);
+	
+	size_t n_size=eval->size, d_size=Hiy->size1;
+	
+	double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
+		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j1);
+		
+		d_Hiy_i=gsl_matrix_get (Hiy, i2, k);
+		d_Hiy_j=gsl_matrix_get (Hiy, j2, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {
+			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			
+			if (i2!=j2) {
+				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			}
+		} else {			
+			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);
+			
+			gsl_blas_daxpy (delta*delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_gg);
+			gsl_blas_daxpy (d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ee);
+			gsl_blas_daxpy (delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ge);
+			
+			if (i2!=j2) {
+				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);
+				
+				gsl_blas_daxpy (delta*delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_gg);
+				gsl_blas_daxpy (d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ee);
+				gsl_blas_daxpy (delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ge);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_xHiDHiDHix (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_matrix *xHiDHiDHix_gg, gsl_matrix *xHiDHiDHix_ee, gsl_matrix *xHiDHiDHix_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHix_gg);
+	gsl_matrix_set_zero(xHiDHiDHix_ee);
+	gsl_matrix_set_zero(xHiDHiDHix_ge);
+	
+	size_t n_size=eval->size, d_size=Hi->size1, dc_size=xHi->size1;
+	
+	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		gsl_vector_const_view xHi_col_i1=gsl_matrix_const_column (xHi, k*d_size+i1);
+		gsl_vector_const_view xHi_col_j1=gsl_matrix_const_column (xHi, k*d_size+j1);
+		gsl_vector_const_view xHi_col_i2=gsl_matrix_const_column (xHi, k*d_size+i2);
+		gsl_vector_const_view xHi_col_j2=gsl_matrix_const_column (xHi, k*d_size+j2);	
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2); 
+		
+		if (i1==j1) {
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+
+			if (i2!=j2) {
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			}
+		} else {
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			
+			gsl_matrix_set_zero (mat_dcdc);
+			gsl_blas_dger (d_Hi_i1i2, &xHi_col_j1.vector, &xHi_col_j2.vector, mat_dcdc);
+			
+			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+			gsl_matrix_scale(mat_dcdc, delta);
+			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			
+			if (i2!=j2) {
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+				
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (d_Hi_i1j2, &xHi_col_j1.vector, &xHi_col_i2.vector, mat_dcdc);
+				
+				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);			
+				gsl_matrix_scale(mat_dcdc, delta);
+				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+			}
+		}
+	}
+	
+	gsl_matrix_free(mat_dcdc);
+	
+	return;
+}
+
+
+
+void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i, const size_t j, double &tHiD_g, double &tHiD_e) 
+{
+	tHiD_g=0.0;
+	tHiD_e=0.0;
+	
+	size_t n_size=eval->size, d_size=Hi->size1;
+	double delta, d;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		d=gsl_matrix_get (Hi, j, k*d_size+i);
+		
+		if (i==j) {
+			tHiD_g+=delta*d;
+			tHiD_e+=d;
+		} else {
+			tHiD_g+=delta*d*2.0;
+			tHiD_e+=d*2.0;
+		}
+	}
+	
+	return;
+}
+
+
+void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee, double &tHiDHiD_ge) 
+{
+	tHiDHiD_gg=0.0;
+	tHiDHiD_ee=0.0;
+	tHiDHiD_ge=0.0;
+	
+	size_t n_size=eval->size, d_size=Hi->size1;
+	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2); 
+		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2); 
+		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);
+		
+		if (i1==j1) {
+			tHiDHiD_gg+=delta*delta*d_Hi_i1j2*d_Hi_j1i2;
+			tHiDHiD_ee+=d_Hi_i1j2*d_Hi_j1i2;
+			tHiDHiD_ge+=delta*d_Hi_i1j2*d_Hi_j1i2;
+			
+			if (i2!=j2) {
+				tHiDHiD_gg+=delta*delta*d_Hi_i1i2*d_Hi_j1j2;
+				tHiDHiD_ee+=d_Hi_i1i2*d_Hi_j1j2;
+				tHiDHiD_ge+=delta*d_Hi_i1i2*d_Hi_j1j2;
+			}
+		} else {
+			tHiDHiD_gg+=delta*delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			tHiDHiD_ee+=(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			tHiDHiD_ge+=delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
+			
+			if (i2!=j2) {
+				tHiDHiD_gg+=delta*delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+				tHiDHiD_ee+=(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+				tHiDHiD_ge+=delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
+			}
+		}
+	}
+	
+	return;
+}
+
+
+//trace(PD)=trace((Hi-HixQixHi)D)=trace(HiD)-trace(HixQixHiD)
+void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const size_t i, const size_t j, double &tPD_g, double &tPD_e) 
+{
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t v=GetIndex(i, j, d_size);
+	
+	double d;
+	
+	//calculate the first part: trace(HiD)
+	Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e);
+	
+	//calculate the second part: -trace(HixQixHiD)
+	for (size_t k=0; k<dc_size; k++) {
+		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, k);
+		gsl_vector_const_view xHiDHix_g_col=gsl_matrix_const_column (xHiDHix_all_g, v*dc_size+k);
+		gsl_vector_const_view xHiDHix_e_col=gsl_matrix_const_column (xHiDHix_all_e, v*dc_size+k);
+		
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_g_col.vector, &d);
+		tPD_g-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_e_col.vector, &d);
+		tPD_e-=d;
+	}
+		
+	return;
+}
+
+
+
+//trace(PDPD)=trace((Hi-HixQixHi)D(Hi-HixQixHi)D)
+//=trace(HiDHiD)-trace(HixQixHiDHiD)-trace(HiDHixQixHiD)+trace(HixQixHiDHixQixHiD)
+void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tPDPD_gg, double &tPDPD_ee, double &tPDPD_ge) 
+{
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t v_size=d_size*(d_size+1)/2;
+	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);
+	
+	double d;
+	
+	//calculate the first part: trace(HiDHiD)
+	Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
+
+	//calculate the second and third parts: -trace(HixQixHiDHiD)-trace(HiDHixQixHiD)
+	for (size_t i=0; i<dc_size; i++) {
+		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, i);
+		gsl_vector_const_view xHiDHiDHix_gg_col=gsl_matrix_const_column (xHiDHiDHix_all_gg, (v1*v_size+v2)*dc_size+i);
+		gsl_vector_const_view xHiDHiDHix_ee_col=gsl_matrix_const_column (xHiDHiDHix_all_ee, (v1*v_size+v2)*dc_size+i);
+		gsl_vector_const_view xHiDHiDHix_ge_col=gsl_matrix_const_column (xHiDHiDHix_all_ge, (v1*v_size+v2)*dc_size+i);
+
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_col.vector, &d);
+		tPDPD_gg-=d*2.0;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_col.vector, &d);
+		tPDPD_ee-=d*2.0;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_col.vector, &d);
+		tPDPD_ge-=d*2.0;
+		/*
+		gsl_vector_const_view xHiDHiDHix_gg_row=gsl_matrix_const_row (xHiDHiDHix_gg, i);
+		gsl_vector_const_view xHiDHiDHix_ee_row=gsl_matrix_const_row (xHiDHiDHix_ee, i);
+		gsl_vector_const_view xHiDHiDHix_ge_row=gsl_matrix_const_row (xHiDHiDHix_ge, i);
+		
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_row.vector, &d);
+		tPDPD_gg-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_row.vector, &d);
+		tPDPD_ee-=d;
+		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_row.vector, &d);
+		tPDPD_ge-=d;
+		 */
+	}
+
+	//calculate the fourth part: trace(HixQixHiDHixQixHiD)
+	for (size_t i=0; i<dc_size; i++) {		
+		//gsl_vector_const_view QixHiDHix_g_row1=gsl_matrix_const_subrow (QixHiDHix_all_g, i, v1*dc_size, dc_size);
+		//gsl_vector_const_view QixHiDHix_e_row1=gsl_matrix_const_subrow (QixHiDHix_all_e, i, v1*dc_size, dc_size);
+
+		gsl_vector_const_view QixHiDHix_g_fullrow1=gsl_matrix_const_row (QixHiDHix_all_g, i);
+		gsl_vector_const_view QixHiDHix_e_fullrow1=gsl_matrix_const_row (QixHiDHix_all_e, i);
+		gsl_vector_const_view QixHiDHix_g_row1=gsl_vector_const_subvector (&QixHiDHix_g_fullrow1.vector, v1*dc_size, dc_size);
+		gsl_vector_const_view QixHiDHix_e_row1=gsl_vector_const_subvector (&QixHiDHix_e_fullrow1.vector, v1*dc_size, dc_size);
+
+		gsl_vector_const_view QixHiDHix_g_col2=gsl_matrix_const_column (QixHiDHix_all_g, v2*dc_size+i);
+		gsl_vector_const_view QixHiDHix_e_col2=gsl_matrix_const_column (QixHiDHix_all_e, v2*dc_size+i);
+
+		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_g_col2.vector, &d);
+		tPDPD_gg+=d;
+		gsl_blas_ddot(&QixHiDHix_e_row1.vector, &QixHiDHix_e_col2.vector, &d);
+		tPDPD_ee+=d;
+		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_e_col2.vector, &d);
+		tPDPD_ge+=d;
+	}		
+
+	return;
+}
+
+
+
+//calculate (xHiDHiy) for every pair of i j
+void Calc_xHiDHiy_all (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiy_all_g, gsl_matrix *xHiDHiy_all_e)
+{
+	gsl_matrix_set_zero(xHiDHiy_all_g);
+	gsl_matrix_set_zero(xHiDHiy_all_e);
+	
+	size_t d_size=Hiy->size1;
+	size_t v;
+	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			gsl_vector_view xHiDHiy_g=gsl_matrix_column (xHiDHiy_all_g, v);
+			gsl_vector_view xHiDHiy_e=gsl_matrix_column (xHiDHiy_all_e, v);
+			
+			Calc_xHiDHiy (eval, xHi, Hiy, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector);
+		}
+	}
+	return;
+}
+
+
+//calculate (xHiDHix) for every pair of i j
+void Calc_xHiDHix_all (const gsl_vector *eval, const gsl_matrix *xHi, gsl_matrix *xHiDHix_all_g, gsl_matrix *xHiDHix_all_e)
+{
+	gsl_matrix_set_zero(xHiDHix_all_g);
+	gsl_matrix_set_zero(xHiDHix_all_e);
+	
+	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
+	size_t v;
+	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			gsl_matrix_view xHiDHix_g=gsl_matrix_submatrix (xHiDHix_all_g, 0, v*dc_size, dc_size, dc_size);
+			gsl_matrix_view xHiDHix_e=gsl_matrix_submatrix (xHiDHix_all_e, 0, v*dc_size, dc_size, dc_size);
+			
+			Calc_xHiDHix (eval, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix);
+		}
+	}
+	return;
+}
+
+
+
+//calculate (xHiDHiy) for every pair of i j
+void Calc_xHiDHiDHiy_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiDHiy_all_gg, gsl_matrix *xHiDHiDHiy_all_ee, gsl_matrix *xHiDHiDHiy_all_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHiy_all_gg);
+	gsl_matrix_set_zero(xHiDHiDHiy_all_ee);
+	gsl_matrix_set_zero(xHiDHiDHiy_all_ge);
+	
+	size_t d_size=Hiy->size1;
+	size_t v1, v2;
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex(i2, j2, d_size);
+					
+					gsl_vector_view xHiDHiDHiy_gg=gsl_matrix_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
+					gsl_vector_view xHiDHiDHiy_ee=gsl_matrix_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
+					gsl_vector_view xHiDHiDHiy_ge=gsl_matrix_column (xHiDHiDHiy_all_ge, v1*v_size+v2);
+					
+					Calc_xHiDHiDHiy (eval, Hi, xHi, Hiy, i1, j1, i2, j2, &xHiDHiDHiy_gg.vector, &xHiDHiDHiy_ee.vector, &xHiDHiDHiy_ge.vector);
+				}
+			}
+		}
+	}
+	return;
+}
+
+
+//calculate (xHiDHix) for every pair of i j
+void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, gsl_matrix *xHiDHiDHix_all_gg, gsl_matrix *xHiDHiDHix_all_ee, gsl_matrix *xHiDHiDHix_all_ge)
+{
+	gsl_matrix_set_zero(xHiDHiDHix_all_gg);
+	gsl_matrix_set_zero(xHiDHiDHix_all_ee);
+	gsl_matrix_set_zero(xHiDHiDHix_all_ge);
+	
+	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
+	size_t v1, v2;	
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex(i2, j2, d_size);
+					
+					if (v2<v1) {continue;}
+					
+					gsl_matrix_view xHiDHiDHix_gg1=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ee1=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ge1=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					
+					Calc_xHiDHiDHix (eval, Hi, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix, &xHiDHiDHix_ee1.matrix, &xHiDHiDHix_ge1.matrix);
+					
+					if (v2!=v1) {
+						gsl_matrix_view xHiDHiDHix_gg2=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ee2=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ge2=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
+					
+						gsl_matrix_memcpy (&xHiDHiDHix_gg2.matrix, &xHiDHiDHix_gg1.matrix);
+						gsl_matrix_memcpy (&xHiDHiDHix_ee2.matrix, &xHiDHiDHix_ee1.matrix);
+						gsl_matrix_memcpy (&xHiDHiDHix_ge2.matrix, &xHiDHiDHix_ge1.matrix);
+					}
+				}
+			}
+		}
+	}
+	
+	
+	/*
+	size_t n_size=eval->size;
+	double delta, d_Hi_ij;
+	
+	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size);
+	
+	for (size_t k=0; k<n_size; k++) {
+		delta=gsl_vector_get (eval, k);
+		
+		for (size_t i1=0; i1<d_size; i1++) {
+			for (size_t j2=0; j2<d_size; j2++) {				
+				gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
+				gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j2);
+		
+				gsl_matrix_set_zero (mat_dcdc);
+				gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);	
+				
+				for (size_t j1=0; j1<d_size; j1++) {
+					for (size_t i2=0; i2<d_size; i2++) {
+						d_Hi_ij=gsl_matrix_get (Hi, j1, k*d_size+i2); 
+						
+						v1=GetIndex(i1, j1, d_size);
+						v2=GetIndex(i2, j2, d_size);						
+						
+						gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+												
+						gsl_matrix_memcpy (mat_dcdc_temp, mat_dcdc);
+						
+						gsl_matrix_scale (mat_dcdc_temp, d_Hi_ij);
+						gsl_matrix_add(&xHiDHiDHix_ee.matrix, mat_dcdc_temp);
+						gsl_matrix_scale(mat_dcdc_temp, delta);
+						gsl_matrix_add(&xHiDHiDHix_ge.matrix, mat_dcdc_temp);
+						gsl_matrix_scale(mat_dcdc_temp, delta);
+						gsl_matrix_add(&xHiDHiDHix_gg.matrix, mat_dcdc_temp);
+					}
+				}
+			}
+		}
+	}
+	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			v1=GetIndex(i1, j1, d_size);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					v2=GetIndex(i2, j2, d_size);
+					
+					if (i1!=j1 && i2!=j2) {continue;}
+					
+					gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+					
+					if ( (i1==j1 && i2!=j2) || (i1!=j1 && i2==j2) ) {
+						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.5);
+						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.5);
+						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.5);
+					} else {
+						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.25);
+						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.25);
+						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.25);
+					}
+				}
+			}
+		}
+	}
+	
+	gsl_matrix_free (mat_dcdc);
+	gsl_matrix_free (mat_dcdc_temp);	
+	*/
+	
+	return;
+}
+
+
+
+//calculate (xHiDHix)Qi(xHiy) for every pair of i, j
+void Calc_xHiDHixQixHiy_all (const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const gsl_vector *QixHiy, gsl_matrix *xHiDHixQixHiy_all_g, gsl_matrix *xHiDHixQixHiy_all_e)
+{
+	size_t dc_size=xHiDHix_all_g->size1;
+	size_t v_size=xHiDHix_all_g->size2/dc_size;
+	
+	for (size_t i=0; i<v_size; i++) {		
+		gsl_matrix_const_view xHiDHix_g=gsl_matrix_const_submatrix (xHiDHix_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view xHiDHix_e=gsl_matrix_const_submatrix (xHiDHix_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_vector_view xHiDHixQixHiy_g=gsl_matrix_column (xHiDHixQixHiy_all_g, i);
+		gsl_vector_view xHiDHixQixHiy_e=gsl_matrix_column (xHiDHixQixHiy_all_e, i);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_g.matrix, QixHiy, 0.0, &xHiDHixQixHiy_g.vector);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_e.matrix, QixHiy, 0.0, &xHiDHixQixHiy_e.vector);
+	}
+	
+	return;
+}
+
+//calculate Qi(xHiDHiy) and Qi(xHiDHix)Qi(xHiy) for each pair of i j (i<=j)
+void Calc_QiVec_all (const gsl_matrix *Qi, const gsl_matrix *vec_all_g, const gsl_matrix *vec_all_e, gsl_matrix *Qivec_all_g, gsl_matrix *Qivec_all_e)
+{
+	for (size_t i=0; i<vec_all_g->size2; i++) {
+		gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i);
+		gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i);
+		
+		gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i);
+		gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector);
+	}
+	
+	return;
+}
+
+
+//calculate Qi(xHiDHix) for each pair of i j (i<=j)
+void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gsl_matrix *mat_all_e, gsl_matrix *Qimat_all_g, gsl_matrix *Qimat_all_e)
+{
+	size_t dc_size=Qi->size1;
+	size_t v_size=mat_all_g->size2/mat_all_g->size1;
+	
+	for (size_t i=0; i<v_size; i++) {
+		gsl_matrix_const_view mat_g=gsl_matrix_const_submatrix (mat_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view mat_e=gsl_matrix_const_submatrix (mat_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_matrix_view Qimat_g=gsl_matrix_submatrix (Qimat_all_g, 0, i*dc_size, dc_size, dc_size);
+		gsl_matrix_view Qimat_e=gsl_matrix_submatrix (Qimat_all_e, 0, i*dc_size, dc_size, dc_size);
+		
+		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_g.matrix, 0.0, &Qimat_g.matrix);
+		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_e.matrix, 0.0, &Qimat_e.matrix);
+	}
+	
+	return;
+}
+
+
+
+//calculate yPDPy
+//yPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)y
+//=ytHiDHiy
+//-(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
+//+(yHix)Qi(xHiDHix)Qi(xtHiy)
+void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const size_t i, const size_t j, double &yPDPy_g, double &yPDPy_e)
+{	
+	size_t d_size=Hiy->size1;
+	size_t v=GetIndex(i, j, d_size);
+		
+	double d;		
+	
+	//first part: ytHiDHiy
+	Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e);
+	
+	//second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
+	gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v);
+	gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d);
+	yPDPy_g-=d*2.0;
+	gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d);
+	yPDPy_e-=d*2.0;	
+	
+	//fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v);
+	gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d);
+	yPDPy_g+=d;
+	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d);
+	yPDPy_e+=d;
+
+	return;
+}
+
+//calculate yPDPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)D(Hi-HixQixHi)y
+//yPDPDPy=yHiDHiDHiy
+//-(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)
+//-(yHiDHix)Qi(xHiDHiy)
+//+(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
+//+(yHix)Qi(xHiDHiDHix)Qi(xHiy)
+//-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
+void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge)
+{	
+	size_t d_size=Hi->size1, dc_size=xHi->size1;
+	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);
+	size_t v_size=d_size*(d_size+1)/2;	
+	
+	double d;
+		
+	gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size);
+	
+	//first part: yHiDHiDHiy
+	Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);	
+	
+	//second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)	
+	gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
+	gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
+	gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2);
+	
+	gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1);
+	gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1);
+	gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1);
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d); 
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d); 
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d); 
+	yPDPDPy_ge-=d;
+	
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d); 
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d); 
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d); 
+	yPDPDPy_ge-=d;
+	
+	//fourth part: -(yHiDHix)Qi(xHiDHiy)
+	gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1);
+	gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1);
+	gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2);
+	gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2);
+	
+	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ge-=d;
+	
+	//fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1);
+	gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1);
+	
+	gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1);
+	gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1);
+	gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2);
+	gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2);
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
+	yPDPDPy_gg+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d);
+	yPDPDPy_gg+=d;
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ee+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d);
+	yPDPDPy_ee+=d;
+	
+	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
+	yPDPDPy_ge+=d;
+	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d);
+	yPDPDPy_ge+=d;
+
+	//seventh part: +(yHix)Qi(xHiDHiDHix)Qi(xHiy)
+	gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+	
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_gg+=d;
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ee.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_ee+=d;
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
+	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
+	yPDPDPy_ge+=d;
+		
+	//eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
+	gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1);
+	gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1);
+	
+	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d);
+	yPDPDPy_gg-=d;
+	gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d);
+	yPDPDPy_ee-=d;
+	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d);
+	yPDPDPy_ge-=d;
+	
+	//free memory	
+	gsl_vector_free(xHiDHiDHixQixHiy);	
+	
+	return;
+}
+
+
+//calculate Edgeworth correctation factors for small samples
+//notation and method follows Thomas J. Rothenberg, Econometirca 1984; 52 (4)
+//M=xHiDHix
+void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c)
+{
+	crt_a=0.0; crt_b=0.0; crt_c=0.0;
+	
+	size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2;
+	size_t c_size=dc_size/d_size;
+	double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0;
+	double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0;
+	
+	gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size);
+	
+	gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size);	
+	
+	gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size);
+		
+	//invert Qi_sub to Qi_si
+	gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size);
+	
+	gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+	
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (d_size);
+	
+	gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix);
+	LUDecomp (Qi_sub, pmt, &sig);
+	LUInvert (Qi_sub, pmt, Qi_si);
+	
+	gsl_permutation_free(pmt);
+	gsl_matrix_free(Qi_sub);
+			
+	//calculate correctation factors
+	for (size_t v1=0; v1<v_size; v1++) {
+		//calculate Qi(xHiDHix)Qi, and subpart of it
+		gsl_matrix_const_view QiM_g1=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v1*dc_size, dc_size, dc_size);
+		gsl_matrix_const_view QiM_e1=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v1*dc_size, dc_size, dc_size);
+				
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, Qi, 0.0, QiMQi_g1);
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, Qi, 0.0, QiMQi_e1);
+		
+		gsl_matrix_view QiMQi_g1_s=gsl_matrix_submatrix (QiMQi_g1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+		gsl_matrix_view QiMQi_e1_s=gsl_matrix_submatrix (QiMQi_e1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+		
+		/*
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<setprecision(6)<<gsl_matrix_get(&QiMQi_g1_s.matrix, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+*/
+		//calculate trCg1 and trCe1
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g1_s.matrix, Qi_si, 0.0, QiMQisQisi_g1);
+		trCg1=0.0;
+		for (size_t k=0; k<d_size; k++) {
+			trCg1-=gsl_matrix_get (QiMQisQisi_g1, k, k);
+		}
+		
+		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e1_s.matrix, Qi_si, 0.0, QiMQisQisi_e1);
+		trCe1=0.0;
+		for (size_t k=0; k<d_size; k++) {
+			trCe1-=gsl_matrix_get (QiMQisQisi_e1, k, k);
+		}
+		/*
+		cout<<v1<<endl;
+		cout<<"trCg1 = "<<trCg1<<", trCe1 = "<<trCe1<<endl;	
+		*/
+		for (size_t v2=0; v2<v_size; v2++) {
+			if (v2<v1) {continue;}
+			
+			//calculate Qi(xHiDHix)Qi, and subpart of it
+			gsl_matrix_const_view QiM_g2=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v2*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view QiM_e2=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v2*dc_size, dc_size, dc_size);
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g2.matrix, Qi, 0.0, QiMQi_g2);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e2.matrix, Qi, 0.0, QiMQi_e2);
+			
+			gsl_matrix_view QiMQi_g2_s=gsl_matrix_submatrix (QiMQi_g2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQi_e2_s=gsl_matrix_submatrix (QiMQi_e2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			
+			//calculate trCg2 and trCe2
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g2_s.matrix, Qi_si, 0.0, QiMQisQisi_g2);
+			trCg2=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCg2-=gsl_matrix_get (QiMQisQisi_g2, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e2_s.matrix, Qi_si, 0.0, QiMQisQisi_e2);
+			trCe2=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCe2-=gsl_matrix_get (QiMQisQisi_e2, k, k);
+			}
+			
+			//calculate trCC_gg, trCC_ge, trCC_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_g2, 0.0, M_dd);
+			trCC_gg=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_gg+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_e2, 0.0, M_dd);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_g2, 1.0, M_dd);
+			trCC_ge=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_ge+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_e2, 0.0, M_dd);
+			trCC_ee=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				trCC_ee+=gsl_matrix_get (M_dd, k, k);
+			}
+						
+			//calculate Qi(xHiDHix)Qi(xHiDHix)Qi, and subpart of it			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_g2, 0.0, QiMQiMQi_gg);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_g2, 1.0, QiMQiMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ee);
+			
+			gsl_matrix_view QiMQiMQi_gg_s=gsl_matrix_submatrix (QiMQiMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQiMQi_ge_s=gsl_matrix_submatrix (QiMQiMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMQiMQi_ee_s=gsl_matrix_submatrix (QiMQiMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+						
+			//and part of trB_gg, trB_ge, trB_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
+			trB_gg=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_gg-=d;
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
+			trB_ge=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_ge-=d;
+			}
+			
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
+			trB_ee=0.0;
+			for (size_t k=0; k<d_size; k++) {
+				d=gsl_matrix_get (M_dd, k, k);
+				trB_ee-=d;
+			}
+			
+			//calculate Qi(xHiDHiDHix)Qi, and subpart of it	
+			gsl_matrix_const_view MM_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view MM_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+			gsl_matrix_const_view MM_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
+						
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_gg.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_gg);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ge.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ge);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ee.matrix, 0.0, M_dcdc);
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ee);
+			
+			gsl_matrix_view QiMMQi_gg_s=gsl_matrix_submatrix (QiMMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMMQi_ge_s=gsl_matrix_submatrix (QiMMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+			gsl_matrix_view QiMMQi_ee_s=gsl_matrix_submatrix (QiMMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
+												
+			//calculate the other part of trB_gg, trB_ge, trB_ee
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_gg+=gsl_matrix_get (M_dd, k, k);
+			}
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_ge+=2.0*gsl_matrix_get (M_dd, k, k);
+			}
+			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
+			for (size_t k=0; k<d_size; k++) {
+				trB_ee+=gsl_matrix_get (M_dd, k, k);
+			}
+			
+			
+			//calculate trD_gg, trD_ge, trD_ee
+			trD_gg=2.0*trB_gg;
+			trD_ge=2.0*trB_ge;
+			trD_ee=2.0*trB_ee;
+			
+			//calculate B, C and D
+			h_gg=-1.0*gsl_matrix_get (Hessian_inv, v1, v2);
+			h_ge=-1.0*gsl_matrix_get (Hessian_inv, v1, v2+v_size);
+			h_ee=-1.0*gsl_matrix_get (Hessian_inv, v1+v_size, v2+v_size);
+			
+			B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
+			C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
+			D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);
+			
+			if (v1!=v2) {
+				B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
+				C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
+				D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);
+			}
+			
+			/*
+			cout<<v1<<"\t"<<v2<<endl;
+			cout<<h_gg<<"\t"<<h_ge<<"\t"<<h_ee<<endl;
+			cout<<trB_gg<<"\t"<<trB_ge<<"\t"<<trB_ee<<endl;
+			cout<<trCg1<<"\t"<<trCe1<<"\t"<<trCg2<<"\t"<<trCe2<<endl;
+			cout<<trCC_gg<<"\t"<<trCC_ge<<"\t"<<trCC_ee<<endl;
+			cout<<trD_gg<<"\t"<<trD_ge<<"\t"<<trD_ee<<endl;
+			*/
+		}
+	}
+	
+	//calculate a, b, c from B C D
+	crt_a=2.0*D-C;
+	crt_b=2.0*B;
+	crt_c=C;
+	/*
+	cout<<B<<"\t"<<C<<"\t"<<D<<endl;
+	cout<<setprecision(6)<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
+	*/
+	//free matrix memory
+	gsl_matrix_free(QiMQi_g1);
+	gsl_matrix_free(QiMQi_e1);
+	gsl_matrix_free(QiMQi_g2);
+	gsl_matrix_free(QiMQi_e2);
+	
+	gsl_matrix_free(QiMQisQisi_g1);
+	gsl_matrix_free(QiMQisQisi_e1);
+	gsl_matrix_free(QiMQisQisi_g2);
+	gsl_matrix_free(QiMQisQisi_e2);
+	
+	gsl_matrix_free(QiMQiMQi_gg);
+	gsl_matrix_free(QiMQiMQi_ge);
+	gsl_matrix_free(QiMQiMQi_ee);
+	
+	gsl_matrix_free(QiMMQi_gg);
+	gsl_matrix_free(QiMMQi_ge);
+	gsl_matrix_free(QiMMQi_ee);
+	
+	gsl_matrix_free(Qi_si);
+	
+	gsl_matrix_free(M_dd);
+	gsl_matrix_free(M_dcdc);
+	
+	return;
+}
+
+
+
+
+
+//calculate first-order and second-order derivatives
+void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, gsl_vector *gradient, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
+{	
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}
+
+	size_t dc_size=Qi->size1, d_size=Hi->size1;
+	size_t c_size=dc_size/d_size;
+	size_t v_size=d_size*(d_size+1)/2;
+	size_t v1, v2;
+	double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge;
+
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+		
+	gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
+	gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);		
+	gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	
+	gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
+	gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);	
+	gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
+	gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
+	
+	gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size);
+	gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
+	
+	//calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all
+	Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e);	
+	Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e);
+	Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e);
+	
+	Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge);
+	Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge);
+	
+	//calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all
+	Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e);
+	Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e);
+	Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e);
+		
+	double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge;
+	double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge;
+
+	//calculate gradient and Hessian for Vg	
+	for (size_t i1=0; i1<d_size; i1++) {
+		for (size_t j1=0; j1<d_size; j1++) {
+			if (j1<i1) {continue;}
+			v1=GetIndex (i1, j1, d_size);
+
+			Calc_yPDPy (eval, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, i1, j1, yPDPy_g, yPDPy_e);
+			
+			if (func_name=='R' || func_name=='r') {				
+				Calc_tracePD (eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g, tPD_e);				
+				//cout<<i1<<" "<<j1<<" "<<yPDPy_g<<" "<<yPDPy_e<<" "<<tPD_g<<" "<<tPD_e<<endl;
+				
+				dev1_g=-0.5*tPD_g+0.5*yPDPy_g;
+				dev1_e=-0.5*tPD_e+0.5*yPDPy_e;
+			} else {
+				Calc_traceHiD (eval, Hi, i1, j1, tHiD_g, tHiD_e);
+								
+				dev1_g=-0.5*tHiD_g+0.5*yPDPy_g;
+				dev1_e=-0.5*tHiD_e+0.5*yPDPy_e;
+			}
+
+			gsl_vector_set (gradient, v1, dev1_g);
+			gsl_vector_set (gradient, v1+v_size, dev1_e);
+			
+			for (size_t i2=0; i2<d_size; i2++) {
+				for (size_t j2=0; j2<d_size; j2++) {
+					if (j2<i2) {continue;}
+					v2=GetIndex (i2, j2, d_size);
+					
+					if (v2<v1) {continue;}
+
+					Calc_yPDPDPy (eval, Hi, xHi, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);
+
+					//cout<<i1<<" "<<j1<<" "<<i2<<" "<<j2<<" "<<yPDPDPy_gg<<" "<<yPDPDPy_ee<<" "<<yPDPDPy_ge<<endl;
+					//AI for reml
+					if (func_name=='R' || func_name=='r') {
+						Calc_tracePDPD (eval, Qi, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
+						
+						dev2_gg=0.5*tPDPD_gg-yPDPDPy_gg; 
+						dev2_ee=0.5*tPDPD_ee-yPDPDPy_ee; 
+						dev2_ge=0.5*tPDPD_ge-yPDPDPy_ge; 		
+						/*
+						dev2_gg=-0.5*yPDPDPy_gg; 
+						dev2_ee=-0.5*yPDPDPy_ee; 
+						dev2_ge=-0.5*yPDPDPy_ge; 
+						*/
+					} else {
+						Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge);
+						
+						dev2_gg=0.5*tHiDHiD_gg-yPDPDPy_gg; 
+						dev2_ee=0.5*tHiDHiD_ee-yPDPDPy_ee; 
+						dev2_ge=0.5*tHiDHiD_ge-yPDPDPy_ge; 
+					}
+
+					//set up Hessian
+					gsl_matrix_set (Hessian, v1, v2, dev2_gg);
+					gsl_matrix_set (Hessian, v1+v_size, v2+v_size, dev2_ee);
+					gsl_matrix_set (Hessian, v1, v2+v_size, dev2_ge);
+					gsl_matrix_set (Hessian, v2+v_size, v1, dev2_ge);
+					
+					if (v1!=v2) {
+						gsl_matrix_set (Hessian, v2, v1, dev2_gg);
+						gsl_matrix_set (Hessian, v2+v_size, v1+v_size, dev2_ee);
+						gsl_matrix_set (Hessian, v2, v1+v_size, dev2_ge);
+						gsl_matrix_set (Hessian, v1+v_size, v2, dev2_ge);
+					}
+				}
+			}
+		}
+	}
+	
+	/*
+	cout<<"Hessian: "<<endl;
+	for (size_t i=0; i<2*v_size; i++) {
+		for (size_t j=0; j<2*v_size; j++) {
+			cout<<gsl_matrix_get(Hessian, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	*/
+	
+	
+	//Invert Hessian
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (v_size*2);
+	
+	LUDecomp (Hessian, pmt, &sig);
+	LUInvert (Hessian, pmt, Hessian_inv);
+	/*
+	cout<<"Hessian Inverse: "<<endl;
+	for (size_t i=0; i<2*v_size; i++) {
+		for (size_t j=0; j<2*v_size; j++) {
+			cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	*/
+	gsl_permutation_free(pmt);	
+	gsl_matrix_free(Hessian);
+	
+	//calculate Edgeworth correction factors
+	//after inverting Hessian
+	if (c_size>1) {
+		CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c);
+	} else {
+		crt_a=0.0; crt_b=0.0; crt_c=0.0; 
+	}	
+	
+	gsl_matrix_free(xHiDHiy_all_g);
+	gsl_matrix_free(xHiDHiy_all_e);
+	gsl_matrix_free(xHiDHix_all_g);
+	gsl_matrix_free(xHiDHix_all_e);		
+	gsl_matrix_free(xHiDHixQixHiy_all_g);
+	gsl_matrix_free(xHiDHixQixHiy_all_e);
+	
+	gsl_matrix_free(QixHiDHiy_all_g);
+	gsl_matrix_free(QixHiDHiy_all_e);
+	gsl_matrix_free(QixHiDHix_all_g);
+	gsl_matrix_free(QixHiDHix_all_e);	
+	gsl_matrix_free(QixHiDHixQixHiy_all_g);
+	gsl_matrix_free(QixHiDHixQixHiy_all_e);
+	
+	gsl_matrix_free(xHiDHiDHiy_all_gg);
+	gsl_matrix_free(xHiDHiDHiy_all_ee);
+	gsl_matrix_free(xHiDHiDHiy_all_ge);
+	gsl_matrix_free(xHiDHiDHix_all_gg);
+	gsl_matrix_free(xHiDHiDHix_all_ee);
+	gsl_matrix_free(xHiDHiDHix_all_ge);
+	
+	return;
+}
+
+
+//update Vg, Ve
+void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, const double step_scale, gsl_matrix *V_g, gsl_matrix *V_e)
+{
+	size_t v_size=gradient->size/2, d_size=V_g->size1;
+	size_t v;
+	
+	gsl_vector *vec_v=gsl_vector_alloc (v_size*2);
+	
+	double d;
+	
+	//vectorize Vg and Ve
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			d=gsl_matrix_get (V_g, i, j);
+			gsl_vector_set (vec_v, v, d);
+			
+			d=gsl_matrix_get (V_e, i, j);
+			gsl_vector_set (vec_v, v+v_size, d);
+		}
+	}	
+	
+	gsl_blas_dgemv (CblasNoTrans, -1.0*step_scale, Hessian_inv, gradient, 1.0, vec_v);
+
+	//save Vg and Ve
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<d_size; j++) {
+			if (j<i) {continue;}
+			v=GetIndex(i, j, d_size);
+			
+			d=gsl_vector_get (vec_v, v);
+			gsl_matrix_set (V_g, i, j, d);
+			gsl_matrix_set (V_g, j, i, d);
+			
+			d=gsl_vector_get (vec_v, v+v_size);
+			gsl_matrix_set (V_e, i, j, d);
+			gsl_matrix_set (V_e, j, i, d);
+		}
+	}	
+	
+	gsl_vector_free(vec_v);
+	
+	return;
+}
+
+
+
+
+
+
+double MphNR (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *Hi_all, gsl_matrix *xHi_all, gsl_matrix *Hiy_all, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
+{
+	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
+	size_t dc_size=d_size*c_size;
+	size_t v_size=d_size*(d_size+1)/2;
+	
+	double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale;
+	int sig;
+	size_t step_iter, flag_pd;
+	
+	gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size);
+	gsl_vector *D_temp=gsl_vector_alloc (d_size);
+	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
+	gsl_vector *QixHiy=gsl_vector_alloc (dc_size);	
+	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
+	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);
+	
+	gsl_vector *gradient=gsl_vector_alloc (v_size*2);	
+	
+	//calculate |XXt| and (XXt)^{-1}
+	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
+	for (size_t i=0; i<c_size; ++i) {
+		for (size_t j=0; j<i; ++j) {
+			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
+		}
+	}
+
+	gsl_permutation * pmt=gsl_permutation_alloc (c_size);
+	LUDecomp (XXt, pmt, &sig);
+	gsl_permutation_free (pmt);
+//	LUInvert (XXt, pmt, XXti);	
+	
+	//calculate the constant for logl	
+	if (func_name=='R' || func_name=='r') {		
+		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
+	} else {
+		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
+	}
+	//optimization iterations
+		
+	for (size_t t=0; t<max_iter; t++) {		
+		gsl_matrix_memcpy (Vg_save, V_g);
+		gsl_matrix_memcpy (Ve_save, V_e);
+
+		step_scale=1.0; step_iter=0;
+		do {
+			gsl_matrix_memcpy (V_g, Vg_save);
+			gsl_matrix_memcpy (V_e, Ve_save);
+			
+			//update Vg, Ve, and invert Hessian
+			if (t!=0) {UpdateVgVe (Hessian_inv, gradient, step_scale, V_g, V_e);}
+			
+			//check if both Vg and Ve are positive definite
+			flag_pd=1;
+			gsl_matrix_memcpy (V_temp, V_e);
+			EigenDecomp(V_temp, U_temp, D_temp, 0);
+			for (size_t i=0; i<d_size; i++) {
+				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}
+			}
+			gsl_matrix_memcpy (V_temp, V_g);
+			EigenDecomp(V_temp, U_temp, D_temp, 0);
+			for (size_t i=0; i<d_size; i++) {
+				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}	
+			}
+
+			//if flag_pd==1 continue to calculate quantities and logl
+			if (flag_pd==1) {				
+				CalcHiQi (eval, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q);
+				Calc_Hiy_all (Y, Hi_all, Hiy_all);
+				Calc_xHi_all (X, Hi_all, xHi_all);
+				
+				//calculate QixHiy and yPy
+				Calc_xHiy (Y, xHi_all, xHiy);
+				gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, xHiy, 0.0, QixHiy);
+				
+				gsl_blas_ddot (QixHiy, xHiy, &yPy);
+				yPy=Calc_yHiy (Y, Hiy_all)-yPy;
+				
+				//calculate log likelihood/restricted likelihood value
+				if (func_name=='R' || func_name=='r') {	
+					logl_new=logl_const-0.5*logdet_H-0.5*logdet_Q-0.5*yPy;
+				} else {
+					logl_new=logl_const-0.5*logdet_H-0.5*yPy;
+				}				
+			}
+
+			step_scale/=2.0; 
+			step_iter++;
+									
+			//cout<<t<<"\t"<<step_iter<<"\t"<<logl_old<<"\t"<<logl_new<<"\t"<<flag_pd<<endl;
+		} while ( (flag_pd==0 || logl_new<logl_old || logl_new-logl_old>10 ) && step_iter<10 && t!=0);
+
+		//terminate if change is small
+		if (t!=0) {
+			if (logl_new<logl_old || flag_pd==0) {
+				gsl_matrix_memcpy (V_g, Vg_save);
+				gsl_matrix_memcpy (V_e, Ve_save);
+				break;
+			}
+			
+			if (logl_new-logl_old<max_prec) {
+				break;
+			}
+		}
+
+		logl_old=logl_new;
+		
+		CalcDev (func_name, eval, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient, Hessian_inv, crt_a, crt_b, crt_c);
+		
+		
+		//output estimates in each iteration
+		/*
+		cout<<func_name<<" iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;
+		
+		cout<<"Vg: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Ve: "<<endl;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=0; j<d_size; j++) {
+				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		cout<<"Hessian: "<<endl;
+		for (size_t i=0; i<Hessian_inv->size1; i++) {
+			for (size_t j=0; j<Hessian_inv->size2; j++) {
+				cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
+			}
+			cout<<endl;
+		}
+		*/
+	}
+	
+	//mutiply Hessian_inv with -1.0
+	//now Hessian_inv is the variance matrix
+	gsl_matrix_scale (Hessian_inv, -1.0);
+	
+	gsl_matrix_free(Vg_save);
+	gsl_matrix_free(Ve_save);
+	gsl_matrix_free(V_temp);
+	gsl_matrix_free(U_temp);
+	gsl_vector_free(D_temp);
+	gsl_vector_free(xHiy);
+	gsl_vector_free(QixHiy);	
+	
+	gsl_matrix_free(Qi);
+	gsl_matrix_free(XXt);
+	
+	gsl_vector_free(gradient);
+	
+	return logl_new;
+}
+
+
+
+
+
+//initialize Vg, Ve and B
+void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter, const double nr_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
+{
+	gsl_matrix_set_zero (V_g);
+	gsl_matrix_set_zero (V_e);
+	gsl_matrix_set_zero (B);
+	
+	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;	
+	double a, b, c;
+	double lambda, logl, vg, ve;
+	
+	//Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates
+	gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size);	
+	gsl_vector *beta_temp=gsl_vector_alloc(c_size);
+	gsl_vector *se_beta_temp=gsl_vector_alloc(c_size);
+	
+	gsl_matrix_transpose_memcpy (Xt, X);	
+	
+	for (size_t i=0; i<d_size; i++) {
+		gsl_vector_const_view Y_row=gsl_matrix_const_row (Y, i);
+		CalcLambda ('R', eval, Xt, &Y_row.vector, l_min, l_max, n_region, lambda, logl);
+		CalcLmmVgVeBeta (eval, Xt, &Y_row.vector, lambda, vg, ve, beta_temp, se_beta_temp);
+		
+		gsl_matrix_set(V_g, i, i, vg);
+		gsl_matrix_set(V_e, i, i, ve);
+	}
+
+	gsl_matrix_free (Xt);
+	gsl_vector_free (beta_temp);
+	gsl_vector_free (se_beta_temp);
+	
+	//if number of phenotypes is above four, then obtain the off diagonal elements with two trait models
+	if (d_size>4) {
+		//first obtain good initial values
+		//large matrices for EM
+		gsl_matrix *U_hat=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *E_hat=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *OmegaU=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *OmegaE=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size);	
+		
+		//large matrices for NR
+		gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size);		//each dxd block is H_k^{-1}
+		gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size);				//each column is H_k^{-1}y_k
+		gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+		gsl_matrix *Hessian=gsl_matrix_alloc (6, 6);
+		
+		//2 by n matrix of Y
+		gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size);
+		gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2);
+		gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2);
+		gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size);
+				
+		for (size_t i=0; i<d_size; i++) {
+			gsl_vector_view Y_sub1=gsl_matrix_row (Y_sub, 0);
+			gsl_vector_const_view Y_1=gsl_matrix_const_row (Y, i);
+			gsl_vector_memcpy (&Y_sub1.vector, &Y_1.vector);
+			
+			for (size_t j=i+1; j<d_size; j++) {
+				gsl_vector_view Y_sub2=gsl_matrix_row (Y_sub, 1);
+				gsl_vector_const_view Y_2=gsl_matrix_const_row (Y, j);
+				gsl_vector_memcpy (&Y_sub2.vector, &Y_2.vector);
+				
+				gsl_matrix_set_zero (Vg_sub);
+				gsl_matrix_set_zero (Ve_sub);
+				gsl_matrix_set (Vg_sub, 0, 0, gsl_matrix_get (V_g, i, i));
+				gsl_matrix_set (Ve_sub, 0, 0, gsl_matrix_get (V_e, i, i));
+				gsl_matrix_set (Vg_sub, 1, 1, gsl_matrix_get (V_g, j, j));
+				gsl_matrix_set (Ve_sub, 1, 1, gsl_matrix_get (V_e, j, j));
+				
+				logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);	
+				logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, a, b, c);
+				
+				gsl_matrix_set(V_g, i, j, gsl_matrix_get (Vg_sub, 0, 1));
+				gsl_matrix_set(V_g, j, i, gsl_matrix_get (Vg_sub, 0, 1));
+				
+				gsl_matrix_set(V_e, i, j, ve=gsl_matrix_get (Ve_sub, 0, 1));
+				gsl_matrix_set(V_e, j, i, ve=gsl_matrix_get (Ve_sub, 0, 1));
+			}
+		}
+		
+		//free matrices
+		gsl_matrix_free(U_hat);
+		gsl_matrix_free(E_hat);
+		gsl_matrix_free(OmegaU);
+		gsl_matrix_free(OmegaE);
+		gsl_matrix_free(UltVehiY);
+		gsl_matrix_free(UltVehiBX);
+		gsl_matrix_free(UltVehiU);
+		gsl_matrix_free(UltVehiE);	
+		
+		gsl_matrix_free(Hi_all);
+		gsl_matrix_free(Hiy_all);
+		gsl_matrix_free(xHi_all);
+		gsl_matrix_free(Hessian);
+		
+		gsl_matrix_free(Y_sub);
+		gsl_matrix_free(Vg_sub);
+		gsl_matrix_free(Ve_sub);
+		gsl_matrix_free(B_sub);
+		
+		/*
+		//second, maximize a increasingly large matrix
+		for (size_t i=1; i<d_size; i++) {		
+			//large matrices for EM
+			gsl_matrix *U_hat=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *E_hat=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *OmegaU=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *OmegaE=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiY=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiBX=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiU=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *UltVehiE=gsl_matrix_alloc (i+1, n_size);	
+			
+			//large matrices for NR
+			gsl_matrix *Hi_all=gsl_matrix_alloc (i+1, (i+1)*n_size);		//each dxd block is H_k^{-1}
+			gsl_matrix *Hiy_all=gsl_matrix_alloc (i+1, n_size);				//each column is H_k^{-1}y_k
+			gsl_matrix *xHi_all=gsl_matrix_alloc ((i+1)*c_size, (i+1)*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+			gsl_matrix *Hessian=gsl_matrix_alloc ((i+1)*(i+2), (i+1)*(i+2));
+			
+			//(i+1) by n matrix of Y
+			gsl_matrix *Y_sub=gsl_matrix_alloc (i+1, n_size);
+			gsl_matrix *Vg_sub=gsl_matrix_alloc (i+1, i+1);
+			gsl_matrix *Ve_sub=gsl_matrix_alloc (i+1, i+1);
+			gsl_matrix *B_sub=gsl_matrix_alloc (i+1, c_size);
+			
+			gsl_matrix_const_view Y_sub_view=gsl_matrix_const_submatrix (Y, 0, 0, i+1, n_size);
+			gsl_matrix_view Vg_sub_view=gsl_matrix_submatrix (V_g, 0, 0, i+1, i+1);
+			gsl_matrix_view Ve_sub_view=gsl_matrix_submatrix (V_e, 0, 0, i+1, i+1);
+			
+			gsl_matrix_memcpy (Y_sub, &Y_sub_view.matrix);
+			gsl_matrix_memcpy (Vg_sub, &Vg_sub_view.matrix);
+			gsl_matrix_memcpy (Ve_sub, &Ve_sub_view.matrix);
+			
+			logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);	
+			logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, crt_a, crt_b, crt_c);
+			
+			gsl_matrix_memcpy (&Vg_sub_view.matrix, Vg_sub);
+			gsl_matrix_memcpy (&Ve_sub_view.matrix, Ve_sub);
+						
+			//free matrices
+			gsl_matrix_free(U_hat);
+			gsl_matrix_free(E_hat);
+			gsl_matrix_free(OmegaU);
+			gsl_matrix_free(OmegaE);
+			gsl_matrix_free(UltVehiY);
+			gsl_matrix_free(UltVehiBX);
+			gsl_matrix_free(UltVehiU);
+			gsl_matrix_free(UltVehiE);	
+			
+			gsl_matrix_free(Hi_all);
+			gsl_matrix_free(Hiy_all);
+			gsl_matrix_free(xHi_all);
+			gsl_matrix_free(Hessian);
+			
+			gsl_matrix_free(Y_sub);
+			gsl_matrix_free(Vg_sub);
+			gsl_matrix_free(Ve_sub);
+			gsl_matrix_free(B_sub);
+		}
+		 */
+	}
+	
+	//calculate B hat using GSL estimate
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	
+	gsl_vector *D_l=gsl_vector_alloc (d_size);
+	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *Qi=gsl_matrix_alloc (d_size*c_size, d_size*c_size);
+	gsl_vector *XHiy=gsl_vector_alloc (d_size*c_size);
+	gsl_vector *beta=gsl_vector_alloc (d_size*c_size);
+	
+	gsl_vector_set_zero (XHiy);
+	
+	double logdet_Ve, logdet_Q, dl, d, delta, dx, dy;
+	
+	//eigen decomposition and calculate log|Ve|
+	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);	
+	
+	//calculate Qi and log|Q|
+	logdet_Q=CalcQi (eval, D_l, X, Qi);	
+	
+	//calculate UltVehiY
+	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
+
+	//calculate XHiy
+	for (size_t i=0; i<d_size; i++) {
+		dl=gsl_vector_get(D_l, i);
+		
+		for (size_t j=0; j<c_size; j++) {	
+			d=0.0;
+			for (size_t k=0; k<n_size; k++) {
+				delta=gsl_vector_get(eval, k);
+				dx=gsl_matrix_get(X, j, k);
+				dy=gsl_matrix_get(UltVehiY, i, k);
+				
+				//if (delta==0) {continue;}			
+				d+=dy*dx/(delta*dl+1.0);
+			}
+			gsl_vector_set(XHiy, j*d_size+i, d);
+		}
+	}
+
+	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, XHiy, 0.0, beta);
+
+	//multiply beta by UltVeh and save to B
+	for (size_t i=0; i<c_size; i++) {
+		gsl_vector_view B_col=gsl_matrix_column (B, i);
+		gsl_vector_view beta_sub=gsl_vector_subvector (beta, i*d_size, d_size);		
+		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &beta_sub.vector, 0.0, &B_col.vector);
+	}
+
+	//free memory
+	gsl_matrix_free(UltVehiY);
+	
+	gsl_vector_free(D_l);
+	gsl_matrix_free(UltVeh);
+	gsl_matrix_free(UltVehi);
+	gsl_matrix_free(Qi);
+	gsl_vector_free(XHiy);
+	gsl_vector_free(beta);
+		
+	return;
+}
+
+
+
+//p value correction
+//mode=1 Wald; mode=2 LRT; mode=3 SCORE;
+double PCRT (const size_t mode, const size_t d_size, const double p_value, const double crt_a, const double crt_b, const double crt_c)
+{
+	double p_crt=0.0, chisq_crt=0.0, q=(double)d_size;
+	double chisq=gsl_cdf_chisq_Qinv(p_value, (double)d_size );
+		
+	if (mode==1) {		
+		double a=crt_c/(2.0*q*(q+2.0));
+		double b=1.0+(crt_a+crt_b)/(2.0*q);		
+		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);		
+	} else if (mode==2) {
+		chisq_crt=chisq/(1.0+crt_a/(2.0*q) );		
+	} else {
+		/*
+		double a=-1.0*crt_c/(2.0*q*(q+2.0));
+		double b=1.0+(crt_a-crt_b)/(2.0*q);	
+		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);
+		*/
+		chisq_crt=chisq;
+	}
+	
+	p_crt=gsl_cdf_chisq_Q (chisq_crt, (double)d_size );	
+	
+	//cout<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
+	//cout<<setprecision(10)<<p_value<<"\t"<<p_crt<<endl;
+	
+	return p_crt;
+}
+
+
+
+void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+
+	clock_t time_start=clock();
+	time_UtX=0; time_opt=0;
+	
+	string line;
+	char *ch_ptr;
+	
+	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
+	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
+	double crt_a, crt_b, crt_c;
+	int n_miss, c_phen;
+	double geno, x_mean;
+	size_t c=0;
+	//	double s=0.0;
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+
+	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
+		
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	gsl_vector *x=gsl_vector_alloc (n_size);
+	gsl_vector *x_miss=gsl_vector_alloc (n_size);
+	
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
+	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_vector *beta=gsl_vector_alloc (d_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);
+	
+	//null estimates for initial values
+	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);	
+	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
+	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);
+	
+	gsl_matrix_transpose_memcpy (Y, UtY);
+
+	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);
+	
+	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
+	gsl_vector_set_zero(&X_row.vector);
+	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
+	gsl_vector_set_zero(&B_col.vector);		
+
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);	
+	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	
+	c=0;
+	Vg_remle_null.clear();
+	Ve_remle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_remle_null.clear(); 
+	se_beta_remle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_remle_H0=logl_H0;
+	
+	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+	cout.precision(4);
+	
+	cout<<"REMLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE likelihood = "<<logl_H0<<endl;
+	
+	
+	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	
+	c=0;
+	Vg_mle_null.clear();
+	Ve_mle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_mle_null.clear(); 
+	se_beta_mle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_mle_H0=logl_H0;
+	
+	cout<<"MLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE likelihood = "<<logl_H0<<endl;
+
+	
+	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
+	for (size_t i=0; i<d_size; i++) {
+		v_beta.push_back(0.0);
+	}
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			v_Vg.push_back(0.0);
+			v_Ve.push_back(0.0);
+			v_Vbeta.push_back(0.0);
+		}
+	}
+	
+	gsl_matrix_memcpy (V_g_null, V_g);
+	gsl_matrix_memcpy (V_e_null, V_e);
+	gsl_matrix_memcpy (B_null, B);
+	
+	//start reading genotypes and analyze		
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		//if (t>=1) {break;}
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+
+		x_mean=0.0; c_phen=0; n_miss=0;
+		gsl_vector_set_zero(x_miss);
+		for (size_t i=0; i<ni_total; ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==0) {continue;}
+			
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
+			else {
+				geno=atof(ch_ptr); 				
+				
+				gsl_vector_set(x, c_phen, geno); 
+				gsl_vector_set(x_miss, c_phen, 1.0); 
+				x_mean+=geno;
+			}
+			c_phen++;
+		}
+
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
+			geno=gsl_vector_get(x, i);
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}
+
+		//calculate statistics
+		time_start=clock();		
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//initial values
+		gsl_matrix_memcpy (V_g, V_g_null);
+		gsl_matrix_memcpy (V_e, V_e_null);
+		gsl_matrix_memcpy (B, B_null);
+		
+		time_start=clock();
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {			
+			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+			if (p_score<p_nr && crt==1) {
+				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+			}
+		}		
+
+		if (a_mode==2 || a_mode==4) {
+			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			//calculate beta and Vbeta
+			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+			
+			if (p_lrt<p_nr) {
+				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				//calculate beta and Vbeta
+				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+				
+				if (crt==1) {
+					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
+				}
+			}			
+		}			
+
+		if (a_mode==1 || a_mode==4) {
+			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			
+			if (p_wald<p_nr) {
+				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				
+				if (crt==1) {
+					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
+				}
+			}			
+		}		
+
+		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		for (size_t i=0; i<d_size; i++) {
+			v_beta[i]=gsl_vector_get (beta, i);			
+		}
+		
+		c=0;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=i; j<d_size; j++) {
+				v_Vg[c]=gsl_matrix_get (V_g, i, j);
+				v_Ve[c]=gsl_matrix_get (V_e, i, j);
+				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+				c++;
+			}
+		}
+		
+		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;
+	
+	
+	infile.close();
+	infile.clear();
+	
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_vector_free(x);
+	gsl_vector_free(x_miss);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(X);	
+	gsl_matrix_free(V_g);
+	gsl_matrix_free(V_e);
+	gsl_matrix_free(B);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_matrix_free(V_g_null);
+	gsl_matrix_free(V_e_null);
+	gsl_matrix_free(B_null);	
+	gsl_matrix_free(se_B_null);
+	
+	return;
+}
+
+
+
+
+
+
+
+void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	clock_t time_start=clock();
+	time_UtX=0; time_opt=0;
+	
+	char ch[1];
+	bitset<8> b;
+	
+	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
+	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
+	double crt_a, crt_b, crt_c;
+	int n_bit, n_miss, ci_total, ci_test;
+	double geno, x_mean;
+	size_t c=0;
+	//	double s=0.0;
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
+			
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	gsl_vector *x=gsl_vector_alloc (n_size);
+	
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);		
+	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
+	gsl_vector *beta=gsl_vector_alloc (d_size);
+	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);
+		
+	//null estimates for initial values
+	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
+	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);	
+	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);
+	
+	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);	
+	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
+	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);
+	
+	gsl_matrix_transpose_memcpy (Y, UtY);
+	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);
+	
+	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
+	gsl_vector_set_zero(&X_row.vector);
+	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
+	gsl_vector_set_zero(&B_col.vector);		
+	
+	//time_start=clock();			
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
+		
+	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	//cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+	
+	c=0;
+	Vg_remle_null.clear();
+	Ve_remle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_remle_null.clear(); 
+	se_beta_remle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_remle_H0=logl_H0;
+	
+	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+	cout.precision(4);
+	cout<<"REMLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"REMLE likelihood = "<<logl_H0<<endl;
+	
+	//time_start=clock();	
+	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
+	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
+	//cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+		
+	c=0;
+	Vg_mle_null.clear();
+	Ve_mle_null.clear();
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
+			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
+			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
+			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
+			c++;
+		}
+	}
+	beta_mle_null.clear(); 
+	se_beta_mle_null.clear();
+	for (size_t i=0; i<se_B_null->size1; i++) {
+		for (size_t j=0; j<se_B_null->size2; j++) {
+			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
+			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
+		}
+	}
+	logl_mle_H0=logl_H0;
+	
+	cout<<"MLE estimate for Vg in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Vg): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE estimate for Ve in the null model: "<<endl;
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+		}
+		cout<<endl;
+	}
+	cout<<"se(Ve): "<<endl;	
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=0; j<=i; j++) {
+			c=GetIndex(i, j, d_size);
+			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
+		}
+		cout<<endl;		
+	}
+	cout<<"MLE likelihood = "<<logl_H0<<endl;
+	
+	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
+	for (size_t i=0; i<d_size; i++) {
+		v_beta.push_back(0.0);
+	}
+	for (size_t i=0; i<d_size; i++) {
+		for (size_t j=i; j<d_size; j++) {
+			v_Vg.push_back(0.0);
+			v_Ve.push_back(0.0);
+			v_Vbeta.push_back(0.0);
+		}
+	}
+	
+	gsl_matrix_memcpy (V_g_null, V_g);
+	gsl_matrix_memcpy (V_e_null, V_e);
+	gsl_matrix_memcpy (B_null, B);	
+	
+	
+	//start reading genotypes and analyze	
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+	
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+		
+		//if (t>=0) {break;}
+		//if (snpInfo[t].rs_number!="MAG18140902") {continue;}
+		//cout<<t<<endl;
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+				
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+				
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+				}
+				
+				ci_total++;
+				ci_test++;
+			}
+		}
+		
+		x_mean/=(double)(ni_test-n_miss);
+		
+		for (size_t i=0; i<ni_test; ++i) {			
+			geno=gsl_vector_get(x,i);
+			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
+			if (x_mean>1) {
+				gsl_vector_set(x, i, 2-geno);
+			}
+		}		
+		
+		/*
+		if (t==0) {			
+			ofstream outfile ("./snp1.txt", ofstream::out);
+			if (!outfile) {cout<<"error writing file: "<<endl; return;}
+			for (size_t i=0; i<x->size; i++) {
+				outfile<<gsl_vector_get(x, i)<<endl;
+			}
+			outfile.clear();
+			outfile.close();			
+		}
+		*/
+	
+		//calculate statistics
+		time_start=clock();		
+		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
+		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//initial values
+		gsl_matrix_memcpy (V_g, V_g_null);
+		gsl_matrix_memcpy (V_e, V_e_null);
+		gsl_matrix_memcpy (B, B_null);
+		
+		time_start=clock();
+		
+		//3 is before 1
+		if (a_mode==3 || a_mode==4) {
+			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+			
+			if (p_score<p_nr && crt==1) {
+				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+			}
+		}		
+		
+		if (a_mode==2 || a_mode==4) {
+			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			//calculate beta and Vbeta
+			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+			
+			if (p_lrt<p_nr) {
+				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+
+				//calculate beta and Vbeta
+				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );	
+				if (crt==1) {
+					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
+				}				
+			}
+		}			
+		
+		if (a_mode==1 || a_mode==4) {
+			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+			
+			if (p_wald<p_nr) {
+				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+				
+				if (crt==1) {
+					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
+				}
+			}
+		}
+		
+		//cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;
+		
+		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		
+		//store summary data
+		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		for (size_t i=0; i<d_size; i++) {
+			v_beta[i]=gsl_vector_get (beta, i);			
+		}
+
+		c=0;
+		for (size_t i=0; i<d_size; i++) {
+			for (size_t j=i; j<d_size; j++) {
+				v_Vg[c]=gsl_matrix_get (V_g, i, j);
+				v_Ve[c]=gsl_matrix_get (V_e, i, j);
+				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+				c++;
+			}
+		}
+		
+		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		sumStat.push_back(SNPs);
+    }	
+	cout<<endl;	
+	
+	//cout<<"time_opt = "<<time_opt<<endl;
+	
+	infile.close();
+	infile.clear();
+	
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_vector_free(x);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(X);	
+	gsl_matrix_free(V_g);
+	gsl_matrix_free(V_e);
+	gsl_matrix_free(B);
+	gsl_vector_free(beta);
+	gsl_matrix_free(Vbeta);
+	
+	gsl_matrix_free(V_g_null);
+	gsl_matrix_free(V_e_null);
+	gsl_matrix_free(B_null);
+	gsl_matrix_free(se_B_null);
+	
+	return;
+}
+
+
+
+
+//calculate Vg, Ve, B, se(B) in the null mvLMM model
+//both B and se_B are d by c matrices
+void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B)
+{
+	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;	
+	size_t dc_size=d_size*c_size, v_size=d_size*(d_size+1)/2;
+
+	double logl, crt_a, crt_b, crt_c;
+	
+	//large matrices for EM
+	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);	
+	
+	//large matrices for NR
+	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
+	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
+	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
+	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);
+	
+	//transpose matrices
+	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
+	gsl_matrix *W=gsl_matrix_alloc (c_size, n_size);
+	gsl_matrix_transpose_memcpy (Y, UtY);
+	gsl_matrix_transpose_memcpy (W, UtW);
+	
+	//initial, EM, NR, and calculate B
+	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B);	
+	logl=MphEM ('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+	logl=MphNR ('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+	MphCalcBeta (eval, W, Y, V_g, V_e, UltVehiY, B, se_B);
+
+	//free matrices
+	gsl_matrix_free(U_hat);
+	gsl_matrix_free(E_hat);
+	gsl_matrix_free(OmegaU);
+	gsl_matrix_free(OmegaE);
+	gsl_matrix_free(UltVehiY);
+	gsl_matrix_free(UltVehiBX);
+	gsl_matrix_free(UltVehiU);
+	gsl_matrix_free(UltVehiE);
+	
+	gsl_matrix_free(Hi_all);
+	gsl_matrix_free(Hiy_all);
+	gsl_matrix_free(xHi_all);
+	gsl_matrix_free(Hessian);
+	
+	gsl_matrix_free(Y);
+	gsl_matrix_free(W);
+	
+	return;
+}
+
diff --git a/src/mvlmm.h b/src/mvlmm.h
new file mode 100644
index 0000000..129879c
--- /dev/null
+++ b/src/mvlmm.h
@@ -0,0 +1,94 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MVLMM_H__                
+#define __MVLMM_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+
+
+class MVLMM {
+	
+public:
+	// IO related parameters
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	string path_out;
+	
+	// MVLMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double logl_remle_H0, logl_mle_H0;
+	vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
+	vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null;
+	vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null;
+	double p_nr;
+	size_t em_iter, nr_iter;
+	double em_prec, nr_prec;
+	size_t crt;
+		
+	// Summary statistics
+	size_t ni_total, ni_test;	//number of individuals
+	size_t ns_total, ns_test;	//number of snps
+	size_t n_cvt;
+	size_t n_ph;
+	double time_UtX;		//time spent on optimization iterations
+	double time_opt;		//time spent on optimization iterations
+	
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	
+	// Not included in PARAM
+	vector<MPHSUMSTAT> sumStat;		//Output SNPSummary Data
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY);
+	void AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY);
+	void WriteFiles ();
+	
+};
+
+void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B);
+
+#endif
+
+
diff --git a/src/param.cpp b/src/param.cpp
new file mode 100644
index 0000000..7a89ff8
--- /dev/null
+++ b/src/param.cpp
@@ -0,0 +1,849 @@
+/*
+    Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <sys/stat.h>
+#include <cmath>
+#include <algorithm>
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+
+
+PARAM::PARAM(void):	
+mode_silence (false), a_mode (0), k_mode(1), d_pace (100000),
+file_out("result"), path_out("./output/"),
+miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999),
+l_min(1e-5), l_max(1e5), n_region(10),p_nr(0.001),em_prec(0.0001),nr_prec(0.0001),em_iter(10000),nr_iter(100),crt(0),
+pheno_mean(0),
+h_min(-1), h_max(-1),	h_scale(-1),
+rho_min(0.0), rho_max(1.0),	rho_scale(-1),
+logp_min(0.0), logp_max(0.0), logp_scale(-1),
+s_min(0), s_max(300),
+w_step(100000),	s_step(1000000),
+r_pace(10), w_pace(1000),
+n_accept(0),
+n_mh(10),
+geo_mean(2000.0),
+randseed(-1),
+error(false),
+  n_cvt(1), n_vc(1),
+time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0)
+{}
+
+
+//read files
+//obtain ns_total, ng_total, ns_test, ni_test
+void PARAM::ReadFiles (void) 
+{
+	string file_str;
+	if (!file_mk.empty()) {				
+	  if (CountFileLines (file_mk, n_vc)==false) {error=true;}
+	}
+	
+	if (!file_snps.empty()) {
+		if (ReadFile_snps (file_snps, setSnps)==false) {error=true;}
+	} else {
+		setSnps.clear();
+	}
+	
+	//for prediction
+	if (!file_epm.empty()) {
+		if (ReadFile_est (file_epm, est_column, mapRS2est)==false) {error=true;}
+		
+		if (!file_bfile.empty()) {
+			file_str=file_bfile+".bim";
+			if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}		
+			
+			file_str=file_bfile+".fam";
+			if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}			
+		}
+		
+		if (!file_geno.empty()) {			
+			if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}		
+			
+			if (CountFileLines (file_geno, ns_total)==false) {error=true;}	
+		}
+		
+		if (!file_ebv.empty() ) {
+			if (ReadFile_column (file_ebv, indicator_bv, vec_bv, 1)==false) {error=true;}
+		}
+		
+		if (!file_log.empty() ) {
+			if (ReadFile_log (file_log, pheno_mean)==false) {error=true;}
+		}
+		
+		//convert indicator_pheno to indicator_idv
+		int k=1;
+		for (size_t i=0; i<indicator_pheno.size(); i++) {
+			k=1;
+			for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+				if (indicator_pheno[i][j]==0) {k=0;}
+			}
+			indicator_idv.push_back(k);
+		}
+		
+		ns_test=0;
+		
+		return;
+	}
+	
+	//read covariates before the genotype files
+	if (!file_cvt.empty() ) {
+		if (ReadFile_cvt (file_cvt, indicator_cvt, cvt, n_cvt)==false) {error=true;}
+
+		if ((indicator_cvt).size()==0) {
+			n_cvt=1;
+		} 		
+	} else {
+		n_cvt=1;
+	}
+
+	//read genotype and phenotype file for plink format
+	if (!file_bfile.empty()) {
+		file_str=file_bfile+".bim";
+		if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}		
+		
+		file_str=file_bfile+".fam";
+		if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}
+		
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+		
+		//obtain covariate matrix
+		gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+		CopyCvt (W);
+		
+		file_str=file_bfile+".bed";
+		if (ReadFile_bed (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test)==false) {error=true;}
+		
+		gsl_matrix_free(W);
+		
+		ns_total=indicator_snp.size();
+	}
+	
+	//read genotype and phenotype file for bimbam format
+	if (!file_geno.empty()) {
+		//annotation file before genotype file
+		if (!file_anno.empty() ) {
+			if (ReadFile_anno (file_anno, mapRS2chr, mapRS2bp, mapRS2cM)==false) {error=true;}
+		}
+
+		//phenotype file before genotype file
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+		
+		//obtain covariate matrix
+		gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+		CopyCvt (W);
+
+		if (ReadFile_geno (file_geno, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test)==false) {error=true;}
+
+		gsl_matrix_free(W);
+		
+		ns_total=indicator_snp.size();
+	}
+	
+	if (!file_gene.empty()) {
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+		
+		//convert indicator_pheno to indicator_idv
+		int k=1;
+		for (size_t i=0; i<indicator_pheno.size(); i++) {
+			k=1;
+			for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+				if (indicator_pheno[i][j]==0) {k=0;}
+			}
+			indicator_idv.push_back(k);
+		}
+		
+		if (ReadFile_gene (file_gene, vec_read, snpInfo, ng_total)==false) {error=true;}	
+	}
+	
+				
+	//read is after gene file
+	if (!file_read.empty() ) {
+		if (ReadFile_column (file_read, indicator_read, vec_read, 1)==false) {error=true;}
+		
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_read[i];
+			ni_test+=indicator_idv[i];
+		}
+		
+		if (ni_test==0) {
+			error=true;
+			cout<<"error! number of analyzed individuals equals 0. "<<endl;
+			return;
+		}
+	}
+	
+	//for ridge prediction, read phenotype only
+	if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) {
+		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}	
+				
+		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		ProcessCvtPhen();
+	}
+
+	return;
+}
+
+
+
+
+
+
+void PARAM::CheckParam (void) 
+{	
+	struct stat fileInfo;
+	string str;
+	
+	//check parameters
+	if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;}
+	if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=21 && a_mode!=22 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61)   
+	{cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm or -predict is sepcified correctly."<<endl; error=true;}
+	if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;}
+	if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;}
+	if (hwe_level>1) {cout<<"error! hwe level needs to be between 0 and 1. current value = "<<hwe_level<<endl; error=true;}
+	if (r2_level>1) {cout<<"error! r2 level needs to be between 0 and 1. current value = "<<r2_level<<endl; error=true;}
+	
+	if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;}	
+	if (h_max<h_min) {cout<<"error! maximum h value must be larger than the minimal value. current values = "<<h_max<<" and "<<h_min<<endl; error=true;}
+	if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;}
+	if (rho_max<rho_min) {cout<<"error! maximum rho value must be larger than the minimal value. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+	if (logp_max<logp_min) {cout<<"error! maximum logp value must be larger than the minimal value. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;}
+	
+	if (h_max>1) {cout<<"error! h values must be bewtween 0 and 1. current values = "<<h_max<<" and "<<h_min<<endl; error=true;}
+	if (rho_max>1) {cout<<"error! rho values must be between 0 and 1. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+	if (logp_max>0) {cout<<"error! maximum logp value must be smaller than 0. current values = "<<logp_max/log(10)<<" and "<<logp_min/log(10)<<endl; error=true;}
+	if (l_max<l_min) {cout<<"error! maximum lambda value must be larger than the minimal value. current values = "<<l_max<<" and "<<l_min<<endl; error=true;}
+		
+	if (h_scale>1.0) {cout<<"error! hscale value must be between 0 and 1. current value = "<<h_scale<<endl; error=true;}
+	if (rho_scale>1.0) {cout<<"error! rscale value must be between 0 and 1. current value = "<<rho_scale<<endl; error=true;}
+	if (logp_scale>1.0) {cout<<"error! pscale value must be between 0 and 1. current value = "<<logp_scale<<endl; error=true;}
+
+	if (rho_max==1 && rho_min==1 && a_mode==12) {cout<<"error! ridge regression does not support a rho parameter. current values = "<<rho_max<<" and "<<rho_min<<endl; error=true;}
+		
+	//check p_column, and (no need to) sort p_column into ascending order
+	if (p_column.size()==0) {
+		p_column.push_back(1);
+	} else {
+		for (size_t i=0; i<p_column.size(); i++) {
+			for (size_t j=0; j<i; j++) {
+				if (p_column[i]==p_column[j]) {cout<<"error! identical phenotype columns: "<<p_column[i]<<endl; error=true;}
+			}
+		}
+	}
+	
+	//sort (p_column.begin(), p_column.end() );
+	n_ph=p_column.size();
+	
+		
+	
+	//only lmm option (and one prediction option) can deal with multiple phenotypes
+	//and no gene expression files
+	if (n_ph>1 && a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=43) {
+		cout<<"error! the current analysis mode "<<a_mode<<" can not deal with multiple phenotypes."<<endl; error=true;
+	}
+	if (n_ph>1 && !file_gene.empty() ) {
+		cout<<"error! multiple phenotype analysis option not allowed with gene expression files. "<<endl; error=true;
+	}
+	
+	if (p_nr>1) {
+		cout<<"error! pnr value must be between 0 and 1. current value = "<<p_nr<<endl; error=true;
+	}
+	
+	//check est_column
+	if (est_column.size()==0) {
+		if (file_ebv.empty()) {
+			est_column.push_back(2);
+			est_column.push_back(5);
+			est_column.push_back(6);
+			est_column.push_back(7);
+		} else {
+			est_column.push_back(2);
+			est_column.push_back(0);
+			est_column.push_back(6);
+			est_column.push_back(7);
+		}
+	}
+	
+	if (est_column.size()!=4) {cout<<"error! -en not followed by four numbers. current number = "<<est_column.size()<<endl; error=true;}	
+	if (est_column[0]==0) {cout<<"error! -en rs column can not be zero. current number = "<<est_column.size()<<endl; error=true;}
+	
+	//check if files are compatible with each other, and if files exist
+	if (!file_bfile.empty()) {
+		str=file_bfile+".bim";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bim file: "<<str<<endl; error=true;}
+		str=file_bfile+".bed";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .bed file: "<<str<<endl; error=true;}
+		str=file_bfile+".fam";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .fam file: "<<str<<endl; error=true;}			
+	}
+	
+	if ((!file_geno.empty() || !file_gene.empty()) ) {
+		str=file_pheno;
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open phenotype file: "<<str<<endl; error=true;}
+	}	
+	
+	str=file_geno;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mean genotype file: "<<str<<endl; error=true;}
+	
+	str=file_gene;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open gene expression file: "<<str<<endl; error=true;}
+	
+	size_t flag=0;
+	if (!file_bfile.empty()) {flag++;}
+	if (!file_geno.empty()) {flag++;}
+	if (!file_gene.empty()) {flag++;}
+	
+	if (flag!=1 && a_mode!=43 && a_mode!=5 && a_mode!=61) {
+		cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true;
+	}
+	
+	if (file_pheno.empty() && (a_mode==43 || a_mode==5 || a_mode==61) ) {
+		cout<<"error! phenotype file is required."<<endl; error=true;
+	}
+	
+	if (!file_epm.empty() && file_bfile.empty() && file_geno.empty() ) {cout<<"error! estimated parameter file also requires genotype file."<<endl; error=true;}
+	if (!file_ebv.empty() && file_kin.empty()) {cout<<"error! estimated breeding value file also requires relatedness file."<<endl; error=true;}
+	
+	if (!file_log.empty() && pheno_mean!=0) {cout<<"error! either log file or mu value can be provide."<<endl; error=true;}
+	
+	str=file_snps;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open snps file: "<<str<<endl; error=true;}
+	
+	str=file_log;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open log file: "<<str<<endl; error=true;}
+	
+	str=file_anno;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open annotation file: "<<str<<endl; error=true;}
+
+	str=file_kin;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open relatedness matrix file: "<<str<<endl; error=true;}
+
+	str=file_mk;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open relatedness matrix file: "<<str<<endl; error=true;}
+	
+	str=file_cvt;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open covariates file: "<<str<<endl; error=true;}
+	
+	str=file_epm;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated parameter file: "<<str<<endl; error=true;}
+	
+	str=file_ebv;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open estimated breeding value file: "<<str<<endl; error=true;}
+	
+	str=file_read;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open total read file: "<<str<<endl; error=true;}
+		
+	//check if files are compatible with analysis mode
+	if (k_mode==2 && !file_geno.empty() ) {cout<<"error! use \"-km 1\" when using bimbam mean genotype file. "<<endl; error=true;}
+	
+	if ((a_mode==1 || a_mode==2 || a_mode==3 || a_mode==4 || a_mode==5 || a_mode==31) && (file_kin.empty() && (file_ku.empty()||file_kd.empty())) )  {cout<<"error! missing relatedness file. "<<endl;  error=true;}
+
+	if (a_mode==61 && (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) )  {cout<<"error! missing relatedness file. "<<endl;  error=true;}
+
+	if ((a_mode==43) && file_kin.empty())  {cout<<"error! missing relatedness file. -predict option requires -k option to provide a relatedness file."<<endl;  error=true;}
+	
+	if ((a_mode==11 || a_mode==12 || a_mode==13) && !file_cvt.empty() ) {cout<<"error! -bslmm option does not support covariates files."<<endl; error=true;}
+		
+	if (a_mode==41 || a_mode==42) {
+		if (!file_cvt.empty() ) {cout<<"error! -predict option does not support covariates files."<<endl; error=true;}	
+		if (file_epm.empty() ) {cout<<"error! -predict option requires estimated parameter files."<<endl; error=true;}		
+	}
+
+	return;
+}
+
+
+		
+
+
+void PARAM::CheckData (void) {
+	if ((file_cvt).empty() || (indicator_cvt).size()==0) {
+		n_cvt=1;
+	}
+	if ( (indicator_cvt).size()!=0 && (indicator_cvt).size()!=(indicator_idv).size()) {
+		error=true;
+		cout<<"error! number of rows in the covariates file do not match the number of individuals. "<<endl;
+		return;
+	}
+	
+	if ( (indicator_read).size()!=0 && (indicator_read).size()!=(indicator_idv).size()) {
+		error=true;
+		cout<<"error! number of rows in the total read file do not match the number of individuals. "<<endl;
+		return;
+	}
+
+	//calculate ni_total and ni_test, and set indicator_idv to 0 whenever indicator_cvt=0
+	//and calculate np_obs and np_miss
+	ni_total=(indicator_idv).size();
+	
+	ni_test=0; 
+	for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		ni_test++;
+	}
+	
+	ni_cvt=0;
+	for (size_t i=0; i<indicator_cvt.size(); i++) {
+		if (indicator_cvt[i]==0) {continue;}
+		ni_cvt++;
+	}
+
+	np_obs=0; np_miss=0;
+	for (size_t i=0; i<indicator_pheno.size(); i++) {
+		if (indicator_cvt.size()!=0) {
+			if (indicator_cvt[i]==0) {continue;}
+		}
+		
+		for (size_t j=0; j<indicator_pheno[i].size(); j++) {					
+			if (indicator_pheno[i][j]==0) {
+				np_miss++;
+			} else {
+				np_obs++;
+			}
+		}
+	}
+
+	/*
+	if ((indicator_cvt).size()!=0) {
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_cvt[i];
+			ni_test+=indicator_idv[i];
+		}
+	}	
+	
+	if ((indicator_read).size()!=0) {
+		ni_test=0; 
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_read[i];
+			ni_test+=indicator_idv[i];
+		}
+	}
+	*/
+	if (ni_test==0) {
+		error=true;
+		cout<<"error! number of analyzed individuals equals 0. "<<endl;
+		return;
+	}
+	
+	if (a_mode==43) {
+		if (ni_cvt==ni_test) {
+			error=true;
+			cout<<"error! no individual has missing phenotypes."<<endl; 
+			return;
+		}
+		if ((np_obs+np_miss)!=(ni_cvt*n_ph)) {
+			error=true;
+			//cout<<ni_cvt<<"\t"<<ni_test<<"\t"<<ni_total<<"\t"<<np_obs<<"\t"<<np_miss<<"\t"<<indicator_cvt.size()<<endl;
+			cout<<"error! number of phenotypes do not match the summation of missing and observed phenotypes."<<endl; 
+			return;
+		}
+	}
+
+	//output some information
+	cout<<"## number of total individuals = "<<ni_total<<endl;
+	if (a_mode==43) {
+		cout<<"## number of analyzed individuals = "<<ni_cvt<<endl;
+		cout<<"## number of individuals with full phenotypes = "<<ni_test<<endl;
+	} else {
+		cout<<"## number of analyzed individuals = "<<ni_test<<endl;
+	}
+	cout<<"## number of covariates = "<<n_cvt<<endl;
+	cout<<"## number of phenotypes = "<<n_ph<<endl;
+	if (a_mode==43) {
+		cout<<"## number of observed data = "<<np_obs<<endl;
+		cout<<"## number of missing data = "<<np_miss<<endl;
+	}
+	if (!file_gene.empty()) {		
+		cout<<"## number of total genes = "<<ng_total<<endl;
+	} else if (file_epm.empty() && a_mode!=43 && a_mode!=5) {
+		cout<<"## number of total SNPs = "<<ns_total<<endl;	
+		cout<<"## number of analyzed SNPs = "<<ns_test<<endl;
+	} else {}
+	
+	//set d_pace to 1000 for gene expression
+	if (!file_gene.empty() && d_pace==100000) {
+		d_pace=1000;
+	}
+	
+	//for case-control studies, count #cases and #controls
+	int flag_cc=0;
+	if (a_mode==13) {	
+		ni_case=0;
+		ni_control=0;
+		for (size_t i=0; i<indicator_idv.size(); i++) {
+			if (indicator_idv[i]==0) {continue;}
+		
+			if (pheno[i][0]==0) {ni_control++;}
+			else if (pheno[i][0]==1) {ni_case++;}
+			else {flag_cc=1;}
+		}
+		cout<<"## number of cases = "<<ni_case<<endl;	
+		cout<<"## number of controls = "<<ni_control<<endl;	
+	}	
+	
+	if (flag_cc==1) {cout<<"Unexpected non-binary phenotypes for case/control analysis. Use default (BSLMM) analysis instead."<<endl; a_mode=11;}
+	
+	//set parameters for BSLMM
+	//and check for predict
+	if (a_mode==11 || a_mode==12 || a_mode==13) {
+		if (a_mode==11) {n_mh=1;}	
+		if (logp_min==0) {logp_min=-1.0*log((double)ns_test);}
+	
+		if (h_scale==-1) {h_scale=min(1.0, 10.0/sqrt((double)ni_test) );}
+		if (rho_scale==-1) {rho_scale=min(1.0, 10.0/sqrt((double)ni_test) );}
+		if (logp_scale==-1) {logp_scale=min(1.0, 5.0/sqrt((double)ni_test) );}
+		
+		if (h_min==-1) {h_min=0.0;}
+		if (h_max==-1) {h_max=1.0;}
+		
+		if (s_max>ns_test) {s_max=ns_test; cout<<"s_max is re-set to the number of analyzed SNPs."<<endl;}
+		if (s_max<s_min) {cout<<"error! maximum s value must be larger than the minimal value. current values = "<<s_max<<" and "<<s_min<<endl; error=true;}
+	} else if (a_mode==41 || a_mode==42) {		
+		if (indicator_bv.size()!=0) {
+			if (indicator_idv.size()!=indicator_bv.size()) {
+				cout<<"error! number of rows in the phenotype file does not match that in the estimated breeding value file: "<<indicator_idv.size()<<"\t"<<indicator_bv.size()<<endl;
+				error=true;
+			} else {
+				size_t flag_bv=0;
+				for (size_t i=0; i<(indicator_bv).size(); ++i) {
+					if (indicator_idv[i]!=indicator_bv[i]) {flag_bv++;}
+				}
+				if (flag_bv!=0) {
+					cout<<"error! individuals with missing value in the phenotype file does not match that in the estimated breeding value file: "<<flag_bv<<endl;
+					error=true;
+				}
+			}
+		}
+	}
+
+	//file_mk needs to contain more than one line
+	if (n_vc==1 && !file_mk.empty()) {cout<<"error! -mk file should contain more than one line."<<endl; error=true;}
+	
+	return;
+}
+
+
+void PARAM::PrintSummary () 
+{
+	if (n_ph==1) {
+		cout<<"pve estimate ="<<pve_null<<endl;
+		cout<<"se(pve) ="<<pve_se_null<<endl;
+	} else {
+		
+	}
+	return;
+}
+
+
+
+void PARAM::ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
+	string file_str;
+	
+	if (!file_bfile.empty()) {
+		file_str=file_bfile+".bed";
+		if (ReadFile_bed (file_str, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;}
+	}
+	else {
+		if (ReadFile_geno (file_geno, indicator_idv, indicator_snp, UtX, K, calc_K)==false) {error=true;}
+	}
+	
+	return;
+}
+		
+
+
+
+void PARAM::CalcKin (gsl_matrix *matrix_kin)  {
+	string file_str;
+	
+	gsl_matrix_set_zero (matrix_kin);
+	
+	if (!file_bfile.empty() ) {		
+		file_str=file_bfile+".bed";
+		if (PlinkKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
+	}
+	else {
+		file_str=file_geno;
+		if (BimbamKin (file_str, indicator_snp, a_mode-20, d_pace, matrix_kin)==false) {error=true;}
+	}
+	
+	return;
+}
+		
+
+
+
+
+void PARAM::WriteMatrix (const gsl_matrix *matrix_U, const string suffix) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".";
+	file_str+=suffix;
+	file_str+=".txt";	
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile.precision(10);
+	
+	for (size_t i=0; i<matrix_U->size1; ++i) {
+		for (size_t j=0; j<matrix_U->size2; ++j) {
+			outfile<<gsl_matrix_get (matrix_U, i, j)<<"\t";
+		}
+		outfile<<endl;
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PARAM::WriteVector (const gsl_vector *vector_D, const string suffix) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".";
+	file_str+=suffix;
+	file_str+=".txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	outfile.precision(10);
+	
+	for (size_t i=0; i<vector_D->size; ++i) {
+		outfile<<gsl_vector_get (vector_D, i)<<endl;
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PARAM::CheckCvt () 
+{
+	if (indicator_cvt.size()==0) {return;}
+		
+	size_t ci_test=0;
+	
+	gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+
+	size_t flag_ipt=0;
+	double v_min, v_max;
+	set<size_t> set_remove;
+	
+	//check if any columns is an intercept
+	for (size_t i=0; i<W->size2; i++) {
+		gsl_vector_view w_col=gsl_matrix_column (W, i);
+		gsl_vector_minmax (&w_col.vector, &v_min, &v_max);
+		if (v_min==v_max) {flag_ipt=1; set_remove.insert (i);}
+	}
+	
+	//add an intecept term if needed
+	if (n_cvt==set_remove.size()) {
+		indicator_cvt.clear();
+		n_cvt=1;
+	} else if (flag_ipt==0) {
+		cout<<"no intecept term is found in the cvt file. a column of 1s is added."<<endl;
+		for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+			if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+			cvt[i].push_back(1.0);
+		}
+		
+		n_cvt++;
+	} else {}	
+	
+	gsl_matrix_free(W);
+	
+	return;
+}
+
+
+//post-process phentoypes, covariates
+void PARAM::ProcessCvtPhen ()
+{	
+	//convert indicator_pheno to indicator_idv
+	int k=1;
+	indicator_idv.clear();
+	for (size_t i=0; i<indicator_pheno.size(); i++) {
+		k=1;
+		for (size_t j=0; j<indicator_pheno[i].size(); j++) {
+			if (indicator_pheno[i][j]==0) {k=0;}
+		}
+		indicator_idv.push_back(k);
+	}
+	
+	//remove individuals with missing covariates
+	if ((indicator_cvt).size()!=0) {
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_idv[i]*=indicator_cvt[i];
+		}
+	}
+	
+	//obtain ni_test
+	ni_test=0; 
+	for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		ni_test++;
+	}
+	
+	if (ni_test==0) {
+		error=true;
+		cout<<"error! number of analyzed individuals equals 0. "<<endl;
+		return;
+	}
+	
+	//check covariates to see if they are correlated with each other, and to see if the intercept term is included
+	//after getting ni_test
+	//add or remove covariates
+	if (indicator_cvt.size()!=0) {
+		CheckCvt();
+	} else {
+		vector<double> cvt_row;
+		cvt_row.push_back(1);
+		
+		for (vector<int>::size_type i=0; i<(indicator_idv).size(); ++i) {
+			indicator_cvt.push_back(1);
+			
+			cvt.push_back(cvt_row);
+		}
+	}
+	 
+	return;
+}
+
+
+
+
+void PARAM::CopyCvt (gsl_matrix *W) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0 || indicator_cvt[i]==0) {continue;}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+
+//if flag=0, then use indicator_idv to load W and Y
+//else, use indicator_cvt to load them
+void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (flag==0) {
+			if (indicator_idv[i]==0) {continue;}
+		} else {
+			if (indicator_cvt[i]==0) {continue;}
+		}
+		
+		gsl_vector_set (y, ci_test, (pheno)[i][0]);
+		
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+//if flag=0, then use indicator_idv to load W and Y
+//else, use indicator_cvt to load them
+void PARAM::CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (flag==0) {
+			if (indicator_idv[i]==0) {continue;}
+		} else {
+			if (indicator_cvt[i]==0) {continue;}
+		}		
+		
+		for (size_t j=0; j<n_ph; ++j) {
+			gsl_matrix_set (Y, ci_test, j, (pheno)[i][j]);
+		}
+		for (size_t j=0; j<n_cvt; ++j) {
+			gsl_matrix_set (W, ci_test, j, (cvt)[i][j]);
+		}
+		ci_test++;
+	}
+	
+	return;
+}
+
+
+
+
+
+void PARAM::CopyRead (gsl_vector *log_N) 
+{
+	size_t ci_test=0;
+	
+	for (vector<int>::size_type i=0; i<indicator_idv.size(); ++i) {
+		if (indicator_idv[i]==0) {continue;}
+		gsl_vector_set (log_N, ci_test, log(vec_read[i]) );	
+		ci_test++;
+	}
+	
+	return;
+}
+		
+		
+
diff --git a/src/param.h b/src/param.h
new file mode 100644
index 0000000..fa18181
--- /dev/null
+++ b/src/param.h
@@ -0,0 +1,232 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __PARAM_H__                
+#define __PARAM_H__
+
+#include <vector>
+#include <map>
+#include <set>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+using namespace std;
+
+
+
+class SNPINFO {
+public:
+	string chr;
+	string rs_number;
+	double cM;
+	long int base_position;
+	string a_minor;
+	string a_major;
+	size_t n_miss;
+	double missingness;
+	double maf;	
+};
+
+//results for lmm
+class SUMSTAT {
+public:
+	double beta;				//REML estimator for beta
+	double se;				//SE for beta  
+	double lambda_remle;		//REML estimator for lambda
+	double lambda_mle;		//MLE estimator for lambda
+	double p_wald;			//p value from a Wald test
+	double p_lrt;				//p value from a likelihood ratio test
+	double p_score;			//p value from a score test
+};
+
+//results for mvlmm
+class MPHSUMSTAT {
+public:
+	vector<double> v_beta;	//REML estimator for beta
+	double p_wald;			//p value from a Wald test
+	double p_lrt;				//p value from a likelihood ratio test
+	double p_score;			//p value from a score test
+	vector<double> v_Vg;	//estimator for Vg, right half
+	vector<double> v_Ve;	//estimator for Ve, right half
+	vector<double> v_Vbeta;	//estimator for Vbeta, right half
+};
+
+
+//hyper-parameters for bslmm
+class HYPBSLMM {
+public:
+	double h;
+	double pve;
+	double rho;
+	double pge;
+	double logp;
+	
+	size_t n_gamma;
+};
+
+
+
+
+class PARAM {
+public:	
+	// IO related parameters
+	bool mode_silence;
+	int a_mode;				//analysis mode, 1/2/3/4 for Frequentist tests
+	int k_mode;				//kinship read mode: 1: n by n matrix, 2: id/id/k_value; 		
+	vector<size_t> p_column;			//which phenotype column needs analysis
+	size_t d_pace;		//display pace
+	
+	string file_bfile;
+	string file_geno;
+	string file_pheno;
+	string file_anno;		//optional
+	string file_cvt;		//optional
+	string file_kin;
+	string file_ku, file_kd;
+	string file_mk;
+	string file_out;
+	string path_out;
+	
+	string file_epm;		//estimated parameter file
+	string file_ebv;		//estimated breeding value file
+	string file_log;		//log file containing mean estimate
+	
+	string file_read;		//file containing total number of reads
+	string file_gene;		//gene expression file
+	
+	string file_snps;		//file containing analyzed snps or genes
+	
+	
+	
+	// QC related parameters	
+	double miss_level;
+	double maf_level;	
+	double hwe_level;
+	double r2_level;
+	
+	// LMM related parameters
+	double l_min;
+	double l_max;
+	size_t n_region;
+	double l_mle_null, l_remle_null;
+	double logl_mle_H0, logl_remle_H0;
+	double pve_null, pve_se_null;
+	double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null;
+	vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
+	vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null;
+	vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null;
+	double p_nr;	
+	double em_prec, nr_prec;
+	size_t em_iter, nr_iter;
+	size_t crt;
+	double pheno_mean;		//phenotype mean from bslmm fitting or for prediction
+
+	//for fitting multiple variance components
+	//the first three are of size n_vc, and the next two are of size n_vc+1
+	vector<double> v_traceG;
+	vector<double> v_pve;
+	vector<double> v_se_pve;
+
+	vector<double> v_sigma2;
+	vector<double> v_se_sigma2;	
+	vector<double> v_beta;
+	vector<double> v_se_beta;	
+	
+	// BSLMM MCMC related parameters
+	double h_min, h_max, h_scale;			//priors for h
+	double rho_min, rho_max, rho_scale;		//priors for rho
+	double logp_min, logp_max, logp_scale;		//priors for log(pi)
+	size_t s_min, s_max;			//minimum and maximum number of gammas
+	size_t w_step;					//number of warm up/burn in iterations
+	size_t s_step;					//number of sampling iterations
+	size_t r_pace;					//record pace
+	size_t w_pace;					//write pace
+	size_t n_accept;				//number of acceptance
+	size_t n_mh;					//number of MH steps within each iteration
+	double geo_mean;				//mean of the geometric distribution
+	long int randseed;
+	double trace_G;
+
+	HYPBSLMM cHyp_initial;
+		
+	// Summary statistics
+	bool error;
+	size_t ni_total, ni_test, ni_cvt;	//number of individuals
+	size_t np_obs, np_miss;		//number of observed and missing phenotypes
+	size_t ns_total, ns_test;	//number of snps
+	size_t ng_total, ng_test;	//number of genes
+	size_t ni_control, ni_case;	//number of controls and number of cases
+	size_t n_cvt;			//number of covariates
+	size_t n_ph;			//number of phenotypes
+	size_t n_vc;			//number of variance components (including the diagonal matrix)
+	double time_total;		//record total time
+	double time_G;			//time spent on reading files the second time and calculate K
+	double time_eigen;		//time spent on eigen-decomposition
+	double time_UtX;		//time spent on calculating UX and Uy
+	double time_UtZ;		//time spent on calculating UtZ, for probit BSLMM
+	double time_opt;		//time spent on optimization iterations/or mcmc
+	double time_Omega;		//time spent on calculating Omega
+	double time_hyp;		//time spent on sampling hyper-parameters, in PMM
+	double time_Proposal;  //time spend on constructing the proposal distribution (i.e. the initial lmm or lm analysis)
+
+	// Data
+	vector<vector<double> > pheno;			//a vector record all phenotypes, NA replaced with -9
+	vector<vector<double> > cvt;			//a vector record all covariates, NA replaced with -9	
+	vector<vector<int> > indicator_pheno;			//a matrix record when a phenotype is missing for an individual; 0 missing, 1 available
+	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
+	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	vector<int> indicator_cvt;				//indicator for covariates, 0 missing, 1 available for analysis
+	
+	vector<int> indicator_bv;				//indicator for estimated breeding value file, 0 missing, 1 available for analysis
+	vector<int> indicator_read;				//indicator for read file, 0 missing, 1 available for analysis
+	vector<double> vec_read;				//total number of reads
+	vector<double> vec_bv;					//breeding values
+	vector<size_t> est_column;
+	
+	map<string, int> mapID2num;		//map small ID number to number, from 0 to n-1
+	map<string, string> mapRS2chr;		//map rs# to chromosome location
+	map<string, long int> mapRS2bp;		//map rs# to base position
+	map<string, double> mapRS2cM;		//map rs# to cM
+	map<string, double> mapRS2est;			//map rs# to parameters
+	
+	vector<SNPINFO> snpInfo;		//record SNP information
+	set<string> setSnps;			//a set of snps for analysis
+	
+	//constructor
+	PARAM();
+	
+	//functions
+	void ReadFiles ();		
+	void CheckParam (); 
+	void CheckData ();	
+	void PrintSummary ();
+	void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K);	
+	void CheckCvt ();
+	void CopyCvt (gsl_matrix *W);
+	void ProcessCvtPhen();
+	void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag);
+	void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag);
+	void CalcKin (gsl_matrix *matrix_kin);
+	void WriteMatrix (const gsl_matrix *matrix_U, const string suffix);
+	void WriteVector (const gsl_vector *vector_D, const string suffix);
+	void CopyRead (gsl_vector *log_N);
+};
+
+
+#endif
+
diff --git a/src/prdt.cpp b/src/prdt.cpp
new file mode 100644
index 0000000..2875119
--- /dev/null
+++ b/src/prdt.cpp
@@ -0,0 +1,544 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <cmath>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "io.h"
+#include "lapack.h"  //for functions EigenDecomp
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"
+#include "prdt_float.h"
+#include "mathfunc_float.h"
+#else
+#include "io.h"
+#include "prdt.h"
+#include "mathfunc.h"
+#endif
+
+using namespace std;
+
+
+
+
+void PRDT::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	
+	indicator_pheno=cPar.indicator_pheno;	
+	indicator_cvt=cPar.indicator_cvt;
+	indicator_idv=cPar.indicator_idv;
+	
+	snpInfo=cPar.snpInfo;
+	mapRS2est=cPar.mapRS2est;
+	
+	time_eigen=0;
+	
+	n_ph=cPar.n_ph;
+	np_obs=cPar.np_obs;
+	np_miss=cPar.np_miss;
+	ns_total=cPar.ns_total;
+	ns_test=0;	
+	
+	return;
+}
+
+void PRDT::CopyToParam (PARAM &cPar) 
+{
+	cPar.ns_test=ns_test;
+	cPar.time_eigen=time_eigen;
+	
+	return;
+}               
+
+
+
+
+void PRDT::WriteFiles (gsl_vector *y_prdt) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".";
+	file_str+="prdt";
+	file_str+=".txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		if (indicator_idv[i]==1) {
+			outfile<<"NA"<<endl;
+		} else {
+			outfile<<gsl_vector_get (y_prdt, ci_test)<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PRDT::WriteFiles (gsl_matrix *Y_full) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".prdt.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_cvt.size(); i++) {
+		if (indicator_cvt[i]==0) {
+			outfile<<"NA"<<endl;
+		} else {
+			for (size_t j=0; j<Y_full->size2; j++) {
+				outfile<<gsl_matrix_get (Y_full, ci_test, j)<<"\t";
+			}
+			outfile<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) 
+{
+	size_t ni_test=u_hat->size, ni_total=G->size1;
+	
+	gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test);
+	gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test);
+	gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); 
+	gsl_vector *eval=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu=gsl_vector_alloc (ni_test);
+	gsl_vector *w=gsl_vector_alloc (ni_total);
+	gsl_permutation *pmt=gsl_permutation_alloc (ni_test);
+	
+	//center matrix G based on indicator_idv
+	for (size_t i=0; i<ni_total; i++) {
+		gsl_vector_set(w, i, indicator_idv[i]);
+	}
+	CenterMatrix(G, w);
+		
+	//obtain Koo and Kfo
+	size_t o_i=0, o_j=0;
+	double d;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		o_j=0;
+		for (size_t j=0; j<indicator_idv.size(); j++) {
+			d=gsl_matrix_get(G, i, j);
+			if (indicator_idv[i]==1 && indicator_idv[j]==1) {
+				gsl_matrix_set(Goo, o_i, o_j, d);
+			}
+			if (indicator_idv[i]==0 && indicator_idv[j]==1) {
+				gsl_matrix_set(Gfo, i-o_i, o_j, d);
+			}
+			if (indicator_idv[j]==1) {o_j++;}
+		}
+		if (indicator_idv[i]==1) {o_i++;}
+	}
+		
+	//matrix operations to get u_prdt
+	cout<<"Start Eigen-Decomposition..."<<endl;
+	clock_t time_start=clock();
+	EigenDecomp (Goo, U, eval, 0);
+	for (size_t i=0; i<eval->size; i++) {
+		if (gsl_vector_get(eval,i)<1e-10) {gsl_vector_set(eval, i, 0);}
+	}
+
+	time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+	
+	gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu);
+	for (size_t i=0; i<eval->size; i++) {
+		d=gsl_vector_get(eval, i);
+		if (d!=0) {d=gsl_vector_get(Utu, i)/d; gsl_vector_set(Utu, i, d);}
+	}
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
+	
+	//free matrices
+	gsl_matrix_free(Goo);
+	gsl_matrix_free(Gfo);
+	gsl_matrix_free(U);
+	gsl_vector_free(eval);
+	gsl_vector_free(Utu);
+	gsl_vector_free(w);
+	gsl_permutation_free(pmt);
+
+	return;	
+}
+
+
+
+void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+	
+	string line;
+	char *ch_ptr;
+	string rs;
+	
+	size_t n_miss, n_train_nomiss, c_phen;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size);
+	
+	ns_test=0;
+
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<ns_total; ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		x_mean=0.0; c_phen=0; n_miss=0; x_train_mean=0; n_train_nomiss=0;
+		gsl_vector_set_zero(x_miss);
+
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==1) {
+				if (strcmp(ch_ptr, "NA")!=0) {
+					geno=atof(ch_ptr); 			
+					x_train_mean+=geno;
+					n_train_nomiss++;
+				}
+			} else {
+				if (strcmp(ch_ptr, "NA")==0) {
+					gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;
+				} else {
+					geno=atof(ch_ptr); 	
+					
+					gsl_vector_set(x, c_phen, geno); 
+					gsl_vector_set(x_miss, c_phen, 1.0); 
+					x_mean+=geno;
+				}
+				c_phen++;
+			}
+		}
+
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (gsl_vector_get (x_miss, i)==0) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (x_miss);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void PRDT::AnalyzePlink (gsl_vector *y_prdt) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	char ch[1];
+	bitset<8> b;	
+	string rs;
+	
+	size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;}
+	else {n_bit=indicator_idv.size()/4+1; }
+	
+	//print the first three majic numbers
+	for (size_t i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}	
+	
+	ns_test=0;
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		//if (indicator_snp[t]==0) {continue;}
+		
+		rs=snpInfo[t].rs_number;
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; x_train_mean=0; n_train_nomiss=0;
+		for (size_t i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==indicator_idv.size() ) {break;}
+				if (indicator_idv[ci_total]==1) {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {x_train_mean+=2.0; n_train_nomiss++;}
+						else {x_train_mean+=1.0; n_train_nomiss++;}
+					}
+					else {
+						if (b[2*j+1]==1) {n_train_nomiss++;}                                  
+						else {}
+					}
+				} else {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+						else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+					}
+					else {
+						if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+						else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+					}
+					ci_test++;
+				}
+				ci_total++;
+				
+			}
+		}
+		
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+		
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (geno==-9) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+		
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+//predict missing phenotypes using ridge regression
+//Y_hat contains fixed effects
+void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full) 
+{	
+	gsl_vector *y_obs=gsl_vector_alloc (np_obs);
+	gsl_vector *y_miss=gsl_vector_alloc (np_miss);
+	gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs);
+	gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs);
+	gsl_vector *Hiy=gsl_vector_alloc (np_obs);
+	
+	size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0;
+	
+	//obtain H_oo, H_mo
+	c_obs1=0; c_miss1=0; 
+	for (vector<int>::size_type i1=0; i1<indicator_pheno.size(); ++i1) {
+		if (indicator_cvt[i1]==0) {continue;}
+		for (vector<int>::size_type j1=0; j1<n_ph; ++j1) {
+			
+			c_obs2=0; c_miss2=0;
+			for (vector<int>::size_type i2=0; i2<indicator_pheno.size(); ++i2) {
+				if (indicator_cvt[i2]==0) {continue;}
+				for (vector<int>::size_type j2=0; j2<n_ph; j2++) {
+					
+					if (indicator_pheno[i2][j2]==1) {
+						if (indicator_pheno[i1][j1]==1) {
+							gsl_matrix_set (H_oo, c_obs1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						} else {
+							gsl_matrix_set (H_mo, c_miss1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						}
+						c_obs2++;
+					} else {
+						c_miss2++;
+					}
+				}				
+			}
+			
+			if (indicator_pheno[i1][j1]==1) {
+				c_obs1++;
+			} else {
+				c_miss1++;
+			}
+		}
+		
+	}	
+	
+	//do LU decomposition of H_oo
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (np_obs);
+	LUDecomp (H_oo, pmt, &sig);
+	
+//	if (mode_temp==0) {
+		//obtain y_obs=y_full-y_hat
+		//add the fixed effects part to y_miss: y_miss=y_hat
+		c_obs1=0; c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==1) {
+					gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j)-gsl_matrix_get (Y_hat, i, j) );
+					c_obs1++;
+				} else {
+					gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+					c_miss1++;
+				}
+			}
+		}	
+		
+		LUSolve (H_oo, pmt, y_obs, Hiy);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+		
+		//put back predicted y_miss to Y_full
+		c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==0) {
+					gsl_matrix_set (Y_full, i, j, gsl_vector_get (y_miss, c_miss1) );
+					c_miss1++;
+				}
+			}
+		}
+/*
+	} else {
+		for (size_t k=0; k<mode_temp; k++) {
+			c_obs1=0; c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==1) {
+						gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j+k*2)-gsl_matrix_get (Y_hat, i, j) );
+						c_obs1++;
+					} else {
+						gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+						c_miss1++;
+					}
+				}
+			}	
+			
+			LUSolve (H_oo, pmt, y_obs, Hiy);
+			
+			gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+			
+			//put back predicted y_miss to Y_full
+			c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==0) {
+						gsl_matrix_set (Y_full, i, j+k*2, gsl_vector_get (y_miss, c_miss1) );
+						c_miss1++;
+					}
+				}
+			}
+		}
+	}
+*/
+	//free matrices
+	gsl_vector_free(y_obs);
+	gsl_vector_free(y_miss);
+	gsl_matrix_free(H_oo);
+	gsl_matrix_free(H_mo);
+	gsl_vector_free(Hiy);
+	
+	return;
+}
+
+
diff --git a/src/prdt.h b/src/prdt.h
new file mode 100644
index 0000000..8af2cee
--- /dev/null
+++ b/src/prdt.h
@@ -0,0 +1,81 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __PRDT_H__                
+#define __PRDT_H__
+
+
+#include <vector>
+#include <map>
+#include <string.h>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#else
+#include "param.h"
+#endif
+
+using namespace std;
+
+class PRDT {
+	
+public:
+	// IO related parameters
+	size_t a_mode;
+	size_t d_pace;
+	
+	string file_bfile;
+	string file_geno;
+	string file_out;
+	string path_out;
+	
+	vector<vector<int> > indicator_pheno;
+	vector<int> indicator_cvt;
+	vector<int> indicator_idv;
+	vector<SNPINFO> snpInfo;
+	map<string, double> mapRS2est;
+	
+	size_t n_ph;
+	size_t np_obs, np_miss;
+	size_t ns_total;
+	size_t ns_test;
+	
+	double time_eigen;
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void WriteFiles (gsl_vector *y_prdt);
+	void WriteFiles (gsl_matrix *Y_full);
+	void AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt);
+	void AnalyzeBimbam (gsl_vector *y_prdt);
+	void AnalyzePlink (gsl_vector *y_prdt);
+	void MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full);
+};
+
+
+#endif
+
+
+
+
+
+
+
diff --git a/src/vc.cpp b/src/vc.cpp
new file mode 100644
index 0000000..77cf746
--- /dev/null
+++ b/src/vc.cpp
@@ -0,0 +1,443 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <iomanip>
+#include <cmath>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <bitset>
+#include <cstring>
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+#include "gsl/gsl_cdf.h"
+#include "gsl/gsl_multiroots.h"
+#include "gsl/gsl_min.h"
+
+#include "io.h"
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "lmm_float.h"
+#include "vc_float.h"
+#else
+#include "lmm.h"
+#include "vc.h"
+#endif
+
+
+
+using namespace std;
+
+
+//in this file, X, Y are already transformed (i.e. UtX and UtY)
+
+
+void VC::CopyFromParam (PARAM &cPar) 
+{	
+	file_out=cPar.file_out;
+	
+	//	v_sigma2=cPar.v_sigma2;
+	
+	time_UtX=0.0;
+	time_opt=0.0;
+
+	v_traceG=cPar.v_traceG;
+	
+	return;
+}
+
+
+void VC::CopyToParam (PARAM &cPar) 
+{
+	cPar.time_UtX=time_UtX;
+	cPar.time_opt=time_opt;	
+		
+	cPar.v_sigma2=v_sigma2;
+	cPar.v_se_sigma2=v_se_sigma2;
+	cPar.v_pve=v_pve;
+	cPar.v_se_pve=v_se_pve;
+	cPar.v_traceG=v_traceG;
+	
+	cPar.v_beta=v_beta;
+	cPar.v_se_beta=v_se_beta;
+	
+	return;
+}
+
+
+
+void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p)
+{
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2;
+ 
+  gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1);
+  gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt);
+  gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt);
+  gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt);
+  gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1);
+
+  double sigma2;  
+  //calculate H=\sum_i^{k+1} \sigma_i^2 K_i
+  gsl_matrix_set_zero (p->P);
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      gsl_matrix_set_identity (K_temp);      
+    } else {
+      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);
+      gsl_matrix_memcpy (K_temp, &K_sub.matrix);
+    }
+
+    sigma2=exp(gsl_vector_get (log_sigma2, i) );
+    gsl_matrix_scale(K_temp, sigma2);
+    gsl_matrix_add (p->P, K_temp);
+  }
+
+  //calculate H^{-1}
+  int sig;
+  gsl_permutation * pmt1=gsl_permutation_alloc (n1);
+  LUDecomp (p->P, pmt1, &sig);	
+  LUInvert (p->P, pmt1, K_temp);
+  gsl_permutation_free(pmt1);
+
+  gsl_matrix_memcpy (p->P, K_temp);
+
+  //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1}
+  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW);
+  gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW);
+
+  gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt);
+  LUDecomp (WtHiW, pmt2, &sig);	
+  LUInvert (WtHiW, pmt2, WtHiWi);
+  gsl_permutation_free(pmt2);
+
+  gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi);  
+  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P);
+  
+  //calculate Py, KPy, PKPy
+  gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py);    
+
+  for (size_t i=0; i<n_vc+1; i++) {
+    gsl_vector_view KPy=gsl_matrix_column (p->KPy_mat, i);
+    gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i);
+
+    if (i==n_vc) {
+      gsl_vector_memcpy (&KPy.vector, p->Py);
+    } else {
+      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);      
+      gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector);
+    }
+    
+    gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector);
+  }
+
+  gsl_matrix_free (K_temp);
+  gsl_matrix_free (HiW);
+  gsl_matrix_free (WtHiW);
+  gsl_matrix_free (WtHiWi);
+  gsl_matrix_free (WtHiWiWtHi);
+
+  return;
+}
+
+
+//below are functions for AI algorithm
+int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
+  
+  double tr, d;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+
+  //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	tr+=gsl_matrix_get (p->P, l, l);
+      }
+    } else {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	gsl_vector_view P_row=gsl_matrix_row (p->P, l);
+	gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l);
+	gsl_blas_ddot(&P_row.vector, &K_col.vector, &d);
+	tr+=d;
+      }
+    }
+
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
+
+    d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i));
+    
+    gsl_vector_set(dev1, i, d);
+  }
+
+  return GSL_SUCCESS;
+}
+
+
+
+int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n_vc=log_sigma2->size-1;
+  
+  double d, sigma2_i, sigma2_j;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+  
+  //calculate dev2=0.5(yPKPKPy)
+  for (size_t i=0; i<n_vc+1; i++) {
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+
+    for (size_t j=i; j<n_vc+1; j++) {
+      gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
+
+      gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
+      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+
+      d*=-0.5*sigma2_i*sigma2_j;
+
+      gsl_matrix_set(dev2, i, j, d);
+      if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
+    }   
+  }
+
+  gsl_matrix_memcpy (p->Hessian, dev2);
+
+  return GSL_SUCCESS;
+}
+
+
+
+int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, gsl_matrix *dev2)
+{
+  VC_PARAM *p=(VC_PARAM *) params;
+
+  size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
+  
+  double tr, d, sigma2_i, sigma2_j;
+
+  //update parameters
+  UpdateParam (log_sigma2, p);
+
+  //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy
+  //calculate dev2=0.5(yPKPKPy)
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	tr+=gsl_matrix_get (p->P, l, l);
+      }
+    } else {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	gsl_vector_view P_row=gsl_matrix_row (p->P, l);
+	gsl_vector_const_view K_col=gsl_matrix_const_column (p->K, n1*i+l);
+	gsl_blas_ddot(&P_row.vector, &K_col.vector, &d);
+	tr+=d;
+      }
+    }
+
+    gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
+    gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
+
+    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+    d=(-0.5*tr+0.5*d)*sigma2_i;
+ 
+    gsl_vector_set(dev1, i, d);
+      
+    for (size_t j=i; j<n_vc+1; j++) {
+      gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
+      gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
+
+      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+      d*=-0.5*sigma2_i*sigma2_j;
+
+      gsl_matrix_set(dev2, i, j, d);
+      if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
+    }   
+
+  }
+
+  gsl_matrix_memcpy (p->Hessian, dev2);
+
+  return GSL_SUCCESS;
+}
+
+
+
+
+void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y)
+{
+  size_t n1=K->size1, n2=K->size2;
+  size_t n_vc=n2/n1;
+  gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1);
+  double d, s;
+
+  //set up params
+  gsl_matrix *P=gsl_matrix_alloc (n1, n1);
+  gsl_vector *Py=gsl_vector_alloc (n1);
+  gsl_matrix *KPy_mat=gsl_matrix_alloc (n1, n_vc+1);
+  gsl_matrix *PKPy_mat=gsl_matrix_alloc (n1, n_vc+1);
+  gsl_vector *dev1=gsl_vector_alloc (n_vc+1);
+  gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1);
+  gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1);
+  VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian};
+
+  //initialize sigma2/log_sigma2
+  gsl_blas_ddot (y, y, &s);
+  s/=(double)n1;
+  for (size_t i=0; i<n_vc+1; i++) {
+    if (i==n_vc) {
+      d=s/((double)n_vc+1.0);
+    } else {
+      d=s/( ((double)n_vc+1.0)*v_traceG[i]);
+    }
+
+    gsl_vector_set (log_sigma2, i, d);
+  }
+  //  gsl_vector_set (log_sigma2, 0, 0.38);
+  //  gsl_vector_set (log_sigma2, 1, -1.08);
+
+  cout<<"iteration "<<0<<endl;
+  cout<<"sigma2 = ";
+  for (size_t i=0; i<n_vc+1; i++) {
+    cout<<exp(gsl_vector_get(log_sigma2, i))<<" ";
+  }
+  cout<<endl;
+
+  //set up fdf
+  gsl_multiroot_function_fdf FDF;
+  FDF.n=n_vc+1;
+  FDF.params=&params;
+  FDF.f=&LogRL_dev1;
+  FDF.df=&LogRL_dev2;
+  FDF.fdf=&LogRL_dev12;
+  
+  //set up solver 	
+  int status;
+  int iter=0, max_iter=100;
+
+  const gsl_multiroot_fdfsolver_type *T_fdf;
+  gsl_multiroot_fdfsolver *s_fdf;
+  T_fdf=gsl_multiroot_fdfsolver_hybridsj;
+  s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1);	
+
+  gsl_multiroot_fdfsolver_set (s_fdf, &FDF, log_sigma2);
+
+  do {
+    iter++;
+    status=gsl_multiroot_fdfsolver_iterate (s_fdf);
+
+    if (status) break;
+
+    cout<<"iteration "<<iter<<endl;
+    cout<<"sigma2 = ";
+    for (size_t i=0; i<n_vc+1; i++) {
+      cout<<exp(gsl_vector_get(s_fdf->x, i))<<" ";
+    }
+    cout<<endl;
+    cout<<"derivatives = ";
+    for (size_t i=0; i<n_vc+1; i++) {
+      cout<<gsl_vector_get(s_fdf->f, i)<<" ";
+    }
+    cout<<endl;
+
+    status=gsl_multiroot_test_residual (s_fdf->f, 1e-3);		
+  }
+  while (status==GSL_CONTINUE && iter<max_iter); 
+
+  //obtain Hessian inverse
+  int sig=LogRL_dev12 (s_fdf->f, &params, dev1, dev2);
+
+  gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1);
+  LUDecomp (dev2, pmt, &sig);	
+  LUInvert (dev2, pmt, Hessian);
+  gsl_permutation_free(pmt);
+
+  //save data
+  v_sigma2.clear(); 
+  for (size_t i=0; i<n_vc+1; i++) {
+    d=exp(gsl_vector_get(s_fdf->x, i));
+    v_sigma2.push_back(d);
+  }
+
+  v_se_sigma2.clear();
+  for (size_t i=0; i<n_vc+1; i++) {
+    d=-1.0*v_sigma2[i]*v_sigma2[i]*gsl_matrix_get(Hessian, i, i);
+    v_se_sigma2.push_back(sqrt(d));
+  }
+
+  s=0;
+  for (size_t i=0; i<n_vc; i++) {
+    s+=v_traceG[i]*v_sigma2[i];
+  }
+  s+=v_sigma2[n_vc];
+  
+  v_pve.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    d=v_traceG[i]*v_sigma2[i]/s;
+    v_pve.push_back(d);
+  }
+
+  v_se_pve.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    d=v_traceG[i]*(s-v_sigma2[i]*v_traceG[i])/(s*s)*v_se_sigma2[i]*v_se_sigma2[i];
+    v_se_pve.push_back(sqrt(d) );
+  }
+  
+  gsl_multiroot_fdfsolver_free(s_fdf);	
+
+  gsl_vector_free(log_sigma2);
+  gsl_matrix_free(P);
+  gsl_vector_free(Py);
+  gsl_matrix_free(KPy_mat);
+  gsl_matrix_free(PKPy_mat);
+  gsl_vector_free(dev1);
+  gsl_matrix_free(dev2);
+  gsl_matrix_free(Hessian);
+
+  return;
+}
+
+
+	
+
+
+
diff --git a/src/vc.h b/src/vc.h
new file mode 100644
index 0000000..f34d72e
--- /dev/null
+++ b/src/vc.h
@@ -0,0 +1,82 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __VC_H__                
+#define __VC_H__
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "io_float.h"
+#else
+#include "param.h"
+#include "io.h"
+#endif
+
+using namespace std;
+
+
+
+class VC_PARAM
+{
+
+public:	
+	const gsl_matrix *K;
+	const gsl_matrix *W;
+	const gsl_vector *y;
+	gsl_matrix *P;
+	gsl_vector *Py;
+	gsl_matrix *KPy_mat;
+	gsl_matrix *PKPy_mat;
+	gsl_matrix *Hessian;
+};
+
+
+
+
+class VC {
+
+public:
+	// IO related parameters
+	string file_out;
+	string path_out;
+
+	vector<double> v_sigma2;
+	vector<double> v_se_sigma2;
+	vector<double> v_pve;
+	vector<double> v_se_pve;
+	vector<double> v_traceG;
+	vector<double> v_beta;
+	vector<double> v_se_beta;
+
+	double time_UtX;
+	double time_opt;
+	
+	// Main functions
+	void CopyFromParam (PARAM &cPar);
+	void CopyToParam (PARAM &cPar);
+	void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
+	void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
+};
+
+#endif
+
+
author	xiangzhou	2014-09-22 11:06:02 -0400
committer	xiangzhou	2014-09-22 11:06:02 -0400
commit	7762722f264adc402ea3b0f21923b18f072253ba (patch)
tree	879ed22943d424b52bd04b4ee6fbdf51616dc9a9
parent	44faf98d2c6fe56c916cace02fe498fc1271bd9d (diff)
download	pangemma-7762722f264adc402ea3b0f21923b18f072253ba.tar.gz