/*
 Genome-wide Efficient Mixed Model Association (GEMMA)
 Copyright (C) 2011  Xiang Zhou

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */



#include <iostream>
#include <fstream>
#include <sstream>

#include <iomanip>
#include <cmath>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <bitset>
#include <cstring>

#include "gsl/gsl_vector.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_blas.h"

#include "gsl/gsl_cdf.h"
#include "gsl/gsl_roots.h"
#include "gsl/gsl_min.h"
#include "gsl/gsl_integration.h"

#include "io.h"
#include "lapack.h"
#include "eigenlib.h"
#include "gzstream.h"

#ifdef FORCE_FLOAT
#include "lmm_float.h"
#include "mvlmm_float.h"
#else
#include "lmm.h"
#include "mvlmm.h"
#endif



using namespace std;


//in this file, X, Y are already transformed (i.e. UtX and UtY)


void MVLMM::CopyFromParam (PARAM &cPar)
{
	a_mode=cPar.a_mode;
	d_pace=cPar.d_pace;

	file_bfile=cPar.file_bfile;
	file_geno=cPar.file_geno;
	file_oxford=cPar.file_oxford;
	file_out=cPar.file_out;
	path_out=cPar.path_out;

	l_min=cPar.l_min;
	l_max=cPar.l_max;
	n_region=cPar.n_region;
	p_nr=cPar.p_nr;
	em_iter=cPar.em_iter;
	nr_iter=cPar.nr_iter;
	em_prec=cPar.em_prec;
	nr_prec=cPar.nr_prec;
	crt=cPar.crt;

	Vg_remle_null=cPar.Vg_remle_null;
	Ve_remle_null=cPar.Ve_remle_null;
	Vg_mle_null=cPar.Vg_mle_null;
	Ve_mle_null=cPar.Ve_mle_null;

	time_UtX=0.0;
	time_opt=0.0;

	ni_total=cPar.ni_total;
	ns_total=cPar.ns_total;
	ni_test=cPar.ni_test;
	ns_test=cPar.ns_test;
	n_cvt=cPar.n_cvt;

	n_ph=cPar.n_ph;

	indicator_idv=cPar.indicator_idv;
	indicator_snp=cPar.indicator_snp;
	snpInfo=cPar.snpInfo;

	return;
}


void MVLMM::CopyToParam (PARAM &cPar)
{
	cPar.time_UtX=time_UtX;
	cPar.time_opt=time_opt;

	cPar.Vg_remle_null=Vg_remle_null;
	cPar.Ve_remle_null=Ve_remle_null;
	cPar.Vg_mle_null=Vg_mle_null;
	cPar.Ve_mle_null=Ve_mle_null;

	cPar.VVg_remle_null=VVg_remle_null;
	cPar.VVe_remle_null=VVe_remle_null;
	cPar.VVg_mle_null=VVg_mle_null;
	cPar.VVe_mle_null=VVe_mle_null;

	cPar.beta_remle_null=beta_remle_null;
	cPar.se_beta_remle_null=se_beta_remle_null;
	cPar.beta_mle_null=beta_mle_null;
	cPar.se_beta_mle_null=se_beta_mle_null;

	cPar.logl_remle_H0=logl_remle_H0;
	cPar.logl_mle_H0=logl_mle_H0;
	return;
}


void MVLMM::WriteFiles ()
{
	string file_str;
	file_str=path_out+"/"+file_out;
	file_str+=".assoc.txt";

	ofstream outfile (file_str.c_str(), ofstream::out);
	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}

	outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";

	for (size_t i=0; i<n_ph; i++) {
		outfile<<"beta_"<<i+1<<"\t";
	}
	for (size_t i=0; i<n_ph; i++) {
		for (size_t j=i; j<n_ph; j++) {
			outfile<<"Vbeta_"<<i+1<<"_"<<j+1<<"\t";
		}
	}

	if (a_mode==1) {
		outfile<<"p_wald"<<endl;
	} else if (a_mode==2) {
		outfile<<"p_lrt"<<endl;
	} else if (a_mode==3) {
		outfile<<"p_score"<<endl;
	} else if (a_mode==4) {
		outfile<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
	} else {}


	size_t t=0, c=0;
	for (size_t i=0; i<snpInfo.size(); ++i) {
		if (indicator_snp[i]==0) {continue;}

		outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";

		outfile<<scientific<<setprecision(6);

		for (size_t i=0; i<n_ph; i++) {
			outfile<<sumStat[t].v_beta[i]<<"\t";
		}

		c=0;
		for (size_t i=0; i<n_ph; i++) {
			for (size_t j=i; j<n_ph; j++) {
				outfile<<sumStat[t].v_Vbeta[c]<<"\t";
				c++;
			}
		}

		if (a_mode==1) {
			outfile<<sumStat[t].p_wald <<endl;
		} else if (a_mode==2) {
			outfile<<sumStat[t].p_lrt<<endl;
		} else if (a_mode==3) {
			outfile<<sumStat[t].p_score<<endl;
		} else if (a_mode==4) {
			outfile<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
		} else {}

		t++;
	}


	outfile.close();
	outfile.clear();
	return;
}




//below are functions for EM algorithm






double EigenProc (const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix *UltVeh, gsl_matrix *UltVehi)
{
	size_t d_size=V_g->size1;
	double d, logdet_Ve=0.0;

	//eigen decomposition of V_e
	gsl_matrix *Lambda=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_temp=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_h=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_hi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *VgVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *U_l=gsl_matrix_alloc (d_size, d_size);

	gsl_matrix_memcpy(V_e_temp, V_e);
	EigenDecomp(V_e_temp, U_l, D_l, 0);

	//calculate V_e_h and V_e_hi
	gsl_matrix_set_zero(V_e_h);
	gsl_matrix_set_zero(V_e_hi);
	for (size_t i=0; i<d_size; i++) {
		d=gsl_vector_get (D_l, i);
		if (d<=0) {continue;}
		logdet_Ve+=log(d);

		gsl_vector_view U_col=gsl_matrix_column(U_l, i);
		d=sqrt(d);
		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_h);
		d=1.0/d;
		gsl_blas_dsyr (CblasUpper, d, &U_col.vector, V_e_hi);
	}

	//copy the upper part to lower part
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<i; j++) {
			gsl_matrix_set (V_e_h, i, j, gsl_matrix_get(V_e_h, j, i));
			gsl_matrix_set (V_e_hi, i, j, gsl_matrix_get(V_e_hi, j, i));
		}
	}

	//calculate Lambda=V_ehi V_g V_ehi
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_g, V_e_hi, 0.0, VgVehi);
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, V_e_hi, VgVehi, 0.0, Lambda);

	//eigen decomposition of Lambda
	EigenDecomp(Lambda, U_l, D_l, 0);

	for (size_t i=0; i<d_size; i++) {
		d=gsl_vector_get (D_l, i);
		if (d<0) {gsl_vector_set (D_l, i, 0);}
	}

	//calculate UltVeh and UltVehi
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_h, 0.0, UltVeh);
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, U_l, V_e_hi, 0.0, UltVehi);
	/*
	cout<<"Vg: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			cout<<gsl_matrix_get (V_g, i, j)<<" ";
		}
		cout<<endl;
	}
	cout<<"Ve: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			cout<<gsl_matrix_get (V_e, i, j)<<" ";
		}
		cout<<endl;
	}

	cout<<"Dl: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		cout<<gsl_vector_get (D_l, i)<<endl;
	}
	cout<<"UltVeh: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			cout<<gsl_matrix_get (UltVeh, i, j)<<" ";
		}
		cout<<endl;
	}
	*/

	//free memory
	gsl_matrix_free (Lambda);
	gsl_matrix_free (V_e_temp);
	gsl_matrix_free (V_e_h);
	gsl_matrix_free (V_e_hi);
	gsl_matrix_free (VgVehi);
	gsl_matrix_free (U_l);

	return logdet_Ve;
}

//Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+I)^{-1} )^{-1}
double CalcQi (const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, gsl_matrix *Qi)
{
	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
	size_t c_size=dc_size/d_size;

	double delta, dl, d1, d2, d, logdet_Q;

	gsl_matrix *Q=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix_set_zero (Q);

	for (size_t i=0; i<c_size; i++) {
		for (size_t j=0; j<c_size; j++) {
			for (size_t l=0; l<d_size; l++) {
				dl=gsl_vector_get(D_l, l);

				if (j<i) {
					d=gsl_matrix_get (Q, j*d_size+l, i*d_size+l);
				} else {
					d=0.0;
					for (size_t k=0; k<n_size; k++) {
						d1=gsl_matrix_get(X, i, k);
						d2=gsl_matrix_get(X, j, k);
						delta=gsl_vector_get(eval, k);
						d+=d1*d2/(dl*delta+1.0);
					}
				}

				gsl_matrix_set (Q, i*d_size+l, j*d_size+l, d);
			}
		}
	}

	//calculate LU decomposition of Q, and invert Q and calculate |Q|
	int sig;
	gsl_permutation * pmt=gsl_permutation_alloc (dc_size);
	LUDecomp (Q, pmt, &sig);
	LUInvert (Q, pmt, Qi);

	logdet_Q=LULndet (Q);

	gsl_matrix_free (Q);
	gsl_permutation_free (pmt);

	return logdet_Q;
}

//xHiy=\sum_{k=1}^n x_k\otimes ((delta_k*Dl+I)^{-1}Ul^TVe^{-1/2}y
void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *UltVehiY, gsl_vector *xHiy)
{
	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size;

	gsl_vector_set_zero (xHiy);

	double x, delta, dl, y, d;
	for (size_t i=0; i<d_size; i++) {
		dl=gsl_vector_get(D_l, i);
		for (size_t j=0; j<c_size; j++) {
			d=0.0;
			for (size_t k=0; k<n_size; k++) {
				x=gsl_matrix_get(X, j, k);
				y=gsl_matrix_get(UltVehiY, i, k);
				delta=gsl_vector_get(eval, k);
				d+=x*y/(delta*dl+1.0);
			}
			gsl_vector_set(xHiy, j*d_size+i, d);
		}
	}
	/*
	cout<<"xHiy: "<<endl;
	for (size_t i=0; i<(d_size*c_size); i++) {
		cout<<gsl_vector_get(xHiy, i)<<endl;
	}
	 */
	return;
}


//OmegaU=D_l/(delta Dl+I)^{-1}
//OmegaE=delta D_l/(delta Dl+I)^{-1}
void CalcOmega (const gsl_vector *eval, const gsl_vector *D_l, gsl_matrix *OmegaU, gsl_matrix *OmegaE)
{
	size_t n_size=eval->size, d_size=D_l->size;
	double delta, dl, d_u, d_e;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get(eval, k);
		for (size_t i=0; i<d_size; i++) {
			dl=gsl_vector_get(D_l, i);

			d_u=dl/(delta*dl+1.0);
			d_e=delta*d_u;

			gsl_matrix_set(OmegaU, i, k, d_u);
			gsl_matrix_set(OmegaE, i, k, d_e);
		}
	}

	return;
}


void UpdateU (const gsl_matrix *OmegaE, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU)
{
	gsl_matrix_memcpy (UltVehiU, UltVehiY);
	gsl_matrix_sub (UltVehiU, UltVehiBX);

	gsl_matrix_mul_elements (UltVehiU, OmegaE);
	return;
}


void UpdateE (const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiBX, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiE)
{
	gsl_matrix_memcpy (UltVehiE, UltVehiY);
	gsl_matrix_sub (UltVehiE, UltVehiBX);
	gsl_matrix_sub (UltVehiE, UltVehiU);

	return;
}



void UpdateL_B (const gsl_matrix *X, const gsl_matrix *XXti, const gsl_matrix *UltVehiY, const gsl_matrix *UltVehiU, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiB)
{
	size_t c_size=X->size1, d_size=UltVehiY->size1;

	gsl_matrix *YUX=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_memcpy (UltVehiBX, UltVehiY);
	gsl_matrix_sub (UltVehiBX, UltVehiU);

	gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, UltVehiBX, X, 0.0, YUX);
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, YUX, XXti, 0.0, UltVehiB);

	gsl_matrix_free(YUX);

	return;
}

void UpdateRL_B (const gsl_vector *xHiy, const gsl_matrix *Qi, gsl_matrix *UltVehiB)
{
	size_t d_size=UltVehiB->size1, c_size=UltVehiB->size2, dc_size=Qi->size1;

	gsl_vector *b=gsl_vector_alloc (dc_size);

	//calculate b=Qiv
	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, b);

	//copy b to UltVehiB
	for (size_t i=0; i<c_size; i++) {
		gsl_vector_view UltVehiB_col=gsl_matrix_column (UltVehiB, i);
		gsl_vector_const_view b_subcol=gsl_vector_const_subvector (b, i*d_size, d_size);
		gsl_vector_memcpy (&UltVehiB_col.vector, &b_subcol.vector);
	}

	gsl_vector_free(b);

	return;
}



void UpdateV (const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E, const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee, gsl_matrix *V_g, gsl_matrix *V_e)
{
	size_t n_size=eval->size, d_size=U->size1;

	gsl_matrix_set_zero (V_g);
	gsl_matrix_set_zero (V_e);

	double delta;

	//calculate the first part: UD^{-1}U^T and EE^T
	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);
		if (delta==0) {continue;}

		gsl_vector_const_view U_col=gsl_matrix_const_column (U, k);
		gsl_blas_dsyr (CblasUpper, 1.0/delta, &U_col.vector, V_g);
	}

	gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e);

	//copy the upper part to lower part
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<i; j++) {
			gsl_matrix_set (V_g, i, j, gsl_matrix_get(V_g, j, i));
			gsl_matrix_set (V_e, i, j, gsl_matrix_get(V_e, j, i));
		}
	}

	//add Sigma
	gsl_matrix_add (V_g, Sigma_uu);
	gsl_matrix_add (V_e, Sigma_ee);

	//scale by 1/n
	gsl_matrix_scale (V_g, 1.0/(double)n_size);
	gsl_matrix_scale (V_e, 1.0/(double)n_size);

	return;
}


void CalcSigma (const char func_name, const gsl_vector *eval, const gsl_vector *D_l, const gsl_matrix *X, const gsl_matrix *OmegaU, const gsl_matrix *OmegaE, const gsl_matrix *UltVeh, const gsl_matrix *Qi, gsl_matrix *Sigma_uu, gsl_matrix *Sigma_ee)
{
	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}

	size_t n_size=eval->size, c_size=X->size1, d_size=D_l->size, dc_size=Qi->size1;

	gsl_matrix_set_zero(Sigma_uu);
	gsl_matrix_set_zero(Sigma_ee);

	double delta, dl, x, d;

	//calculate the first diagonal term
	gsl_vector_view Suu_diag=gsl_matrix_diagonal (Sigma_uu);
	gsl_vector_view See_diag=gsl_matrix_diagonal (Sigma_ee);

	for (size_t k=0; k<n_size; k++) {
		gsl_vector_const_view OmegaU_col=gsl_matrix_const_column (OmegaU, k);
		gsl_vector_const_view OmegaE_col=gsl_matrix_const_column (OmegaE, k);

		gsl_vector_add (&Suu_diag.vector, &OmegaU_col.vector);
		gsl_vector_add (&See_diag.vector, &OmegaE_col.vector);
	}

	//calculate the second term for reml
	if (func_name=='R' || func_name=='r') {
		gsl_matrix *M_u=gsl_matrix_alloc(dc_size, d_size);
		gsl_matrix *M_e=gsl_matrix_alloc(dc_size, d_size);
		gsl_matrix *QiM=gsl_matrix_alloc(dc_size, d_size);

		gsl_matrix_set_zero(M_u);
		gsl_matrix_set_zero(M_e);

		for (size_t k=0; k<n_size; k++) {
			delta=gsl_vector_get(eval, k);
			//if (delta==0) {continue;}

			for (size_t i=0; i<d_size; i++) {
				dl=gsl_vector_get(D_l, i);
				for (size_t j=0; j<c_size; j++) {
					x=gsl_matrix_get(X, j, k);
					d=x/(delta*dl+1.0);
					gsl_matrix_set(M_e, j*d_size+i, i, d);
					gsl_matrix_set(M_u, j*d_size+i, i, d*dl);
				}
			}
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_u, 0.0, QiM);
			gsl_blas_dgemm(CblasTrans, CblasNoTrans, delta, M_u, QiM, 1.0, Sigma_uu);

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_e, 0.0, QiM);
			gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, M_e, QiM, 1.0, Sigma_ee);
		}

		gsl_matrix_free(M_u);
		gsl_matrix_free(M_e);
		gsl_matrix_free(QiM);
	}

	//multiply both sides by VehUl
	gsl_matrix *M=gsl_matrix_alloc (d_size, d_size);

	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_uu, UltVeh, 0.0, M);
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_uu);
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Sigma_ee, UltVeh, 0.0, M);
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, M, 0.0, Sigma_ee);

	gsl_matrix_free(M);
	return;
}


//'R' for restricted likelihood and 'L' for likelihood
//'R' update B and 'L' don't
//only calculate -0.5*\sum_{k=1}^n|H_k|-0.5yPxy
double MphCalcLogL (const gsl_vector *eval, const gsl_vector *xHiy, const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *Qi)
{
	size_t n_size=eval->size, d_size=D_l->size, dc_size=Qi->size1;
	double logl=0.0, delta, dl, y, d;

	//calculate yHiy+log|H_k|
	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get(eval, k);
		for (size_t i=0; i<d_size; i++) {
			y=gsl_matrix_get(UltVehiY, i, k);
			dl=gsl_vector_get(D_l, i);
			d=delta*dl+1.0;

			logl+=y*y/d+log(d);
		}
	}

	//calculate the rest of yPxy
	gsl_vector *Qiv=gsl_vector_alloc(dc_size);

	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, xHiy, 0.0, Qiv);
	gsl_blas_ddot(xHiy, Qiv, &d);

	logl-=d;

	gsl_vector_free(Qiv);

	return -0.5*logl;
}





//Y is a dxn matrix, X is a cxn matrix, B is a dxc matrix, V_g is a dxd matrix, V_e is a dxd matrix, eval is a size n vector
//'R' for restricted likelihood and 'L' for likelihood
double MphEM (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *U_hat, gsl_matrix *E_hat, gsl_matrix *OmegaU, gsl_matrix *OmegaE, gsl_matrix *UltVehiY, gsl_matrix *UltVehiBX, gsl_matrix *UltVehiU, gsl_matrix *UltVehiE, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
{
	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}

	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
	size_t dc_size=d_size*c_size;

	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);
	gsl_matrix *XXti=gsl_matrix_alloc (c_size, c_size);
	gsl_vector *D_l=gsl_vector_alloc (d_size);
	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehiB=gsl_matrix_alloc (d_size, c_size);
	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *Sigma_uu=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *Sigma_ee=gsl_matrix_alloc (d_size, d_size);
	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
	gsl_permutation * pmt=gsl_permutation_alloc (c_size);

	double logl_const=0.0, logl_old=0.0, logl_new=0.0, logdet_Q, logdet_Ve;
	int sig;

	//calculate |XXt| and (XXt)^{-1}
	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
	for (size_t i=0; i<c_size; ++i) {
		for (size_t j=0; j<i; ++j) {
			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
		}
	}

	LUDecomp (XXt, pmt, &sig);
	LUInvert (XXt, pmt, XXti);

	//calculate the constant for logl
	if (func_name=='R' || func_name=='r') {
		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
	} else {
		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
	}

	//start EM
	for (size_t t=0; t<max_iter; t++) {
		logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);

		logdet_Q=CalcQi (eval, D_l, X, Qi);

		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
		CalcXHiY(eval, D_l, X, UltVehiY, xHiy);

		//calculate log likelihood/restricted likelihood value, and terminate if change is small
		logl_new=logl_const+MphCalcLogL (eval, xHiy, D_l, UltVehiY, Qi)-0.5*(double)n_size*logdet_Ve;
		if (func_name=='R' || func_name=='r') {
			logl_new+=-0.5*(logdet_Q-(double)c_size*logdet_Ve);
		}
		if (t!=0 && abs(logl_new-logl_old)<max_prec) {break;}
		logl_old=logl_new;

		/*
		cout<<"iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;

		cout<<"Vg: "<<endl;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=0; j<d_size; j++) {
				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
			}
			cout<<endl;
		}
		cout<<"Ve: "<<endl;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=0; j<d_size; j++) {
				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
			}
			cout<<endl;
		}
		*/

		CalcOmega (eval, D_l, OmegaU, OmegaE);

		//Update UltVehiB, UltVehiU
		if (func_name=='R' || func_name=='r') {
			UpdateRL_B(xHiy, Qi, UltVehiB);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
		} else if (t==0) {
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, B, 0.0, UltVehiB);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
		}

		UpdateU(OmegaE, UltVehiY, UltVehiBX, UltVehiU);

		if (func_name=='L' || func_name=='l') {
			//UltVehiBX is destroyed here
			UpdateL_B(X, XXti, UltVehiY, UltVehiU, UltVehiBX, UltVehiB);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehiB, X, 0.0, UltVehiBX);
		}

		UpdateE(UltVehiY, UltVehiBX, UltVehiU, UltVehiE);

		//calculate U_hat, E_hat and B
		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiU, 0.0, U_hat);
		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiE, 0.0, E_hat);
		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiB, 0.0, B);

		//calculate Sigma_uu and Sigma_ee
		CalcSigma (func_name, eval, D_l, X, OmegaU, OmegaE, UltVeh, Qi, Sigma_uu, Sigma_ee);

		//update V_g and V_e
		UpdateV (eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e);
	}

	gsl_matrix_free(XXt);
	gsl_matrix_free(XXti);
	gsl_vector_free(D_l);
	gsl_matrix_free(UltVeh);
	gsl_matrix_free(UltVehi);
	gsl_matrix_free(UltVehiB);
	gsl_matrix_free(Qi);
	gsl_matrix_free(Sigma_uu);
	gsl_matrix_free(Sigma_ee);
	gsl_vector_free(xHiy);
	gsl_permutation_free(pmt);

	return logl_new;
}







//calculate p-value, beta (d by 1 vector) and V(beta)
double MphCalcP (const gsl_vector *eval, const gsl_vector *x_vec, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta, gsl_matrix *Vbeta)
{
	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
	size_t dc_size=d_size*c_size;
	double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value;

	gsl_vector *D_l=gsl_vector_alloc (d_size);
	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *WHix=gsl_matrix_alloc (dc_size, d_size);
	gsl_matrix *QiWHix=gsl_matrix_alloc(dc_size, d_size);

	gsl_matrix *xPx=gsl_matrix_alloc (d_size, d_size);
	gsl_vector *xPy=gsl_vector_alloc (d_size);
	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
	gsl_vector *WHiy=gsl_vector_alloc (dc_size);

	gsl_matrix_set_zero (xPx);
	gsl_matrix_set_zero (WHix);
	gsl_vector_set_zero (xPy);
	gsl_vector_set_zero (WHiy);

	//eigen decomposition and calculate log|Ve|
	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);

	//calculate Qi and log|Q|
	logdet_Q=CalcQi (eval, D_l, W, Qi);

	//calculate UltVehiY
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);

	//calculate WHix, WHiy, xHiy, xHix
	for (size_t i=0; i<d_size; i++) {
		dl=gsl_vector_get(D_l, i);

		d1=0.0; d2=0.0;
		for (size_t k=0; k<n_size; k++) {
			delta=gsl_vector_get(eval, k);
			dx=gsl_vector_get(x_vec, k);
			dy=gsl_matrix_get(UltVehiY, i, k);

			d1+=dx*dy/(delta*dl+1.0);
			d2+=dx*dx/(delta*dl+1.0);
		}
		gsl_vector_set (xPy, i, d1);
		gsl_matrix_set (xPx, i, i, d2);

		for (size_t j=0; j<c_size; j++) {
			d1=0.0; d2=0.0;
			for (size_t k=0; k<n_size; k++) {
				delta=gsl_vector_get(eval, k);
				dx=gsl_vector_get(x_vec, k);
				dw=gsl_matrix_get(W, j, k);
				dy=gsl_matrix_get(UltVehiY, i, k);

				//if (delta==0) {continue;}
				d1+=dx*dw/(delta*dl+1.0);
				d2+=dy*dw/(delta*dl+1.0);
			}
			gsl_matrix_set(WHix, j*d_size+i, i, d1);
			gsl_vector_set(WHiy, j*d_size+i, d2);
		}
	}

	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, WHix, 0.0, QiWHix);
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, -1.0, WHix, QiWHix, 1.0, xPx);
	gsl_blas_dgemv(CblasTrans, -1.0, QiWHix, WHiy, 1.0, xPy);

	//calculate V(beta) and beta
	int sig;
	gsl_permutation * pmt=gsl_permutation_alloc (d_size);
	LUDecomp (xPx, pmt, &sig);
	LUSolve (xPx, pmt, xPy, D_l);
	LUInvert (xPx, pmt, Vbeta);

	//need to multiply UltVehi on both sides or one side
	gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, D_l, 0.0, beta);
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Vbeta, UltVeh, 0.0, xPx);
	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, xPx, 0.0, Vbeta);

	//calculate test statistic and p value
	gsl_blas_ddot(D_l, xPy, &d);

	p_value=gsl_cdf_chisq_Q (d, (double)d_size);
	//d*=(double)(n_size-c_size-d_size)/((double)d_size*(double)(n_size-c_size-1));
	//p_value=gsl_cdf_fdist_Q (d, (double)d_size, (double)(n_size-c_size-d_size));

	gsl_vector_free(D_l);
	gsl_matrix_free(UltVeh);
	gsl_matrix_free(UltVehi);
	gsl_matrix_free(Qi);
	gsl_matrix_free(WHix);
	gsl_matrix_free(QiWHix);

	gsl_matrix_free(xPx);
	gsl_vector_free(xPy);
	gsl_vector_free(WHiy);

	gsl_permutation_free(pmt);

	return p_value;
}



//calculate B and its standard error (which is a matrix of the same dimension as B)
void MphCalcBeta (const gsl_vector *eval, const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B, gsl_matrix *se_B)
{
	size_t n_size=eval->size, c_size=W->size1, d_size=V_g->size1;
	size_t dc_size=d_size*c_size;
	double delta, dl, d, dy, dw, logdet_Ve, logdet_Q;

	gsl_vector *D_l=gsl_vector_alloc (d_size);
	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *Qi_temp=gsl_matrix_alloc (dc_size, dc_size);
	//gsl_vector *UltVehiy=gsl_vector_alloc (d_size);
	gsl_vector *WHiy=gsl_vector_alloc (dc_size);
	gsl_vector *QiWHiy=gsl_vector_alloc (dc_size);
	gsl_vector *beta=gsl_vector_alloc (dc_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (dc_size, dc_size);

	gsl_vector_set_zero (WHiy);

	//eigen decomposition and calculate log|Ve|
	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);

	//calculate Qi and log|Q|
	logdet_Q=CalcQi (eval, D_l, W, Qi);

	//calculate UltVehiY
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);

	//calculate WHiy
	for (size_t i=0; i<d_size; i++) {
		dl=gsl_vector_get(D_l, i);

		for (size_t j=0; j<c_size; j++) {
			d=0.0;
			for (size_t k=0; k<n_size; k++) {
				delta=gsl_vector_get(eval, k);
				dw=gsl_matrix_get(W, j, k);
				dy=gsl_matrix_get(UltVehiY, i, k);

				//if (delta==0) {continue;}
				d+=dy*dw/(delta*dl+1.0);
			}
			gsl_vector_set(WHiy, j*d_size+i, d);
		}
	}

	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, WHiy, 0.0, QiWHiy);

	//need to multiply I_c\otimes UltVehi on both sides or one side
	for (size_t i=0; i<c_size; i++) {
		gsl_vector_view QiWHiy_sub=gsl_vector_subvector(QiWHiy, i*d_size, d_size);
		gsl_vector_view beta_sub=gsl_vector_subvector(beta, i*d_size, d_size);
		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &QiWHiy_sub.vector, 0.0, &beta_sub.vector);

		for (size_t j=0; j<c_size; j++) {
			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
			gsl_matrix_view Qitemp_sub=gsl_matrix_submatrix (Qi_temp, i*d_size, j*d_size, d_size, d_size);
			gsl_matrix_view Vbeta_sub=gsl_matrix_submatrix (Vbeta, i*d_size, j*d_size, d_size, d_size);

			if (j<i) {
				gsl_matrix_view Vbeta_sym=gsl_matrix_submatrix (Vbeta, j*d_size, i*d_size, d_size, d_size);
				gsl_matrix_transpose_memcpy (&Vbeta_sub.matrix, &Vbeta_sym.matrix);
			} else {
				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, &Qitemp_sub.matrix);
				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, &Qitemp_sub.matrix, 0.0, &Vbeta_sub.matrix);
			}
		}
	}

	//copy beta to B, and Vbeta to se_B
	for (size_t j=0; j<B->size2; j++) {
		for (size_t i=0; i<B->size1; i++) {
			gsl_matrix_set(B, i, j, gsl_vector_get(beta, j*d_size+i));
			gsl_matrix_set(se_B, i, j, sqrt(gsl_matrix_get(Vbeta, j*d_size+i, j*d_size+i)));
		}
	}

	//free matrices
	gsl_vector_free(D_l);
	gsl_matrix_free(UltVeh);
	gsl_matrix_free(UltVehi);
	gsl_matrix_free(Qi);
	gsl_matrix_free(Qi_temp);
	gsl_vector_free(WHiy);
	gsl_vector_free(QiWHiy);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	return;
}



//below are functions for Newton-Raphson's algorithm





//calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k|
//and calculate Qi and return logdet_Q
//and calculate yPy
void CalcHiQi (const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all, gsl_matrix *Qi, double &logdet_H, double &logdet_Q)
{
	gsl_matrix_set_zero (Hi_all);
	gsl_matrix_set_zero (Qi);
	logdet_H=0.0; logdet_Q=0.0;

	size_t n_size=eval->size, c_size=X->size1, d_size=V_g->size1;
	double logdet_Ve=0.0, delta, dl, d;

	gsl_matrix *mat_dd=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_vector *D_l=gsl_vector_alloc (d_size);

	//calculate D_l, UltVeh and UltVehi
	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);

	//calculate each Hi and log|H_k|
	logdet_H=(double)n_size*logdet_Ve;
	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		gsl_matrix_memcpy (mat_dd, UltVehi);
		for (size_t i=0; i<d_size; i++) {
			dl=gsl_vector_get(D_l, i);
			d=delta*dl+1.0;

			gsl_vector_view mat_row=gsl_matrix_row (mat_dd, i);
			gsl_vector_scale (&mat_row.vector, 1.0/d);

			logdet_H+=log(d);
		}

		gsl_matrix_view Hi_k=gsl_matrix_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
		gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVehi, mat_dd, 0.0, &Hi_k.matrix);
	}

	//calculate Qi, and multiply I\otimes UtVeh on both side
	//and calculate logdet_Q, don't forget to substract c_size*logdet_Ve
	logdet_Q=CalcQi (eval, D_l, X, Qi)-(double)c_size*logdet_Ve;

	for (size_t i=0; i<c_size; i++) {
		for (size_t j=0; j<c_size; j++) {
			gsl_matrix_view Qi_sub=gsl_matrix_submatrix (Qi, i*d_size, j*d_size, d_size, d_size);
			if (j<i) {
				gsl_matrix_view Qi_sym=gsl_matrix_submatrix (Qi, j*d_size, i*d_size, d_size, d_size);
				gsl_matrix_transpose_memcpy (&Qi_sub.matrix, &Qi_sym.matrix);
			} else {
				gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &Qi_sub.matrix, UltVeh, 0.0, mat_dd);
				gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, mat_dd, 0.0, &Qi_sub.matrix);
			}
		}
	}

	//free memory
	gsl_matrix_free(mat_dd);
	gsl_matrix_free(UltVeh);
	gsl_matrix_free(UltVehi);
	gsl_vector_free(D_l);

	return;
}




//calculate all Hiy
void Calc_Hiy_all (const gsl_matrix *Y, const gsl_matrix *Hi_all, gsl_matrix *Hiy_all)
{
	gsl_matrix_set_zero (Hiy_all);

	size_t n_size=Y->size2, d_size=Y->size1;

	for (size_t k=0; k<n_size; k++) {
		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);
		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
		gsl_vector_view Hiy_k=gsl_matrix_column(Hiy_all, k);

		gsl_blas_dgemv (CblasNoTrans, 1.0, &Hi_k.matrix, &y_k.vector, 0.0, &Hiy_k.vector);
	}

	return;
}


//calculate all xHi
void Calc_xHi_all (const gsl_matrix *X, const gsl_matrix *Hi_all, gsl_matrix *xHi_all)
{
	gsl_matrix_set_zero (xHi_all);

	size_t n_size=X->size2, c_size=X->size1, d_size=Hi_all->size1;

	double d;

	for (size_t k=0; k<n_size; k++) {
		gsl_matrix_const_view Hi_k=gsl_matrix_const_submatrix(Hi_all, 0, k*d_size, d_size, d_size);

		for (size_t i=0; i<c_size; i++) {
			d=gsl_matrix_get (X, i, k);
			gsl_matrix_view xHi_sub=gsl_matrix_submatrix(xHi_all, i*d_size, k*d_size, d_size, d_size);
			gsl_matrix_memcpy(&xHi_sub.matrix, &Hi_k.matrix);
			gsl_matrix_scale(&xHi_sub.matrix, d);
		}
	}

	return;
}


//calculate scalar yHiy
double Calc_yHiy (const gsl_matrix *Y, const gsl_matrix *Hiy_all)
{
	double yHiy=0.0, d;
	size_t n_size=Y->size2;

	for (size_t k=0; k<n_size; k++) {
		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);
		gsl_vector_const_view Hiy_k=gsl_matrix_const_column(Hiy_all, k);

		gsl_blas_ddot (&Hiy_k.vector, &y_k.vector, &d);
		yHiy+=d;
	}

	return yHiy;
}


//calculate the vector xHiy
void Calc_xHiy (const gsl_matrix *Y, const gsl_matrix *xHi, gsl_vector *xHiy)
{
	gsl_vector_set_zero (xHiy);

	size_t n_size=Y->size2, d_size=Y->size1, dc_size=xHi->size1;

	for (size_t k=0; k<n_size; k++) {
		gsl_matrix_const_view xHi_k=gsl_matrix_const_submatrix(xHi, 0, k*d_size, dc_size, d_size);
		gsl_vector_const_view y_k=gsl_matrix_const_column(Y, k);

		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHi_k.matrix, &y_k.vector, 1.0, xHiy);
	}

	return;
}




//0<=i,j<d_size
size_t GetIndex (const size_t i, const size_t j, const size_t d_size)
{
	if (i>=d_size || j>=d_size) {cout<<"error in GetIndex."<<endl; return 0;}

	size_t s, l;
	if (j<i) {s=j; l=i;} else {s=i; l=j;}

	return (2*d_size-s+1)*s/2+l-s;
}



void Calc_yHiDHiy (const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i, const size_t j, double &yHiDHiy_g, double &yHiDHiy_e)
{
	yHiDHiy_g=0.0;
	yHiDHiy_e=0.0;

	size_t n_size=eval->size;

	double delta, d1, d2;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);
		d1=gsl_matrix_get (Hiy, i, k);
		d2=gsl_matrix_get (Hiy, j, k);

		if (i==j) {
			yHiDHiy_g+=delta*d1*d2;
			yHiDHiy_e+=d1*d2;
		} else {
			yHiDHiy_g+=delta*d1*d2*2.0;
			yHiDHiy_e+=d1*d2*2.0;
		}
	}

	return;
}



void Calc_xHiDHiy (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i, const size_t j, gsl_vector *xHiDHiy_g, gsl_vector *xHiDHiy_e)
{
	gsl_vector_set_zero(xHiDHiy_g);
	gsl_vector_set_zero(xHiDHiy_e);

	size_t n_size=eval->size, d_size=Hiy->size1;

	double delta, d;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
		d=gsl_matrix_get (Hiy, j, k);

		gsl_blas_daxpy (d*delta, &xHi_col_i.vector, xHiDHiy_g);
		gsl_blas_daxpy (d, &xHi_col_i.vector, xHiDHiy_e);

		if (i!=j) {
			gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);
			d=gsl_matrix_get (Hiy, i, k);

			gsl_blas_daxpy (d*delta, &xHi_col_j.vector, xHiDHiy_g);
			gsl_blas_daxpy (d, &xHi_col_j.vector, xHiDHiy_e);
		}
	}

	return;
}


void Calc_xHiDHix (const gsl_vector *eval, const gsl_matrix *xHi, const size_t i, const size_t j, gsl_matrix *xHiDHix_g, gsl_matrix *xHiDHix_e)
{
	gsl_matrix_set_zero(xHiDHix_g);
	gsl_matrix_set_zero(xHiDHix_e);

	size_t n_size=eval->size, dc_size=xHi->size1;
	size_t d_size=xHi->size2/n_size;

	double delta;

	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *mat_dcdc_t=gsl_matrix_alloc (dc_size, dc_size);

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i);
		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j);

		gsl_matrix_set_zero (mat_dcdc);
		gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);

		gsl_matrix_transpose_memcpy (mat_dcdc_t, mat_dcdc);

		gsl_matrix_add (xHiDHix_e, mat_dcdc);

		gsl_matrix_scale (mat_dcdc, delta);
		gsl_matrix_add (xHiDHix_g, mat_dcdc);

		if (i!=j) {
			gsl_matrix_add (xHiDHix_e, mat_dcdc_t);

			gsl_matrix_scale (mat_dcdc_t, delta);
			gsl_matrix_add (xHiDHix_g, mat_dcdc_t);
		}
	}

	gsl_matrix_free(mat_dcdc);
	gsl_matrix_free(mat_dcdc_t);

	return;
}



void Calc_yHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yHiDHiDHiy_gg, double &yHiDHiDHiy_ee, double &yHiDHiDHiy_ge)
{
	yHiDHiDHiy_gg=0.0;
	yHiDHiDHiy_ee=0.0;
	yHiDHiDHiy_ge=0.0;

	size_t n_size=eval->size, d_size=Hiy->size1;

	double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		d_Hiy_i1=gsl_matrix_get (Hiy, i1, k);
		d_Hiy_j1=gsl_matrix_get (Hiy, j1, k);
		d_Hiy_i2=gsl_matrix_get (Hiy, i2, k);
		d_Hiy_j2=gsl_matrix_get (Hiy, j2, k);

		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2);
		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2);
		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2);
		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);

		if (i1==j1) {
			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);
			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2);

			if (i2!=j2) {
				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2);
			}
		} else {
			yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
			yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);
			yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1i2*d_Hiy_j2+d_Hiy_j1*d_Hi_i1i2*d_Hiy_j2);

			if (i2!=j2) {
				yHiDHiDHiy_gg+=delta*delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
				yHiDHiDHiy_ee+=(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
				yHiDHiDHiy_ge+=delta*(d_Hiy_i1*d_Hi_j1j2*d_Hiy_i2+d_Hiy_j1*d_Hi_i1j2*d_Hiy_i2);
			}
		}
	}

	return;
}


void Calc_xHiDHiDHiy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_vector *xHiDHiDHiy_gg, gsl_vector *xHiDHiDHiy_ee, gsl_vector *xHiDHiDHiy_ge)
{
	gsl_vector_set_zero(xHiDHiDHiy_gg);
	gsl_vector_set_zero(xHiDHiDHiy_ee);
	gsl_vector_set_zero(xHiDHiDHiy_ge);

	size_t n_size=eval->size, d_size=Hiy->size1;

	double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
		gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j1);

		d_Hiy_i=gsl_matrix_get (Hiy, i2, k);
		d_Hiy_j=gsl_matrix_get (Hiy, j2, k);

		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2);
		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2);
		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2);
		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);

		if (i1==j1) {
			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);

			if (i2!=j2) {
				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);
			}
		} else {
			gsl_blas_daxpy (delta*delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_gg);
			gsl_blas_daxpy (d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
			gsl_blas_daxpy (delta*d_Hi_j1i2*d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ge);

			gsl_blas_daxpy (delta*delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_gg);
			gsl_blas_daxpy (d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ee);
			gsl_blas_daxpy (delta*d_Hi_i1i2*d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ge);

			if (i2!=j2) {
				gsl_blas_daxpy (delta*delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_gg);
				gsl_blas_daxpy (d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
				gsl_blas_daxpy (delta*d_Hi_j1j2*d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ge);

				gsl_blas_daxpy (delta*delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_gg);
				gsl_blas_daxpy (d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ee);
				gsl_blas_daxpy (delta*d_Hi_i1j2*d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ge);
			}
		}
	}

	return;
}


void Calc_xHiDHiDHix (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, gsl_matrix *xHiDHiDHix_gg, gsl_matrix *xHiDHiDHix_ee, gsl_matrix *xHiDHiDHix_ge)
{
	gsl_matrix_set_zero(xHiDHiDHix_gg);
	gsl_matrix_set_zero(xHiDHiDHix_ee);
	gsl_matrix_set_zero(xHiDHiDHix_ge);

	size_t n_size=eval->size, d_size=Hi->size1, dc_size=xHi->size1;

	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;

	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		gsl_vector_const_view xHi_col_i1=gsl_matrix_const_column (xHi, k*d_size+i1);
		gsl_vector_const_view xHi_col_j1=gsl_matrix_const_column (xHi, k*d_size+j1);
		gsl_vector_const_view xHi_col_i2=gsl_matrix_const_column (xHi, k*d_size+i2);
		gsl_vector_const_view xHi_col_j2=gsl_matrix_const_column (xHi, k*d_size+j2);

		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2);
		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2);
		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2);
		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);

		if (i1==j1) {
			gsl_matrix_set_zero (mat_dcdc);
			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);

			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);

			if (i2!=j2) {
				gsl_matrix_set_zero (mat_dcdc);
				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);

				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
			}
		} else {
			gsl_matrix_set_zero (mat_dcdc);
			gsl_blas_dger (d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector, mat_dcdc);

			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);

			gsl_matrix_set_zero (mat_dcdc);
			gsl_blas_dger (d_Hi_i1i2, &xHi_col_j1.vector, &xHi_col_j2.vector, mat_dcdc);

			gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
			gsl_matrix_scale(mat_dcdc, delta);
			gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);

			if (i2!=j2) {
				gsl_matrix_set_zero (mat_dcdc);
				gsl_blas_dger (d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector, mat_dcdc);

				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);

				gsl_matrix_set_zero (mat_dcdc);
				gsl_blas_dger (d_Hi_i1j2, &xHi_col_j1.vector, &xHi_col_i2.vector, mat_dcdc);

				gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
				gsl_matrix_scale(mat_dcdc, delta);
				gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
			}
		}
	}

	gsl_matrix_free(mat_dcdc);

	return;
}



void Calc_traceHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i, const size_t j, double &tHiD_g, double &tHiD_e)
{
	tHiD_g=0.0;
	tHiD_e=0.0;

	size_t n_size=eval->size, d_size=Hi->size1;
	double delta, d;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);
		d=gsl_matrix_get (Hi, j, k*d_size+i);

		if (i==j) {
			tHiD_g+=delta*d;
			tHiD_e+=d;
		} else {
			tHiD_g+=delta*d*2.0;
			tHiD_e+=d*2.0;
		}
	}

	return;
}


void Calc_traceHiDHiD (const gsl_vector *eval, const gsl_matrix *Hi, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee, double &tHiDHiD_ge)
{
	tHiDHiD_gg=0.0;
	tHiDHiD_ee=0.0;
	tHiDHiD_ge=0.0;

	size_t n_size=eval->size, d_size=Hi->size1;
	double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		d_Hi_i1i2=gsl_matrix_get (Hi, i1, k*d_size+i2);
		d_Hi_i1j2=gsl_matrix_get (Hi, i1, k*d_size+j2);
		d_Hi_j1i2=gsl_matrix_get (Hi, j1, k*d_size+i2);
		d_Hi_j1j2=gsl_matrix_get (Hi, j1, k*d_size+j2);

		if (i1==j1) {
			tHiDHiD_gg+=delta*delta*d_Hi_i1j2*d_Hi_j1i2;
			tHiDHiD_ee+=d_Hi_i1j2*d_Hi_j1i2;
			tHiDHiD_ge+=delta*d_Hi_i1j2*d_Hi_j1i2;

			if (i2!=j2) {
				tHiDHiD_gg+=delta*delta*d_Hi_i1i2*d_Hi_j1j2;
				tHiDHiD_ee+=d_Hi_i1i2*d_Hi_j1j2;
				tHiDHiD_ge+=delta*d_Hi_i1i2*d_Hi_j1j2;
			}
		} else {
			tHiDHiD_gg+=delta*delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
			tHiDHiD_ee+=(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);
			tHiDHiD_ge+=delta*(d_Hi_i1j2*d_Hi_j1i2+d_Hi_j1j2*d_Hi_i1i2);

			if (i2!=j2) {
				tHiDHiD_gg+=delta*delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
				tHiDHiD_ee+=(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
				tHiDHiD_ge+=delta*(d_Hi_i1i2*d_Hi_j1j2+d_Hi_j1i2*d_Hi_i1j2);
			}
		}
	}

	return;
}


//trace(PD)=trace((Hi-HixQixHi)D)=trace(HiD)-trace(HixQixHiD)
void Calc_tracePD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const size_t i, const size_t j, double &tPD_g, double &tPD_e)
{
	size_t dc_size=Qi->size1, d_size=Hi->size1;
	size_t v=GetIndex(i, j, d_size);

	double d;

	//calculate the first part: trace(HiD)
	Calc_traceHiD (eval, Hi, i, j, tPD_g, tPD_e);

	//calculate the second part: -trace(HixQixHiD)
	for (size_t k=0; k<dc_size; k++) {
		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, k);
		gsl_vector_const_view xHiDHix_g_col=gsl_matrix_const_column (xHiDHix_all_g, v*dc_size+k);
		gsl_vector_const_view xHiDHix_e_col=gsl_matrix_const_column (xHiDHix_all_e, v*dc_size+k);

		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_g_col.vector, &d);
		tPD_g-=d;
		gsl_blas_ddot(&Qi_row.vector, &xHiDHix_e_col.vector, &d);
		tPD_e-=d;
	}

	return;
}



//trace(PDPD)=trace((Hi-HixQixHi)D(Hi-HixQixHi)D)
//=trace(HiDHiD)-trace(HixQixHiDHiD)-trace(HiDHixQixHiD)+trace(HixQixHiDHixQixHiD)
void Calc_tracePDPD (const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &tPDPD_gg, double &tPDPD_ee, double &tPDPD_ge)
{
	size_t dc_size=Qi->size1, d_size=Hi->size1;
	size_t v_size=d_size*(d_size+1)/2;
	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);

	double d;

	//calculate the first part: trace(HiDHiD)
	Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);

	//calculate the second and third parts: -trace(HixQixHiDHiD)-trace(HiDHixQixHiD)
	for (size_t i=0; i<dc_size; i++) {
		gsl_vector_const_view Qi_row=gsl_matrix_const_row (Qi, i);
		gsl_vector_const_view xHiDHiDHix_gg_col=gsl_matrix_const_column (xHiDHiDHix_all_gg, (v1*v_size+v2)*dc_size+i);
		gsl_vector_const_view xHiDHiDHix_ee_col=gsl_matrix_const_column (xHiDHiDHix_all_ee, (v1*v_size+v2)*dc_size+i);
		gsl_vector_const_view xHiDHiDHix_ge_col=gsl_matrix_const_column (xHiDHiDHix_all_ge, (v1*v_size+v2)*dc_size+i);

		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_col.vector, &d);
		tPDPD_gg-=d*2.0;
		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_col.vector, &d);
		tPDPD_ee-=d*2.0;
		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_col.vector, &d);
		tPDPD_ge-=d*2.0;
		/*
		gsl_vector_const_view xHiDHiDHix_gg_row=gsl_matrix_const_row (xHiDHiDHix_gg, i);
		gsl_vector_const_view xHiDHiDHix_ee_row=gsl_matrix_const_row (xHiDHiDHix_ee, i);
		gsl_vector_const_view xHiDHiDHix_ge_row=gsl_matrix_const_row (xHiDHiDHix_ge, i);

		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_gg_row.vector, &d);
		tPDPD_gg-=d;
		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ee_row.vector, &d);
		tPDPD_ee-=d;
		gsl_blas_ddot(&Qi_row.vector, &xHiDHiDHix_ge_row.vector, &d);
		tPDPD_ge-=d;
		 */
	}

	//calculate the fourth part: trace(HixQixHiDHixQixHiD)
	for (size_t i=0; i<dc_size; i++) {
		//gsl_vector_const_view QixHiDHix_g_row1=gsl_matrix_const_subrow (QixHiDHix_all_g, i, v1*dc_size, dc_size);
		//gsl_vector_const_view QixHiDHix_e_row1=gsl_matrix_const_subrow (QixHiDHix_all_e, i, v1*dc_size, dc_size);

		gsl_vector_const_view QixHiDHix_g_fullrow1=gsl_matrix_const_row (QixHiDHix_all_g, i);
		gsl_vector_const_view QixHiDHix_e_fullrow1=gsl_matrix_const_row (QixHiDHix_all_e, i);
		gsl_vector_const_view QixHiDHix_g_row1=gsl_vector_const_subvector (&QixHiDHix_g_fullrow1.vector, v1*dc_size, dc_size);
		gsl_vector_const_view QixHiDHix_e_row1=gsl_vector_const_subvector (&QixHiDHix_e_fullrow1.vector, v1*dc_size, dc_size);

		gsl_vector_const_view QixHiDHix_g_col2=gsl_matrix_const_column (QixHiDHix_all_g, v2*dc_size+i);
		gsl_vector_const_view QixHiDHix_e_col2=gsl_matrix_const_column (QixHiDHix_all_e, v2*dc_size+i);

		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_g_col2.vector, &d);
		tPDPD_gg+=d;
		gsl_blas_ddot(&QixHiDHix_e_row1.vector, &QixHiDHix_e_col2.vector, &d);
		tPDPD_ee+=d;
		gsl_blas_ddot(&QixHiDHix_g_row1.vector, &QixHiDHix_e_col2.vector, &d);
		tPDPD_ge+=d;
	}

	return;
}



//calculate (xHiDHiy) for every pair of i j
void Calc_xHiDHiy_all (const gsl_vector *eval, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiy_all_g, gsl_matrix *xHiDHiy_all_e)
{
	gsl_matrix_set_zero(xHiDHiy_all_g);
	gsl_matrix_set_zero(xHiDHiy_all_e);

	size_t d_size=Hiy->size1;
	size_t v;

	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			if (j<i) {continue;}
			v=GetIndex(i, j, d_size);

			gsl_vector_view xHiDHiy_g=gsl_matrix_column (xHiDHiy_all_g, v);
			gsl_vector_view xHiDHiy_e=gsl_matrix_column (xHiDHiy_all_e, v);

			Calc_xHiDHiy (eval, xHi, Hiy, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector);
		}
	}
	return;
}


//calculate (xHiDHix) for every pair of i j
void Calc_xHiDHix_all (const gsl_vector *eval, const gsl_matrix *xHi, gsl_matrix *xHiDHix_all_g, gsl_matrix *xHiDHix_all_e)
{
	gsl_matrix_set_zero(xHiDHix_all_g);
	gsl_matrix_set_zero(xHiDHix_all_e);

	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
	size_t v;

	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			if (j<i) {continue;}
			v=GetIndex(i, j, d_size);

			gsl_matrix_view xHiDHix_g=gsl_matrix_submatrix (xHiDHix_all_g, 0, v*dc_size, dc_size, dc_size);
			gsl_matrix_view xHiDHix_e=gsl_matrix_submatrix (xHiDHix_all_e, 0, v*dc_size, dc_size, dc_size);

			Calc_xHiDHix (eval, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix);
		}
	}
	return;
}



//calculate (xHiDHiy) for every pair of i j
void Calc_xHiDHiDHiy_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, gsl_matrix *xHiDHiDHiy_all_gg, gsl_matrix *xHiDHiDHiy_all_ee, gsl_matrix *xHiDHiDHiy_all_ge)
{
	gsl_matrix_set_zero(xHiDHiDHiy_all_gg);
	gsl_matrix_set_zero(xHiDHiDHiy_all_ee);
	gsl_matrix_set_zero(xHiDHiDHiy_all_ge);

	size_t d_size=Hiy->size1;
	size_t v1, v2;

	for (size_t i1=0; i1<d_size; i1++) {
		for (size_t j1=0; j1<d_size; j1++) {
			if (j1<i1) {continue;}
			v1=GetIndex(i1, j1, d_size);

			for (size_t i2=0; i2<d_size; i2++) {
				for (size_t j2=0; j2<d_size; j2++) {
					if (j2<i2) {continue;}
					v2=GetIndex(i2, j2, d_size);

					gsl_vector_view xHiDHiDHiy_gg=gsl_matrix_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
					gsl_vector_view xHiDHiDHiy_ee=gsl_matrix_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
					gsl_vector_view xHiDHiDHiy_ge=gsl_matrix_column (xHiDHiDHiy_all_ge, v1*v_size+v2);

					Calc_xHiDHiDHiy (eval, Hi, xHi, Hiy, i1, j1, i2, j2, &xHiDHiDHiy_gg.vector, &xHiDHiDHiy_ee.vector, &xHiDHiDHiy_ge.vector);
				}
			}
		}
	}
	return;
}


//calculate (xHiDHix) for every pair of i j
void Calc_xHiDHiDHix_all (const size_t v_size, const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, gsl_matrix *xHiDHiDHix_all_gg, gsl_matrix *xHiDHiDHix_all_ee, gsl_matrix *xHiDHiDHix_all_ge)
{
	gsl_matrix_set_zero(xHiDHiDHix_all_gg);
	gsl_matrix_set_zero(xHiDHiDHix_all_ee);
	gsl_matrix_set_zero(xHiDHiDHix_all_ge);

	size_t d_size=xHi->size2/eval->size, dc_size=xHi->size1;
	size_t v1, v2;

	for (size_t i1=0; i1<d_size; i1++) {
		for (size_t j1=0; j1<d_size; j1++) {
			if (j1<i1) {continue;}
			v1=GetIndex(i1, j1, d_size);

			for (size_t i2=0; i2<d_size; i2++) {
				for (size_t j2=0; j2<d_size; j2++) {
					if (j2<i2) {continue;}
					v2=GetIndex(i2, j2, d_size);

					if (v2<v1) {continue;}

					gsl_matrix_view xHiDHiDHix_gg1=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
					gsl_matrix_view xHiDHiDHix_ee1=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
					gsl_matrix_view xHiDHiDHix_ge1=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);

					Calc_xHiDHiDHix (eval, Hi, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix, &xHiDHiDHix_ee1.matrix, &xHiDHiDHix_ge1.matrix);

					if (v2!=v1) {
						gsl_matrix_view xHiDHiDHix_gg2=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
						gsl_matrix_view xHiDHiDHix_ee2=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);
						gsl_matrix_view xHiDHiDHix_ge2=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v2*v_size+v1)*dc_size, dc_size, dc_size);

						gsl_matrix_memcpy (&xHiDHiDHix_gg2.matrix, &xHiDHiDHix_gg1.matrix);
						gsl_matrix_memcpy (&xHiDHiDHix_ee2.matrix, &xHiDHiDHix_ee1.matrix);
						gsl_matrix_memcpy (&xHiDHiDHix_ge2.matrix, &xHiDHiDHix_ge1.matrix);
					}
				}
			}
		}
	}


	/*
	size_t n_size=eval->size;
	double delta, d_Hi_ij;

	gsl_matrix *mat_dcdc=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *mat_dcdc_temp=gsl_matrix_alloc (dc_size, dc_size);

	for (size_t k=0; k<n_size; k++) {
		delta=gsl_vector_get (eval, k);

		for (size_t i1=0; i1<d_size; i1++) {
			for (size_t j2=0; j2<d_size; j2++) {
				gsl_vector_const_view xHi_col_i=gsl_matrix_const_column (xHi, k*d_size+i1);
				gsl_vector_const_view xHi_col_j=gsl_matrix_const_column (xHi, k*d_size+j2);

				gsl_matrix_set_zero (mat_dcdc);
				gsl_blas_dger (1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);

				for (size_t j1=0; j1<d_size; j1++) {
					for (size_t i2=0; i2<d_size; i2++) {
						d_Hi_ij=gsl_matrix_get (Hi, j1, k*d_size+i2);

						v1=GetIndex(i1, j1, d_size);
						v2=GetIndex(i2, j2, d_size);

						gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
						gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
						gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);

						gsl_matrix_memcpy (mat_dcdc_temp, mat_dcdc);

						gsl_matrix_scale (mat_dcdc_temp, d_Hi_ij);
						gsl_matrix_add(&xHiDHiDHix_ee.matrix, mat_dcdc_temp);
						gsl_matrix_scale(mat_dcdc_temp, delta);
						gsl_matrix_add(&xHiDHiDHix_ge.matrix, mat_dcdc_temp);
						gsl_matrix_scale(mat_dcdc_temp, delta);
						gsl_matrix_add(&xHiDHiDHix_gg.matrix, mat_dcdc_temp);
					}
				}
			}
		}
	}

	for (size_t i1=0; i1<d_size; i1++) {
		for (size_t j1=0; j1<d_size; j1++) {
			v1=GetIndex(i1, j1, d_size);

			for (size_t i2=0; i2<d_size; i2++) {
				for (size_t j2=0; j2<d_size; j2++) {
					v2=GetIndex(i2, j2, d_size);

					if (i1!=j1 && i2!=j2) {continue;}

					gsl_matrix_view xHiDHiDHix_gg=gsl_matrix_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
					gsl_matrix_view xHiDHiDHix_ee=gsl_matrix_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
					gsl_matrix_view xHiDHiDHix_ge=gsl_matrix_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);

					if ( (i1==j1 && i2!=j2) || (i1!=j1 && i2==j2) ) {
						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.5);
						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.5);
						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.5);
					} else {
						gsl_matrix_scale (&xHiDHiDHix_gg.matrix, 0.25);
						gsl_matrix_scale (&xHiDHiDHix_ee.matrix, 0.25);
						gsl_matrix_scale (&xHiDHiDHix_ge.matrix, 0.25);
					}
				}
			}
		}
	}

	gsl_matrix_free (mat_dcdc);
	gsl_matrix_free (mat_dcdc_temp);
	*/

	return;
}



//calculate (xHiDHix)Qi(xHiy) for every pair of i, j
void Calc_xHiDHixQixHiy_all (const gsl_matrix *xHiDHix_all_g, const gsl_matrix *xHiDHix_all_e, const gsl_vector *QixHiy, gsl_matrix *xHiDHixQixHiy_all_g, gsl_matrix *xHiDHixQixHiy_all_e)
{
	size_t dc_size=xHiDHix_all_g->size1;
	size_t v_size=xHiDHix_all_g->size2/dc_size;

	for (size_t i=0; i<v_size; i++) {
		gsl_matrix_const_view xHiDHix_g=gsl_matrix_const_submatrix (xHiDHix_all_g, 0, i*dc_size, dc_size, dc_size);
		gsl_matrix_const_view xHiDHix_e=gsl_matrix_const_submatrix (xHiDHix_all_e, 0, i*dc_size, dc_size, dc_size);

		gsl_vector_view xHiDHixQixHiy_g=gsl_matrix_column (xHiDHixQixHiy_all_g, i);
		gsl_vector_view xHiDHixQixHiy_e=gsl_matrix_column (xHiDHixQixHiy_all_e, i);

		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_g.matrix, QixHiy, 0.0, &xHiDHixQixHiy_g.vector);
		gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHix_e.matrix, QixHiy, 0.0, &xHiDHixQixHiy_e.vector);
	}

	return;
}

//calculate Qi(xHiDHiy) and Qi(xHiDHix)Qi(xHiy) for each pair of i j (i<=j)
void Calc_QiVec_all (const gsl_matrix *Qi, const gsl_matrix *vec_all_g, const gsl_matrix *vec_all_e, gsl_matrix *Qivec_all_g, gsl_matrix *Qivec_all_e)
{
	for (size_t i=0; i<vec_all_g->size2; i++) {
		gsl_vector_const_view vec_g=gsl_matrix_const_column (vec_all_g, i);
		gsl_vector_const_view vec_e=gsl_matrix_const_column (vec_all_e, i);

		gsl_vector_view Qivec_g=gsl_matrix_column (Qivec_all_g, i);
		gsl_vector_view Qivec_e=gsl_matrix_column (Qivec_all_e, i);

		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_g.vector, 0.0, &Qivec_g.vector);
		gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, &vec_e.vector, 0.0, &Qivec_e.vector);
	}

	return;
}


//calculate Qi(xHiDHix) for each pair of i j (i<=j)
void Calc_QiMat_all (const gsl_matrix *Qi, const gsl_matrix *mat_all_g, const gsl_matrix *mat_all_e, gsl_matrix *Qimat_all_g, gsl_matrix *Qimat_all_e)
{
	size_t dc_size=Qi->size1;
	size_t v_size=mat_all_g->size2/mat_all_g->size1;

	for (size_t i=0; i<v_size; i++) {
		gsl_matrix_const_view mat_g=gsl_matrix_const_submatrix (mat_all_g, 0, i*dc_size, dc_size, dc_size);
		gsl_matrix_const_view mat_e=gsl_matrix_const_submatrix (mat_all_e, 0, i*dc_size, dc_size, dc_size);

		gsl_matrix_view Qimat_g=gsl_matrix_submatrix (Qimat_all_g, 0, i*dc_size, dc_size, dc_size);
		gsl_matrix_view Qimat_e=gsl_matrix_submatrix (Qimat_all_e, 0, i*dc_size, dc_size, dc_size);

		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_g.matrix, 0.0, &Qimat_g.matrix);
		gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, Qi, &mat_e.matrix, 0.0, &Qimat_e.matrix);
	}

	return;
}



//calculate yPDPy
//yPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)y
//=ytHiDHiy
//-(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
//+(yHix)Qi(xHiDHix)Qi(xtHiy)
void Calc_yPDPy (const gsl_vector *eval, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const size_t i, const size_t j, double &yPDPy_g, double &yPDPy_e)
{
	size_t d_size=Hiy->size1;
	size_t v=GetIndex(i, j, d_size);

	double d;

	//first part: ytHiDHiy
	Calc_yHiDHiy (eval, Hiy, i, j, yPDPy_g, yPDPy_e);

	//second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
	gsl_vector_const_view xHiDHiy_g=gsl_matrix_const_column (xHiDHiy_all_g, v);
	gsl_vector_const_view xHiDHiy_e=gsl_matrix_const_column (xHiDHiy_all_e, v);

	gsl_blas_ddot(QixHiy, &xHiDHiy_g.vector, &d);
	yPDPy_g-=d*2.0;
	gsl_blas_ddot(QixHiy, &xHiDHiy_e.vector, &d);
	yPDPy_e-=d*2.0;

	//fourth part: +(yHix)Qi(xHiDHix)Qi(xHiy)
	gsl_vector_const_view xHiDHixQixHiy_g=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v);
	gsl_vector_const_view xHiDHixQixHiy_e=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v);

	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_g.vector, &d);
	yPDPy_g+=d;
	gsl_blas_ddot(QixHiy, &xHiDHixQixHiy_e.vector, &d);
	yPDPy_e+=d;

	return;
}

//calculate yPDPDPy=y(Hi-HixQixHi)D(Hi-HixQixHi)D(Hi-HixQixHi)y
//yPDPDPy=yHiDHiDHiy
//-(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)
//-(yHiDHix)Qi(xHiDHiy)
//+(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
//+(yHix)Qi(xHiDHiDHix)Qi(xHiy)
//-(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
void Calc_yPDPDPy (const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e, const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e, const gsl_matrix *xHiDHixQixHiy_all_g, const gsl_matrix *xHiDHixQixHiy_all_e, const gsl_matrix *QixHiDHixQixHiy_all_g, const gsl_matrix *QixHiDHixQixHiy_all_e, const gsl_matrix *xHiDHiDHiy_all_gg, const gsl_matrix *xHiDHiDHiy_all_ee, const gsl_matrix *xHiDHiDHiy_all_ge, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t i1, const size_t j1, const size_t i2, const size_t j2, double &yPDPDPy_gg, double &yPDPDPy_ee, double &yPDPDPy_ge)
{
	size_t d_size=Hi->size1, dc_size=xHi->size1;
	size_t v1=GetIndex(i1, j1, d_size), v2=GetIndex(i2, j2, d_size);
	size_t v_size=d_size*(d_size+1)/2;

	double d;

	gsl_vector *xHiDHiDHixQixHiy=gsl_vector_alloc (dc_size);

	//first part: yHiDHiDHiy
	Calc_yHiDHiDHiy (eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);

	//second and third parts: -(yHix)Qi(xHiDHiDHiy)-(yHiDHiDHix)Qi(xHiy)
	gsl_vector_const_view xHiDHiDHiy_gg1=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v1*v_size+v2);
	gsl_vector_const_view xHiDHiDHiy_ee1=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v1*v_size+v2);
	gsl_vector_const_view xHiDHiDHiy_ge1=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v1*v_size+v2);

	gsl_vector_const_view xHiDHiDHiy_gg2=gsl_matrix_const_column (xHiDHiDHiy_all_gg, v2*v_size+v1);
	gsl_vector_const_view xHiDHiDHiy_ee2=gsl_matrix_const_column (xHiDHiDHiy_all_ee, v2*v_size+v1);
	gsl_vector_const_view xHiDHiDHiy_ge2=gsl_matrix_const_column (xHiDHiDHiy_all_ge, v2*v_size+v1);

	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg1.vector, &d);
	yPDPDPy_gg-=d;
	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee1.vector, &d);
	yPDPDPy_ee-=d;
	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge1.vector, &d);
	yPDPDPy_ge-=d;

	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_gg2.vector, &d);
	yPDPDPy_gg-=d;
	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ee2.vector, &d);
	yPDPDPy_ee-=d;
	gsl_blas_ddot(QixHiy, &xHiDHiDHiy_ge2.vector, &d);
	yPDPDPy_ge-=d;

	//fourth part: -(yHiDHix)Qi(xHiDHiy)
	gsl_vector_const_view xHiDHiy_g1=gsl_matrix_const_column (xHiDHiy_all_g, v1);
	gsl_vector_const_view xHiDHiy_e1=gsl_matrix_const_column (xHiDHiy_all_e, v1);
	gsl_vector_const_view QixHiDHiy_g2=gsl_matrix_const_column (QixHiDHiy_all_g, v2);
	gsl_vector_const_view QixHiDHiy_e2=gsl_matrix_const_column (QixHiDHiy_all_e, v2);

	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
	yPDPDPy_gg-=d;
	gsl_blas_ddot(&xHiDHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
	yPDPDPy_ee-=d;
	gsl_blas_ddot(&xHiDHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
	yPDPDPy_ge-=d;

	//fifth and sixth parts: +(yHix)Qi(xHiDHix)Qi(xHiDHiy)+(yHiDHix)Qi(xHiDHix)Qi(xHiy)
	gsl_vector_const_view QixHiDHiy_g1=gsl_matrix_const_column (QixHiDHiy_all_g, v1);
	gsl_vector_const_view QixHiDHiy_e1=gsl_matrix_const_column (QixHiDHiy_all_e, v1);

	gsl_vector_const_view xHiDHixQixHiy_g1=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v1);
	gsl_vector_const_view xHiDHixQixHiy_e1=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v1);
	gsl_vector_const_view xHiDHixQixHiy_g2=gsl_matrix_const_column (xHiDHixQixHiy_all_g, v2);
	gsl_vector_const_view xHiDHixQixHiy_e2=gsl_matrix_const_column (xHiDHixQixHiy_all_e, v2);

	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_g2.vector, &d);
	yPDPDPy_gg+=d;
	gsl_blas_ddot(&xHiDHixQixHiy_g2.vector, &QixHiDHiy_g1.vector, &d);
	yPDPDPy_gg+=d;

	gsl_blas_ddot(&xHiDHixQixHiy_e1.vector, &QixHiDHiy_e2.vector, &d);
	yPDPDPy_ee+=d;
	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_e1.vector, &d);
	yPDPDPy_ee+=d;

	gsl_blas_ddot(&xHiDHixQixHiy_g1.vector, &QixHiDHiy_e2.vector, &d);
	yPDPDPy_ge+=d;
	gsl_blas_ddot(&xHiDHixQixHiy_e2.vector, &QixHiDHiy_g1.vector, &d);
	yPDPDPy_ge+=d;

	//seventh part: +(yHix)Qi(xHiDHiDHix)Qi(xHiy)
	gsl_matrix_const_view xHiDHiDHix_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
	gsl_matrix_const_view xHiDHiDHix_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
	gsl_matrix_const_view xHiDHiDHix_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);

	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_gg.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
	yPDPDPy_gg+=d;
	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ee.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
	yPDPDPy_ee+=d;
	gsl_blas_dgemv (CblasNoTrans, 1.0, &xHiDHiDHix_ge.matrix, QixHiy, 0.0, xHiDHiDHixQixHiy);
	gsl_blas_ddot(xHiDHiDHixQixHiy, QixHiy, &d);
	yPDPDPy_ge+=d;

	//eighth part: -(yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
	gsl_vector_const_view QixHiDHixQixHiy_g1=gsl_matrix_const_column (QixHiDHixQixHiy_all_g, v1);
	gsl_vector_const_view QixHiDHixQixHiy_e1=gsl_matrix_const_column (QixHiDHixQixHiy_all_e, v1);

	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_g2.vector, &d);
	yPDPDPy_gg-=d;
	gsl_blas_ddot(&QixHiDHixQixHiy_e1.vector, &xHiDHixQixHiy_e2.vector, &d);
	yPDPDPy_ee-=d;
	gsl_blas_ddot(&QixHiDHixQixHiy_g1.vector, &xHiDHixQixHiy_e2.vector, &d);
	yPDPDPy_ge-=d;

	//free memory
	gsl_vector_free(xHiDHiDHixQixHiy);

	return;
}


//calculate Edgeworth correctation factors for small samples
//notation and method follows Thomas J. Rothenberg, Econometirca 1984; 52 (4)
//M=xHiDHix
void CalcCRT (const gsl_matrix *Hessian_inv, const gsl_matrix *Qi, const gsl_matrix *QixHiDHix_all_g, const gsl_matrix *QixHiDHix_all_e, const gsl_matrix *xHiDHiDHix_all_gg, const gsl_matrix *xHiDHiDHix_all_ee, const gsl_matrix *xHiDHiDHix_all_ge, const size_t d_size, double &crt_a, double &crt_b, double &crt_c)
{
	crt_a=0.0; crt_b=0.0; crt_c=0.0;

	size_t dc_size=Qi->size1, v_size=Hessian_inv->size1/2;
	size_t c_size=dc_size/d_size;
	double h_gg, h_ge, h_ee, d, B=0.0, C=0.0, D=0.0;
	double trCg1, trCe1, trCg2, trCe2, trB_gg, trB_ge, trB_ee, trCC_gg, trCC_ge, trCC_ee, trD_gg=0.0, trD_ge=0.0, trD_ee=0.0;

	gsl_matrix *QiMQi_g1=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMQi_e1=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMQi_g2=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMQi_e2=gsl_matrix_alloc (dc_size, dc_size);

	gsl_matrix *QiMQisQisi_g1=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *QiMQisQisi_e1=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *QiMQisQisi_g2=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *QiMQisQisi_e2=gsl_matrix_alloc (d_size, d_size);

	gsl_matrix *QiMQiMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMQiMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMQiMQi_ee=gsl_matrix_alloc (dc_size, dc_size);

	gsl_matrix *QiMMQi_gg=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMMQi_ge=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *QiMMQi_ee=gsl_matrix_alloc (dc_size, dc_size);

	gsl_matrix *Qi_si=gsl_matrix_alloc (d_size, d_size);

	gsl_matrix *M_dd=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *M_dcdc=gsl_matrix_alloc (dc_size, dc_size);

	//invert Qi_sub to Qi_si
	gsl_matrix *Qi_sub=gsl_matrix_alloc (d_size, d_size);

	gsl_matrix_const_view Qi_s=gsl_matrix_const_submatrix (Qi, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);

	int sig;
	gsl_permutation * pmt=gsl_permutation_alloc (d_size);

	gsl_matrix_memcpy (Qi_sub, &Qi_s.matrix);
	LUDecomp (Qi_sub, pmt, &sig);
	LUInvert (Qi_sub, pmt, Qi_si);

	gsl_permutation_free(pmt);
	gsl_matrix_free(Qi_sub);

	//calculate correctation factors
	for (size_t v1=0; v1<v_size; v1++) {
		//calculate Qi(xHiDHix)Qi, and subpart of it
		gsl_matrix_const_view QiM_g1=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v1*dc_size, dc_size, dc_size);
		gsl_matrix_const_view QiM_e1=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v1*dc_size, dc_size, dc_size);

		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, Qi, 0.0, QiMQi_g1);
		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, Qi, 0.0, QiMQi_e1);

		gsl_matrix_view QiMQi_g1_s=gsl_matrix_submatrix (QiMQi_g1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
		gsl_matrix_view QiMQi_e1_s=gsl_matrix_submatrix (QiMQi_e1, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);

		/*
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=0; j<d_size; j++) {
				cout<<setprecision(6)<<gsl_matrix_get(&QiMQi_g1_s.matrix, i, j)<<"\t";
			}
			cout<<endl;
		}
*/
		//calculate trCg1 and trCe1
		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g1_s.matrix, Qi_si, 0.0, QiMQisQisi_g1);
		trCg1=0.0;
		for (size_t k=0; k<d_size; k++) {
			trCg1-=gsl_matrix_get (QiMQisQisi_g1, k, k);
		}

		gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e1_s.matrix, Qi_si, 0.0, QiMQisQisi_e1);
		trCe1=0.0;
		for (size_t k=0; k<d_size; k++) {
			trCe1-=gsl_matrix_get (QiMQisQisi_e1, k, k);
		}
		/*
		cout<<v1<<endl;
		cout<<"trCg1 = "<<trCg1<<", trCe1 = "<<trCe1<<endl;
		*/
		for (size_t v2=0; v2<v_size; v2++) {
			if (v2<v1) {continue;}

			//calculate Qi(xHiDHix)Qi, and subpart of it
			gsl_matrix_const_view QiM_g2=gsl_matrix_const_submatrix (QixHiDHix_all_g, 0, v2*dc_size, dc_size, dc_size);
			gsl_matrix_const_view QiM_e2=gsl_matrix_const_submatrix (QixHiDHix_all_e, 0, v2*dc_size, dc_size, dc_size);

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g2.matrix, Qi, 0.0, QiMQi_g2);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e2.matrix, Qi, 0.0, QiMQi_e2);

			gsl_matrix_view QiMQi_g2_s=gsl_matrix_submatrix (QiMQi_g2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
			gsl_matrix_view QiMQi_e2_s=gsl_matrix_submatrix (QiMQi_e2, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);

			//calculate trCg2 and trCe2
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_g2_s.matrix, Qi_si, 0.0, QiMQisQisi_g2);
			trCg2=0.0;
			for (size_t k=0; k<d_size; k++) {
				trCg2-=gsl_matrix_get (QiMQisQisi_g2, k, k);
			}

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQi_e2_s.matrix, Qi_si, 0.0, QiMQisQisi_e2);
			trCe2=0.0;
			for (size_t k=0; k<d_size; k++) {
				trCe2-=gsl_matrix_get (QiMQisQisi_e2, k, k);
			}

			//calculate trCC_gg, trCC_ge, trCC_ee
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_g2, 0.0, M_dd);
			trCC_gg=0.0;
			for (size_t k=0; k<d_size; k++) {
				trCC_gg+=gsl_matrix_get (M_dd, k, k);
			}

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_g1, QiMQisQisi_e2, 0.0, M_dd);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_g2, 1.0, M_dd);
			trCC_ge=0.0;
			for (size_t k=0; k<d_size; k++) {
				trCC_ge+=gsl_matrix_get (M_dd, k, k);
			}

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, QiMQisQisi_e1, QiMQisQisi_e2, 0.0, M_dd);
			trCC_ee=0.0;
			for (size_t k=0; k<d_size; k++) {
				trCC_ee+=gsl_matrix_get (M_dd, k, k);
			}

			//calculate Qi(xHiDHix)Qi(xHiDHix)Qi, and subpart of it
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_g2, 0.0, QiMQiMQi_gg);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_g1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ge);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_g2, 1.0, QiMQiMQi_ge);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiM_e1.matrix, QiMQi_e2, 0.0, QiMQiMQi_ee);

			gsl_matrix_view QiMQiMQi_gg_s=gsl_matrix_submatrix (QiMQiMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
			gsl_matrix_view QiMQiMQi_ge_s=gsl_matrix_submatrix (QiMQiMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
			gsl_matrix_view QiMQiMQi_ee_s=gsl_matrix_submatrix (QiMQiMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);

			//and part of trB_gg, trB_ge, trB_ee
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
			trB_gg=0.0;
			for (size_t k=0; k<d_size; k++) {
				d=gsl_matrix_get (M_dd, k, k);
				trB_gg-=d;
			}

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
			trB_ge=0.0;
			for (size_t k=0; k<d_size; k++) {
				d=gsl_matrix_get (M_dd, k, k);
				trB_ge-=d;
			}

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMQiMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
			trB_ee=0.0;
			for (size_t k=0; k<d_size; k++) {
				d=gsl_matrix_get (M_dd, k, k);
				trB_ee-=d;
			}

			//calculate Qi(xHiDHiDHix)Qi, and subpart of it
			gsl_matrix_const_view MM_gg=gsl_matrix_const_submatrix (xHiDHiDHix_all_gg, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
			gsl_matrix_const_view MM_ge=gsl_matrix_const_submatrix (xHiDHiDHix_all_ge, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);
			gsl_matrix_const_view MM_ee=gsl_matrix_const_submatrix (xHiDHiDHix_all_ee, 0, (v1*v_size+v2)*dc_size, dc_size, dc_size);

			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_gg.matrix, 0.0, M_dcdc);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_gg);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ge.matrix, 0.0, M_dcdc);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ge);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, &MM_ee.matrix, 0.0, M_dcdc);
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, M_dcdc, Qi, 0.0, QiMMQi_ee);

			gsl_matrix_view QiMMQi_gg_s=gsl_matrix_submatrix (QiMMQi_gg, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
			gsl_matrix_view QiMMQi_ge_s=gsl_matrix_submatrix (QiMMQi_ge, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);
			gsl_matrix_view QiMMQi_ee_s=gsl_matrix_submatrix (QiMMQi_ee, (c_size-1)*d_size, (c_size-1)*d_size, d_size, d_size);

			//calculate the other part of trB_gg, trB_ge, trB_ee
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_gg_s.matrix, Qi_si, 0.0, M_dd);
			for (size_t k=0; k<d_size; k++) {
				trB_gg+=gsl_matrix_get (M_dd, k, k);
			}
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ge_s.matrix, Qi_si, 0.0, M_dd);
			for (size_t k=0; k<d_size; k++) {
				trB_ge+=2.0*gsl_matrix_get (M_dd, k, k);
			}
			gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &QiMMQi_ee_s.matrix, Qi_si, 0.0, M_dd);
			for (size_t k=0; k<d_size; k++) {
				trB_ee+=gsl_matrix_get (M_dd, k, k);
			}


			//calculate trD_gg, trD_ge, trD_ee
			trD_gg=2.0*trB_gg;
			trD_ge=2.0*trB_ge;
			trD_ee=2.0*trB_ee;

			//calculate B, C and D
			h_gg=-1.0*gsl_matrix_get (Hessian_inv, v1, v2);
			h_ge=-1.0*gsl_matrix_get (Hessian_inv, v1, v2+v_size);
			h_ee=-1.0*gsl_matrix_get (Hessian_inv, v1+v_size, v2+v_size);

			B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
			C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
			D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);

			if (v1!=v2) {
				B+=h_gg*trB_gg+h_ge*trB_ge+h_ee*trB_ee;
				C+=h_gg*(trCC_gg+0.5*trCg1*trCg2)+h_ge*(trCC_ge+0.5*trCg1*trCe2+0.5*trCe1*trCg2)+h_ee*(trCC_ee+0.5*trCe1*trCe2);
				D+=h_gg*(trCC_gg+0.5*trD_gg)+h_ge*(trCC_ge+0.5*trD_ge)+h_ee*(trCC_ee+0.5*trD_ee);
			}

			/*
			cout<<v1<<"\t"<<v2<<endl;
			cout<<h_gg<<"\t"<<h_ge<<"\t"<<h_ee<<endl;
			cout<<trB_gg<<"\t"<<trB_ge<<"\t"<<trB_ee<<endl;
			cout<<trCg1<<"\t"<<trCe1<<"\t"<<trCg2<<"\t"<<trCe2<<endl;
			cout<<trCC_gg<<"\t"<<trCC_ge<<"\t"<<trCC_ee<<endl;
			cout<<trD_gg<<"\t"<<trD_ge<<"\t"<<trD_ee<<endl;
			*/
		}
	}

	//calculate a, b, c from B C D
	crt_a=2.0*D-C;
	crt_b=2.0*B;
	crt_c=C;
	/*
	cout<<B<<"\t"<<C<<"\t"<<D<<endl;
	cout<<setprecision(6)<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
	*/
	//free matrix memory
	gsl_matrix_free(QiMQi_g1);
	gsl_matrix_free(QiMQi_e1);
	gsl_matrix_free(QiMQi_g2);
	gsl_matrix_free(QiMQi_e2);

	gsl_matrix_free(QiMQisQisi_g1);
	gsl_matrix_free(QiMQisQisi_e1);
	gsl_matrix_free(QiMQisQisi_g2);
	gsl_matrix_free(QiMQisQisi_e2);

	gsl_matrix_free(QiMQiMQi_gg);
	gsl_matrix_free(QiMQiMQi_ge);
	gsl_matrix_free(QiMQiMQi_ee);

	gsl_matrix_free(QiMMQi_gg);
	gsl_matrix_free(QiMMQi_ge);
	gsl_matrix_free(QiMMQi_ee);

	gsl_matrix_free(Qi_si);

	gsl_matrix_free(M_dd);
	gsl_matrix_free(M_dcdc);

	return;
}





//calculate first-order and second-order derivatives
void CalcDev (const char func_name, const gsl_vector *eval, const gsl_matrix *Qi, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_vector *QixHiy, gsl_vector *gradient, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
{
	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return;}

	size_t dc_size=Qi->size1, d_size=Hi->size1;
	size_t c_size=dc_size/d_size;
	size_t v_size=d_size*(d_size+1)/2;
	size_t v1, v2;
	double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge;

	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_matrix *xHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *xHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *xHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
	gsl_matrix *xHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);
	gsl_matrix *xHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *xHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);

	gsl_matrix *QixHiDHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *QixHiDHiy_all_e=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *QixHiDHix_all_g=gsl_matrix_alloc (dc_size, v_size*dc_size);
	gsl_matrix *QixHiDHix_all_e=gsl_matrix_alloc (dc_size, v_size*dc_size);
	gsl_matrix *QixHiDHixQixHiy_all_g=gsl_matrix_alloc (dc_size, v_size);
	gsl_matrix *QixHiDHixQixHiy_all_e=gsl_matrix_alloc (dc_size, v_size);

	gsl_matrix *xHiDHiDHiy_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size);
	gsl_matrix *xHiDHiDHiy_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size);
	gsl_matrix *xHiDHiDHiy_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size);
	gsl_matrix *xHiDHiDHix_all_gg=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
	gsl_matrix *xHiDHiDHix_all_ee=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);
	gsl_matrix *xHiDHiDHix_all_ge=gsl_matrix_alloc (dc_size, v_size*v_size*dc_size);

	//calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all
	Calc_xHiDHiy_all (eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e);
	Calc_xHiDHix_all (eval, xHi, xHiDHix_all_g, xHiDHix_all_e);
	Calc_xHiDHixQixHiy_all (xHiDHix_all_g, xHiDHix_all_e, QixHiy, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e);

	Calc_xHiDHiDHiy_all (v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge);
	Calc_xHiDHiDHix_all (v_size, eval, Hi, xHi, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge);

	//calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all
	Calc_QiVec_all (Qi, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e);
	Calc_QiVec_all (Qi, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e);
	Calc_QiMat_all (Qi, xHiDHix_all_g, xHiDHix_all_e, QixHiDHix_all_g, QixHiDHix_all_e);

	double tHiD_g, tHiD_e, tPD_g, tPD_e, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge, tPDPD_gg, tPDPD_ee, tPDPD_ge;
	double yPDPy_g, yPDPy_e, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge;

	//calculate gradient and Hessian for Vg
	for (size_t i1=0; i1<d_size; i1++) {
		for (size_t j1=0; j1<d_size; j1++) {
			if (j1<i1) {continue;}
			v1=GetIndex (i1, j1, d_size);

			Calc_yPDPy (eval, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, i1, j1, yPDPy_g, yPDPy_e);

			if (func_name=='R' || func_name=='r') {
				Calc_tracePD (eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g, tPD_e);
				//cout<<i1<<" "<<j1<<" "<<yPDPy_g<<" "<<yPDPy_e<<" "<<tPD_g<<" "<<tPD_e<<endl;

				dev1_g=-0.5*tPD_g+0.5*yPDPy_g;
				dev1_e=-0.5*tPD_e+0.5*yPDPy_e;
			} else {
				Calc_traceHiD (eval, Hi, i1, j1, tHiD_g, tHiD_e);

				dev1_g=-0.5*tHiD_g+0.5*yPDPy_g;
				dev1_e=-0.5*tHiD_e+0.5*yPDPy_e;
			}

			gsl_vector_set (gradient, v1, dev1_g);
			gsl_vector_set (gradient, v1+v_size, dev1_e);

			for (size_t i2=0; i2<d_size; i2++) {
				for (size_t j2=0; j2<d_size; j2++) {
					if (j2<i2) {continue;}
					v2=GetIndex (i2, j2, d_size);

					if (v2<v1) {continue;}

					Calc_yPDPDPy (eval, Hi, xHi, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e, QixHiDHiy_all_g, QixHiDHiy_all_e, xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g, QixHiDHixQixHiy_all_e, xHiDHiDHiy_all_gg, xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee, yPDPDPy_ge);

					//cout<<i1<<" "<<j1<<" "<<i2<<" "<<j2<<" "<<yPDPDPy_gg<<" "<<yPDPDPy_ee<<" "<<yPDPDPy_ge<<endl;
					//AI for reml
					if (func_name=='R' || func_name=='r') {
						Calc_tracePDPD (eval, Qi, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);

						dev2_gg=0.5*tPDPD_gg-yPDPDPy_gg;
						dev2_ee=0.5*tPDPD_ee-yPDPDPy_ee;
						dev2_ge=0.5*tPDPD_ge-yPDPDPy_ge;
						/*
						dev2_gg=-0.5*yPDPDPy_gg;
						dev2_ee=-0.5*yPDPDPy_ee;
						dev2_ge=-0.5*yPDPDPy_ge;
						*/
					} else {
						Calc_traceHiDHiD (eval, Hi, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee, tHiDHiD_ge);

						dev2_gg=0.5*tHiDHiD_gg-yPDPDPy_gg;
						dev2_ee=0.5*tHiDHiD_ee-yPDPDPy_ee;
						dev2_ge=0.5*tHiDHiD_ge-yPDPDPy_ge;
					}

					//set up Hessian
					gsl_matrix_set (Hessian, v1, v2, dev2_gg);
					gsl_matrix_set (Hessian, v1+v_size, v2+v_size, dev2_ee);
					gsl_matrix_set (Hessian, v1, v2+v_size, dev2_ge);
					gsl_matrix_set (Hessian, v2+v_size, v1, dev2_ge);

					if (v1!=v2) {
						gsl_matrix_set (Hessian, v2, v1, dev2_gg);
						gsl_matrix_set (Hessian, v2+v_size, v1+v_size, dev2_ee);
						gsl_matrix_set (Hessian, v2, v1+v_size, dev2_ge);
						gsl_matrix_set (Hessian, v1+v_size, v2, dev2_ge);
					}
				}
			}
		}
	}

	/*
	cout<<"Hessian: "<<endl;
	for (size_t i=0; i<2*v_size; i++) {
		for (size_t j=0; j<2*v_size; j++) {
			cout<<gsl_matrix_get(Hessian, i, j)<<"\t";
		}
		cout<<endl;
	}
	*/


	//Invert Hessian
	int sig;
	gsl_permutation * pmt=gsl_permutation_alloc (v_size*2);

	LUDecomp (Hessian, pmt, &sig);
	LUInvert (Hessian, pmt, Hessian_inv);
	/*
	cout<<"Hessian Inverse: "<<endl;
	for (size_t i=0; i<2*v_size; i++) {
		for (size_t j=0; j<2*v_size; j++) {
			cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
		}
		cout<<endl;
	}
	*/
	gsl_permutation_free(pmt);
	gsl_matrix_free(Hessian);

	//calculate Edgeworth correction factors
	//after inverting Hessian
	if (c_size>1) {
		CalcCRT (Hessian_inv, Qi, QixHiDHix_all_g, QixHiDHix_all_e, xHiDHiDHix_all_gg, xHiDHiDHix_all_ee, xHiDHiDHix_all_ge, d_size, crt_a, crt_b, crt_c);
	} else {
		crt_a=0.0; crt_b=0.0; crt_c=0.0;
	}

	gsl_matrix_free(xHiDHiy_all_g);
	gsl_matrix_free(xHiDHiy_all_e);
	gsl_matrix_free(xHiDHix_all_g);
	gsl_matrix_free(xHiDHix_all_e);
	gsl_matrix_free(xHiDHixQixHiy_all_g);
	gsl_matrix_free(xHiDHixQixHiy_all_e);

	gsl_matrix_free(QixHiDHiy_all_g);
	gsl_matrix_free(QixHiDHiy_all_e);
	gsl_matrix_free(QixHiDHix_all_g);
	gsl_matrix_free(QixHiDHix_all_e);
	gsl_matrix_free(QixHiDHixQixHiy_all_g);
	gsl_matrix_free(QixHiDHixQixHiy_all_e);

	gsl_matrix_free(xHiDHiDHiy_all_gg);
	gsl_matrix_free(xHiDHiDHiy_all_ee);
	gsl_matrix_free(xHiDHiDHiy_all_ge);
	gsl_matrix_free(xHiDHiDHix_all_gg);
	gsl_matrix_free(xHiDHiDHix_all_ee);
	gsl_matrix_free(xHiDHiDHix_all_ge);

	return;
}


//update Vg, Ve
void UpdateVgVe (const gsl_matrix *Hessian_inv, const gsl_vector *gradient, const double step_scale, gsl_matrix *V_g, gsl_matrix *V_e)
{
	size_t v_size=gradient->size/2, d_size=V_g->size1;
	size_t v;

	gsl_vector *vec_v=gsl_vector_alloc (v_size*2);

	double d;

	//vectorize Vg and Ve
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			if (j<i) {continue;}
			v=GetIndex(i, j, d_size);

			d=gsl_matrix_get (V_g, i, j);
			gsl_vector_set (vec_v, v, d);

			d=gsl_matrix_get (V_e, i, j);
			gsl_vector_set (vec_v, v+v_size, d);
		}
	}

	gsl_blas_dgemv (CblasNoTrans, -1.0*step_scale, Hessian_inv, gradient, 1.0, vec_v);

	//save Vg and Ve
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<d_size; j++) {
			if (j<i) {continue;}
			v=GetIndex(i, j, d_size);

			d=gsl_vector_get (vec_v, v);
			gsl_matrix_set (V_g, i, j, d);
			gsl_matrix_set (V_g, j, i, d);

			d=gsl_vector_get (vec_v, v+v_size);
			gsl_matrix_set (V_e, i, j, d);
			gsl_matrix_set (V_e, j, i, d);
		}
	}

	gsl_vector_free(vec_v);

	return;
}






double MphNR (const char func_name, const size_t max_iter, const double max_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, gsl_matrix *Hi_all, gsl_matrix *xHi_all, gsl_matrix *Hiy_all, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *Hessian_inv, double &crt_a, double &crt_b, double &crt_c)
{
	if (func_name!='R' && func_name!='L' && func_name!='r' && func_name!='l') {cout<<"func_name only takes 'R' or 'L': 'R' for log-restricted likelihood, 'L' for log-likelihood."<<endl; return 0.0;}
	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
	size_t dc_size=d_size*c_size;
	size_t v_size=d_size*(d_size+1)/2;

	double logdet_H, logdet_Q, yPy, logl_const, logl_old=0.0, logl_new=0.0, step_scale;
	int sig;
	size_t step_iter, flag_pd;

	gsl_matrix *Vg_save=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *Ve_save=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_temp=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *U_temp=gsl_matrix_alloc (d_size, d_size);
	gsl_vector *D_temp=gsl_vector_alloc (d_size);
	gsl_vector *xHiy=gsl_vector_alloc (dc_size);
	gsl_vector *QixHiy=gsl_vector_alloc (dc_size);
	gsl_matrix *Qi=gsl_matrix_alloc (dc_size, dc_size);
	gsl_matrix *XXt=gsl_matrix_alloc (c_size, c_size);

	gsl_vector *gradient=gsl_vector_alloc (v_size*2);

	//calculate |XXt| and (XXt)^{-1}
	gsl_blas_dsyrk (CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
	for (size_t i=0; i<c_size; ++i) {
		for (size_t j=0; j<i; ++j) {
			gsl_matrix_set (XXt, i, j, gsl_matrix_get (XXt, j, i));
		}
	}

	gsl_permutation * pmt=gsl_permutation_alloc (c_size);
	LUDecomp (XXt, pmt, &sig);
	gsl_permutation_free (pmt);
//	LUInvert (XXt, pmt, XXti);

	//calculate the constant for logl
	if (func_name=='R' || func_name=='r') {
		logl_const=-0.5*(double)(n_size-c_size)*(double)d_size*log(2.0*M_PI)+0.5*(double)d_size*LULndet (XXt);
	} else {
		logl_const=-0.5*(double)n_size*(double)d_size*log(2.0*M_PI);
	}
	//optimization iterations

	for (size_t t=0; t<max_iter; t++) {
		gsl_matrix_memcpy (Vg_save, V_g);
		gsl_matrix_memcpy (Ve_save, V_e);

		step_scale=1.0; step_iter=0;
		do {
			gsl_matrix_memcpy (V_g, Vg_save);
			gsl_matrix_memcpy (V_e, Ve_save);

			//update Vg, Ve, and invert Hessian
			if (t!=0) {UpdateVgVe (Hessian_inv, gradient, step_scale, V_g, V_e);}

			//check if both Vg and Ve are positive definite
			flag_pd=1;
			gsl_matrix_memcpy (V_temp, V_e);
			EigenDecomp(V_temp, U_temp, D_temp, 0);
			for (size_t i=0; i<d_size; i++) {
				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}
			}
			gsl_matrix_memcpy (V_temp, V_g);
			EigenDecomp(V_temp, U_temp, D_temp, 0);
			for (size_t i=0; i<d_size; i++) {
				if (gsl_vector_get (D_temp, i)<=0) {flag_pd=0;}
			}

			//if flag_pd==1 continue to calculate quantities and logl
			if (flag_pd==1) {
				CalcHiQi (eval, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q);
				Calc_Hiy_all (Y, Hi_all, Hiy_all);
				Calc_xHi_all (X, Hi_all, xHi_all);

				//calculate QixHiy and yPy
				Calc_xHiy (Y, xHi_all, xHiy);
				gsl_blas_dgemv (CblasNoTrans, 1.0, Qi, xHiy, 0.0, QixHiy);

				gsl_blas_ddot (QixHiy, xHiy, &yPy);
				yPy=Calc_yHiy (Y, Hiy_all)-yPy;

				//calculate log likelihood/restricted likelihood value
				if (func_name=='R' || func_name=='r') {
					logl_new=logl_const-0.5*logdet_H-0.5*logdet_Q-0.5*yPy;
				} else {
					logl_new=logl_const-0.5*logdet_H-0.5*yPy;
				}
			}

			step_scale/=2.0;
			step_iter++;

			//cout<<t<<"\t"<<step_iter<<"\t"<<logl_old<<"\t"<<logl_new<<"\t"<<flag_pd<<endl;
		} while ( (flag_pd==0 || logl_new<logl_old || logl_new-logl_old>10 ) && step_iter<10 && t!=0);

		//terminate if change is small
		if (t!=0) {
			if (logl_new<logl_old || flag_pd==0) {
				gsl_matrix_memcpy (V_g, Vg_save);
				gsl_matrix_memcpy (V_e, Ve_save);
				break;
			}

			if (logl_new-logl_old<max_prec) {
				break;
			}
		}

		logl_old=logl_new;

		CalcDev (func_name, eval, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient, Hessian_inv, crt_a, crt_b, crt_c);


		//output estimates in each iteration
		/*
		cout<<func_name<<" iteration = "<<t<<" log-likelihood = "<<logl_old<<"\t"<<logl_new<<endl;

		cout<<"Vg: "<<endl;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=0; j<d_size; j++) {
				cout<<gsl_matrix_get(V_g, i, j)<<"\t";
			}
			cout<<endl;
		}
		cout<<"Ve: "<<endl;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=0; j<d_size; j++) {
				cout<<gsl_matrix_get(V_e, i, j)<<"\t";
			}
			cout<<endl;
		}
		cout<<"Hessian: "<<endl;
		for (size_t i=0; i<Hessian_inv->size1; i++) {
			for (size_t j=0; j<Hessian_inv->size2; j++) {
				cout<<gsl_matrix_get(Hessian_inv, i, j)<<"\t";
			}
			cout<<endl;
		}
		*/
	}

	//mutiply Hessian_inv with -1.0
	//now Hessian_inv is the variance matrix
	gsl_matrix_scale (Hessian_inv, -1.0);

	gsl_matrix_free(Vg_save);
	gsl_matrix_free(Ve_save);
	gsl_matrix_free(V_temp);
	gsl_matrix_free(U_temp);
	gsl_vector_free(D_temp);
	gsl_vector_free(xHiy);
	gsl_vector_free(QixHiy);

	gsl_matrix_free(Qi);
	gsl_matrix_free(XXt);

	gsl_vector_free(gradient);

	return logl_new;
}





//initialize Vg, Ve and B
void MphInitial(const size_t em_iter, const double em_prec, const size_t nr_iter, const double nr_prec, const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B)
{
	gsl_matrix_set_zero (V_g);
	gsl_matrix_set_zero (V_e);
	gsl_matrix_set_zero (B);

	size_t n_size=eval->size, c_size=X->size1, d_size=Y->size1;
	double a, b, c;
	double lambda, logl, vg, ve;

	//Initial the diagonal elements of Vg and Ve using univariate LMM and REML estimates
	gsl_matrix *Xt=gsl_matrix_alloc (n_size, c_size);
	gsl_vector *beta_temp=gsl_vector_alloc(c_size);
	gsl_vector *se_beta_temp=gsl_vector_alloc(c_size);

	gsl_matrix_transpose_memcpy (Xt, X);

	for (size_t i=0; i<d_size; i++) {
		gsl_vector_const_view Y_row=gsl_matrix_const_row (Y, i);
		CalcLambda ('R', eval, Xt, &Y_row.vector, l_min, l_max, n_region, lambda, logl);
		CalcLmmVgVeBeta (eval, Xt, &Y_row.vector, lambda, vg, ve, beta_temp, se_beta_temp);

		gsl_matrix_set(V_g, i, i, vg);
		gsl_matrix_set(V_e, i, i, ve);
	}

	gsl_matrix_free (Xt);
	gsl_vector_free (beta_temp);
	gsl_vector_free (se_beta_temp);

	//if number of phenotypes is above four, then obtain the off diagonal elements with two trait models
	if (d_size>4) {
		//first obtain good initial values
		//large matrices for EM
		gsl_matrix *U_hat=gsl_matrix_alloc (2, n_size);
		gsl_matrix *E_hat=gsl_matrix_alloc (2, n_size);
		gsl_matrix *OmegaU=gsl_matrix_alloc (2, n_size);
		gsl_matrix *OmegaE=gsl_matrix_alloc (2, n_size);
		gsl_matrix *UltVehiY=gsl_matrix_alloc (2, n_size);
		gsl_matrix *UltVehiBX=gsl_matrix_alloc (2, n_size);
		gsl_matrix *UltVehiU=gsl_matrix_alloc (2, n_size);
		gsl_matrix *UltVehiE=gsl_matrix_alloc (2, n_size);

		//large matrices for NR
		gsl_matrix *Hi_all=gsl_matrix_alloc (2, 2*n_size);		//each dxd block is H_k^{-1}
		gsl_matrix *Hiy_all=gsl_matrix_alloc (2, n_size);				//each column is H_k^{-1}y_k
		gsl_matrix *xHi_all=gsl_matrix_alloc (2*c_size, 2*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
		gsl_matrix *Hessian=gsl_matrix_alloc (6, 6);

		//2 by n matrix of Y
		gsl_matrix *Y_sub=gsl_matrix_alloc (2, n_size);
		gsl_matrix *Vg_sub=gsl_matrix_alloc (2, 2);
		gsl_matrix *Ve_sub=gsl_matrix_alloc (2, 2);
		gsl_matrix *B_sub=gsl_matrix_alloc (2, c_size);

		for (size_t i=0; i<d_size; i++) {
			gsl_vector_view Y_sub1=gsl_matrix_row (Y_sub, 0);
			gsl_vector_const_view Y_1=gsl_matrix_const_row (Y, i);
			gsl_vector_memcpy (&Y_sub1.vector, &Y_1.vector);

			for (size_t j=i+1; j<d_size; j++) {
				gsl_vector_view Y_sub2=gsl_matrix_row (Y_sub, 1);
				gsl_vector_const_view Y_2=gsl_matrix_const_row (Y, j);
				gsl_vector_memcpy (&Y_sub2.vector, &Y_2.vector);

				gsl_matrix_set_zero (Vg_sub);
				gsl_matrix_set_zero (Ve_sub);
				gsl_matrix_set (Vg_sub, 0, 0, gsl_matrix_get (V_g, i, i));
				gsl_matrix_set (Ve_sub, 0, 0, gsl_matrix_get (V_e, i, i));
				gsl_matrix_set (Vg_sub, 1, 1, gsl_matrix_get (V_g, j, j));
				gsl_matrix_set (Ve_sub, 1, 1, gsl_matrix_get (V_e, j, j));

				logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);
				logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, a, b, c);

				gsl_matrix_set(V_g, i, j, gsl_matrix_get (Vg_sub, 0, 1));
				gsl_matrix_set(V_g, j, i, gsl_matrix_get (Vg_sub, 0, 1));

				gsl_matrix_set(V_e, i, j, ve=gsl_matrix_get (Ve_sub, 0, 1));
				gsl_matrix_set(V_e, j, i, ve=gsl_matrix_get (Ve_sub, 0, 1));
			}
		}

		//free matrices
		gsl_matrix_free(U_hat);
		gsl_matrix_free(E_hat);
		gsl_matrix_free(OmegaU);
		gsl_matrix_free(OmegaE);
		gsl_matrix_free(UltVehiY);
		gsl_matrix_free(UltVehiBX);
		gsl_matrix_free(UltVehiU);
		gsl_matrix_free(UltVehiE);

		gsl_matrix_free(Hi_all);
		gsl_matrix_free(Hiy_all);
		gsl_matrix_free(xHi_all);
		gsl_matrix_free(Hessian);

		gsl_matrix_free(Y_sub);
		gsl_matrix_free(Vg_sub);
		gsl_matrix_free(Ve_sub);
		gsl_matrix_free(B_sub);

		/*
		//second, maximize a increasingly large matrix
		for (size_t i=1; i<d_size; i++) {
			//large matrices for EM
			gsl_matrix *U_hat=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *E_hat=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *OmegaU=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *OmegaE=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *UltVehiY=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *UltVehiBX=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *UltVehiU=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *UltVehiE=gsl_matrix_alloc (i+1, n_size);

			//large matrices for NR
			gsl_matrix *Hi_all=gsl_matrix_alloc (i+1, (i+1)*n_size);		//each dxd block is H_k^{-1}
			gsl_matrix *Hiy_all=gsl_matrix_alloc (i+1, n_size);				//each column is H_k^{-1}y_k
			gsl_matrix *xHi_all=gsl_matrix_alloc ((i+1)*c_size, (i+1)*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
			gsl_matrix *Hessian=gsl_matrix_alloc ((i+1)*(i+2), (i+1)*(i+2));

			//(i+1) by n matrix of Y
			gsl_matrix *Y_sub=gsl_matrix_alloc (i+1, n_size);
			gsl_matrix *Vg_sub=gsl_matrix_alloc (i+1, i+1);
			gsl_matrix *Ve_sub=gsl_matrix_alloc (i+1, i+1);
			gsl_matrix *B_sub=gsl_matrix_alloc (i+1, c_size);

			gsl_matrix_const_view Y_sub_view=gsl_matrix_const_submatrix (Y, 0, 0, i+1, n_size);
			gsl_matrix_view Vg_sub_view=gsl_matrix_submatrix (V_g, 0, 0, i+1, i+1);
			gsl_matrix_view Ve_sub_view=gsl_matrix_submatrix (V_e, 0, 0, i+1, i+1);

			gsl_matrix_memcpy (Y_sub, &Y_sub_view.matrix);
			gsl_matrix_memcpy (Vg_sub, &Vg_sub_view.matrix);
			gsl_matrix_memcpy (Ve_sub, &Ve_sub_view.matrix);

			logl=MphEM ('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, Vg_sub, Ve_sub, B_sub);
			logl=MphNR ('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all, Hiy_all, Vg_sub, Ve_sub, Hessian, crt_a, crt_b, crt_c);

			gsl_matrix_memcpy (&Vg_sub_view.matrix, Vg_sub);
			gsl_matrix_memcpy (&Ve_sub_view.matrix, Ve_sub);

			//free matrices
			gsl_matrix_free(U_hat);
			gsl_matrix_free(E_hat);
			gsl_matrix_free(OmegaU);
			gsl_matrix_free(OmegaE);
			gsl_matrix_free(UltVehiY);
			gsl_matrix_free(UltVehiBX);
			gsl_matrix_free(UltVehiU);
			gsl_matrix_free(UltVehiE);

			gsl_matrix_free(Hi_all);
			gsl_matrix_free(Hiy_all);
			gsl_matrix_free(xHi_all);
			gsl_matrix_free(Hessian);

			gsl_matrix_free(Y_sub);
			gsl_matrix_free(Vg_sub);
			gsl_matrix_free(Ve_sub);
			gsl_matrix_free(B_sub);
		}
		 */
	}

	//calculate B hat using GSL estimate
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);

	gsl_vector *D_l=gsl_vector_alloc (d_size);
	gsl_matrix *UltVeh=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *UltVehi=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *Qi=gsl_matrix_alloc (d_size*c_size, d_size*c_size);
	gsl_vector *XHiy=gsl_vector_alloc (d_size*c_size);
	gsl_vector *beta=gsl_vector_alloc (d_size*c_size);

	gsl_vector_set_zero (XHiy);

	double logdet_Ve, logdet_Q, dl, d, delta, dx, dy;

	//eigen decomposition and calculate log|Ve|
	logdet_Ve=EigenProc (V_g, V_e, D_l, UltVeh, UltVehi);

	//calculate Qi and log|Q|
	logdet_Q=CalcQi (eval, D_l, X, Qi);

	//calculate UltVehiY
	gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);

	//calculate XHiy
	for (size_t i=0; i<d_size; i++) {
		dl=gsl_vector_get(D_l, i);

		for (size_t j=0; j<c_size; j++) {
			d=0.0;
			for (size_t k=0; k<n_size; k++) {
				delta=gsl_vector_get(eval, k);
				dx=gsl_matrix_get(X, j, k);
				dy=gsl_matrix_get(UltVehiY, i, k);

				//if (delta==0) {continue;}
				d+=dy*dx/(delta*dl+1.0);
			}
			gsl_vector_set(XHiy, j*d_size+i, d);
		}
	}

	gsl_blas_dgemv(CblasNoTrans, 1.0, Qi, XHiy, 0.0, beta);

	//multiply beta by UltVeh and save to B
	for (size_t i=0; i<c_size; i++) {
		gsl_vector_view B_col=gsl_matrix_column (B, i);
		gsl_vector_view beta_sub=gsl_vector_subvector (beta, i*d_size, d_size);
		gsl_blas_dgemv(CblasTrans, 1.0, UltVeh, &beta_sub.vector, 0.0, &B_col.vector);
	}

	//free memory
	gsl_matrix_free(UltVehiY);

	gsl_vector_free(D_l);
	gsl_matrix_free(UltVeh);
	gsl_matrix_free(UltVehi);
	gsl_matrix_free(Qi);
	gsl_vector_free(XHiy);
	gsl_vector_free(beta);

	return;
}



//p value correction
//mode=1 Wald; mode=2 LRT; mode=3 SCORE;
double PCRT (const size_t mode, const size_t d_size, const double p_value, const double crt_a, const double crt_b, const double crt_c)
{
	double p_crt=0.0, chisq_crt=0.0, q=(double)d_size;
	double chisq=gsl_cdf_chisq_Qinv(p_value, (double)d_size );

	if (mode==1) {
		double a=crt_c/(2.0*q*(q+2.0));
		double b=1.0+(crt_a+crt_b)/(2.0*q);
		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);
	} else if (mode==2) {
		chisq_crt=chisq/(1.0+crt_a/(2.0*q) );
	} else {
		/*
		double a=-1.0*crt_c/(2.0*q*(q+2.0));
		double b=1.0+(crt_a-crt_b)/(2.0*q);
		chisq_crt=(-1.0*b+sqrt(b*b+4.0*a*chisq))/(2.0*a);
		*/
		chisq_crt=chisq;
	}

	p_crt=gsl_cdf_chisq_Q (chisq_crt, (double)d_size );

	//cout<<crt_a<<"\t"<<crt_b<<"\t"<<crt_c<<endl;
	//cout<<setprecision(10)<<p_value<<"\t"<<p_crt<<endl;

	return p_crt;
}

// WJA added
#include <assert.h>
void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY)
{
	string file_bgen=file_oxford+".bgen";
	ifstream infile (file_bgen.c_str(), ios::binary);
	if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;}

	clock_t time_start=clock();
	time_UtX=0; time_opt=0;

	string line;

	//create a large matrix
	size_t msize=10000;
	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix_set_zero(Xlarge);

	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
	double crt_a, crt_b, crt_c;
	int n_miss, c_phen;
	double geno, x_mean;
	size_t c=0;
	//	double s=0.0;
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;

	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_vector *x=gsl_vector_alloc (n_size);
	gsl_vector *x_miss=gsl_vector_alloc (n_size);

	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
	gsl_vector *beta=gsl_vector_alloc (d_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);

	//null estimates for initial values
	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);
	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);

	gsl_matrix_transpose_memcpy (Y, UtY);

	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);

	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
	gsl_vector_set_zero(&X_row.vector);
	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
	gsl_vector_set_zero(&B_col.vector);

	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);

	c=0;
	Vg_remle_null.clear();
	Ve_remle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_remle_null.clear();
	se_beta_remle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_remle_H0=logl_H0;

	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
	cout.precision(4);

	cout<<"REMLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE likelihood = "<<logl_H0<<endl;


	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);

	c=0;
	Vg_mle_null.clear();
	Ve_mle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_mle_null.clear();
	se_beta_mle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_mle_H0=logl_H0;

	cout<<"MLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE likelihood = "<<logl_H0<<endl;


	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
	for (size_t i=0; i<d_size; i++) {
		v_beta.push_back(0.0);
	}
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			v_Vg.push_back(0.0);
			v_Ve.push_back(0.0);
			v_Vbeta.push_back(0.0);
		}
	}

	gsl_matrix_memcpy (V_g_null, V_g);
	gsl_matrix_memcpy (V_e_null, V_e);
	gsl_matrix_memcpy (B_null, B);

	// read in header
	uint32_t bgen_snp_block_offset;
	uint32_t bgen_header_length;
	uint32_t bgen_nsamples;
	uint32_t bgen_nsnps;
	uint32_t bgen_flags;
	infile.read(reinterpret_cast<char*>(&bgen_snp_block_offset),4);
	infile.read(reinterpret_cast<char*>(&bgen_header_length),4);
	bgen_snp_block_offset-=4;
	infile.read(reinterpret_cast<char*>(&bgen_nsnps),4);
	bgen_snp_block_offset-=4;
	infile.read(reinterpret_cast<char*>(&bgen_nsamples),4);
	bgen_snp_block_offset-=4;
	infile.ignore(4+bgen_header_length-20);
	bgen_snp_block_offset-=4+bgen_header_length-20;
	infile.read(reinterpret_cast<char*>(&bgen_flags),4);
	bgen_snp_block_offset-=4;
	bool CompressedSNPBlocks=bgen_flags&0x1;
//	bool LongIds=bgen_flags&0x4;

	infile.ignore(bgen_snp_block_offset);

	double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss;

	uint32_t bgen_N;
	uint16_t bgen_LS;
	uint16_t bgen_LR;
	uint16_t bgen_LC;
	uint32_t bgen_SNP_pos;
	uint32_t bgen_LA;
	std::string bgen_A_allele;
	uint32_t bgen_LB;
	std::string bgen_B_allele;
	uint32_t bgen_P;
	size_t unzipped_data_size;
	string id;
	string rs;
	string chr;
	std::cout<<"Warning: WJA hard coded SNP missingness threshold of 10%"<<std::endl;



	//start reading genotypes and analyze
	size_t csnp=0, t_last=0;
	for (size_t t=0; t<indicator_snp.size(); ++t) {
	  if (indicator_snp[t]==0) {continue;}
	  t_last++;
	}
	for (size_t t=0; t<indicator_snp.size(); ++t) {


//		if (t>1) {break;}
		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
		if (indicator_snp[t]==0) {continue;}
		// read SNP header
		id.clear();
		rs.clear();
		chr.clear();
		bgen_A_allele.clear();
		bgen_B_allele.clear();

		infile.read(reinterpret_cast<char*>(&bgen_N),4);
		infile.read(reinterpret_cast<char*>(&bgen_LS),2);

		id.resize(bgen_LS);
		infile.read(&id[0], bgen_LS);

		infile.read(reinterpret_cast<char*>(&bgen_LR),2);
		rs.resize(bgen_LR);
		infile.read(&rs[0], bgen_LR);

		infile.read(reinterpret_cast<char*>(&bgen_LC),2);
		chr.resize(bgen_LC);
		infile.read(&chr[0], bgen_LC);

		infile.read(reinterpret_cast<char*>(&bgen_SNP_pos),4);

		infile.read(reinterpret_cast<char*>(&bgen_LA),4);
		bgen_A_allele.resize(bgen_LA);
		infile.read(&bgen_A_allele[0], bgen_LA);


		infile.read(reinterpret_cast<char*>(&bgen_LB),4);
		bgen_B_allele.resize(bgen_LB);
		infile.read(&bgen_B_allele[0], bgen_LB);




		uint16_t unzipped_data[3*bgen_N];

		if (indicator_snp[t]==0) {
			if(CompressedSNPBlocks)
				infile.read(reinterpret_cast<char*>(&bgen_P),4);
			else
				bgen_P=6*bgen_N;

			infile.ignore(static_cast<size_t>(bgen_P));

			continue;
		}


		if(CompressedSNPBlocks)
		{


			infile.read(reinterpret_cast<char*>(&bgen_P),4);
			uint8_t zipped_data[bgen_P];

			unzipped_data_size=6*bgen_N;

			infile.read(reinterpret_cast<char*>(zipped_data),bgen_P);

			int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P));
			assert(result == Z_OK);

		}
		else
		{

			bgen_P=6*bgen_N;
			infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P);
		}

		x_mean=0.0; c_phen=0; n_miss=0;
		gsl_vector_set_zero(x_miss);
		for (size_t i=0; i<bgen_N; ++i) {
			if (indicator_idv[i]==0) {continue;}


				bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0;
				bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0;
				bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0;
				// WJA
				bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB;
				if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
				else {

					bgen_geno_prob_AA/=bgen_geno_prob_non_miss;
					bgen_geno_prob_AB/=bgen_geno_prob_non_miss;
					bgen_geno_prob_BB/=bgen_geno_prob_non_miss;

					geno=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB;

					gsl_vector_set(x, c_phen, geno);
					gsl_vector_set(x_miss, c_phen, 1.0);
					x_mean+=geno;
			}
			c_phen++;
		}

		x_mean/=static_cast<double>(ni_test-n_miss);

		for (size_t i=0; i<ni_test; ++i) {
			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
			//geno=gsl_vector_get(x, i);
			//if (x_mean>1) {
			//gsl_vector_set(x, i, 2-geno);
			//}
		}

		/*
		//calculate statistics
		time_start=clock();
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
		*/

		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
		gsl_vector_memcpy (&Xlarge_col.vector, x);
		csnp++;

		if (csnp%msize==0 || c==t_last ) {
		  size_t l=0;
		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}

		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);

		  time_start=clock();
		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		  gsl_matrix_set_zero (Xlarge);

		  for (size_t i=0; i<l; i++) {
		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);

		    //initial values
		    gsl_matrix_memcpy (V_g, V_g_null);
		    gsl_matrix_memcpy (V_e, V_e_null);
		    gsl_matrix_memcpy (B, B_null);

		    time_start=clock();

		    //3 is before 1
		    if (a_mode==3 || a_mode==4) {
		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
		      if (p_score<p_nr && crt==1) {
			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
		      }
		    }

		    if (a_mode==2 || a_mode==4) {
		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      //calculate beta and Vbeta
		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

		      if (p_lrt<p_nr) {
			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			//calculate beta and Vbeta
			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

			if (crt==1) {
			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    if (a_mode==1 || a_mode==4) {
		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

		      if (p_wald<p_nr) {
			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

			if (crt==1) {
			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}

		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		    //store summary data
		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
		    for (size_t i=0; i<d_size; i++) {
		      v_beta[i]=gsl_vector_get (beta, i);
		    }

		    c=0;
		    for (size_t i=0; i<d_size; i++) {
		      for (size_t j=i; j<d_size; j++) {
			v_Vg[c]=gsl_matrix_get (V_g, i, j);
			v_Ve[c]=gsl_matrix_get (V_e, i, j);
			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
			c++;
		      }
		    }

		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
		    sumStat.push_back(SNPs);
		  }
		}
	}
	cout<<endl;


	infile.close();
	infile.clear();

	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_vector_free(x);
	gsl_vector_free(x_miss);

	gsl_matrix_free(Y);
	gsl_matrix_free(X);
	gsl_matrix_free(V_g);
	gsl_matrix_free(V_e);
	gsl_matrix_free(B);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	gsl_matrix_free(V_g_null);
	gsl_matrix_free(V_e_null);
	gsl_matrix_free(B_null);
	gsl_matrix_free(se_B_null);

	gsl_matrix_free(Xlarge);
	gsl_matrix_free(UtXlarge);

	return;
}

void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY)
{
	igzstream infile (file_geno.c_str(), igzstream::in);
//	ifstream infile (file_geno.c_str(), ifstream::in);
	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}

	clock_t time_start=clock();
	time_UtX=0; time_opt=0;

	string line;
	char *ch_ptr;

	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
	double crt_a, crt_b, crt_c;
	int n_miss, c_phen;
	double geno, x_mean;
	size_t c=0;
	//	double s=0.0;
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;

	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;

	//create a large matrix
	size_t msize=10000;
	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix_set_zero(Xlarge);

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_vector *x=gsl_vector_alloc (n_size);
	gsl_vector *x_miss=gsl_vector_alloc (n_size);

	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
	gsl_vector *beta=gsl_vector_alloc (d_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);

	//null estimates for initial values
	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);
	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);

	gsl_matrix_transpose_memcpy (Y, UtY);

	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);

	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
	gsl_vector_set_zero(&X_row.vector);
	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
	gsl_vector_set_zero(&B_col.vector);

	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);
	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);

	c=0;
	Vg_remle_null.clear();
	Ve_remle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_remle_null.clear();
	se_beta_remle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_remle_H0=logl_H0;

	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
	cout.precision(4);

	cout<<"REMLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE likelihood = "<<logl_H0<<endl;


	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);

	c=0;
	Vg_mle_null.clear();
	Ve_mle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_mle_null.clear();
	se_beta_mle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_mle_H0=logl_H0;

	cout<<"MLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE likelihood = "<<logl_H0<<endl;


	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
	for (size_t i=0; i<d_size; i++) {
		v_beta.push_back(0.0);
	}
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			v_Vg.push_back(0.0);
			v_Ve.push_back(0.0);
			v_Vbeta.push_back(0.0);
		}
	}

	gsl_matrix_memcpy (V_g_null, V_g);
	gsl_matrix_memcpy (V_e_null, V_e);
	gsl_matrix_memcpy (B_null, B);

	//start reading genotypes and analyze
	size_t csnp=0, t_last=0;
	for (size_t t=0; t<indicator_snp.size(); ++t) {
	  if (indicator_snp[t]==0) {continue;}
	  t_last++;
	}
	for (size_t t=0; t<indicator_snp.size(); ++t) {
		//if (t>=1) {break;}
		!safeGetline(infile, line).eof();
		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
		if (indicator_snp[t]==0) {continue;}

		ch_ptr=strtok ((char *)line.c_str(), " , \t");
		ch_ptr=strtok (NULL, " , \t");
		ch_ptr=strtok (NULL, " , \t");

		x_mean=0.0; c_phen=0; n_miss=0;
		gsl_vector_set_zero(x_miss);
		for (size_t i=0; i<ni_total; ++i) {
			ch_ptr=strtok (NULL, " , \t");
			if (indicator_idv[i]==0) {continue;}

			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
			else {
				geno=atof(ch_ptr);

				gsl_vector_set(x, c_phen, geno);
				gsl_vector_set(x_miss, c_phen, 1.0);
				x_mean+=geno;
			}
			c_phen++;
		}

		x_mean/=(double)(ni_test-n_miss);

		for (size_t i=0; i<ni_test; ++i) {
			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
			geno=gsl_vector_get(x, i);
			//if (x_mean>1) {
			//	gsl_vector_set(x, i, 2-geno);
			//}
		}

		/*
		//calculate statistics
		time_start=clock();
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
		*/

		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
		gsl_vector_memcpy (&Xlarge_col.vector, x);
		csnp++;

		if (csnp%msize==0 || c==t_last ) {
		  size_t l=0;
		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}

		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);

		  time_start=clock();
		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		  gsl_matrix_set_zero (Xlarge);

		  for (size_t i=0; i<l; i++) {
		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);

		    //initial values
		    gsl_matrix_memcpy (V_g, V_g_null);
		    gsl_matrix_memcpy (V_e, V_e_null);
		    gsl_matrix_memcpy (B, B_null);

		    time_start=clock();

		    //3 is before 1
		    if (a_mode==3 || a_mode==4) {
		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
		      if (p_score<p_nr && crt==1) {
			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
		      }
		    }

		    if (a_mode==2 || a_mode==4) {
		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      //calculate beta and Vbeta
		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

		      if (p_lrt<p_nr) {
			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			//calculate beta and Vbeta
			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

			if (crt==1) {
			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    if (a_mode==1 || a_mode==4) {
		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

		      if (p_wald<p_nr) {
			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

			if (crt==1) {
			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}

		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		    //store summary data
		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
		    for (size_t i=0; i<d_size; i++) {
		      v_beta[i]=gsl_vector_get (beta, i);
		    }

		    c=0;
		    for (size_t i=0; i<d_size; i++) {
		      for (size_t j=i; j<d_size; j++) {
			v_Vg[c]=gsl_matrix_get (V_g, i, j);
			v_Ve[c]=gsl_matrix_get (V_e, i, j);
			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
			c++;
		      }
		    }

		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
		    sumStat.push_back(SNPs);
		  }
		}
    }
	cout<<endl;


	infile.close();
	infile.clear();

	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_vector_free(x);
	gsl_vector_free(x_miss);

	gsl_matrix_free(Y);
	gsl_matrix_free(X);
	gsl_matrix_free(V_g);
	gsl_matrix_free(V_e);
	gsl_matrix_free(B);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	gsl_matrix_free(V_g_null);
	gsl_matrix_free(V_e_null);
	gsl_matrix_free(B_null);
	gsl_matrix_free(se_B_null);

	gsl_matrix_free(Xlarge);
	gsl_matrix_free(UtXlarge);

	return;
}







void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY)
{
	string file_bed=file_bfile+".bed";
	ifstream infile (file_bed.c_str(), ios::binary);
	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}

	clock_t time_start=clock();
	time_UtX=0; time_opt=0;

	char ch[1];
	bitset<8> b;

	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
	double crt_a, crt_b, crt_c;
	int n_bit, n_miss, ci_total, ci_test;
	double geno, x_mean;
	size_t c=0;
	//	double s=0.0;
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;
	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;

	//create a large matrix
	size_t msize=10000;
	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
	gsl_matrix_set_zero(Xlarge);

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_vector *x=gsl_vector_alloc (n_size);

	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
	gsl_vector *beta=gsl_vector_alloc (d_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);

	//null estimates for initial values
	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
	gsl_matrix *se_B_null=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_view X_sub=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);
	gsl_matrix_view B_sub=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
	gsl_matrix_view xHi_all_sub=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);

	gsl_matrix_transpose_memcpy (Y, UtY);
	gsl_matrix_transpose_memcpy (&X_sub.matrix, UtW);

	gsl_vector_view X_row=gsl_matrix_row(X, c_size);
	gsl_vector_set_zero(&X_row.vector);
	gsl_vector_view B_col=gsl_matrix_column(B, c_size);
	gsl_vector_set_zero(&B_col.vector);

	//time_start=clock();
	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub.matrix);

	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
	//cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;

	c=0;
	Vg_remle_null.clear();
	Ve_remle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_remle_null.clear();
	se_beta_remle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_remle_H0=logl_H0;

	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
	cout.precision(4);
	cout<<"REMLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE likelihood = "<<logl_H0<<endl;

	//time_start=clock();
	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub.matrix);
	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, se_B_null);
	//cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;

	c=0;
	Vg_mle_null.clear();
	Ve_mle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_mle_null.clear();
	se_beta_mle_null.clear();
	for (size_t i=0; i<se_B_null->size1; i++) {
		for (size_t j=0; j<se_B_null->size2; j++) {
			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j) );
		}
	}
	logl_mle_H0=logl_H0;

	cout<<"MLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE likelihood = "<<logl_H0<<endl;

	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
	for (size_t i=0; i<d_size; i++) {
		v_beta.push_back(0.0);
	}
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			v_Vg.push_back(0.0);
			v_Ve.push_back(0.0);
			v_Vbeta.push_back(0.0);
		}
	}

	gsl_matrix_memcpy (V_g_null, V_g);
	gsl_matrix_memcpy (V_e_null, V_e);
	gsl_matrix_memcpy (B_null, B);


	//start reading genotypes and analyze

	//calculate n_bit and c, the number of bit for each snp
	if (ni_total%4==0) {n_bit=ni_total/4;}
	else {n_bit=ni_total/4+1; }

	//print the first three majic numbers
	for (int i=0; i<3; ++i) {
		infile.read(ch,1);
		b=ch[0];
	}

	size_t csnp=0, t_last=0;
	for (size_t t=0; t<indicator_snp.size(); ++t) {
	  if (indicator_snp[t]==0) {continue;}
	  t_last++;
	}
	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
		if (indicator_snp[t]==0) {continue;}

		//if (t>=0) {break;}
		//if (snpInfo[t].rs_number!="MAG18140902") {continue;}
		//cout<<t<<endl;

		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers

		//read genotypes
		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0;
		for (int i=0; i<n_bit; ++i) {
			infile.read(ch,1);
			b=ch[0];
			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
				if (indicator_idv[ci_total]==0) {ci_total++; continue;}

				if (b[2*j]==0) {
					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
				}
				else {
					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }
					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
				}

				ci_total++;
				ci_test++;
			}
		}

		x_mean/=(double)(ni_test-n_miss);

		for (size_t i=0; i<ni_test; ++i) {
			geno=gsl_vector_get(x,i);
			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
			//if (x_mean>1) {
			//	gsl_vector_set(x, i, 2-geno);
			//}
		}

		/*
		if (t==0) {
			ofstream outfile ("./snp1.txt", ofstream::out);
			if (!outfile) {cout<<"error writing file: "<<endl; return;}
			for (size_t i=0; i<x->size; i++) {
				outfile<<gsl_vector_get(x, i)<<endl;
			}
			outfile.clear();
			outfile.close();
		}
		*/

		/*
		//calculate statistics
		time_start=clock();
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
		*/

		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
		gsl_vector_memcpy (&Xlarge_col.vector, x);
		csnp++;

		if (csnp%msize==0 || c==t_last ) {
		  size_t l=0;
		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}

		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);

		  time_start=clock();
		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		  gsl_matrix_set_zero (Xlarge);

		  for (size_t i=0; i<l; i++) {
		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);

		    //initial values
		    gsl_matrix_memcpy (V_g, V_g_null);
		    gsl_matrix_memcpy (V_e, V_e_null);
		    gsl_matrix_memcpy (B, B_null);

		    time_start=clock();

		    //3 is before 1
		    if (a_mode==3 || a_mode==4) {
		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);

		      if (p_score<p_nr && crt==1) {
			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
		      }
		    }

		    if (a_mode==2 || a_mode==4) {
		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      //calculate beta and Vbeta
		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

		      if (p_lrt<p_nr) {
			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);

			//calculate beta and Vbeta
			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
			if (crt==1) {
			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    if (a_mode==1 || a_mode==4) {
		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

		      if (p_wald<p_nr) {
			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

			if (crt==1) {
			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
			}
		      }
		    }

		    //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;

		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}

		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		    //store summary data
		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
		    for (size_t i=0; i<d_size; i++) {
		      v_beta[i]=gsl_vector_get (beta, i);
		    }

		    c=0;
		    for (size_t i=0; i<d_size; i++) {
		      for (size_t j=i; j<d_size; j++) {
			v_Vg[c]=gsl_matrix_get (V_g, i, j);
			v_Ve[c]=gsl_matrix_get (V_e, i, j);
			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
			c++;
		      }
		    }

		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
		    sumStat.push_back(SNPs);
		  }
		}
	}
	cout<<endl;

	//cout<<"time_opt = "<<time_opt<<endl;

	infile.close();
	infile.clear();

	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_vector_free(x);

	gsl_matrix_free(Y);
	gsl_matrix_free(X);
	gsl_matrix_free(V_g);
	gsl_matrix_free(V_e);
	gsl_matrix_free(B);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	gsl_matrix_free(V_g_null);
	gsl_matrix_free(V_e_null);
	gsl_matrix_free(B_null);
	gsl_matrix_free(se_B_null);

	gsl_matrix_free(Xlarge);
	gsl_matrix_free(UtXlarge);

	return;
}




//calculate Vg, Ve, B, se(B) in the null mvLMM model
//both B and se_B are d by c matrices
void CalcMvLmmVgVeBeta (const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const size_t em_iter, const size_t nr_iter, const double em_prec, const double nr_prec, const double l_min, const double l_max, const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *B, gsl_matrix *se_B)
{
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;
	size_t dc_size=d_size*c_size, v_size=d_size*(d_size+1)/2;

	double logl, crt_a, crt_b, crt_c;

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	//transpose matrices
	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *W=gsl_matrix_alloc (c_size, n_size);
	gsl_matrix_transpose_memcpy (Y, UtY);
	gsl_matrix_transpose_memcpy (W, UtW);

	//initial, EM, NR, and calculate B
	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B);
	logl=MphEM ('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
	logl=MphNR ('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, W, Y, V_g, V_e, UltVehiY, B, se_B);

	//free matrices
	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_matrix_free(Y);
	gsl_matrix_free(W);

	return;
}





void MVLMM::AnalyzeBimbamGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env)
{
	igzstream infile (file_geno.c_str(), igzstream::in);
//	ifstream infile (file_geno.c_str(), ifstream::in);
	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}

	clock_t time_start=clock();
	time_UtX=0; time_opt=0;

	string line;
	char *ch_ptr;

	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
	double crt_a, crt_b, crt_c;
	int n_miss, c_phen;
	double geno, x_mean;
	size_t c=0;
	//	double s=0.0;
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2+2;
	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);	//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);		//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);  //each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_vector *x=gsl_vector_alloc (n_size);
	gsl_vector *x_miss=gsl_vector_alloc (n_size);

	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
	gsl_vector *beta=gsl_vector_alloc (d_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);

	//null estimates for initial values; including env but not including x
	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
	gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1);
	gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size);
	gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1);
	gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size);

	gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);
	gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
	gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);

	gsl_matrix_transpose_memcpy (Y, UtY);

	gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size);
	gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW);
	gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2);
	gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector);

	gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1);
	gsl_vector_set_zero(&X_row1.vector);
	gsl_vector_view X_row2=gsl_matrix_row(X, c_size);
	gsl_vector_set_zero(&X_row2.vector);

	gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1);
	gsl_vector_set_zero(&B_col1.vector);
	gsl_vector_view B_col2=gsl_matrix_column(B, c_size);
	gsl_vector_set_zero(&B_col2.vector);

	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix);
	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix);
	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1);

	c=0;
	Vg_remle_null.clear();
	Ve_remle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_remle_null.clear();
	se_beta_remle_null.clear();
	for (size_t i=0; i<se_B_null1->size1; i++) {
		for (size_t j=0; j<se_B_null1->size2; j++) {
			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) );
		}
	}
	logl_remle_H0=logl_H0;

	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
	cout.precision(4);

	cout<<"REMLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE likelihood = "<<logl_H0<<endl;


	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix);
	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1);

	c=0;
	Vg_mle_null.clear();
	Ve_mle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_mle_null.clear();
	se_beta_mle_null.clear();
	for (size_t i=0; i<se_B_null1->size1; i++) {
		for (size_t j=0; j<se_B_null1->size2; j++) {
			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null1, i, j) );
		}
	}
	logl_mle_H0=logl_H0;

	cout<<"MLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE likelihood = "<<logl_H0<<endl;


	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
	for (size_t i=0; i<d_size; i++) {
		v_beta.push_back(0.0);
	}
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			v_Vg.push_back(0.0);
			v_Ve.push_back(0.0);
			v_Vbeta.push_back(0.0);
		}
	}

	gsl_matrix_memcpy (V_g_null, V_g);
	gsl_matrix_memcpy (V_e_null, V_e);
	gsl_matrix_memcpy (B_null, B);

	//start reading genotypes and analyze
	for (size_t t=0; t<indicator_snp.size(); ++t) {
		//if (t>=1) {break;}
		!safeGetline(infile, line).eof();
		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
		if (indicator_snp[t]==0) {continue;}

		ch_ptr=strtok ((char *)line.c_str(), " , \t");
		ch_ptr=strtok (NULL, " , \t");
		ch_ptr=strtok (NULL, " , \t");

		x_mean=0.0; c_phen=0; n_miss=0;
		gsl_vector_set_zero(x_miss);
		for (size_t i=0; i<ni_total; ++i) {
			ch_ptr=strtok (NULL, " , \t");
			if (indicator_idv[i]==0) {continue;}

			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
			else {
				geno=atof(ch_ptr);

				gsl_vector_set(x, c_phen, geno);
				gsl_vector_set(x_miss, c_phen, 1.0);
				x_mean+=geno;
			}
			c_phen++;
		}

		x_mean/=(double)(ni_test-n_miss);

		for (size_t i=0; i<ni_test; ++i) {
			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
			geno=gsl_vector_get(x, i);
			if (x_mean>1) {
				gsl_vector_set(x, i, 2-geno);
			}
		}

		//calculate statistics
		time_start=clock();
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row1.vector);
		gsl_vector_mul (x, env);
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row2.vector);
		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		//initial values
		gsl_matrix_memcpy (V_g, V_g_null);
		gsl_matrix_memcpy (V_e, V_e_null);
		gsl_matrix_memcpy (B, B_null);

		if (a_mode==2 || a_mode==3 || a_mode==4) {
		  if (a_mode==3 || a_mode==4) {
		    logl_H0=MphEM ('R', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
		    logl_H0=MphNR ('R', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
		    MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2);
		  }

		  if (a_mode==2 || a_mode==4) {
		    logl_H0=MphEM ('L', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
		    logl_H0=MphNR ('L', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
		    MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2);
		  }
		}


		time_start=clock();

		//3 is before 1
		if (a_mode==3 || a_mode==4) {
			p_score=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
			if (p_score<p_nr && crt==1) {
				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
			}
		}

		if (a_mode==2 || a_mode==4) {
			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
			//calculate beta and Vbeta
			p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

			if (p_lrt<p_nr) {
				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
				//calculate beta and Vbeta
				p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

				if (crt==1) {
					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
				}
			}
		}

		if (a_mode==1 || a_mode==4) {
			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
			p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

			if (p_wald<p_nr) {
				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
				p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

				if (crt==1) {
					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
				}
			}
		}

		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}

		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		//store summary data
		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
		for (size_t i=0; i<d_size; i++) {
			v_beta[i]=gsl_vector_get (beta, i);
		}

		c=0;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=i; j<d_size; j++) {
				v_Vg[c]=gsl_matrix_get (V_g, i, j);
				v_Ve[c]=gsl_matrix_get (V_e, i, j);
				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
				c++;
			}
		}

		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
		sumStat.push_back(SNPs);
    }
	cout<<endl;


	infile.close();
	infile.clear();

	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_vector_free(x);
	gsl_vector_free(x_miss);

	gsl_matrix_free(Y);
	gsl_matrix_free(X);
	gsl_matrix_free(V_g);
	gsl_matrix_free(V_e);
	gsl_matrix_free(B);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	gsl_matrix_free(V_g_null);
	gsl_matrix_free(V_e_null);
	gsl_matrix_free(B_null);
	gsl_matrix_free(se_B_null1);
	gsl_matrix_free(se_B_null2);

	return;
}







void MVLMM::AnalyzePlinkGXE (const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY, const gsl_vector *env)
{
	string file_bed=file_bfile+".bed";
	ifstream infile (file_bed.c_str(), ios::binary);
	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}

	clock_t time_start=clock();
	time_UtX=0; time_opt=0;

	char ch[1];
	bitset<8> b;

	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
	double crt_a, crt_b, crt_c;
	int n_bit, n_miss, ci_total, ci_test;
	double geno, x_mean;
	size_t c=0;
	//	double s=0.0;
	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2+2;
	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;

	//large matrices for EM
	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *OmegaE=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiY=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiBX=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiU=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *UltVehiE=gsl_matrix_alloc (d_size, n_size);

	//large matrices for NR
	gsl_matrix *Hi_all=gsl_matrix_alloc (d_size, d_size*n_size);		//each dxd block is H_k^{-1}
	gsl_matrix *Hiy_all=gsl_matrix_alloc (d_size, n_size);				//each column is H_k^{-1}y_k
	gsl_matrix *xHi_all=gsl_matrix_alloc (dc_size, d_size*n_size);		//each dcxdc block is x_k\otimes H_k^{-1}
	gsl_matrix *Hessian=gsl_matrix_alloc (v_size*2, v_size*2);

	gsl_vector *x=gsl_vector_alloc (n_size);

	gsl_matrix *Y=gsl_matrix_alloc (d_size, n_size);
	gsl_matrix *X=gsl_matrix_alloc (c_size+1, n_size);
	gsl_matrix *V_g=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B=gsl_matrix_alloc (d_size, c_size+1);
	gsl_vector *beta=gsl_vector_alloc (d_size);
	gsl_matrix *Vbeta=gsl_matrix_alloc (d_size, d_size);

	//null estimates for initial values
	gsl_matrix *V_g_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *V_e_null=gsl_matrix_alloc (d_size, d_size);
	gsl_matrix *B_null=gsl_matrix_alloc (d_size, c_size+1);
	gsl_matrix *se_B_null1=gsl_matrix_alloc (d_size, c_size-1);
	gsl_matrix *se_B_null2=gsl_matrix_alloc (d_size, c_size);

	gsl_matrix_view X_sub1=gsl_matrix_submatrix (X, 0, 0, c_size-1, n_size);
	gsl_matrix_view B_sub1=gsl_matrix_submatrix (B, 0, 0, d_size, c_size-1);
	gsl_matrix_view xHi_all_sub1=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*(c_size-1), d_size*n_size);

	gsl_matrix_view X_sub2=gsl_matrix_submatrix (X, 0, 0, c_size, n_size);
	gsl_matrix_view B_sub2=gsl_matrix_submatrix (B, 0, 0, d_size, c_size);
	gsl_matrix_view xHi_all_sub2=gsl_matrix_submatrix (xHi_all, 0, 0, d_size*c_size, d_size*n_size);

	gsl_matrix_transpose_memcpy (Y, UtY);

	gsl_matrix_view X_sub0=gsl_matrix_submatrix (X, 0, 0, c_size-2, n_size);
	gsl_matrix_transpose_memcpy (&X_sub0.matrix, UtW);
	gsl_vector_view X_row0=gsl_matrix_row(X, c_size-2);
	gsl_blas_dgemv (CblasTrans, 1.0, U, env, 0.0, &X_row0.vector);

	gsl_vector_view X_row1=gsl_matrix_row(X, c_size-1);
	gsl_vector_set_zero(&X_row1.vector);
	gsl_vector_view X_row2=gsl_matrix_row(X, c_size);
	gsl_vector_set_zero(&X_row2.vector);

	gsl_vector_view B_col1=gsl_matrix_column(B, c_size-1);
	gsl_vector_set_zero(&B_col1.vector);
	gsl_vector_view B_col2=gsl_matrix_column(B, c_size);
	gsl_vector_set_zero(&B_col2.vector);

	//time_start=clock();
	MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min, l_max, n_region, V_g, V_e, &B_sub1.matrix);

	logl_H0=MphEM ('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix);
	logl_H0=MphNR ('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1);
	//cout<<"time for REML in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;

	c=0;
	Vg_remle_null.clear();
	Ve_remle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_remle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_remle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_remle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_remle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_remle_null.clear();
	se_beta_remle_null.clear();
	for (size_t i=0; i<se_B_null1->size1; i++) {
		for (size_t j=0; j<se_B_null1->size2; j++) {
			beta_remle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_remle_null.push_back(gsl_matrix_get(se_B_null1, i, j) );
		}
	}
	logl_remle_H0=logl_H0;

	cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
	cout.precision(4);
	cout<<"REMLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"REMLE likelihood = "<<logl_H0<<endl;

	//time_start=clock();
	logl_H0=MphEM ('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub1.matrix);
	logl_H0=MphNR ('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all, &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
	MphCalcBeta (eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix, se_B_null1);
	//cout<<"time for MLE in the null = "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;

	c=0;
	Vg_mle_null.clear();
	Ve_mle_null.clear();
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			Vg_mle_null.push_back(gsl_matrix_get (V_g, i, j) );
			Ve_mle_null.push_back(gsl_matrix_get (V_e, i, j) );
			VVg_mle_null.push_back(gsl_matrix_get (Hessian, c, c) );
			VVe_mle_null.push_back(gsl_matrix_get (Hessian, c+v_size, c+v_size) );
			c++;
		}
	}
	beta_mle_null.clear();
	se_beta_mle_null.clear();
	for (size_t i=0; i<se_B_null1->size1; i++) {
		for (size_t j=0; j<se_B_null1->size2; j++) {
			beta_mle_null.push_back(gsl_matrix_get(B, i, j) );
			se_beta_mle_null.push_back(gsl_matrix_get(se_B_null1, i, j) );
		}
	}
	logl_mle_H0=logl_H0;

	cout<<"MLE estimate for Vg in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_g, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Vg): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c, c))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE estimate for Ve in the null model: "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			cout<<gsl_matrix_get(V_e, i, j)<<"\t";
		}
		cout<<endl;
	}
	cout<<"se(Ve): "<<endl;
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=0; j<=i; j++) {
			c=GetIndex(i, j, d_size);
			cout<<sqrt(gsl_matrix_get(Hessian, c+v_size, c+v_size))<<"\t";
		}
		cout<<endl;
	}
	cout<<"MLE likelihood = "<<logl_H0<<endl;

	vector<double> v_beta, v_Vg, v_Ve, v_Vbeta;
	for (size_t i=0; i<d_size; i++) {
		v_beta.push_back(0.0);
	}
	for (size_t i=0; i<d_size; i++) {
		for (size_t j=i; j<d_size; j++) {
			v_Vg.push_back(0.0);
			v_Ve.push_back(0.0);
			v_Vbeta.push_back(0.0);
		}
	}

	gsl_matrix_memcpy (V_g_null, V_g);
	gsl_matrix_memcpy (V_e_null, V_e);
	gsl_matrix_memcpy (B_null, B);


	//start reading genotypes and analyze

	//calculate n_bit and c, the number of bit for each snp
	if (ni_total%4==0) {n_bit=ni_total/4;}
	else {n_bit=ni_total/4+1; }

	//print the first three majic numbers
	for (int i=0; i<3; ++i) {
		infile.read(ch,1);
		b=ch[0];
	}

	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
		if (indicator_snp[t]==0) {continue;}

		//if (t>=0) {break;}
		//if (snpInfo[t].rs_number!="MAG18140902") {continue;}
		//cout<<t<<endl;

		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers

		//read genotypes
		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0;
		for (int i=0; i<n_bit; ++i) {
			infile.read(ch,1);
			b=ch[0];
			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
				if (indicator_idv[ci_total]==0) {ci_total++; continue;}

				if (b[2*j]==0) {
					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
				}
				else {
					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }
					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
				}

				ci_total++;
				ci_test++;
			}
		}

		x_mean/=(double)(ni_test-n_miss);

		for (size_t i=0; i<ni_test; ++i) {
			geno=gsl_vector_get(x,i);
			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
			if (x_mean>1) {
				gsl_vector_set(x, i, 2-geno);
			}
		}

		/*
		if (t==0) {
			ofstream outfile ("./snp1.txt", ofstream::out);
			if (!outfile) {cout<<"error writing file: "<<endl; return;}
			for (size_t i=0; i<x->size; i++) {
				outfile<<gsl_vector_get(x, i)<<endl;
			}
			outfile.clear();
			outfile.close();
		}
		*/

		//calculate statistics
		time_start=clock();
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row1.vector);
		gsl_vector_mul (x, env);
		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row2.vector);
		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		//initial values
		gsl_matrix_memcpy (V_g, V_g_null);
		gsl_matrix_memcpy (V_e, V_e_null);
		gsl_matrix_memcpy (B, B_null);

		if (a_mode==2 || a_mode==3 || a_mode==4) {
		  if (a_mode==3 || a_mode==4) {
		    logl_H0=MphEM ('R', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
		    logl_H0=MphNR ('R', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
		    MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2);
		  }

		  if (a_mode==2 || a_mode==4) {
		    logl_H0=MphEM ('L', em_iter/10, em_prec*10, eval, &X_sub2.matrix, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
		    logl_H0=MphNR ('L', nr_iter/10, nr_prec*10, eval, &X_sub2.matrix, Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
		    MphCalcBeta (eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix, se_B_null2);
		  }
		}

		time_start=clock();

		//3 is before 1
		if (a_mode==3 || a_mode==4) {
			p_score=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);

			if (p_score<p_nr && crt==1) {
				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
			}
		}

		if (a_mode==2 || a_mode==4) {
			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
			//calculate beta and Vbeta
			p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );

			if (p_lrt<p_nr) {
				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);

				//calculate beta and Vbeta
				p_lrt=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
				if (crt==1) {
					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
				}
			}
		}

		if (a_mode==1 || a_mode==4) {
			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
			p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

			if (p_wald<p_nr) {
				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
				p_wald=MphCalcP (eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);

				if (crt==1) {
					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
				}
			}
		}

		//cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;

		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}

		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);

		//store summary data
		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
		for (size_t i=0; i<d_size; i++) {
			v_beta[i]=gsl_vector_get (beta, i);
		}

		c=0;
		for (size_t i=0; i<d_size; i++) {
			for (size_t j=i; j<d_size; j++) {
				v_Vg[c]=gsl_matrix_get (V_g, i, j);
				v_Ve[c]=gsl_matrix_get (V_e, i, j);
				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
				c++;
			}
		}

		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
		sumStat.push_back(SNPs);
    }
	cout<<endl;

	//cout<<"time_opt = "<<time_opt<<endl;

	infile.close();
	infile.clear();

	gsl_matrix_free(U_hat);
	gsl_matrix_free(E_hat);
	gsl_matrix_free(OmegaU);
	gsl_matrix_free(OmegaE);
	gsl_matrix_free(UltVehiY);
	gsl_matrix_free(UltVehiBX);
	gsl_matrix_free(UltVehiU);
	gsl_matrix_free(UltVehiE);

	gsl_matrix_free(Hi_all);
	gsl_matrix_free(Hiy_all);
	gsl_matrix_free(xHi_all);
	gsl_matrix_free(Hessian);

	gsl_vector_free(x);

	gsl_matrix_free(Y);
	gsl_matrix_free(X);
	gsl_matrix_free(V_g);
	gsl_matrix_free(V_e);
	gsl_matrix_free(B);
	gsl_vector_free(beta);
	gsl_matrix_free(Vbeta);

	gsl_matrix_free(V_g_null);
	gsl_matrix_free(V_e_null);
	gsl_matrix_free(B_null);
	gsl_matrix_free(se_B_null1);
	gsl_matrix_free(se_B_null2);

	return;
}