about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gemma.cpp1840
-rw-r--r--src/io.cpp1224
-rw-r--r--src/io.h24
-rw-r--r--src/lm.cpp24
-rw-r--r--src/lmm.cpp267
-rw-r--r--src/mathfunc.cpp18
-rw-r--r--src/mvlmm.cpp451
-rw-r--r--src/param.cpp878
-rw-r--r--src/param.h42
-rw-r--r--src/vc.cpp2240
-rw-r--r--src/vc.h41
11 files changed, 5786 insertions, 1263 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp
index b8693a8..3b9fe29 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -39,9 +39,11 @@
 #include "vc_float.h"
 #include "lm_float.h"  //for LM class
 #include "bslmm_float.h"  //for BSLMM class
+#include "ldr_float.h"  //for LDR class
 #include "lmm_float.h"  //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe
 #include "mvlmm_float.h"  //for MVLMM class
 #include "prdt_float.h"	//for PRDT class
+#include "varcov_float.h"  //for MVLMM class
 #include "mathfunc_float.h"	//for a few functions
 #else
 #include "io.h"
@@ -49,9 +51,11 @@
 #include "vc.h"
 #include "lm.h"
 #include "bslmm.h"
+#include "ldr.h"
 #include "lmm.h"
 #include "mvlmm.h"
 #include "prdt.h"
+#include "varcov.h"
 #include "mathfunc.h"
 #endif
 
@@ -60,26 +64,23 @@ using namespace std;
 
 
 
-GEMMA::GEMMA(void):	
-version("0.95alpha"), date("08/08/2014"), year("2011")
+GEMMA::GEMMA(void):
+version("0.95alpha"), date("07/11/2015"), year("2011")
 {}
 
 void GEMMA::PrintHeader (void)
 {
 	cout<<endl;
 	cout<<"*********************************************************"<<endl;
-	cout<<"  Genome-wide Efficient Mixed Model Association (GEMMA) "<<endl;
+	cout<<"  Genome-wide Efficient Mixed Model Association (GEMMA)  "<<endl;
 	cout<<"  Version "<<version<<", "<<date<<"                              "<<endl;
-	cout<<"  Visit                                                 "<<endl;
-	cout<<"     http://stephenslab.uchicago.edu/software.html      "<<endl;
-	cout<<"     http://home.uchicago.edu/~xz7/software.html        "<<endl;
-	cout<<"  For Possible Updates                                  "<<endl;
+	cout<<"  Visit http://www.xzlab.org/software.html For Updates   "<<endl;
 	cout<<"  (C) "<<year<<" Xiang Zhou                                   "<<endl;
-	cout<<"  GNU General Public License                            "<<endl;
-	cout<<"  For Help, Type ./gemma -h                             "<<endl;
+	cout<<"  GNU General Public License                             "<<endl;
+	cout<<"  For Help, Type ./gemma -h                              "<<endl;
 	cout<<"*********************************************************"<<endl;
 	cout<<endl;
-	
+
 	return;
 }
 
@@ -89,13 +90,13 @@ void GEMMA::PrintLicense (void)
 	cout<<endl;
 	cout<<"The Software Is Distributed Under GNU General Public License, But May Also Require The Following Notifications."<<endl;
 	cout<<endl;
-	
+
 	cout<<"Including Lapack Routines In The Software May Require The Following Notification:"<<endl;
 	cout<<"Copyright (c) 1992-2010 The University of Tennessee and The University of Tennessee Research Foundation.  All rights reserved."<<endl;
 	cout<<"Copyright (c) 2000-2010 The University of California Berkeley. All rights reserved."<<endl;
-	cout<<"Copyright (c) 2006-2010 The University of Colorado Denver.  All rights reserved."<<endl;	
+	cout<<"Copyright (c) 2006-2010 The University of Colorado Denver.  All rights reserved."<<endl;
 	cout<<endl;
-	
+
 	cout<<"$COPYRIGHT$"<<endl;
 	cout<<"Additional copyrights may follow"<<endl;
 	cout<<"$HEADER$"<<endl;
@@ -113,9 +114,9 @@ void GEMMA::PrintLicense (void)
 		<<"THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE "
 		<<"OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."<<endl;
 	cout<<endl;
-	
-	
-	
+
+
+
 	return;
 }
 
@@ -124,9 +125,9 @@ void GEMMA::PrintLicense (void)
 void GEMMA::PrintHelp(size_t option)
 {
 	if (option==0) {
-		cout<<endl; 
+		cout<<endl;
 		cout<<" GEMMA version "<<version<<", released on "<<date<<endl;
-		cout<<" implemented by Xiang Zhou"<<endl; 
+		cout<<" implemented by Xiang Zhou"<<endl;
 		cout<<endl;
 		cout<<" type ./gemma -h [num] for detailed helps"<<endl;
 		cout<<" options: " << endl;
@@ -135,72 +136,116 @@ void GEMMA::PrintHelp(size_t option)
 		cout<<" 3: SNP QC"<<endl;
 		cout<<" 4: calculate relatedness matrix"<<endl;
 		cout<<" 5: perform eigen decomposition"<<endl;
-		cout<<" 6: perform variance component estiamtion"<<endl;
+		cout<<" 6: perform variance component estimation"<<endl;
 		cout<<" 7: fit a linear model"<<endl;
 		cout<<" 8: fit a linear mixed model"<<endl;
 		cout<<" 9: fit a multivariate linear mixed model"<<endl;
 		cout<<" 10: fit a Bayesian sparse linear mixed model"<<endl;
 		cout<<" 11: obtain predicted values"<<endl;
-		cout<<" 12: note"<<endl;
+		cout<<" 12: calculate snp variance covariance"<<endl;
+		cout<<" 13: note"<<endl;
 		cout<<endl;
-	}	
-	
+	}
+
 	if (option==1) {
 		cout<<" QUICK GUIDE" << endl;
 		cout<<" to generate a relatedness matrix: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -gk [num] -o [prefix]"<<endl;
 		cout<<"         ./gemma -g [filename] -p [filename] -gk [num] -o [prefix]"<<endl;
+		cout<<" to generate the S matrix: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -gs -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -g [filename] -gs -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -cat [filename] -gs -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -g [filename] -cat [filename] -gs -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -sample [num] -gs -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -g [filename] -sample [num] -gs -o [prefix]"<<endl;
+		cout<<" to generate the q vector: "<<endl;
+		cout<<"         ./gemma -beta [filename] -gq -o [prefix]"<<endl;
+		cout<<"         ./gemma -beta [filename] -cat [filename] -gq -o [prefix]"<<endl;
+		cout<<" to generate the ldsc weigthts: "<<endl;
+		cout<<"         ./gemma -beta [filename] -gw -o [prefix]"<<endl;
+		cout<<"         ./gemma -beta [filename] -cat [filename] -gw -o [prefix]"<<endl;
 		cout<<" to perform eigen decomposition of the relatedness matrix: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -k [filename] -eigen -o [prefix]"<<endl;
 		cout<<"         ./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]"<<endl;
 		cout<<" to estimate variance components: "<<endl;
-		cout<<"         ./gemma -bfile [prefix] -k [filename] -vc -o [prefix]"<<endl;
-		cout<<"         ./gemma -p [filename] -k [filename] -vc -o [prefix]"<<endl;
-		cout<<"         ./gemma -bfile [prefix] -mk [filename] -vc -o [prefix]"<<endl;
-		cout<<"         ./gemma -p [filename] -mk [filename] -vc -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -k [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -k [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -bfile [prefix] -mk [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -p [filename] -mk [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -beta [filename] -cor [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -beta [filename] -cor [filename] -cat [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         options for the above two commands: -crt -windowbp [num]"<<endl;
+		cout<<"         ./gemma -mq [filename] -ms [filename] -mv [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         or with summary statistics, replace bfile with mbfile, or g or mg; vc=1 for HE weights and vc=2 for LDSC weights"<<endl;
+		cout<<"         ./gemma -beta [filename] -bfile [filename] -cat [filename] -wsnp [filename] -wcat [filename] -vc [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -beta [filename] -bfile [filename] -cat [filename] -wsnp [filename] -wcat [filename] -ci [num] -o [prefix]"<<endl;
 		cout<<" to fit a linear mixed model: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -o [prefix]"<<endl;
-		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;	
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<" to fit a linear mixed model to test g by e effects: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -gxe [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -gxe [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<" to fit a univariate linear mixed model with different residual weights for different individuals: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -weight [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -weight [filename] -k [filename] -lmm [num] -o [prefix]"<<endl;
 		cout<<" to fit a multivariate linear mixed model: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;
-		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;	
+		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl;
 		cout<<" to fit a Bayesian sparse linear mixed model: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -bslmm [num] -o [prefix]"<<endl;
 		cout<<"         ./gemma -g [filename] -p [filename] -a [filename] -bslmm [num] -o [prefix]"<<endl;
 		cout<<" to obtain predicted values: "<<endl;
 		cout<<"         ./gemma -bfile [prefix] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
 		cout<<"         ./gemma -g [filename] -p [filename] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl;
+		cout<<" to calculate correlations between SNPs: "<<endl;
+		cout<<"         ./gemma -bfile [prefix] -calccor -o [prefix]"<<endl;
+		cout<<"         ./gemma -g [filename] -p [filename] -calccor -o [prefix]"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==2) {
 		cout<<" FILE I/O RELATED OPTIONS" << endl;
-		cout<<" -bfile    [prefix]       "<<" specify input PLINK binary ped file prefix."<<endl;	
-		cout<<"          requires: *.fam, *.bim and *.bed files"<<endl;	
+		cout<<" -bfile    [prefix]       "<<" specify input PLINK binary ped file prefix."<<endl;
+		cout<<"          requires: *.fam, *.bim and *.bed files"<<endl;
 		cout<<"          missing value: -9"<<endl;
 		cout<<" -g        [filename]     "<<" specify input BIMBAM mean genotype file name"<<endl;
-		cout<<"          format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
-		cout<<"                  rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;	
-		cout<<"                  ..."<<endl;	
-		cout<<"          missing value: NA"<<endl;	
+		cout<<"          format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;
+		cout<<"                  rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl;
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;
 		cout<<" -p        [filename]     "<<" specify input BIMBAM phenotype file name"<<endl;
-		cout<<"          format: phenotype for individual 1"<<endl;	
-		cout<<"                  phenotype for individual 2"<<endl;	
+		cout<<"          format: phenotype for individual 1"<<endl;
+		cout<<"                  phenotype for individual 2"<<endl;
+		cout<<"                  ..."<<endl;
+		cout<<"          missing value: NA"<<endl;
+		cout<<" -a        [filename]     "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl;
+		cout<<"          format: rs#1, base_position, chr_number"<<endl;
+		cout<<"                  rs#2, base_position, chr_number"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: NA"<<endl;	
-		cout<<" -a        [filename]     "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl;	
-		cout<<"          format: rs#1, base_position, chr_number"<<endl;	
-		cout<<"                  rs#2, base_position, chr_number"<<endl;	
+		// WJA added
+		cout<<" -oxford    [prefix]       "<<" specify input Oxford genotype bgen file prefix."<<endl;
+		cout<<"          requires: *.bgen, *.sample files"<<endl;
+
+		cout<<" -gxe      [filename]     "<<" specify input file that contains a column of environmental factor for g by e tests"<<endl;
+		cout<<"          format: variable for individual 1"<<endl;
+		cout<<"                  variable for individual 2"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<" -k        [filename]     "<<" specify input kinship/relatedness matrix file name"<<endl;	
-		cout<<" -mk       [filename]     "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl;	
-		cout<<" -u        [filename]     "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl;	
-		cout<<" -d        [filename]     "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl;	
-		cout<<" -c        [filename]     "<<" specify input covariates file name (optional)"<<endl;	
-		cout<<"          format: covariate 1 for individual 1, ... , covariate c for individual 1"<<endl;	
-		cout<<"                  covariate 1 for individual 2, ... , covariate c for individual 2"<<endl;	
+		cout<<"          missing value: NA"<<endl;
+		cout<<" -widv   [filename]     "<<" specify input file that contains a column of residual weights"<<endl;
+		cout<<"          format: variable for individual 1"<<endl;
+		cout<<"                  variable for individual 2"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: NA"<<endl;	
+		cout<<"          missing value: NA"<<endl;
+		cout<<" -k        [filename]     "<<" specify input kinship/relatedness matrix file name"<<endl;
+		cout<<" -mk       [filename]     "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl;
+		cout<<" -u        [filename]     "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl;
+		cout<<" -d        [filename]     "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl;
+		cout<<" -c        [filename]     "<<" specify input covariates file name (optional)"<<endl;
+		cout<<" -cat      [filename]     "<<" specify input category file name (optional), which contains rs cat1 cat2 ..."<<endl;
+		cout<<" -beta     [filename]     "<<" specify input beta file name (optional), which contains rs beta se_beta n_total (or n_mis and n_obs) estimates from a lm model"<<endl;
+		cout<<" -cor      [filename]     "<<" specify input correlation file name (optional), which contains rs window_size correlations from snps"<<endl;
+		cout<<"          missing value: NA"<<endl;
 		cout<<"          note: the intercept (a column of 1s) may need to be included"<<endl;
 		cout<<" -epm      [filename]     "<<" specify input estimated parameter file name"<<endl;
 		cout<<" -en [n1] [n2] [n3] [n4]  "<<" specify values for the input estimated parameter file (with a header)"<<endl;
@@ -210,74 +255,81 @@ void GEMMA::PrintHelp(size_t option)
 		cout<<"                   n4: estimated gamma column number (0 to ignore)"<<endl;
 		cout<<"          default: 2 4 5 6 if -ebv is not specified; 2 0 5 6 if -ebv is specified"<<endl;
 		cout<<" -ebv      [filename]     "<<" specify input estimated random effect (breeding value) file name"<<endl;
-		cout<<"          format: value for individual 1"<<endl;	
-		cout<<"                  value for individual 2"<<endl;	
+		cout<<"          format: value for individual 1"<<endl;
+		cout<<"                  value for individual 2"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: NA"<<endl;	
+		cout<<"          missing value: NA"<<endl;
 		cout<<" -emu      [filename]     "<<" specify input log file name containing estimated mean"<<endl;
 		cout<<" -mu       [num]          "<<" specify input estimated mean value"<<endl;
 		cout<<" -gene     [filename]     "<<" specify input gene expression file name"<<endl;
-		cout<<"          format: header"<<endl;	
-		cout<<"                  gene1, count for individual 1, count for individual 2, ..."<<endl;	
-		cout<<"                  gene2, count for individual 1, count for individual 2, ..."<<endl;	
+		cout<<"          format: header"<<endl;
+		cout<<"                  gene1, count for individual 1, count for individual 2, ..."<<endl;
+		cout<<"                  gene2, count for individual 1, count for individual 2, ..."<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: not allowed"<<endl;	
+		cout<<"          missing value: not allowed"<<endl;
 		cout<<" -r        [filename]     "<<" specify input total read count file name"<<endl;
-		cout<<"          format: total read count for individual 1"<<endl;	
-		cout<<"                  total read count for individual 2"<<endl;	
+		cout<<"          format: total read count for individual 1"<<endl;
+		cout<<"                  total read count for individual 2"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: NA"<<endl;	
+		cout<<"          missing value: NA"<<endl;
 		cout<<" -snps     [filename]     "<<" specify input snps file name to only analyze a certain set of snps"<<endl;
-		cout<<"          format: rs#1"<<endl;	
-		cout<<"                  rs#2"<<endl;	
+		cout<<"          format: rs#1"<<endl;
+		cout<<"                  rs#2"<<endl;
 		cout<<"                  ..."<<endl;
-		cout<<"          missing value: NA"<<endl;	
+		cout<<"          missing value: NA"<<endl;
 		cout<<" -silence                 "<<" silent terminal display"<<endl;
 		cout<<" -km       [num]          "<<" specify input kinship/relatedness file type (default 1)."<<endl;
 		cout<<"          options: 1: \"n by n matrix\" format"<<endl;
 		cout<<"                   2: \"id  id  value\" format"<<endl;
-		cout<<" -n        [num]          "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl;	
+		cout<<" -n        [num]          "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl;
 		cout<<" -pace     [num]          "<<" specify terminal display update pace (default 100000 SNPs or 100000 iterations)."<<endl;
-		cout<<" -outdir   [path]         "<<" specify output directory path (default \"./output/\")"<<endl; 
-		cout<<" -o        [prefix]       "<<" specify output file prefix (default \"result\")"<<endl;  
-		cout<<"          output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl;	
-		cout<<"          output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl;	
+		cout<<" -outdir   [path]         "<<" specify output directory path (default \"./output/\")"<<endl;
+		cout<<" -o        [prefix]       "<<" specify output file prefix (default \"result\")"<<endl;
+		cout<<"          output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl;
+		cout<<"          output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==3) {
 		cout<<" SNP QC OPTIONS" << endl;
-		cout<<" -miss     [num]          "<<" specify missingness threshold (default 0.05)" << endl; 
-		cout<<" -maf      [num]          "<<" specify minor allele frequency threshold (default 0.01)" << endl; 
-		cout<<" -hwe      [num]          "<<" specify HWE test p value threshold (default 0; no test)" << endl; 
-		cout<<" -r2       [num]          "<<" specify r-squared threshold (default 0.9999)" << endl; 
-		cout<<" -notsnp                  "<<" minor allele frequency cutoff is not used" << endl; 
+		cout<<" -miss     [num]          "<<" specify missingness threshold (default 0.05)" << endl;
+		cout<<" -maf      [num]          "<<" specify minor allele frequency threshold (default 0.01)" << endl;
+		cout<<" -hwe      [num]          "<<" specify HWE test p value threshold (default 0; no test)" << endl;
+		cout<<" -r2       [num]          "<<" specify r-squared threshold (default 0.9999)" << endl;
+		cout<<" -notsnp                  "<<" minor allele frequency cutoff is not used" << endl;
 		cout<<endl;
 	}
-	
+
 	if (option==4) {
 		cout<<" RELATEDNESS MATRIX CALCULATION OPTIONS" << endl;
-		cout<<" -gk       [num]          "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl; 
+		cout<<" -gk       [num]          "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl;
 		cout<<"          options: 1: centered XX^T/p"<<endl;
 		cout<<"                   2: standardized XX^T/p"<<endl;
 		cout<<"          note: non-polymorphic SNPs are excluded "<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==5) {
 		cout<<" EIGEN-DECOMPOSITION OPTIONS" << endl;
-		cout<<" -eigen                   "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl; 
+		cout<<" -eigen                   "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl;
 		cout<<endl;
 	}
 
 	if (option==6) {
 		cout<<" VARIANCE COMPONENT ESTIMATION OPTIONS" << endl;
-		cout<<" -vc                      "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl; 
+		cout<<" -vc                      "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl;
+		cout<<"          options (with kinship file):   1: HE regression (default)"<<endl;
+		cout<<"                                         2: REML"<<endl;
+		cout<<"          options (with beta/cor files): 1: Centered genotypes (default)"<<endl;
+		cout<<"                                         2: Standardized genotypes"<<endl;
+		cout<<"                                         -crt -windowbp [num]"<<" specify the window size based on bp (default 1000000; 1Mb)"<<endl;
+		cout<<"                                         -crt -windowcm [num]"<<" specify the window size based on cm (default 0)"<<endl;
+		cout<<"                                         -crt -windowns [num]"<<" specify the window size based on number of snps (default 0)"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==7) {
-		cout<<" LINEAR MODEL OPTIONS" << endl;		
+		cout<<" LINEAR MODEL OPTIONS" << endl;
 		cout<<" -lm       [num]         "<<" specify analysis options (default 1)."<<endl;
 		cout<<"          options: 1: Wald test"<<endl;
 		cout<<"                   2: Likelihood ratio test"<<endl;
@@ -285,21 +337,21 @@ void GEMMA::PrintHelp(size_t option)
 		cout<<"                   4: 1-3"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==8) {
-		cout<<" LINEAR MIXED MODEL OPTIONS" << endl;		
+		cout<<" LINEAR MIXED MODEL OPTIONS" << endl;
 		cout<<" -lmm      [num]         "<<" specify analysis options (default 1)."<<endl;
-		cout<<"          options: 1: Wald test"<<endl;		
+		cout<<"          options: 1: Wald test"<<endl;
 		cout<<"                   2: Likelihood ratio test"<<endl;
 		cout<<"                   3: Score test"<<endl;
 		cout<<"                   4: 1-3"<<endl;
 		cout<<"                   5: Parameter estimation in the null model only"<<endl;
-		cout<<" -lmin     [num]          "<<" specify minimal value for lambda (default 1e-5)" << endl; 
-		cout<<" -lmax     [num]          "<<" specify maximum value for lambda (default 1e+5)" << endl; 
-		cout<<" -region   [num]          "<<" specify the number of regions used to evaluate lambda (default 10)" << endl; 
+		cout<<" -lmin     [num]          "<<" specify minimal value for lambda (default 1e-5)" << endl;
+		cout<<" -lmax     [num]          "<<" specify maximum value for lambda (default 1e+5)" << endl;
+		cout<<" -region   [num]          "<<" specify the number of regions used to evaluate lambda (default 10)" << endl;
 		cout<<endl;
 	}
-	
+
 	if (option==9) {
 		cout<<" MULTIVARIATE LINEAR MIXED MODEL OPTIONS" << endl;
 		cout<<" -pnr				     "<<" specify the pvalue threshold to use the Newton-Raphson's method (default 0.001)"<<endl;
@@ -310,51 +362,63 @@ void GEMMA::PrintHelp(size_t option)
 		cout<<" -crt				     "<<" specify to output corrected pvalues for these pvalues that are below the -pnr threshold"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==10) {
 		cout<<" MULTI-LOCUS ANALYSIS OPTIONS" << endl;
 		cout<<" -bslmm	  [num]			 "<<" specify analysis options (default 1)."<<endl;
-		cout<<"          options: 1: BSLMM"<<endl;	
-		cout<<"                   2: standard ridge regression/GBLUP (no mcmc)"<<endl;	
-		cout<<"                   3: probit BSLMM (requires 0/1 phenotypes)"<<endl;			
-		
+		cout<<"          options: 1: BSLMM"<<endl;
+		cout<<"                   2: standard ridge regression/GBLUP (no mcmc)"<<endl;
+		cout<<"                   3: probit BSLMM (requires 0/1 phenotypes)"<<endl;
+
+		cout<<" -ldr	  [num]			 "<<" specify analysis options (default 1)."<<endl;
+		cout<<"          options: 1: LDR"<<endl;
+
 		cout<<"   MCMC OPTIONS" << endl;
-		cout<<"   Prior" << endl;	
-		cout<<" -hmin     [num]          "<<" specify minimum value for h (default 0)" << endl; 
-		cout<<" -hmax     [num]          "<<" specify maximum value for h (default 1)" << endl; 
-		cout<<" -rmin     [num]          "<<" specify minimum value for rho (default 0)" << endl; 
-		cout<<" -rmax     [num]          "<<" specify maximum value for rho (default 1)" << endl; 
-		cout<<" -pmin     [num]          "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl; 
-		cout<<" -pmax     [num]          "<<" specify maximum value for log10(pi) (default log10(1) )" << endl; 	
-		cout<<" -smin     [num]          "<<" specify minimum value for |gamma| (default 0)" << endl; 
-		cout<<" -smax     [num]          "<<" specify maximum value for |gamma| (default 300)" << endl; 
-		
+		cout<<"   Prior" << endl;
+		cout<<" -hmin     [num]          "<<" specify minimum value for h (default 0)" << endl;
+		cout<<" -hmax     [num]          "<<" specify maximum value for h (default 1)" << endl;
+		cout<<" -rmin     [num]          "<<" specify minimum value for rho (default 0)" << endl;
+		cout<<" -rmax     [num]          "<<" specify maximum value for rho (default 1)" << endl;
+		cout<<" -pmin     [num]          "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl;
+		cout<<" -pmax     [num]          "<<" specify maximum value for log10(pi) (default log10(1) )" << endl;
+		cout<<" -smin     [num]          "<<" specify minimum value for |gamma| (default 0)" << endl;
+		cout<<" -smax     [num]          "<<" specify maximum value for |gamma| (default 300)" << endl;
+
 		cout<<"   Proposal" << endl;
-		cout<<" -gmean    [num]          "<<" specify the mean for the geometric distribution (default: 2000)" << endl; 
-		cout<<" -hscale   [num]          "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
-		cout<<" -rscale   [num]          "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; 
-		cout<<" -pscale   [num]          "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl; 
-		
+		cout<<" -gmean    [num]          "<<" specify the mean for the geometric distribution (default: 2000)" << endl;
+		cout<<" -hscale   [num]          "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl;
+		cout<<" -rscale   [num]          "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl;
+		cout<<" -pscale   [num]          "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl;
+
 		cout<<"   Others" << endl;
-		cout<<" -w        [num]          "<<" specify burn-in steps (default 100,000)" << endl; 
-		cout<<" -s        [num]          "<<" specify sampling steps (default 1,000,000)" << endl; 
-		cout<<" -rpace    [num]          "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl; 	
-		cout<<" -wpace    [num]          "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl; 	
-		cout<<" -seed     [num]          "<<" specify random seed (a random seed is generated by default)" << endl; 	
-		cout<<" -mh       [num]          "<<" specify number of MH steps in each iteration (default 10)" << endl; 
-		cout<<"          requires: 0/1 phenotypes and -bslmm 3 option"<<endl;	
+		cout<<" -w        [num]          "<<" specify burn-in steps (default 100,000)" << endl;
+		cout<<" -s        [num]          "<<" specify sampling steps (default 1,000,000)" << endl;
+		cout<<" -rpace    [num]          "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl;
+		cout<<" -wpace    [num]          "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl;
+		cout<<" -seed     [num]          "<<" specify random seed (a random seed is generated by default)" << endl;
+		cout<<" -mh       [num]          "<<" specify number of MH steps in each iteration (default 10)" << endl;
+		cout<<"          requires: 0/1 phenotypes and -bslmm 3 option"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==11) {
 		cout<<" PREDICTION OPTIONS" << endl;
 		cout<<" -predict  [num]			 "<<" specify prediction options (default 1)."<<endl;
-		cout<<"          options: 1: predict for individuals with missing phenotypes"<<endl;	
-		cout<<"                   2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl;	
+		cout<<"          options: 1: predict for individuals with missing phenotypes"<<endl;
+		cout<<"                   2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl;
 		cout<<endl;
 	}
-	
+
 	if (option==12) {
+		cout<<" CALC CORRELATION OPTIONS" << endl;
+		cout<<" -calccor       			 "<<endl;
+		cout<<" -windowbp       [num]            "<<" specify the window size based on bp (default 1000000; 1Mb)" << endl;
+		cout<<" -windowcm       [num]            "<<" specify the window size based on cm (default 0; not used)" << endl;
+		cout<<" -windowns       [num]            "<<" specify the window size based on number of snps (default 0; not used)" << endl;
+		cout<<endl;
+	}
+
+	if (option==13) {
 		cout<<" NOTE"<<endl;
 		cout<<" 1. Only individuals with non-missing phenotoypes and covariates will be analyzed."<<endl;
 		cout<<" 2. Missing genotoypes will be repalced with the mean genotype of that SNP."<<endl;
@@ -363,17 +427,29 @@ void GEMMA::PrintHelp(size_t option)
 		cout<<" 5. For bslmm analysis, in addition to 3, memory should be large enough to hold the whole genotype matrix."<<endl;
 		cout<<endl;
 	}
-	
+
 	return;
 }
 
-
+//options
+//gk: 21-22
+//gs: 25-26
+//gq: 27-28
+//eigen: 31-32
+//lmm: 1-5
+//bslmm: 11-13
+//predict: 41-43
+//lm: 51
+//vc: 61
+//ci: 66-67
+//calccor: 71
+//gw: 72
 
 void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 {
 	string str;
-	
-	for(int i = 1; i < argc; i++) {		
+
+	for(int i = 1; i < argc; i++) {
 		if (strcmp(argv[i], "-bfile")==0 || strcmp(argv[i], "--bfile")==0 || strcmp(argv[i], "-b")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -381,6 +457,13 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.assign(argv[i]);
 			cPar.file_bfile=str;
 		}
+		else if (strcmp(argv[i], "-mbfile")==0 || strcmp(argv[i], "--mbfile")==0 || strcmp(argv[i], "-mb")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mbfile=str;
+		}
 		else if (strcmp(argv[i], "-silence")==0) {
 			cPar.mode_silence=true;
 		}
@@ -391,6 +474,13 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.assign(argv[i]);
 			cPar.file_geno=str;
 		}
+		else if (strcmp(argv[i], "-mg")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mgeno=str;
+		}
 		else if (strcmp(argv[i], "-p")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -405,6 +495,42 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.assign(argv[i]);
 			cPar.file_anno=str;
 		}
+		// WJA added
+		else if (strcmp(argv[i], "-oxford")==0 || strcmp(argv[i], "--oxford")==0 || strcmp(argv[i], "-x")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_oxford=str;
+		}
+		else if (strcmp(argv[i], "-gxe")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_gxe=str;
+		}
+		else if (strcmp(argv[i], "-widv")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_weight=str;
+		}
+		else if (strcmp(argv[i], "-wsnp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_wsnp=str;
+		}
+		else if (strcmp(argv[i], "-wcat")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_wcat=str;
+		}
 		else if (strcmp(argv[i], "-k")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -440,6 +566,62 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.assign(argv[i]);
 			cPar.file_cvt=str;
 		}
+		else if (strcmp(argv[i], "-cat")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_cat=str;
+		}
+		else if (strcmp(argv[i], "-mcat")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mcat=str;
+		}
+		else if (strcmp(argv[i], "-beta")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_beta=str;
+		}
+		else if (strcmp(argv[i], "-cor")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_cor=str;
+		}
+		else if (strcmp(argv[i], "-study")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_study=str;
+		}
+		else if (strcmp(argv[i], "-ref")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_ref=str;
+		}
+		else if (strcmp(argv[i], "-mstudy")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mstudy=str;
+		}
+		else if (strcmp(argv[i], "-mref")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.file_mref=str;
+		}
 		else if (strcmp(argv[i], "-epm")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -447,7 +629,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.assign(argv[i]);
 			cPar.file_epm=str;
 		}
-		else if (strcmp(argv[i], "-en")==0) {			
+		else if (strcmp(argv[i], "-en")==0) {
 			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
 				++i;
 				str.clear();
@@ -503,7 +685,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.clear();
 			str.assign(argv[i]);
 			cPar.k_mode=atoi(str.c_str());
-		}		
+		}
 		else if (strcmp(argv[i], "-n")==0) {
 			(cPar.p_column).clear();
 			while (argv[i+1] != NULL && argv[i+1][0] != '-') {
@@ -533,7 +715,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			str.clear();
 			str.assign(argv[i]);
 			cPar.file_out=str;
-		}		
+		}
 		else if (strcmp(argv[i], "-miss")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -566,31 +748,101 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			cPar.maf_level=-1;
 		}
 		else if (strcmp(argv[i], "-gk")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=21; continue;}
 			++i;
 			str.clear();
 			str.assign(argv[i]);
 			cPar.a_mode=20+atoi(str.c_str());
-		}	
+		}
+		else if (strcmp(argv[i], "-gs")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=25; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=24+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gq")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=27; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=26+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-gw")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=72; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=71+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-sample")==0) {
+		  if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.ni_subsample=atoi(str.c_str());
+		}
 		else if (strcmp(argv[i], "-eigen")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=31; continue;}
 			++i;
 			str.clear();
 			str.assign(argv[i]);
 			cPar.a_mode=30+atoi(str.c_str());
-		}	
+		}
+        else if (strcmp(argv[i], "-calccor")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=71; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=70+atoi(str.c_str());
+		}
 		else if (strcmp(argv[i], "-vc")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=61; continue;}
 			++i;
 			str.clear();
 			str.assign(argv[i]);
 			cPar.a_mode=60+atoi(str.c_str());
-		}	
+		}
+		else if (strcmp(argv[i], "-ci")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=66; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=65+atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-pve")==0) {
+		  double s=0;
+		  while (argv[i+1] != NULL && (argv[i+1][0] != '-' || !isalpha(argv[i+1][1]) ) ) {
+			  ++i;
+			  str.clear();
+			  str.assign(argv[i]);
+			  cPar.v_pve.push_back(atof(str.c_str()));
+			  s+=atof(str.c_str());
+			}
+			if (s==1) {
+			  cout<<"summation of pve equals one."<<endl;
+			}
+		}
+		else if (strcmp(argv[i], "-blocks")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.n_block=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-noconstrain")==0) {
+			cPar.noconstrain=true;
+		}
 		else if (strcmp(argv[i], "-lm")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=51; continue;}
 			++i;
 			str.clear();
@@ -598,7 +850,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			cPar.a_mode=50+atoi(str.c_str());
 		}
 		else if (strcmp(argv[i], "-fa")==0 || strcmp(argv[i], "-lmm")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=1; continue;}
 			++i;
 			str.clear();
@@ -665,13 +917,21 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			cPar.crt=1;
 		}
 		else if (strcmp(argv[i], "-bslmm")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=11; continue;}
 			++i;
 			str.clear();
 			str.assign(argv[i]);
 			cPar.a_mode=10+atoi(str.c_str());
 		}
+		else if (strcmp(argv[i], "-ldr")==0) {
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=14; continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.a_mode=13+atoi(str.c_str());
+		}
 		else if (strcmp(argv[i], "-hmin")==0) {
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
 			++i;
@@ -799,25 +1059,46 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar)
 			cPar.n_mh=atoi(str.c_str());
 		}
 		else if (strcmp(argv[i], "-predict")==0) {
-			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;}
+			if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;}
 			if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=41; continue;}
 			++i;
 			str.clear();
 			str.assign(argv[i]);
 			cPar.a_mode=40+atoi(str.c_str());
 		}
+		else if (strcmp(argv[i], "-windowcm")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.window_cm=atof(str.c_str());
+		}
+		else if (strcmp(argv[i], "-windowbp")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.window_bp=atoi(str.c_str());
+		}
+		else if (strcmp(argv[i], "-windowns")==0) {
+			if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;}
+			++i;
+			str.clear();
+			str.assign(argv[i]);
+			cPar.window_ns=atoi(str.c_str());
+		}
 		else {cout<<"error! unrecognized option: "<<argv[i]<<endl; cPar.error=true; continue;}
 	}
-	
+
 	//change prediction mode to 43, if the epm file is not provided
 	if (cPar.a_mode==41 && cPar.file_epm.empty()) {cPar.a_mode=43;}
-	
+
 	return;
 }
 
 
 
-void GEMMA::BatchRun (PARAM &cPar) 
+void GEMMA::BatchRun (PARAM &cPar)
 {
 	clock_t time_begin, time_start;
 	time_begin=clock();
@@ -828,25 +1109,26 @@ void GEMMA::BatchRun (PARAM &cPar)
 	if (cPar.error==true) {cout<<"error! fail to read files. "<<endl; return;}
 	cPar.CheckData();
 	if (cPar.error==true) {cout<<"error! fail to check data. "<<endl; return;}
-	//Prediction for bslmm	
+
+	//Prediction for bslmm
 	if (cPar.a_mode==41 || cPar.a_mode==42) {
 		gsl_vector *y_prdt;
-		
+
 		y_prdt=gsl_vector_alloc (cPar.ni_total-cPar.ni_test);
 
 		//set to zero
 		gsl_vector_set_zero (y_prdt);
-		
+
 		PRDT cPRDT;
 		cPRDT.CopyFromParam(cPar);
-		
+
 		//add breeding value if needed
 		if (!cPar.file_kin.empty() && !cPar.file_ebv.empty()) {
 			cout<<"Adding Breeding Values ... "<<endl;
-			
+
 			gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
 			gsl_vector *u_hat=gsl_vector_alloc (cPar.ni_test);
-			
+
 			//read kinship matrix and set u_hat
 			vector<int> indicator_all;
 			size_t c_bv=0;
@@ -854,13 +1136,13 @@ void GEMMA::BatchRun (PARAM &cPar)
 				indicator_all.push_back(1);
 				if (cPar.indicator_bv[i]==1) {gsl_vector_set(u_hat, c_bv, cPar.vec_bv[i]); c_bv++;}
 			}
-			
+
 			ReadFile_kin (cPar.file_kin, indicator_all, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-			
-			//read u			
-			cPRDT.AddBV(G, u_hat, y_prdt);					
-			
+
+			//read u
+			cPRDT.AddBV(G, u_hat, y_prdt);
+
 			gsl_matrix_free(G);
 			gsl_vector_free(u_hat);
 		}
@@ -872,10 +1154,10 @@ void GEMMA::BatchRun (PARAM &cPar)
 		else {
 			cPRDT.AnalyzeBimbam (y_prdt);
 		}
-		
+
 		//add mu
 		gsl_vector_add_constant(y_prdt, cPar.pheno_mean);
-		
+
 		//convert y to probability if needed
 		if (cPar.a_mode==42) {
 			double d;
@@ -885,51 +1167,51 @@ void GEMMA::BatchRun (PARAM &cPar)
 				gsl_vector_set(y_prdt, i, d);
 			}
 		}
-			
-			
+
+
 		cPRDT.CopyToParam(cPar);
-		
+
 		cPRDT.WriteFiles(y_prdt);
-		
+
 		gsl_vector_free(y_prdt);
 	}
-	
-	
+
+
 	//Prediction with kinship matrix only; for one or more phenotypes
 	if (cPar.a_mode==43) {
-		//first, use individuals with full phenotypes to obtain estimates of Vg and Ve		
+		//first, use individuals with full phenotypes to obtain estimates of Vg and Ve
 		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
-		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);		
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
 		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
-		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1);
 		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
 		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
 		gsl_vector *eval=gsl_vector_alloc (Y->size1);
-		
+
 		gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph);
 		gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt);
 		//set covariates matrix W and phenotype matrix Y
-		//an intercept should be included in W, 
+		//an intercept should be included in W,
 		cPar.CopyCvtPhen (W, Y, 0);
 		cPar.CopyCvtPhen (W_full, Y_full, 1);
-				
-		gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph);		
-		gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1);		
+
+		gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph);
+		gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1);
 		gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2);
-				
+
 		//read relatedness matrix G, and matrix G_full
 		ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
 		ReadFile_kin (cPar.file_kin, cPar.indicator_cvt, cPar.mapID2num, cPar.k_mode, cPar.error, G_full);
 		if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-				
+
 		//center matrix G
 		CenterMatrix (G);
 		CenterMatrix (G_full);
-		
+
 		//eigen-decomposition and calculate trace_G
 		cout<<"Start Eigen-Decomposition..."<<endl;
-		time_start=clock();	
+		time_start=clock();
 		cPar.trace_G=EigenDecomp (G, U, eval, 0);
 		cPar.trace_G=0.0;
 		for (size_t i=0; i<eval->size; i++) {
@@ -937,8 +1219,8 @@ void GEMMA::BatchRun (PARAM &cPar)
 			cPar.trace_G+=gsl_vector_get (eval, i);
 		}
 		cPar.trace_G/=(double)eval->size;
-		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
-		
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
 		//calculate UtW and Uty
 		CalcUtX (U, W, UtW);
 		CalcUtX (U, Y, UtY);
@@ -948,7 +1230,7 @@ void GEMMA::BatchRun (PARAM &cPar)
 		if (cPar.n_ph==1) {
 			gsl_vector *beta=gsl_vector_alloc (W->size2);
 			gsl_vector *se_beta=gsl_vector_alloc (W->size2);
-			
+
 			double lambda, logl, vg, ve;
 			gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
 
@@ -959,29 +1241,29 @@ void GEMMA::BatchRun (PARAM &cPar)
 			cout<<"REMLE estimate for vg in the null model = "<<vg<<endl;
 			cout<<"REMLE estimate for ve in the null model = "<<ve<<endl;
 			cPar.vg_remle_null=vg; cPar.ve_remle_null=ve;
-			
+
 			//obtain Y_hat from fixed effects
-			gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0);			
+			gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0);
 			gsl_blas_dgemv (CblasNoTrans, 1.0, W_full, beta, 0.0, &Yhat_col.vector);
-			
+
 			//obtain H
 			gsl_matrix_set_identity (H_full);
 			gsl_matrix_scale (H_full, ve);
 			gsl_matrix_scale (G_full, vg);
 			gsl_matrix_add (H_full, G_full);
-			
-			//free matrices			
+
+			//free matrices
 			gsl_vector_free(beta);
 			gsl_vector_free(se_beta);
-		} else {			
+		} else {
 			gsl_matrix *Vg=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
 			gsl_matrix *Ve=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph);
 			gsl_matrix *B=gsl_matrix_alloc (cPar.n_ph, W->size2);
 			gsl_matrix *se_B=gsl_matrix_alloc (cPar.n_ph, W->size2);
-			
+
 			//obtain estimates
 			CalcMvLmmVgVeBeta (eval, UtW, UtY, cPar.em_iter, cPar.nr_iter, cPar.em_prec, cPar.nr_prec, cPar.l_min, cPar.l_max, cPar.n_region, Vg, Ve, B, se_B);
-			
+
 			cout<<"REMLE estimate for Vg in the null model: "<<endl;
 			for (size_t i=0; i<Vg->size1; i++) {
 				for (size_t j=0; j<=i; j++) {
@@ -1004,110 +1286,250 @@ void GEMMA::BatchRun (PARAM &cPar)
 					cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) );
 				}
 			}
-			
+
 			//obtain Y_hat from fixed effects
 			gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat);
-			
+
 			//obtain H
 			KroneckerSym(G_full, Vg, H_full);
 			for (size_t i=0; i<G_full->size1; i++) {
 				gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2);
 				gsl_matrix_add (&H_sub.matrix, Ve);
 			}
-			
-			//free matrices					
+
+			//free matrices
 			gsl_matrix_free (Vg);
 			gsl_matrix_free (Ve);
 			gsl_matrix_free (B);
 			gsl_matrix_free (se_B);
 		}
-					
+
 		PRDT cPRDT;
-		
+
 		cPRDT.CopyFromParam(cPar);
-		
+
 		cout<<"Predicting Missing Phentypes ... "<<endl;
-		time_start=clock();	
+		time_start=clock();
 		cPRDT.MvnormPrdt(Y_hat, H_full, Y_full);
-		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
 		cPRDT.WriteFiles(Y_full);
-		
+
 		gsl_matrix_free(Y);
-		gsl_matrix_free(W);		
+		gsl_matrix_free(W);
 		gsl_matrix_free(G);
-		gsl_matrix_free(U); 
+		gsl_matrix_free(U);
 		gsl_matrix_free(UtW);
 		gsl_matrix_free(UtY);
 		gsl_vector_free(eval);
-		
+
 		gsl_matrix_free(Y_full);
 		gsl_matrix_free(Y_hat);
 		gsl_matrix_free(W_full);
-		gsl_matrix_free(G_full);		
+		gsl_matrix_free(G_full);
 		gsl_matrix_free(H_full);
 	}
-	
-	
+
+
 	//Generate Kinship matrix
-	if (cPar.a_mode==21 || cPar.a_mode==22) {  
+	if (cPar.a_mode==21 || cPar.a_mode==22) {
 		cout<<"Calculating Relatedness Matrix ... "<<endl;
-		
+
 		gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total);
-		
+
 		time_start=clock();
 		cPar.CalcKin (G);
 		cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 		if (cPar.error==true) {cout<<"error! fail to calculate relatedness matrix. "<<endl; return;}
-		
+
 		if (cPar.a_mode==21) {
 			cPar.WriteMatrix (G, "cXX");
 		} else {
 			cPar.WriteMatrix (G, "sXX");
 		}
-		
+
 		gsl_matrix_free (G);
 	}
-	
-	
+
+	//Compute the LDSC weights (not implemented yet)
+	if (cPar.a_mode==72) {
+		cout<<"Calculating Weights ... "<<endl;
+
+		VARCOV cVarcov;
+		cVarcov.CopyFromParam(cPar);
+
+		if (!cPar.file_bfile.empty()) {
+		  cVarcov.AnalyzePlink ();
+		} else {
+		  cVarcov.AnalyzeBimbam ();
+		}
+
+		cVarcov.CopyToParam(cPar);
+	}
+
+
+	//Compute the S matrix (and its variance), that is used for variance component estimation using summary statistics
+	if (cPar.a_mode==25 || cPar.a_mode==26) {
+	  cout<<"Calculating the S Matrix ... "<<endl;
+
+	  gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc*2, cPar.n_vc);
+	  gsl_vector *ns=gsl_vector_alloc (cPar.n_vc+1);
+	  gsl_matrix_set_zero(S);
+	  gsl_vector_set_zero(ns);
+
+	  gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc);
+	  gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc);
+	  gsl_vector_view ns_vec=gsl_vector_subvector(ns, 0, cPar.n_vc);
+
+	  gsl_matrix *K=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test);
+	  gsl_matrix *A=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test);
+	  gsl_matrix_set_zero (K);
+	  gsl_matrix_set_zero (A);
+
+	  gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
+	  gsl_matrix *W=gsl_matrix_alloc (cPar.ni_test, cPar.n_cvt);
+
+	  cPar.CopyCvtPhen (W, y, 0);
+
+	  set<string> setSnps_beta;
+	  map <string, double> mapRS2wA, mapRS2wK;
+
+	  cPar.ObtainWeight(setSnps_beta, mapRS2wK);
+
+	  time_start=clock();
+	  cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &ns_vec.vector);
+	  cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	  if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;}
+
+	  gsl_vector_set (ns, cPar.n_vc, cPar.ni_test);
+
+	  cPar.WriteMatrix (S, "S");
+	  cPar.WriteVector (ns, "size");
+	  cPar.WriteVar ("snps");
+	  /*
+	  cout<<scientific;
+	  for (size_t i=0; i<cPar.n_vc; i++) {
+            for (size_t j=0; j<cPar.n_vc; j++) {
+	      cout<<gsl_matrix_get(S, i, j)<<" ";
+            }
+            cout<<endl;
+	  }
+
+	  for (size_t i=cPar.n_vc; i<cPar.n_vc*2; i++) {
+            for (size_t j=0; j<cPar.n_vc; j++) {
+	      cout<<gsl_matrix_get(S, i, j)<<" ";
+            }
+            cout<<endl;
+	  }
+	  */
+	  gsl_matrix_free (S);
+	  gsl_vector_free (ns);
+
+	  gsl_matrix_free (A);
+	  gsl_matrix_free (K);
+
+	  gsl_vector_free (y);
+	  gsl_matrix_free (K);
+	}
+
+	//Compute the q vector, that is used for variance component estimation using summary statistics
+	if (cPar.a_mode==27 || cPar.a_mode==28) {
+	  gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc);
+	  gsl_vector *q=gsl_vector_alloc (cPar.n_vc);
+	  gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1);
+	  gsl_vector_set_zero (q);
+	  gsl_vector_set_zero (s);
+
+	  gsl_vector_view s_vec=gsl_vector_subvector(s, 0, cPar.n_vc);
+
+	  vector<size_t> vec_cat, vec_ni;
+	  vector<double> vec_weight, vec_z2;
+	  map<string, double> mapRS2weight;
+	  mapRS2weight.clear();
+
+	  time_start=clock();
+	  ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2weight, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_total, cPar.ns_total, cPar.ns_test);
+	  cout<<"## number of total individuals = "<<cPar.ni_total<<endl;
+	  cout<<"## number of total SNPs = "<<cPar.ns_total<<endl;
+	  cout<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  cout<<"## number of variance components = "<<cPar.n_vc<<endl;
+	  cout<<"Calculating the q vector ... "<<endl;
+	  Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector);
+	  cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+	  if (cPar.error==true) {cout<<"error! fail to calculate the q vector. "<<endl; return;}
+
+	  gsl_vector_set (s, cPar.n_vc, cPar.ni_total);
+
+	  cPar.WriteMatrix (Vq, "Vq");
+	  cPar.WriteVector (q, "q");
+	  cPar.WriteVector (s, "size");
+	  /*
+	  for (size_t i=0; i<cPar.n_vc; i++) {
+	    cout<<gsl_vector_get(q, i)<<endl;
+	  }
+	  */
+	  gsl_matrix_free (Vq);
+	  gsl_vector_free (q);
+	  gsl_vector_free (s);
+	}
+
+
+    //Calculate SNP covariance
+	if (cPar.a_mode==71) {
+	  VARCOV cVarcov;
+	  cVarcov.CopyFromParam(cPar);
+
+	  if (!cPar.file_bfile.empty()) {
+            cVarcov.AnalyzePlink ();
+	  } else {
+            cVarcov.AnalyzeBimbam ();
+	  }
+
+	  cVarcov.CopyToParam(cPar);
+	}
+
+
 	//LM
 	if (cPar.a_mode==51 || cPar.a_mode==52 || cPar.a_mode==53 || cPar.a_mode==54) {  //Fit LM
 		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
-		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);	
-		
-		//set covariates matrix W and phenotype matrix Y		
-		//an intercept should be included in W, 
+		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
+
+		//set covariates matrix W and phenotype matrix Y
+		//an intercept should be included in W,
 		cPar.CopyCvtPhen (W, Y, 0);
-		
+
 		//Fit LM or mvLM
-		if (cPar.n_ph==1) {			
+		if (cPar.n_ph==1) {
 			LM cLm;
 			cLm.CopyFromParam(cPar);
-			
+
 			gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
-			
-			if (!cPar.file_gene.empty()) {		
+
+			if (!cPar.file_gene.empty()) {
 				cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype
 			} else if (!cPar.file_bfile.empty()) {
 				cLm.AnalyzePlink (W, &Y_col.vector);
+			} else if (!cPar.file_oxford.empty()) {
+				cLm.Analyzebgen (W, &Y_col.vector);
 			} else {
 				cLm.AnalyzeBimbam (W, &Y_col.vector);
 			}
-			
+
 			cLm.WriteFiles();
 			cLm.CopyToParam(cPar);
 		}
 		/*
-		else {			 
+		else {
 			MVLM cMvlm;
-			cMvlm.CopyFromParam(cPar);			
-			
+			cMvlm.CopyFromParam(cPar);
+
 			if (!cPar.file_bfile.empty()) {
 				cMvlm.AnalyzePlink (W, Y);
 			} else {
 				cMvlm.AnalyzeBimbam (W, Y);
 			}
-			
+
 			cMvlm.WriteFiles();
 			cMvlm.CopyToParam(cPar);
 		}
@@ -1115,27 +1537,202 @@ void GEMMA::BatchRun (PARAM &cPar)
 		//release all matrices and vectors
 		gsl_matrix_free (Y);
 		gsl_matrix_free (W);
-	} 
+	}
 
 
 	//VC estimation with one or multiple kinship matrices
 	//REML approach only
 	//if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp
-	//for one phenotype only; 
-	if (cPar.a_mode==61) {
+	//for one phenotype only;
+	if (cPar.a_mode==61 || cPar.a_mode==62) {
+	  if (!cPar.file_beta.empty() ) {
+	    //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK
+	    //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1
+	    //update indicator_snps, so that the numbers are in accordance with mapRS2wK
+	    set<string> setSnps_beta;
+	    ReadFile_snps_header (cPar.file_beta, setSnps_beta);
+
+	    map <string, double> mapRS2wA, mapRS2wK;
+	    cPar.ObtainWeight(setSnps_beta, mapRS2wK);
+
+	    cPar.UpdateSNP (mapRS2wK);
+
+	    //setup matrices and vectors
+	    gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc*2, cPar.n_vc);
+	    gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc);
+	    gsl_vector *q=gsl_vector_alloc (cPar.n_vc);
+	    gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1);
+
+	    gsl_matrix *K=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test);
+	    gsl_matrix *A=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test);
+
+	    gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
+	    gsl_matrix *W=gsl_matrix_alloc (cPar.ni_test, cPar.n_cvt);
+
+	    gsl_matrix_set_zero (K);
+	    gsl_matrix_set_zero (A);
+
+	    gsl_matrix_set_zero(S);
+	    gsl_matrix_set_zero(Vq);
+	    gsl_vector_set_zero (q);
+	    gsl_vector_set_zero (s);
+
+	    cPar.CopyCvtPhen (W, y, 0);
+
+	    gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc);
+	    gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc);
+	    gsl_vector_view s_vec=gsl_vector_subvector(s, 0, cPar.n_vc);
+
+	    vector<size_t> vec_cat, vec_ni;
+	    vector<double> vec_weight, vec_z2;
+
+	    //read beta, based on the mapRS2wK
+	    ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2wK, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_study, cPar.ns_study, cPar.ns_test);
+
+	    cout<<"Study Panel: "<<endl;
+	    cout<<"## number of total individuals = "<<cPar.ni_study<<endl;
+	    cout<<"## number of total SNPs = "<<cPar.ns_study<<endl;
+	    cout<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	    cout<<"## number of variance components = "<<cPar.n_vc<<endl;
+
+	    //compute q
+	    Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector);
+
+	    //compute S
+	    time_start=clock();
+	    cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &s_vec.vector);
+	    cPar.time_G+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	    if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;}
+
+	    //compute vc estimates
+	    CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, &s_vec.vector, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich);
+
+	    //if LDSC weights, then compute the weights and run the above steps again
+	    if (cPar.a_mode==62) {
+	      //compute the weights and normalize the weights for A
+	      cPar.UpdateWeight (1, mapRS2wK, cPar.ni_study, &s_vec.vector, mapRS2wA);
+
+	      //read beta file again, and update weigths vector
+	      ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2wA, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_study, cPar.ns_total, cPar.ns_test);
+
+	      //compute q
+	      Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector);
+
+	      //compute S
+	      time_start=clock();
+	      cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &s_vec.vector);
+	      cPar.time_G+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+	      if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;}
+
+	      //compute vc estimates
+	      CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, &s_vec.vector, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich);
+	    }
+
+	    gsl_vector_set (s, cPar.n_vc, cPar.ni_test);
+
+	    cPar.WriteMatrix (S, "S");
+	    cPar.WriteMatrix (Vq, "Vq");
+	    cPar.WriteVector (q, "q");
+	    cPar.WriteVector (s, "size");
+
+	    gsl_matrix_free (S);
+	    gsl_matrix_free (Vq);
+	    gsl_vector_free (q);
+	    gsl_vector_free (s);
+
+	    gsl_matrix_free (A);
+	    gsl_matrix_free (K);
+	    gsl_vector_free (y);
+	    gsl_matrix_free (W);
+	  } else if (!cPar.file_study.empty() || !cPar.file_mstudy.empty()) {
+	    if (!cPar.file_study.empty()) {
+	      string sfile=cPar.file_study+".size.txt";
+	      CountFileLines (sfile, cPar.n_vc);
+	    } else {
+	      string file_name;
+	      igzstream infile (cPar.file_mstudy.c_str(), igzstream::in);
+	      if (!infile) {cout<<"error! fail to open mstudy file: "<<cPar.file_study<<endl; return;}
+
+	      safeGetline(infile, file_name);
+
+	      infile.clear();
+	      infile.close();
+
+	      string sfile=file_name+".size.txt";
+	      CountFileLines (sfile, cPar.n_vc);
+	    }
+
+	    cPar.n_vc=cPar.n_vc-1;
+
+	    gsl_matrix *S=gsl_matrix_alloc (2*cPar.n_vc, cPar.n_vc);
+	    gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc);
+	    //gsl_matrix *V=gsl_matrix_alloc (cPar.n_vc+1, (cPar.n_vc*(cPar.n_vc+1))/2*(cPar.n_vc+1) );
+	    //gsl_matrix *Vslope=gsl_matrix_alloc (n_lines+1, (n_lines*(n_lines+1))/2*(n_lines+1) );
+	    gsl_vector *q=gsl_vector_alloc (cPar.n_vc);
+	    gsl_vector *s_study=gsl_vector_alloc (cPar.n_vc);
+	    gsl_vector *s_ref=gsl_vector_alloc (cPar.n_vc);
+	    gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1);
+
+	    gsl_matrix_set_zero(S);
+	    gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc);
+	    gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc);
+
+	    gsl_matrix_set_zero(Vq);
+	    //gsl_matrix_set_zero(V);
+	    //gsl_matrix_set_zero(Vslope);
+	    gsl_vector_set_zero(q);
+	    gsl_vector_set_zero(s_study);
+	    gsl_vector_set_zero(s_ref);
+
+	    if (!cPar.file_study.empty()) {
+	      ReadFile_study(cPar.file_study, Vq, q, s_study, cPar.ni_study);
+	    } else {
+	      ReadFile_mstudy(cPar.file_mstudy, Vq, q, s_study, cPar.ni_study);
+	    }
+
+	    if (!cPar.file_ref.empty()) {
+	      ReadFile_ref(cPar.file_ref, &S_mat.matrix, &Svar_mat.matrix, s_ref, cPar.ni_ref);
+	    } else {
+	      ReadFile_mref(cPar.file_mref, &S_mat.matrix, &Svar_mat.matrix, s_ref, cPar.ni_ref);
+	    }
+
+	    cout<<"## number of variance components = "<<cPar.n_vc<<endl;
+	    cout<<"## number of individuals in the sample = "<<cPar.ni_study<<endl;
+	    cout<<"## number of individuals in the reference = "<<cPar.ni_ref<<endl;
+
+	    CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, s_study, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich);
+
+	    gsl_vector_view s_sub=gsl_vector_subvector (s, 0, cPar.n_vc);
+	    gsl_vector_memcpy (&s_sub.vector, s_ref);
+	    gsl_vector_set (s, cPar.n_vc, cPar.ni_ref);
+
+	    cPar.WriteMatrix (S, "S");
+	    cPar.WriteMatrix (Vq, "Vq");
+	    cPar.WriteVector (q, "q");
+	    cPar.WriteVector (s, "size");
+
+	    gsl_matrix_free (S);
+	    gsl_matrix_free (Vq);
+	    //gsl_matrix_free (V);
+	    //gsl_matrix_free (Vslope);
+	    gsl_vector_free (q);
+	    gsl_vector_free (s_study);
+	    gsl_vector_free (s_ref);
+	    gsl_vector_free (s);
+	  } else {
 		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
 		gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt);
 		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc );
 
-		//set covariates matrix W and phenotype matrix Y		
-		//an intercept should be included in W, 
+		//set covariates matrix W and phenotype matrix Y
+		//an intercept should be included in W,
 		cPar.CopyCvtPhen (W, Y, 0);
 
 		//read kinship matrices
 		if (!(cPar.file_mk).empty()) {
 		  ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 		  if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-	
+
 		  //center matrix G, and obtain v_traceG
 		  double d=0;
 		  (cPar.v_traceG).clear();
@@ -1152,7 +1749,7 @@ void GEMMA::BatchRun (PARAM &cPar)
 		} else if (!(cPar.file_kin).empty()) {
 			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-						
+
 			//center matrix G
 			CenterMatrix (G);
 
@@ -1167,8 +1764,8 @@ void GEMMA::BatchRun (PARAM &cPar)
 			/*
 			//eigen-decomposition and calculate trace_G
 			cout<<"Start Eigen-Decomposition..."<<endl;
-			time_start=clock();	
-	
+			time_start=clock();
+
 			if (cPar.a_mode==31) {
 				cPar.trace_G=EigenDecomp (G, U, eval, 1);
 			} else {
@@ -1182,14 +1779,14 @@ void GEMMA::BatchRun (PARAM &cPar)
 			}
 			cPar.trace_G/=(double)eval->size;
 
-			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 		} else {
 			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
 			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
-			
-			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);
 			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
-			
+
 			cPar.trace_G=0.0;
 			for (size_t i=0; i<eval->size; i++) {
 				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
@@ -1202,7 +1799,7 @@ void GEMMA::BatchRun (PARAM &cPar)
 		if (cPar.n_ph==1) {
 		  //		  if (cPar.n_vc==1) {
 		    /*
-		    //calculate UtW and Uty	
+		    //calculate UtW and Uty
 		    CalcUtX (U, W, UtW);
 		    CalcUtX (U, Y, UtY);
 
@@ -1228,10 +1825,10 @@ void GEMMA::BatchRun (PARAM &cPar)
 		      cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
 		      cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
 		    }
-				
+
 		    CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
 		    cPar.PrintSummary();
-				
+
 		    //calculate and output residuals
 		    if (cPar.a_mode==5) {
 		      gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
@@ -1239,11 +1836,11 @@ void GEMMA::BatchRun (PARAM &cPar)
 		      gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
 		      gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
 		      gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
-					
+
 		      //obtain Utu and Ute
 		      gsl_vector_memcpy (y_hat, &UtY_col.vector);
 		      gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
-		      
+
 		      double d, u, e;
 		      for (size_t i=0; i<eval->size; i++) {
 			d=gsl_vector_get (eval, i);
@@ -1252,37 +1849,210 @@ void GEMMA::BatchRun (PARAM &cPar)
 			gsl_vector_set (Utu_hat, i, u);
 			gsl_vector_set (Ute_hat, i, e);
 		      }
-					
+
 		      //obtain u and e
 		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
 		      gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
-		      
-		      //output residuals					
+
+		      //output residuals
 		      cPar.WriteVector(u_hat, "residU");
 		      cPar.WriteVector(e_hat, "residE");
-		      
+
 		      gsl_vector_free(u_hat);
 		      gsl_vector_free(e_hat);
 		      gsl_vector_free(y_hat);
-		    }	
-*/	
+		    }
+*/
 		  //		  } else {
 		    gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
 		    VC cVc;
-		    cVc.CopyFromParam(cPar); 
-		    cVc.CalcVCreml (G, W, &Y_col.vector);			
+		    cVc.CopyFromParam(cPar);
+		    if (cPar.a_mode==61) {
+		      cVc.CalcVChe (G, W, &Y_col.vector);
+		    } else {
+		      cVc.CalcVCreml (cPar.noconstrain, G, W, &Y_col.vector);
+		    }
 		    cVc.CopyToParam(cPar);
-
 		    //obtain pve from sigma2
 		    //obtain se_pve from se_sigma2
-		    
+
 		    //}
-		} 
+		}
+	  }
+
+	}
+
 
-		
+	//compute confidence intervals with additional summary statistics
+	//we do not check the sign of z-scores here, but they have to be matched with the genotypes
+	if (cPar.a_mode==66 || cPar.a_mode==67) {
+	  //read reference file first
+	  gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc);
+	  gsl_matrix *Svar=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc);
+	  gsl_vector *s_ref=gsl_vector_alloc (cPar.n_vc);
+
+	  gsl_matrix_set_zero(S);
+	  gsl_matrix_set_zero(Svar);
+	  gsl_vector_set_zero(s_ref);
+
+	  if (!cPar.file_ref.empty()) {
+	    ReadFile_ref(cPar.file_ref, S, Svar, s_ref, cPar.ni_ref);
+	  } else {
+	    ReadFile_mref(cPar.file_mref, S, Svar, s_ref, cPar.ni_ref);
+	  }
+
+	  //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK
+	  //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1
+	  set<string> setSnps_beta;
+	  ReadFile_snps_header (cPar.file_beta, setSnps_beta);
+
+	  //obtain the weights for wA, which contains the SNP weights for SNPs used in the model
+	  map <string, double> mapRS2wK;
+	  cPar.ObtainWeight(setSnps_beta, mapRS2wK);
+
+	  //set up matrices and vector
+	  gsl_matrix *Xz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc);
+	  gsl_matrix *XWz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc);
+	  gsl_matrix *XtXWz=gsl_matrix_alloc (mapRS2wK.size(), cPar.n_vc*cPar.n_vc);
+	  gsl_vector *w=gsl_vector_alloc (mapRS2wK.size());
+	  gsl_vector *w1=gsl_vector_alloc (mapRS2wK.size());
+	  gsl_vector *z=gsl_vector_alloc (mapRS2wK.size());
+	  gsl_vector *s_vec=gsl_vector_alloc (cPar.n_vc);
+
+	  vector<size_t> vec_cat, vec_size;
+	  vector<double> vec_z;
+
+	  map <string, double> mapRS2z, mapRS2wA;
+	  map <string, string> mapRS2A1;
+	  string file_str;
+
+	  //update s_vec, the number of snps in each category
+	  for (size_t i=0; i<cPar.n_vc; i++) {
+	    vec_size.push_back(0);
+	  }
+
+	  for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
+	    vec_size[cPar.mapRS2cat[it->first]]++;
+	  }
+
+	  for (size_t i=0; i<cPar.n_vc; i++) {
+	    gsl_vector_set(s_vec, i, vec_size[i]);
+	  }
+
+	  //update mapRS2wA using v_pve and s_vec
+	  if (cPar.a_mode==66) {
+	    for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
+	      mapRS2wA[it->first]=1;
+	    }
+	  } else {
+	    cPar.UpdateWeight (0, mapRS2wK, cPar.ni_test, s_vec, mapRS2wA);
+	  }
+
+	  //read in z-scores based on allele 0, and save that into a vector
+	  ReadFile_beta (cPar.file_beta, mapRS2wA, mapRS2A1, mapRS2z);
+
+	  //update snp indicator, save weights to w, save z-scores to vec_z, save category label to vec_cat
+	  //sign of z is determined by matching alleles
+	  cPar.UpdateSNPnZ (mapRS2wA, mapRS2A1, mapRS2z, w, z, vec_cat);
+
+	  //compute an n by k matrix of X_iWz
+	  cout<<"Calculating Xz ... "<<endl;
+
+	  gsl_matrix_set_zero(Xz);
+	  gsl_vector_set_all (w1, 1);
+
+	  if (!cPar.file_bfile.empty() ) {
+	    file_str=cPar.file_bfile+".bed";
+	    PlinkXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w1, z, 0, Xz);
+	  } else if (!cPar.file_geno.empty()) {
+	    BimbamXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w1, z, 0, Xz);
+	  } else if (!cPar.file_mbfile.empty() ){
+	    MFILEXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w1, z, Xz);
+	  } else if (!cPar.file_mgeno.empty()) {
+	    MFILEXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w1, z, Xz);
+	  }
+	  /*
+	  cout<<"Xz: "<<endl;
+	  for (size_t i=0; i<5; i++) {
+	    for (size_t j=0; j<cPar.n_vc; j++) {
+	      cout<<gsl_matrix_get (Xz, i, j)<<" ";
+	    }
+	    cout<<endl;
+	  }
+	  */
+	  if (cPar.a_mode==66) {
+	    gsl_matrix_memcpy (XWz, Xz);
+	  } else if (cPar.a_mode==67) {
+	    cout<<"Calculating XWz ... "<<endl;
+
+	    gsl_matrix_set_zero(XWz);
+
+	    if (!cPar.file_bfile.empty() ) {
+	      file_str=cPar.file_bfile+".bed";
+	      PlinkXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w, z, 0, XWz);
+	    } else if (!cPar.file_geno.empty()) {
+	      BimbamXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w, z, 0, XWz);
+	    } else if (!cPar.file_mbfile.empty() ){
+	      MFILEXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w, z, XWz);
+	    } else if (!cPar.file_mgeno.empty()) {
+	      MFILEXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w, z, XWz);
+	    }
+	  }
+	  /*
+	  cout<<"XWz: "<<endl;
+	  for (size_t i=0; i<5; i++) {
+	    cout<<gsl_vector_get (w, i)<<endl;
+	    for (size_t j=0; j<cPar.n_vc; j++) {
+	      cout<<gsl_matrix_get (XWz, i, j)<<" ";
+	    }
+	    cout<<endl;
+	  }
+	  */
+	  //compute an p by k matrix of X_j^TWX_iWz
+	  cout<<"Calculating XtXWz ... "<<endl;
+	  gsl_matrix_set_zero(XtXWz);
+
+	  if (!cPar.file_bfile.empty() ) {
+	    file_str=cPar.file_bfile+".bed";
+	    PlinkXtXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, XWz, 0, XtXWz);
+	  } else if (!cPar.file_geno.empty()) {
+	    BimbamXtXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, XWz, 0, XtXWz);
+	  } else if (!cPar.file_mbfile.empty() ){
+	    MFILEXtXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, XWz, XtXWz);
+	  } else if (!cPar.file_mgeno.empty()) {
+	    MFILEXtXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, XWz, XtXWz);
+	  }
+	  /*
+	  cout<<"XtXWz: "<<endl;
+	  for (size_t i=0; i<5; i++) {
+	    for (size_t j=0; j<cPar.n_vc; j++) {
+	      cout<<gsl_matrix_get (XtXWz, i, j)<<" ";
+	    }
+	    cout<<endl;
+	  }
+	  */
+	  //compute confidence intervals
+	  CalcCIss(Xz, XWz, XtXWz, S, Svar, w, z, s_vec, vec_cat, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich);
+
+	  //write files
+	  //cPar.WriteMatrix (XWz, "XWz");
+	  //cPar.WriteMatrix (XtXWz, "XtXWz");
+	  //cPar.WriteVector (w, "w");
+
+	  gsl_matrix_free(S);
+	  gsl_matrix_free(Svar);
+	  gsl_vector_free(s_ref);
+
+	  gsl_matrix_free(Xz);
+	  gsl_matrix_free(XWz);
+	  gsl_matrix_free(XtXWz);
+	  gsl_vector_free(w);
+	  gsl_vector_free(w1);
+	  gsl_vector_free(z);
+	  gsl_vector_free(s_vec);
 	}
-	
-	
+
+
 	//LMM or mvLMM or Eigen-Decomposition
 	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) {  //Fit LMM or mvLMM or eigen
 		gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph);
@@ -1290,33 +2060,62 @@ void GEMMA::BatchRun (PARAM &cPar)
 		gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2);	//B is a d by c matrix
 		gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2);
 		gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1);
-		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); 
+		gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1);
 		gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2);
 		gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2);
 		gsl_vector *eval=gsl_vector_alloc (Y->size1);
-				
-		//set covariates matrix W and phenotype matrix Y		
-		//an intercept should be included in W, 
+		gsl_vector *env=gsl_vector_alloc (Y->size1);
+		gsl_vector *weight=gsl_vector_alloc (Y->size1);
+
+		//set covariates matrix W and phenotype matrix Y
+		//an intercept should be included in W,
 		cPar.CopyCvtPhen (W, Y, 0);
-				
-		//read relatedness matrix G	
+		if (!cPar.file_gxe.empty()) {cPar.CopyGxe (env);}
+
+		//read relatedness matrix G
 		if (!(cPar.file_kin).empty()) {
 			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-						
+
 			//center matrix G
 			CenterMatrix (G);
-			
+
+			//is residual weights are provided, then
+			if (!cPar.file_weight.empty()) {
+			  cPar.CopyWeight (weight);
+			  double d, wi, wj;
+			  for (size_t i=0; i<G->size1; i++) {
+			    wi=gsl_vector_get(weight, i);
+			    for (size_t j=i; j<G->size2; j++) {
+			      wj=gsl_vector_get(weight, j);
+			      d=gsl_matrix_get(G, i, j);
+			      if (wi<=0 || wj<=0) {d=0;} else {d/=sqrt(wi*wj);}
+			      gsl_matrix_set(G, i, j, d);
+			      if (j!=i) {gsl_matrix_set(G, j, i, d);}
+			    }
+			  }
+			}
+
 			//eigen-decomposition and calculate trace_G
 			cout<<"Start Eigen-Decomposition..."<<endl;
-			time_start=clock();	
-	
+			time_start=clock();
+
 			if (cPar.a_mode==31) {
 				cPar.trace_G=EigenDecomp (G, U, eval, 1);
 			} else {
 				cPar.trace_G=EigenDecomp (G, U, eval, 0);
 			}
 
+			if (!cPar.file_weight.empty()) {
+			  double wi;
+			  for (size_t i=0; i<U->size1; i++) {
+			    wi=gsl_vector_get(weight, i);
+			    if (wi<=0) {wi=0;} else {wi=sqrt(wi);}
+			    gsl_vector_view Urow=gsl_matrix_row (U, i);
+			    gsl_vector_scale (&Urow.vector, wi);
+			  }
+			}
+
 			cPar.trace_G=0.0;
 			for (size_t i=0; i<eval->size; i++) {
 				if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);}
@@ -1324,14 +2123,14 @@ void GEMMA::BatchRun (PARAM &cPar)
 			}
 			cPar.trace_G/=(double)eval->size;
 
-			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+			cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 		} else {
 			ReadFile_eigenU (cPar.file_ku, cPar.error, U);
 			if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;}
-			
-			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);			
+
+			ReadFile_eigenD (cPar.file_kd, cPar.error, eval);
 			if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;}
-			
+
 			cPar.trace_G=0.0;
 			for (size_t i=0; i<eval->size; i++) {
 				if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);}
@@ -1339,14 +2138,29 @@ void GEMMA::BatchRun (PARAM &cPar)
 			}
 			cPar.trace_G/=(double)eval->size;
 		}
-		
+
 		if (cPar.a_mode==31) {
 			cPar.WriteMatrix(U, "eigenU");
 			cPar.WriteVector(eval, "eigenD");
-		} else {
-			//calculate UtW and Uty	
+		} else if (!cPar.file_gene.empty() ) {
+			//calculate UtW and Uty
 			CalcUtX (U, W, UtW);
-			CalcUtX (U, Y, UtY);			
+			CalcUtX (U, Y, UtY);
+
+			LMM cLmm;
+			cLmm.CopyFromParam(cPar);
+
+			gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
+			gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
+
+			cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype
+
+			cLmm.WriteFiles();
+			cLmm.CopyToParam(cPar);
+		} else {
+		  //calculate UtW and Uty
+		  CalcUtX (U, W, UtW);
+		  CalcUtX (U, Y, UtY);
 
 			//calculate REMLE/MLE estimate and pve for univariate model
 			if (cPar.n_ph==1) {
@@ -1372,10 +2186,10 @@ void GEMMA::BatchRun (PARAM &cPar)
 					cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) );
 					cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) );
 				}
-				
+
 				CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
 				cPar.PrintSummary();
-				
+
 				//calculate and output residuals
 				if (cPar.a_mode==5) {
 					gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1);
@@ -1383,11 +2197,11 @@ void GEMMA::BatchRun (PARAM &cPar)
 					gsl_vector *u_hat=gsl_vector_alloc (Y->size1);
 					gsl_vector *e_hat=gsl_vector_alloc (Y->size1);
 					gsl_vector *y_hat=gsl_vector_alloc (Y->size1);
-					
+
 					//obtain Utu and Ute
 					gsl_vector_memcpy (y_hat, &UtY_col.vector);
 					gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat);
-					
+
 					double d, u, e;
 					for (size_t i=0; i<eval->size; i++) {
 						d=gsl_vector_get (eval, i);
@@ -1396,81 +2210,106 @@ void GEMMA::BatchRun (PARAM &cPar)
 						gsl_vector_set (Utu_hat, i, u);
 						gsl_vector_set (Ute_hat, i, e);
 					}
-					
+
 					//obtain u and e
 					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat);
 					gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat);
-					
-					//output residuals					
+
+					//output residuals
 					cPar.WriteVector(u_hat, "residU");
 					cPar.WriteVector(e_hat, "residE");
-					
+
 					gsl_vector_free(u_hat);
 					gsl_vector_free(e_hat);
 					gsl_vector_free(y_hat);
-				}							
-			} 
-			
+				}
+			}
+
 			//Fit LMM or mvLMM
 			if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) {
-				if (cPar.n_ph==1) {			
+				if (cPar.n_ph==1) {
 					LMM cLmm;
 					cLmm.CopyFromParam(cPar);
-					
+
 					gsl_vector_view Y_col=gsl_matrix_column (Y, 0);
 					gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0);
-					
-					if (!cPar.file_gene.empty()) {		
-						cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype
-					} else if (!cPar.file_bfile.empty()) {
-						cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
-					} else {
-						cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
-					}	
-					
+
+					if (!cPar.file_bfile.empty()) {
+					  if (cPar.file_gxe.empty()) {
+					    cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					  } else {
+					    cLmm.AnalyzePlinkGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env);
+					  }
+					}
+					// WJA added
+				       	else if(!cPar.file_oxford.empty()) {
+					  cLmm.Analyzebgen (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					}
+					else {
+					  if (cPar.file_gxe.empty()) {
+					    cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector);
+					  } else {
+					    cLmm.AnalyzeBimbamGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env);
+					  }
+					}
+
 					cLmm.WriteFiles();
 					cLmm.CopyToParam(cPar);
-				} else {			 
+				} else {
 					MVLMM cMvlmm;
-					cMvlmm.CopyFromParam(cPar);			
-					
+					cMvlmm.CopyFromParam(cPar);
+
 					if (!cPar.file_bfile.empty()) {
-						cMvlmm.AnalyzePlink (U, eval, UtW, UtY);
-					} else {
-						cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY);
+					  if (cPar.file_gxe.empty()) {
+					    cMvlmm.AnalyzePlink (U, eval, UtW, UtY);
+					  } else {
+					    cMvlmm.AnalyzePlinkGXE (U, eval, UtW, UtY, env);
+					  }
+					}
+					else if(!cPar.file_oxford.empty())
+					{
+					    cMvlmm.Analyzebgen (U, eval, UtW, UtY);
+					}
+					else {
+					  if (cPar.file_gxe.empty()) {
+					    cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY);
+					  } else {
+					    cMvlmm.AnalyzeBimbamGXE (U, eval, UtW, UtY, env);
+					  }
 					}
-					
+
 					cMvlmm.WriteFiles();
 					cMvlmm.CopyToParam(cPar);
 				}
 			}
 		}
-		
-				
+
+
 		//release all matrices and vectors
 		gsl_matrix_free (Y);
 		gsl_matrix_free (W);
 		gsl_matrix_free(B);
 		gsl_matrix_free(se_B);
-		gsl_matrix_free (G);	
+		gsl_matrix_free (G);
 		gsl_matrix_free (U);
 		gsl_matrix_free (UtW);
 		gsl_matrix_free (UtY);
 		gsl_vector_free (eval);
-	} 
-	
-	
+		gsl_vector_free (env);
+	}
+
+
 	//BSLMM
 	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
 		gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
-		gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);	
+		gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);
 		gsl_matrix *G=gsl_matrix_alloc (y->size, y->size);
-		gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test);	
-		
-		//set covariates matrix W and phenotype vector y		
-		//an intercept should be included in W, 
+		gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test);
+
+		//set covariates matrix W and phenotype vector y
+		//an intercept should be included in W,
 		cPar.CopyCvtPhen (W, y, 0);
-		
+
 		//center y, even for case/control data
 		cPar.pheno_mean=CenterVector(y);
 
@@ -1482,32 +2321,32 @@ void GEMMA::BatchRun (PARAM &cPar)
 		  //perform BSLMM analysis
 		  BSLMM cBslmm;
 		  cBslmm.CopyFromParam(cPar);
-		  time_start=clock();	
+		  time_start=clock();
 		  cBslmm.MCMC(UtX, y);
 		  cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 		  cBslmm.CopyToParam(cPar);
 		  //else, if rho!=1
 		} else {
-		gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); 
+		gsl_matrix *U=gsl_matrix_alloc (y->size, y->size);
 		gsl_vector *eval=gsl_vector_alloc (y->size);
 		gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2);
 		gsl_vector *Uty=gsl_vector_alloc (y->size);
 
-		
-		//read relatedness matrix G		
-		if (!(cPar.file_kin).empty()) {		
+
+		//read relatedness matrix G
+		if (!(cPar.file_kin).empty()) {
 			cPar.ReadGenotypes (UtX, G, false);
-			
+
 			//read relatedness matrix G
 			ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G);
 			if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;}
-			
+
 			//center matrix G
 			CenterMatrix (G);
 		} else {
 			cPar.ReadGenotypes (UtX, G, true);
 		}
-		
+
 		//eigen-decomposition and calculate trace_G
 		cout<<"Start Eigen-Decomposition..."<<endl;
 		time_start=clock();
@@ -1518,39 +2357,39 @@ void GEMMA::BatchRun (PARAM &cPar)
 			cPar.trace_G+=gsl_vector_get (eval, i);
 		}
 		cPar.trace_G/=(double)eval->size;
-		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);			
-		
-		//calculate UtW and Uty		
+		cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		//calculate UtW and Uty
 		CalcUtX (U, W, UtW);
 		CalcUtX (U, y, Uty);
-		
+
 		//calculate REMLE/MLE estimate and pve
 		CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
 		CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
 		CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null);
-		
+
 		cPar.PrintSummary();
-				
+
 		//Creat and calcualte UtX, use a large memory
 		cout<<"Calculating UtX..."<<endl;
-		time_start=clock();							
+		time_start=clock();
 		CalcUtX (U, UtX);
 		cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-		
+
 		//perform BSLMM analysis
 		BSLMM cBslmm;
 		cBslmm.CopyFromParam(cPar);
-		time_start=clock();	
-		if (cPar.a_mode==12) {  //ridge regression				
+		time_start=clock();
+		if (cPar.a_mode==12) {  //ridge regression
 			cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null);
 		} else {	//Run MCMC
 			cBslmm.MCMC(U, UtX, Uty, eval, y);
 		}
 		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 		cBslmm.CopyToParam(cPar);
-		
+
 		//release all matrices and vectors
-		gsl_matrix_free (G);	
+		gsl_matrix_free (G);
 		gsl_matrix_free (U);
 		gsl_matrix_free (UtW);
 		gsl_vector_free (eval);
@@ -1560,106 +2399,259 @@ void GEMMA::BatchRun (PARAM &cPar)
 		gsl_matrix_free (W);
 		gsl_vector_free (y);
 		gsl_matrix_free (UtX);
-	} 
-	
-	
-		
+	}
+
+
+
+	//LDR
+	if (cPar.a_mode==14) {
+		gsl_vector *y=gsl_vector_alloc (cPar.ni_test);
+		gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt);
+		gsl_matrix *G=gsl_matrix_alloc (1, 1);
+		vector<vector<unsigned char> > Xt;
+
+        	//set covariates matrix W and phenotype vector y
+		//an intercept is included in W
+		cPar.CopyCvtPhen (W, y, 0);
+
+		//read in genotype matrix X
+		cPar.ReadGenotypes (Xt, G, false);
+
+		LDR cLdr;
+		cLdr.CopyFromParam(cPar);
+		time_start=clock();
+
+		cLdr.VB(Xt, W, y);
+
+		cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		cLdr.CopyToParam(cPar);
+
+		gsl_vector_free (y);
+		gsl_matrix_free (W);
+		gsl_matrix_free (G);
+	}
+
 	cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0);
-	
+
 	return;
 }
 
 
 
 
-void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) 
+void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 {
 	string file_str;
 	file_str=cPar.path_out+"/"+cPar.file_out;
 	file_str+=".log.txt";
-	
+
 	ofstream outfile (file_str.c_str(), ofstream::out);
 	if (!outfile) {cout<<"error writing log file: "<<file_str.c_str()<<endl; return;}
-	
+
 	outfile<<"##"<<endl;
 	outfile<<"## GEMMA Version = "<<version<<endl;
-	
+
 	outfile<<"##"<<endl;
 	outfile<<"## Command Line Input = ";
-	for(int i = 1; i < argc; i++) {	
+	for(int i = 0; i < argc; i++) {
 		outfile<<argv[i]<<" ";
 	}
 	outfile<<endl;
 
 	outfile<<"##"<<endl;
-	time_t  rawtime; 
+	time_t  rawtime;
 	time(&rawtime);
 	tm *ptm = localtime (&rawtime);
 
-	outfile<<"## Date = "<<asctime(ptm)<<endl;
+	outfile<<"## Date = "<<asctime(ptm);
 	  //ptm->tm_year<<":"<<ptm->tm_month<<":"<<ptm->tm_day":"<<ptm->tm_hour<<":"<<ptm->tm_min<<endl;
-	
+
 	outfile<<"##"<<endl;
 	outfile<<"## Summary Statistics:"<<endl;
-	outfile<<"## number of total individuals = "<<cPar.ni_total<<endl;	
-	if (cPar.a_mode==43) {
-		outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl;
-		outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl;
-	} else {
-		outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl;
-	}
-	outfile<<"## number of covariates = "<<cPar.n_cvt<<endl;
-	outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl;
-	if (cPar.a_mode==43) {
-		outfile<<"## number of observed data = "<<cPar.np_obs<<endl;
-		outfile<<"## number of missing data = "<<cPar.np_miss<<endl;
-	}
-	if (cPar.a_mode==61) {
-		outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
-	}
-		
-	if (!(cPar.file_gene).empty()) {
-		outfile<<"## number of total genes = "<<cPar.ng_total<<endl;
-		outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl;		
-	} else if (cPar.file_epm.empty()) {	
-		outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl;	
-		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	if (!cPar.file_cor.empty() || !cPar.file_study.empty() || !cPar.file_mstudy.empty() ) {
+	  outfile<<"## number of total individuals in the sample = "<<cPar.ni_study<<endl;
+	  outfile<<"## number of total individuals in the reference = "<<cPar.ni_ref<<endl;
+	  //outfile<<"## number of total SNPs in the sample = "<<cPar.ns_study<<endl;
+	  //outfile<<"## number of total SNPs in the reference panel = "<<cPar.ns_ref<<endl;
+	  //outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  //outfile<<"## number of analyzed SNP pairs = "<<cPar.ns_pair<<endl;
+	  outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+
+	  outfile<<"## pve estimates = ";
+	    for (size_t i=0; i<cPar.v_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_pve[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(pve) = ";
+	    for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_pve[i];
+	    }
+	    outfile<<endl;
+
+	    if (cPar.n_vc>1) {
+	      outfile<<"## total pve = "<<cPar.pve_total<<endl;
+	      outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl;
+	    }
+
+	    outfile<<"## sigma2 per snp = ";
+	    for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(sigma2 per snp) = ";
+	    for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## enrichment = ";
+	    for (size_t i=0; i<cPar.v_enrich.size(); i++) {
+	      outfile<<"  "<<cPar.v_enrich[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(enrichment) = ";
+	    for (size_t i=0; i<cPar.v_se_enrich.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_enrich[i];
+	    }
+	    outfile<<endl;
+	} else if (!cPar.file_beta.empty() && (cPar.a_mode==61 || cPar.a_mode==62) ) {
+	  outfile<<"## number of total individuals in the sample = "<<cPar.ni_study<<endl;
+	  outfile<<"## number of total individuals in the reference = "<<cPar.ni_total<<endl;
+	  outfile<<"## number of total SNPs in the sample = "<<cPar.ns_study<<endl;
+	  outfile<<"## number of total SNPs in the reference panel = "<<cPar.ns_total<<endl;
+	  outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+	} else if (!cPar.file_beta.empty() && (cPar.a_mode==66 || cPar.a_mode==67) ) {
+	  outfile<<"## number of total individuals in the sample = "<<cPar.ni_total<<endl;
+	  outfile<<"## number of total individuals in the reference = "<<cPar.ni_ref<<endl;
+	  outfile<<"## number of total SNPs in the sample = "<<cPar.ns_total<<endl;
+	  outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+
+	  outfile<<"## pve estimates = ";
+	    for (size_t i=0; i<cPar.v_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_pve[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(pve) = ";
+	    for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_pve[i];
+	    }
+	    outfile<<endl;
+
+	    if (cPar.n_vc>1) {
+	      outfile<<"## total pve = "<<cPar.pve_total<<endl;
+	      outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl;
+	    }
+
+	    outfile<<"## sigma2 per snp = ";
+	    for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(sigma2 per snp) = ";
+	    for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## enrichment = ";
+	    for (size_t i=0; i<cPar.v_enrich.size(); i++) {
+	      outfile<<"  "<<cPar.v_enrich[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(enrichment) = ";
+	    for (size_t i=0; i<cPar.v_se_enrich.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_enrich[i];
+	    }
+	    outfile<<endl;
 	} else {
-		outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  outfile<<"## number of total individuals = "<<cPar.ni_total<<endl;
+
+	  if (cPar.a_mode==43) {
+	    outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl;
+	    outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl;
+	  } else if (cPar.a_mode!=27 && cPar.a_mode!=28) {
+	    outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl;
+	  }
+
+	  outfile<<"## number of covariates = "<<cPar.n_cvt<<endl;
+	  outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl;
+	  if (cPar.a_mode==43) {
+	    outfile<<"## number of observed data = "<<cPar.np_obs<<endl;
+	    outfile<<"## number of missing data = "<<cPar.np_miss<<endl;
+	  }
+	  if (cPar.a_mode==25 || cPar.a_mode==26 || cPar.a_mode==27 || cPar.a_mode==28 || cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==66 || cPar.a_mode==67) {
+	    outfile<<"## number of variance components = "<<cPar.n_vc<<endl;
+	  }
+
+	  if (!(cPar.file_gene).empty()) {
+	    outfile<<"## number of total genes = "<<cPar.ng_total<<endl;
+	    outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl;
+	  } else if (cPar.file_epm.empty()) {
+	    outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl;
+	    outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  } else {
+	    outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl;
+	  }
+
+	  if (cPar.a_mode==13) {
+	    outfile<<"## number of cases = "<<cPar.ni_case<<endl;
+	    outfile<<"## number of controls = "<<cPar.ni_control<<endl;
+	  }
 	}
-	
-	if (cPar.a_mode==13) {
-		outfile<<"## number of cases = "<<cPar.ni_case<<endl;
-		outfile<<"## number of controls = "<<cPar.ni_control<<endl;
-	}
-
-
-	if (cPar.a_mode==61) {
-	  //	        outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
-		if (cPar.n_ph==1) {
-		  outfile<<"## pve estimate in the null model = ";
-		  for (size_t i=0; i<cPar.v_pve.size(); i++) {
-		    outfile<<"  "<<cPar.v_pve[i];
-		  }
-		  outfile<<endl;
-
-		  outfile<<"## se(pve) in the null model = ";
-		  for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
-		    outfile<<"  "<<cPar.v_se_pve[i];
-		  }
-		  outfile<<endl;
-
-		  outfile<<"## sigma2 estimate in the null model = ";
-		  for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
-		    outfile<<"  "<<cPar.v_sigma2[i];
-		  }
-		  outfile<<endl;
 
-		  outfile<<"## se(sigma2) in the null model = ";
-		  for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
-		    outfile<<"  "<<cPar.v_se_sigma2[i];
-		  }
-		  outfile<<endl;
+	if ( (cPar.a_mode==61 || cPar.a_mode==62) && cPar.file_cor.empty() && cPar.file_study.empty() && cPar.file_mstudy.empty() ) {
+	    //	        outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
+	  if (cPar.n_ph==1) {
+	    outfile<<"## pve estimates = ";
+	    for (size_t i=0; i<cPar.v_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_pve[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(pve) = ";
+	    for (size_t i=0; i<cPar.v_se_pve.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_pve[i];
+	    }
+	    outfile<<endl;
+
+	    if (cPar.n_vc>1) {
+	      outfile<<"## total pve = "<<cPar.pve_total<<endl;
+	      outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl;
+	    }
+
+	    outfile<<"## sigma2 estimates = ";
+	    for (size_t i=0; i<cPar.v_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    outfile<<"## se(sigma2) = ";
+	    for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) {
+	      outfile<<"  "<<cPar.v_se_sigma2[i];
+	    }
+	    outfile<<endl;
+
+	    if (!cPar.file_beta.empty() ) {
+	      outfile<<"## enrichment = ";
+	      for (size_t i=0; i<cPar.v_enrich.size(); i++) {
+		outfile<<"  "<<cPar.v_enrich[i];
+	      }
+	      outfile<<endl;
+
+	      outfile<<"## se(enrichment) = ";
+	      for (size_t i=0; i<cPar.v_se_enrich.size(); i++) {
+		outfile<<"  "<<cPar.v_se_enrich[i];
+	      }
+	      outfile<<endl;
+	    }
 		  /*
 			outfile<<"## beta estimate in the null model = ";
 			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
@@ -1672,19 +2664,19 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 			}
 			outfile<<endl;
 		  */
-		}
+	  }
 	}
-	
+
 	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
 		outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl;
 		outfile<<"## MLE log-likelihood in the null model = "<<cPar.logl_mle_H0<<endl;
 		if (cPar.n_ph==1) {
 			//outfile<<"## lambda REMLE estimate in the null (linear mixed) model = "<<cPar.l_remle_null<<endl;
-			//outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl;	
+			//outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl;
 			outfile<<"## pve estimate in the null model = "<<cPar.pve_null<<endl;
-			outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl;	
+			outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl;
 			outfile<<"## vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
-			outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+			outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl;
 			outfile<<"## beta estimate in the null model = ";
 			for (size_t i=0; i<cPar.beta_remle_null.size(); i++) {
 				outfile<<"  "<<cPar.beta_remle_null[i];
@@ -1695,10 +2687,10 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				outfile<<"  "<<cPar.se_beta_remle_null[i];
 			}
 			outfile<<endl;
-			
+
 		} else {
 			size_t c;
-			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1706,7 +2698,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## se(Vg): "<<endl;	
+			outfile<<"## se(Vg): "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1714,7 +2706,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1722,7 +2714,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## se(Ve): "<<endl;	
+			outfile<<"## se(Ve): "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1730,7 +2722,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			
+
 			outfile<<"## MLE estimate for Vg in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<cPar.n_ph; j++) {
@@ -1739,7 +2731,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## se(Vg): "<<endl;	
+			outfile<<"## se(Vg): "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1747,7 +2739,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## MLE estimate for Ve in the null model: "<<endl;	
+			outfile<<"## MLE estimate for Ve in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<cPar.n_ph; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1755,7 +2747,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## se(Ve): "<<endl;	
+			outfile<<"## se(Ve): "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1781,15 +2773,15 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 			}
 		}
 	}
-	
+
 	/*
 	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
 		if (cPar.n_ph==1) {
 			outfile<<"## REMLE vg estimate in the null model = "<<cPar.vg_remle_null<<endl;
-			outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl;	
+			outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl;
 		} else {
 			size_t c;
-			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;			
+			outfile<<"## REMLE estimate for Vg in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1797,7 +2789,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 				}
 				outfile<<endl;
 			}
-			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;	
+			outfile<<"## REMLE estimate for Ve in the null model: "<<endl;
 			for (size_t i=0; i<cPar.n_ph; i++) {
 				for (size_t j=0; j<=i; j++) {
 					c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j);
@@ -1808,15 +2800,15 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 		}
 	}
 	 */
-	
-	
+
+
 	if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
-		outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl;
+	  outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl;
 	}
-	
-	if (cPar.a_mode==11 || cPar.a_mode==13) {	
+
+	if (cPar.a_mode==11 || cPar.a_mode==13) {
 		outfile<<"##"<<endl;
-		outfile<<"## MCMC related:"<<endl;	
+		outfile<<"## MCMC related:"<<endl;
 		outfile<<"## initial value of h = "<<cPar.cHyp_initial.h<<endl;
 		outfile<<"## initial value of rho = "<<cPar.cHyp_initial.rho<<endl;
 		outfile<<"## initial value of pi = "<<exp(cPar.cHyp_initial.logp)<<endl;
@@ -1824,7 +2816,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 		outfile<<"## random seed = "<<cPar.randseed<<endl;
 		outfile<<"## acceptance ratio = "<<(double)cPar.n_accept/(double)((cPar.w_step+cPar.s_step)*cPar.n_mh)<<endl;
 	}
-	
+
 	outfile<<"##"<<endl;
 	outfile<<"## Computation Time:"<<endl;
 	outfile<<"## total computation time = "<<cPar.time_total<<" min "<<endl;
@@ -1837,7 +2829,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 	}
 	if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) {
 		outfile<<"##      time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl;
-		outfile<<"##      time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl;		
+		outfile<<"##      time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl;
 	}
 	if ((cPar.a_mode>=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) {
 		outfile<<"##      time on optimization = "<<cPar.time_opt<<" min "<<endl;
@@ -1855,7 +2847,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar)
 		outfile<<"##      time on predicting phenotypes = "<<cPar.time_opt<<" min "<<endl;
 	}
 	outfile<<"##"<<endl;
-	
+
 	outfile.close();
 	outfile.clear();
 	return;
diff --git a/src/io.cpp b/src/io.cpp
index 03b8e3f..64eb8e3 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -90,6 +90,21 @@ void ProgressBar (string str, double p, double total, double ratio)
 	return;
 }
 
+
+bool isBlankLine(char const* line)
+{
+    for ( char const* cp = line; *cp; ++cp )
+    {
+        if ( !isspace(*cp) ) return false;
+    }
+    return true;
+}
+
+bool isBlankLine(std::string const& line)
+{
+   return isBlankLine(line.c_str());
+}
+
 // in case files are ended with "\r" or "\r\n"
 std::istream& safeGetline(std::istream& is, std::string& t)
 {
@@ -129,7 +144,10 @@ bool ReadFile_snps (const string &file_snps, set<string> &setSnps)
 {
 	setSnps.clear();
 
-	ifstream infile (file_snps.c_str(), ifstream::in);
+	//ifstream infile (file_snps.c_str(), ifstream::in);
+	//if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
+
+	igzstream infile (file_snps.c_str(), igzstream::in);
 	if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
 
 	string line;
@@ -147,6 +165,54 @@ bool ReadFile_snps (const string &file_snps, set<string> &setSnps)
 }
 
 
+bool ReadFile_snps_header (const string &file_snps, set<string> &setSnps)
+{
+	setSnps.clear();
+
+	//ifstream infile (file_snps.c_str(), ifstream::in);
+	//if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
+
+	igzstream infile (file_snps.c_str(), igzstream::in);
+	if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;}
+
+	string line, rs, chr, pos;
+	char *ch_ptr;
+
+	//read header
+	HEADER header;
+	!safeGetline(infile, line).eof();
+	ReadHeader (line, header);
+
+	if (header.rs_col==0 && (header.chr_col==0 || header.pos_col==0) ) {
+	  cout<<"missing rs id in the hearder"<<endl;
+	}
+
+	while (!safeGetline(infile, line).eof()) {
+	  if (isBlankLine(line)) {continue;}
+	  ch_ptr=strtok ((char *)line.c_str(), " , \t");
+
+	  for (size_t i=0; i<header.coln; i++) {
+	    if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+	    if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
+	    if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;}
+
+	    ch_ptr=strtok (NULL, " , \t");
+	  }
+
+	  if (header.rs_col==0) {
+	    rs=chr+":"+pos;
+	  }
+
+	  setSnps.insert(rs);
+	}
+
+	infile.close();
+	infile.clear();
+
+	return true;
+}
+
+
 //Read log file
 bool ReadFile_log (const string &file_log, double &pheno_mean)
 {
@@ -353,7 +419,7 @@ bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<ve
 //Read .bim file
 bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo)
 {
-	snpInfo.clear();
+  snpInfo.clear();
 
 	ifstream infile (file_bim.c_str(), ifstream::in);
 	if (!infile) {cout<<"error opening .bim file: "<<file_bim<<endl; return false;}
@@ -662,7 +728,7 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl
 
 	//start reading snps and doing association test
 	for (size_t t=0; t<ns_total; ++t) {
-		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+	  infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
 
 		if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) {
 			snpInfo[t].n_miss=-9;
@@ -710,11 +776,10 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl
 
 		if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;}
 
-		if (hwe_level!=1 && maf_level!=-1) {
+		if (hwe_level!=0 && maf_level!=-1) {
 			if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;}
 		}
 
-
 		//filter SNP if it is correlated with W
 		//unless W has only one column, of 1s
 		for (size_t i=0; i<genotype->size; ++i) {
@@ -1054,6 +1119,11 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k
 	gsl_vector *geno=gsl_vector_alloc (ni_total);
 	gsl_vector *geno_miss=gsl_vector_alloc (ni_total);
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	size_t ns_test=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t) {
 		!safeGetline(infile, line).eof();
@@ -1090,6 +1160,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k
 
 		gsl_vector_add_constant (geno, -1.0*geno_mean);
 
+		/*
 		if (geno_var!=0) {
 		  if (k_mode==1) {
 		    gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);
@@ -1101,8 +1172,23 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k
 		    cout<<"Unknown kinship mode."<<endl;
 		  }
 		}
+		*/
+
+		if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));}
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, geno);
+
 		ns_test++;
-    }
+
+		if (ns_test%msize==0) {
+		  eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+		  gsl_matrix_set_zero(Xlarge);
+		}
+	}
+
+	if (ns_test%msize!=0) {
+	  eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+	}
 	cout<<endl;
 
 	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
@@ -1116,6 +1202,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k
 
 	gsl_vector_free (geno);
 	gsl_vector_free (geno_miss);
+	gsl_matrix_free (Xlarge);
 
 	infile.close();
 	infile.clear();
@@ -1146,11 +1233,16 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m
 	size_t ns_test=0;
 	int n_bit;
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	//calculate n_bit and c, the number of bit for each snp
 	if (ni_total%4==0) {n_bit=ni_total/4;}
 	else {n_bit=ni_total/4+1; }
 
-	//print the first three majic numbers
+	//print the first three magic numbers
 	for (int i=0; i<3; ++i) {
 		infile.read(ch,1);
 		b=ch[0];
@@ -1196,14 +1288,30 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m
 
 		gsl_vector_add_constant (geno, -1.0*geno_mean);
 
+		/*
 		if (geno_var!=0) {
 			if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);}
 			else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);}
 			else {cout<<"Unknown kinship mode."<<endl;}
 		}
+		*/
+
+		if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));}
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, geno);
 
 		ns_test++;
-    }
+
+		if (ns_test%msize==0) {
+		  eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+		  gsl_matrix_set_zero(Xlarge);
+		}
+	}
+
+	if (ns_test%msize!=0) {
+	  eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+	}
+
 	cout<<endl;
 
 	gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test);
@@ -1216,6 +1324,7 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m
 	}
 
 	gsl_vector_free (geno);
+	gsl_matrix_free (Xlarge);
 
 	infile.close();
 	infile.clear();
@@ -2053,7 +2162,7 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs
 		uint16_t unzipped_data[3*bgen_N];
 
 		if (setSnps.size()!=0 && setSnps.count(rs)==0) {
-			SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9};
+		  SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, (long int) -9};
 			snpInfo.push_back(sInfo);
 			indicator_snp.push_back(0);
 			if(CompressedSNPBlocks)
@@ -2394,18 +2503,18 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k
 //read header to determine which column contains which item
 bool ReadHeader (const string &line, HEADER &header)
 {
-  string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"};
-  set<string> rs_set(rs_ptr, rs_ptr+10);
+  string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID","MarkerName"};
+  set<string> rs_set(rs_ptr, rs_ptr+11);
   string chr_ptr[]={"chr","CHR"};
   set<string> chr_set(chr_ptr, chr_ptr+2);
   string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"};
   set<string> pos_set(pos_ptr, pos_ptr+8);
   string cm_ptr[]={"cm","CM"};
   set<string> cm_set(cm_ptr, cm_ptr+2);
-  string a1_ptr[]={"a1","A1","allele1","ALLELE1"};
-  set<string> a1_set(a1_ptr, a1_ptr+4);
-  string a0_ptr[]={"a0","A0","allele0","ALLELE0"};
-  set<string> a0_set(a0_ptr, a0_ptr+4);
+  string a1_ptr[]={"a1","A1","allele1","ALLELE1","Allele1","INC_ALLELE"};
+  set<string> a1_set(a1_ptr, a1_ptr+5);
+  string a0_ptr[]={"a0","A0","allele0","ALLELE0","Allele0","a2","A2","allele2","ALLELE2","Allele2","DEC_ALLELE"};
+  set<string> a0_set(a0_ptr, a0_ptr+10);
 
   string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"};
   set<string> z_set(z_ptr, z_ptr+6);
@@ -2424,9 +2533,13 @@ bool ReadHeader (const string &line, HEADER &header)
   set<string> nmis_set(nmis_ptr, nmis_ptr+6);
   string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"};
   set<string> nobs_set(nobs_ptr, nobs_ptr+4);
+  string ncase_ptr[]={"ncase","NCASE","n_case","N_CASE"};
+  set<string> ncase_set(ncase_ptr, ncase_ptr+4);
+  string ncontrol_ptr[]={"ncontrol","NCONTROL","n_control","N_CONTROL"};
+  set<string> ncontrol_set(ncontrol_ptr, ncontrol_ptr+4);
 
-  string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"};
-  set<string> af_set(af_ptr, af_ptr+10);
+  string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY","Freq.Allele1.HapMapCEU","FreqAllele1HapMapCEU", "Freq1.Hapmap"};
+  set<string> af_set(af_ptr, af_ptr+13);
   string var_ptr[]={"var","VAR"};
   set<string> var_set(var_ptr, var_ptr+2);
 
@@ -2435,7 +2548,7 @@ bool ReadHeader (const string &line, HEADER &header)
   string cor_ptr[]={"cor","COR","r","R"};
   set<string> cor_set(cor_ptr, cor_ptr+4);
 
-  header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0;
+  header.rs_col=0; header.chr_col=0; header.pos_col=0;  header.cm_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.ncase_col=0; header.ncontrol_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0;
 
   char *ch_ptr;
   string type;
@@ -2472,6 +2585,10 @@ bool ReadHeader (const string &line, HEADER &header)
       if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;}
     } else if (nobs_set.count(type)!=0) {
       if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;}
+    } else if (ncase_set.count(type)!=0) {
+      if (header.ncase_col==0) {header.ncase_col=header.coln+1;} else {cout<<"error! more than two n_case columns in the file."<<endl; n_error++;}
+    } else if (ncontrol_set.count(type)!=0) {
+      if (header.ncontrol_col==0) {header.ncontrol_col=header.coln+1;} else {cout<<"error! more than two n_control columns in the file."<<endl; n_error++;}
     } else if (ws_set.count(type)!=0) {
       if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;}
     } else if (af_set.count(type)!=0) {
@@ -2576,8 +2693,31 @@ bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_
 
 
 
+bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc)
+{
+  mapRS2cat.clear();
+
+  igzstream infile (file_mcat.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; return false;}
+
+  string file_name;
+  map<string, size_t> mapRS2cat_tmp;
+  size_t n_vc_tmp, t=0;
+
+  while (!safeGetline(infile, file_name).eof()) {
+    mapRS2cat_tmp.clear();
+    ReadFile_cat (file_name, mapRS2cat_tmp, n_vc_tmp);
+    mapRS2cat.insert(mapRS2cat_tmp.begin(), mapRS2cat_tmp.end());
+    if (t==0) {n_vc=n_vc_tmp;} else {n_vc=max(n_vc, n_vc_tmp);}
+    t++;
+  }
+
+  return true;
+}
+
+
 //read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix
-bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin)
+bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns)
 {
 	igzstream infile (file_geno.c_str(), igzstream::in);
 	//ifstream infile (file_geno.c_str(), ifstream::in);
@@ -2593,6 +2733,17 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int>
 	gsl_vector *geno=gsl_vector_alloc (ni_test);
 	gsl_vector *geno_miss=gsl_vector_alloc (ni_test);
 
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
 	size_t n_vc=matrix_kin->size2/ni_test, i_vc;
 	string rs;
 	vector<size_t> ns_vec;
@@ -2600,6 +2751,11 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int>
 	  ns_vec.push_back(0);
 	}
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc);
+	gsl_matrix_set_zero(Xlarge);
+
 	size_t ns_test=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t) {
 		!safeGetline(infile, line).eof();
@@ -2640,49 +2796,85 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int>
 			if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);}
 		}
 
-		//this line is new; removed
-		//gsl_vector_add_constant (geno, -1.0*geno_mean);
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
 
-		if (geno_var!=0) {
-		  mapRS2var[rs]=geno_var;
+		gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno);
+		gsl_blas_ddot (geno, geno, &geno_var);
+		geno_var/=(double)ni_test;
 
-		  if (k_mode==1) {
-		    if (n_vc==1 || mapRS2cat.size()==0 ) {
-		      gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);
-                ns_vec[0]++;
-		    } else if (mapRS2cat.count(rs)!=0) {
+		if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) {
+		  if (mapRS2weight.size()==0) {
+		    d=1.0/geno_var;
+		  } else {
+		    d=mapRS2weight.at(rs)/geno_var;
+		  }
+
+		  /*
+		  if (n_vc==1 || mapRS2cat.size()==0 ) {
+		    gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin);
+		    ns_vec[0]++;
+		  } else if (mapRS2cat.count(rs)!=0) {
 		      i_vc=mapRS2cat.at(rs);
 		      ns_vec[i_vc]++;
 		      gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
-		      gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix);
+		      gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix);
+		      //eigenlib_dsyr (1.0, geno, matrix_kin);
+		  }
+		  */
+
+		  gsl_vector_scale (geno, sqrt(d));
+		  if (n_vc==1 || mapRS2cat.size()==0 ) {
+		    gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize);
+		    gsl_vector_memcpy (&Xlarge_col.vector, geno);
+		    ns_vec[0]++;
+
+		    if (ns_vec[0]%msize==0) {
+		      eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+		      gsl_matrix_set_zero(Xlarge);
 		    }
+		  } else if (mapRS2cat.count(rs)!=0) {
+		    i_vc=mapRS2cat.at(rs);
 
-		    //eigenlib_dsyr (1.0, geno, matrix_kin);
-		  } else if (k_mode==2) {
-		    if (n_vc==1 || mapRS2cat.size()==0 ) {
-		      gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);
-                ns_vec[0]++;
-		    } else if (mapRS2cat.count(rs)!=0) {
-		      i_vc=mapRS2cat.at(rs);
-		      ns_vec[i_vc]++;
+		    gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize);
+		    gsl_vector_memcpy (&Xlarge_col.vector, geno);
+
+		    ns_vec[i_vc]++;
+
+		    if (ns_vec[i_vc]%msize==0) {
+		      gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize);
 		      gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
-		      gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix);
+		      eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix);
+
+		      gsl_matrix_set_zero(&X_sub.matrix);
 		    }
-		  } else {
-		    cout<<"Unknown kinship mode."<<endl;
 		  }
+
 		}
 		ns_test++;
-    }
+
+	}
+
+	for (size_t i_vc=0; i_vc<n_vc; i_vc++) {
+	  if (ns_vec[i_vc]%msize!=0) {
+	    gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize);
+	    gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
+	    eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix);
+	  }
+	}
+
 	cout<<endl;
 
 	for (size_t t=0; t<n_vc; t++) {
-	  if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);}
+	  gsl_vector_set(vector_ns, t, ns_vec[t]);
 
 	  for (size_t i=0; i<ni_test; ++i) {
-	    for (size_t j=0; j<i; ++j) {
+	    for (size_t j=0; j<=i; ++j) {
 	      d=gsl_matrix_get (matrix_kin, j, i+ni_test*t);
+	      d/=(double)ns_vec[t];
 	      gsl_matrix_set (matrix_kin, i, j+ni_test*t, d);
+	      gsl_matrix_set (matrix_kin, j, i+ni_test*t, d);
 	    }
 	  }
 	}
@@ -2690,6 +2882,14 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int>
 	gsl_vector_free (geno);
 	gsl_vector_free (geno_miss);
 
+	gsl_vector_free (Wtx);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
+
+	gsl_matrix_free (Xlarge);
+
 	infile.close();
 	infile.clear();
 
@@ -2702,7 +2902,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int>
 
 
 
-bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin)
+bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns)
 {
 	ifstream infile (file_bed.c_str(), ios::binary);
 	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
@@ -2717,6 +2917,17 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &
 	size_t ni_total=indicator_idv.size();
 	gsl_vector *geno=gsl_vector_alloc (ni_test);
 
+	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
+	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);
+	gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2);
+	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
+
+	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
+	int sig;
+	LUDecomp (WtW, pmt, &sig);
+	LUInvert (WtW, pmt, WtWi);
+
 	size_t ns_test=0;
 	int n_bit;
 
@@ -2727,6 +2938,11 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &
 	  ns_vec.push_back(0);
 	}
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc);
+	gsl_matrix_set_zero(Xlarge);
+
 	//calculate n_bit and c, the number of bit for each snp
 	if (ni_total%4==0) {n_bit=ni_total/4;}
 	else {n_bit=ni_total/4+1; }
@@ -2780,65 +2996,97 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &
 			if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);}
 		}
 
-		//this line is new; removed
-		//gsl_vector_add_constant (geno, -1.0*geno_mean);
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+
+		gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx);
+		gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
+		gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno);
+		gsl_blas_ddot (geno, geno, &geno_var);
+		geno_var/=(double)ni_test;
+
+		if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) {
+		  if (mapRS2weight.size()==0) {
+		    d=1.0/geno_var;
+		  } else {
+		    d=mapRS2weight.at(rs)/geno_var;
+		  }
+
+		  /*
+		  if (n_vc==1 || mapRS2cat.size()==0 ) {
+		    gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin);
+		    ns_vec[0]++;
+		  } else if (mapRS2cat.count(rs)!=0) {
+		    i_vc=mapRS2cat.at(rs);
+		    ns_vec[i_vc]++;
+		    gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
+		    gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix);
+		  }
+		  */
+
+		  gsl_vector_scale (geno, sqrt(d));
+		  if (n_vc==1 || mapRS2cat.size()==0 ) {
+		    gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize);
+		    gsl_vector_memcpy (&Xlarge_col.vector, geno);
+		    ns_vec[0]++;
+
+		    if (ns_vec[0]%msize==0) {
+		      eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+		      gsl_matrix_set_zero(Xlarge);
+		    }
+		  } else if (mapRS2cat.count(rs)!=0) {
+		    i_vc=mapRS2cat.at(rs);
+
+		    gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize);
+		    gsl_vector_memcpy (&Xlarge_col.vector, geno);
+
+		    ns_vec[i_vc]++;
+
+		    if (ns_vec[i_vc]%msize==0) {
+		      gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize);
+		      gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
+		      eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix);
+
+		      gsl_matrix_set_zero(&X_sub.matrix);
+		    }
+		  }
 
-		if (geno_var!=0) {
-		  mapRS2var[rs]=geno_var;
-			if (k_mode==1) {
-			  if (n_vc==1 || mapRS2cat.size()==0 ) {
-			    gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);
-			    ns_vec[0]++;
-			  } else if (mapRS2cat.count(rs)!=0) {
-			    i_vc=mapRS2cat.at(rs);
-			    ns_vec[i_vc]++;
-			    gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
-			    gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix);
-			  }
-			} else if (k_mode==2) {
-			  if (n_vc==1 || mapRS2cat.size()==0 ) {
-			    gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);
-			    ns_vec[0]++;
-			  } else if (mapRS2cat.count(rs)!=0) {
-			    i_vc=mapRS2cat.at(rs);
-			    ns_vec[i_vc]++;
-			    gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
-			    gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix);
-			  }
-			} else {
-			  cout<<"Unknown kinship mode."<<endl;
-			}
-		}
 
+		}
 		ns_test++;
-    }
+	}
+
+	for (size_t i_vc=0; i_vc<n_vc; i_vc++) {
+	  if (ns_vec[i_vc]%msize!=0) {
+	    gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize);
+	    gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test);
+	    eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix);
+	  }
+	}
+
 	cout<<endl;
 
 	for (size_t t=0; t<n_vc; t++) {
-	  if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);}
+	  gsl_vector_set(vector_ns, t, ns_vec[t]);
 
 	  for (size_t i=0; i<ni_test; ++i) {
-	    for (size_t j=0; j<i; ++j) {
+	    for (size_t j=0; j<=i; ++j) {
 	      d=gsl_matrix_get (matrix_kin, j, i+ni_test*t);
+	      d/=(double)ns_vec[t];
 	      gsl_matrix_set (matrix_kin, i, j+ni_test*t, d);
-	      //cout<<d<<" ";
+	      gsl_matrix_set (matrix_kin, j, i+ni_test*t, d);
 	    }
-	    //cout<<endl;
-	  }
-	}
-
-	d=0;
-	for (size_t i=0; i<ni_test; ++i) {
-	  for (size_t j=0; j<ni_test; ++j) {
-	    d+=gsl_matrix_get (matrix_kin, i, j)*gsl_matrix_get (matrix_kin, i, j);
 	  }
 	}
-	d/=(double)ni_test*(double)ni_test;
-	//cout<<"trace = "<<scientific<<d-1/(double)ni_test<<endl;
 
+	gsl_vector_free (geno);
 
+	gsl_vector_free (Wtx);
+	gsl_matrix_free (WtW);
+	gsl_matrix_free (WtWi);
+	gsl_vector_free (WtWiWtx);
+	gsl_permutation_free (pmt);
 
-	gsl_vector_free (geno);
+	gsl_matrix_free (Xlarge);
 
 	infile.close();
 	infile.clear();
@@ -2848,34 +3096,176 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &
 
 
 
-//read var file, store mapRS2var
-bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var)
+bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns)
 {
-  mapRS2var.clear();
+  size_t n_vc=vector_ns->size, ni_test=matrix_kin->size1;
+  gsl_matrix_set_zero(matrix_kin);
+  gsl_vector_set_zero(vector_ns);
+
+  igzstream infile (file_mfile.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;}
 
-  igzstream infile (file_var.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open var file: "<<file_var<<endl; return false;}
+  string file_name;
+
+  gsl_matrix *kin_tmp=gsl_matrix_alloc (matrix_kin->size1, matrix_kin->size2);
+  gsl_vector *ns_tmp=gsl_vector_alloc (vector_ns->size);
+
+  size_t l=0;
+  double d;
+  while (!safeGetline(infile, file_name).eof()) {
+    gsl_matrix_set_zero(kin_tmp);
+    gsl_vector_set_zero(ns_tmp);
+
+    if (mfile_mode==1) {
+      file_name+=".bed";
+      PlinkKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp);
+    } else {
+      BimbamKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp);
+    }
+
+    //add ns
+    gsl_vector_add(vector_ns, ns_tmp);
+
+    //add kin
+    for (size_t t=0; t<n_vc; t++) {
+      for (size_t i=0; i<ni_test; ++i) {
+	for (size_t j=0; j<=i; ++j) {
+	  d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)+gsl_matrix_get (kin_tmp, j, i+ni_test*t)*gsl_vector_get(ns_tmp, t);
+
+	  gsl_matrix_set (matrix_kin, i, j+ni_test*t, d);
+	  gsl_matrix_set (matrix_kin, j, i+ni_test*t, d);
+	}
+      }
+    }
+    l++;
+  }
+
+  //renormalize kin
+  for (size_t t=0; t<n_vc; t++) {
+    for (size_t i=0; i<ni_test; ++i) {
+      for (size_t j=0; j<=i; ++j) {
+	d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)/gsl_vector_get(vector_ns, t);
+
+	gsl_matrix_set (matrix_kin, i, j+ni_test*t, d);
+	gsl_matrix_set (matrix_kin, j, i+ni_test*t, d);
+
+      }
+    }
+  }
+  cout<<endl;
+
+  infile.close();
+  infile.clear();
+
+  gsl_matrix_free(kin_tmp);
+  gsl_vector_free(ns_tmp);
+
+  return true;
+}
+
+
+
+
+//read var file, store mapRS2wsnp
+bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2weight)
+{
+  mapRS2weight.clear();
+
+  igzstream infile (file_wsnp.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wsnp<<endl; return false;}
 
   char *ch_ptr;
   string line, rs;
-  double var;
+  double weight;
 
   while (!safeGetline(infile, line).eof()) {
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
     rs=ch_ptr;
     ch_ptr=strtok (NULL, " , \t");
-    var=atof(ch_ptr);
-    mapRS2var[rs]=var;
+    weight=atof(ch_ptr);
+    mapRS2weight[rs]=weight;
   }
 
   return true;
 }
 
+bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vector<double> > &mapRS2wvector)
+{
+  mapRS2wvector.clear();
+
+  igzstream infile (file_wcat.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wcat<<endl; return false;}
+
+  char *ch_ptr;
+  vector<double> weight;
+  for (size_t i=0; i<n_vc; i++) {
+    weight.push_back(0.0);
+  }
+
+  string line, rs, chr, a1, a0, pos, cm;
+  //double af=0, var_x=0;
+  //size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0;
+
+  //read header
+  HEADER header;
+  !safeGetline(infile, line).eof();
+  ReadHeader (line, header);
+
+  while (!safeGetline(infile, line).eof()) {
+    if (isBlankLine(line)) {continue;}
+    ch_ptr=strtok ((char *)line.c_str(), " , \t");
+
+    //n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; n_case=0; af=0; var_x=0;
+    size_t t=0;
+    for (size_t i=0; i<header.coln; i++) {
+      if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+      else if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr; }
+      else if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; }
+      else if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; }
+      else if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr; }
+      else if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr; }
+      //else if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr); }
+      //else if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr); }
+      //else if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr); }
+      //else if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr); }
+      //else if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr); }
+      //else if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr); }
+      //else if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr); }
+      else {
+	weight[t]=atof(ch_ptr); t++;
+	if (t>n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;}
+      }
+
+      ch_ptr=strtok (NULL, " , \t");
+    }
+
+    if (t!=n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;}
+
+    if (header.rs_col==0) {
+      rs=chr+":"+pos;
+    }
+
+    mapRS2wvector[rs]=weight;
+  }
+
+  return true;
+}
+
+
+
+
+
+
+
 
-//read beta file, use the mapRS2var to select snps (and to provide var if maf/var is not provided in the beta file), calculate q
-void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test)
+//read the beta file, save snp z scores in to z2_score, and save category into indicator_snp based on mapRS2var and set, and indicator_snp record the category number (from 1 to n_vc), and provide var if maf/var is not provided in the beta file
+//notice that indicator_snp contains ns_test snps, instead of ns_total snps
+//read the beta file for the second time, compute q, and Vq based on block jacknife
+//use the mapRS2var to select snps (and to ), calculate q
+//do a block-wise jacknife, and compute Vq
+void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2wA, vector<size_t> &vec_cat, vector<size_t> &vec_ni, vector<double> &vec_weight, vector<double> &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test)
 {
-  gsl_vector_set_zero(q);
+  vec_cat.clear(); vec_ni.clear(); vec_weight.clear(); vec_z2.clear();
   ni_total=0; ns_total=0; ns_test=0;
 
   igzstream infile (file_beta.c_str(), igzstream::in);
@@ -2887,13 +3277,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
 
   string rs, chr, a1, a0, pos, cm;
   double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0;
-  size_t n_total=0, n_mis=0, n_obs=0;
-
-  vector<double> vec_q, vec_s;
-  for (size_t i=0; i<q->size; i++) {
-    vec_q.push_back(0.0);
-    vec_s.push_back(0.0);
-  }
+  size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0;
 
   //read header
   HEADER header;
@@ -2901,7 +3285,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
   ReadHeader (line, header);
 
   if (header.n_col==0 ) {
-    if (header.nobs_col==0 && header.nmis_col==0) {
+    if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) {
       cout<<"error! missing sample size in the beta file."<<endl;
     } else {
       cout<<"total sample size will be replaced by obs/mis sample size."<<endl;
@@ -2911,16 +3295,17 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
   if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) {
     cout<<"error! missing z scores in the beta file."<<endl;
   }
-
-  if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) {
+  /*
+  if (header.af_col==0 && header.var_col==0) {
     cout<<"error! missing allele frequency in the beta file."<<endl;
   }
-
+  */
   while (!safeGetline(infile, line).eof()) {
+    if (isBlankLine(line)) {continue;}
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
 
     z=0; beta=0; se_beta=0; chisq=0; pvalue=0;
-    n_total=0; n_mis=0; n_obs=0; af=0; var_x=0;
+    n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; af=0; var_x=0;
     for (size_t i=0; i<header.coln; i++) {
       if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
       if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
@@ -2938,6 +3323,8 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
       if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);}
       if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);}
       if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);}
+      if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);}
+      if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);}
 
       if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);}
       if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);}
@@ -2950,7 +3337,11 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
     }
 
     if (header.n_col==0) {
-      n_total=n_mis+n_obs;
+      if (header.nmis_col!=0 && header.nobs_col!=0) {
+	n_total=n_mis+n_obs;
+      } else {
+	n_total=n_case+n_control;
+      }
     }
 
     //both z values and beta/se_beta have directions, while chisq/pvalue do not
@@ -2965,29 +3356,25 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
       zsquare=gsl_cdf_chisq_Qinv (pvalue, 1);
     } else {zsquare=0;}
 
+    //obtain var_x
+    if (header.var_col==0 && header.af_col!=0) {
+      var_x=2.0*af*(1.0-af);
+    }
+
     //if the snp is also present in cor file, then do calculations
-    if (mapRS2var.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) {
-      //obtain var_x
-      if (k_mode==1) {
-	if (header.var_col==0) {
-	  if (header.af_col!=0) {
-	    var_x=2.0*af*(1.0-af);
-	  } else {
-	    var_x=mapRS2var.at(rs);
-	  }
-	}
+    if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) && zsquare!=0) {
+      if (mapRS2cat.size()!=0) {
+	vec_cat.push_back(mapRS2cat.at(rs));
       } else {
-	var_x=1.0;
+	vec_cat.push_back(0);
       }
-
-      //compute q
-      if (mapRS2cat.size()!=0) {
-	vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total;
-	vec_s[mapRS2cat.at(rs) ]+=var_x;
+      vec_ni.push_back(n_total);
+      if (mapRS2wA.size()==0) {
+	vec_weight.push_back(1);
       } else {
-	vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total;
-	vec_s[0]+=var_x;
+	vec_weight.push_back(mapRS2wA.at(rs));
       }
+      vec_z2.push_back(zsquare);
 
       ni_total=max(ni_total, n_total);
       ns_test++;
@@ -2996,14 +3383,6 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
     ns_total++;
   }
 
-  //save q
-  for (size_t i=0; i<q->size; i++) {
-    if (vec_s[i]!=0) {
-      gsl_vector_set(q, i, vec_q[i]/vec_s[i]);
-    }
-    gsl_vector_set(s, i, vec_s[i]);
-  }
-
   infile.clear();
   infile.close();
 
@@ -3013,34 +3392,108 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string,
 
 
 
-//read S file: S and Svar
-void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar)
+
+
+void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA, map<string, string> &mapRS2A1, map<string, double> &mapRS2z)
 {
-  igzstream infile (file_s.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;}
+  mapRS2A1.clear(); mapRS2z.clear();
+
+  igzstream infile (file_beta.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;}
 
   string line;
   char *ch_ptr;
-  double d;
+  string type;
 
-  for (size_t i=0; i<S->size1; i++) {
-    !safeGetline(infile, line).eof();
-    ch_ptr=strtok ((char *)line.c_str(), " , \t");
-    for (size_t j=0; j<S->size2; j++) {
-      d=gsl_matrix_get(S, i, j)+atof(ch_ptr);
-      gsl_matrix_set(S, i, j, d);
-      ch_ptr=strtok (NULL, " , \t");
+  string rs, chr, a1, a0, pos, cm;
+  double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, af=0, var_x=0;
+  size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0;
+  size_t ni_total=0, ns_total=0, ns_test=0;
+
+  //read header
+  HEADER header;
+  !safeGetline(infile, line).eof();
+  ReadHeader (line, header);
+
+  if (header.n_col==0 ) {
+    if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) {
+      cout<<"error! missing sample size in the beta file."<<endl;
+    } else {
+      cout<<"total sample size will be replaced by obs/mis sample size."<<endl;
     }
   }
 
-  for (size_t i=0; i<Svar->size1; i++) {
-    !safeGetline(infile, line).eof();
+  if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0)) {
+    cout<<"error! missing z scores in the beta file."<<endl;
+  }
+  /*
+  if (header.af_col==0 && header.var_col==0) {
+    cout<<"error! missing allele frequency in the beta file."<<endl;
+  }
+  */
+  while (!safeGetline(infile, line).eof()) {
+    if (isBlankLine(line)) {continue;}
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
-    for (size_t j=0; j<Svar->size2; j++) {
-      d=gsl_matrix_get(Svar, i, j)+atof(ch_ptr);
-      gsl_matrix_set(Svar, i, j, d);
+
+    z=0; beta=0; se_beta=0; chisq=0; pvalue=0;
+    n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; af=0; var_x=0;
+    for (size_t i=0; i<header.coln; i++) {
+      if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+      if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
+      if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;}
+      if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;}
+      if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;}
+      if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;}
+
+      if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);}
+      if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);}
+      if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);}
+      if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);}
+      if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);}
+
+      if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);}
+      if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);}
+      if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);}
+      if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);}
+      if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);}
+
+      if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);}
+      if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);}
+
       ch_ptr=strtok (NULL, " , \t");
     }
+
+    if (header.rs_col==0) {
+      rs=chr+":"+pos;
+    }
+
+    if (header.n_col==0) {
+      if (header.nmis_col!=0 && header.nobs_col!=0) {
+	n_total=n_mis+n_obs;
+      } else {
+	n_total=n_case+n_control;
+      }
+    }
+
+    //both z values and beta/se_beta have directions, while chisq/pvalue do not
+    if (header.z_col!=0) {
+      z=z;
+    } else if (header.beta_col!=0 && header.sebeta_col!=0) {
+      z=beta/se_beta;
+    } else {
+      z=0;
+    }
+
+    //if the snp is also present in cor file, then do calculations
+    if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) ) {
+      mapRS2z[rs]=z;
+      mapRS2A1[rs]=a1;
+
+      ni_total=max(ni_total, n_total);
+      ns_test++;
+    }
+
+    ns_total++;
   }
 
   infile.clear();
@@ -3052,22 +3505,135 @@ void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar)
 
 
 
-void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar)
+void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s)
 {
-  gsl_matrix_set_zero(S);
-  gsl_matrix_set_zero(Svar);
+  gsl_matrix_set_zero (Vq);
+  gsl_vector_set_zero (q);
+  gsl_vector_set_zero (s);
 
-  string file_name;
+  size_t cat, n_total;
+  double w, zsquare;
 
-  igzstream infile (file_ms.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open ms file: "<<file_ms<<endl; return;}
+  vector<double> vec_q, vec_s, n_snps;
+  for (size_t i=0; i<q->size; i++) {
+    vec_q.push_back(0.0);
+    vec_s.push_back(0.0);
+    n_snps.push_back(0.0);
+  }
 
-  while (!safeGetline(infile, file_name).eof()) {
-    ReadFile_s(file_name, S, Svar);
+  vector<vector<double> > mat_q, mat_s;
+  for (size_t i=0; i<n_block; i++) {
+    mat_q.push_back(vec_q);
+    mat_s.push_back(vec_s);
   }
 
-  infile.clear();
-  infile.close();
+  //compute q and s
+  for (size_t i=0; i<vec_cat.size(); i++) {
+    //extract quantities
+    cat=vec_cat[i];
+    n_total=vec_ni[i];
+    w=vec_weight[i];
+    zsquare=vec_z2[i];
+
+    //compute q and s
+    vec_q[cat]+=(zsquare-1.0)*w/(double)n_total;
+    vec_s[cat]+=w;
+    n_snps[cat]++;
+  }
+
+  //update q; vec_q is used again for computing Vq below
+  for (size_t i=0; i<q->size; i++) {
+    if (vec_s[i]!=0) {
+      gsl_vector_set(q, i, vec_q[i]/vec_s[i]);
+    }
+    gsl_vector_set(s, i, vec_s[i]);
+  }
+
+  //compute Vq; divide SNPs in each category into evenly distributed blocks
+  size_t t=0, b=0, n_snp=0;
+  double d, m, n;
+  for (size_t l=0; l<q->size; l++) {
+    n_snp=floor(n_snps[l]/n_block); t=0; b=0;
+    if (n_snp==0) {continue;}
+
+    //initiate everything to zero
+    for (size_t i=0; i<n_block; i++) {
+      for (size_t j=0; j<q->size; j++) {
+	mat_q[i][j]=0;
+	mat_s[i][j]=0;
+      }
+    }
+
+    //record values
+    for (size_t i=0; i<vec_cat.size(); i++) {
+      //extract quantities
+      cat=vec_cat[i];
+      n_total=vec_ni[i];
+      w=vec_weight[i];
+      zsquare=vec_z2[i];
+
+      //save quantities for computing Vq (which is not divided by n_total)
+      mat_q[b][cat]+=(zsquare-1.0)*w;
+      mat_s[b][cat]+=w;
+
+      if (cat==l) {
+	if (b<n_block-1) {
+	  if (t<n_snp-1) {t++;}  else {b++; t=0;}
+	} else {
+	  t++;
+	}
+      }
+    }
+
+    //center mat_q
+    for (size_t i=0; i<q->size; i++) {
+      m=0; n=0;
+      for (size_t k=0; k<n_block; k++) {
+	if (mat_s[k][i]!=0 && vec_s[i]!=mat_s[k][i]) {
+	  d=(vec_q[i]-mat_q[k][i])/(vec_s[i]-mat_s[k][i]);
+	  mat_q[k][i]=d;
+	  m+=d;
+	  n++;
+	}
+      }
+      if (n!=0) {m/=n;}
+
+      for (size_t k=0; k<n_block; k++) {
+	if (mat_q[k][i]!=0) {
+	  mat_q[k][i]-=m;
+	}
+      }
+    }
+
+    //compute Vq for l'th row and l'th column only
+    for (size_t i=0; i<q->size; i++) {
+      d=0; n=0;
+      for (size_t k=0; k<n_block; k++) {
+	if (mat_q[k][l]!=0 && mat_q[k][i]!=0) {
+	  d+=mat_q[k][l]*mat_q[k][i];
+	  n++;
+	}
+      }
+      if (n!=0) {
+	d/=n;
+	d*=n-1;
+      }
+      d+=gsl_matrix_get(Vq, i, l);
+      gsl_matrix_set(Vq, i, l, d);
+      if (i!=l) {gsl_matrix_set(Vq, l, i, d);}
+    }
+
+  }
+
+  //divide the off diagonal elements of Vq by 2
+  for (size_t i=0; i<q->size; i++) {
+    for (size_t j=i; j<q->size; j++) {
+      if (i==j) {continue;}
+      d=gsl_matrix_get(Vq, i, j);
+      gsl_matrix_set(Vq, i, j, d/2);
+      gsl_matrix_set(Vq, j, i, d/2);
+    }
+  }
 
   return;
 }
@@ -3075,24 +3641,19 @@ void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar)
 
 
 
-//read V file: V (i.e. Q)
-void ReadFile_v (const string &file_v, gsl_matrix *V)
+//read vector file
+void ReadFile_vector (const string &file_vec, gsl_vector *vec)
 {
-  igzstream infile (file_v.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open v file: "<<file_v<<endl; return;}
+  igzstream infile (file_vec.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open vector file: "<<file_vec<<endl; return;}
 
   string line;
   char *ch_ptr;
-  double d;
 
-  for (size_t i=0; i<V->size1; i++) {
+  for (size_t i=0; i<vec->size; i++) {
     !safeGetline(infile, line).eof();
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
-    for (size_t j=0; j<V->size2; j++) {
-      d=gsl_matrix_get(V, i, j)+atof(ch_ptr);
-      gsl_matrix_set(V, i, j, d);
-      ch_ptr=strtok (NULL, " , \t");
-    }
+    gsl_vector_set(vec, i, atof(ch_ptr));
   }
 
   infile.clear();
@@ -3102,17 +3663,21 @@ void ReadFile_v (const string &file_v, gsl_matrix *V)
 }
 
 
-void ReadFile_mv (const string &file_mv, gsl_matrix *V)
+void ReadFile_matrix (const string &file_mat, gsl_matrix *mat)
 {
-  gsl_matrix_set_zero(V);
-
-  string file_name;
+  igzstream infile (file_mat.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;}
 
-  igzstream infile (file_mv.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open ms file: "<<file_mv<<endl; return;}
+  string line;
+  char *ch_ptr;
 
-  while (!safeGetline(infile, file_name).eof()) {
-    ReadFile_v(file_name, V);
+  for (size_t i=0; i<mat->size1; i++) {
+    !safeGetline(infile, line).eof();
+    ch_ptr=strtok ((char *)line.c_str(), " , \t");
+    for (size_t j=0; j<mat->size2; j++) {
+      gsl_matrix_set(mat, i, j, atof(ch_ptr));
+      ch_ptr=strtok (NULL, " , \t");
+    }
   }
 
   infile.clear();
@@ -3121,35 +3686,32 @@ void ReadFile_mv (const string &file_mv, gsl_matrix *V)
   return;
 }
 
-
-//read q file: q, s and ni_test
-void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df)
+void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, gsl_matrix *mat2)
 {
-  igzstream infile (file_s.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;}
+  igzstream infile (file_mat.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;}
 
   string line;
   char *ch_ptr;
-  double d;
 
-  for (size_t i=0; i<q_vec->size; i++) {
+  for (size_t i=0; i<mat1->size1; i++) {
     !safeGetline(infile, line).eof();
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
-    d=gsl_vector_get(q_vec, i)+atof(ch_ptr);
-    gsl_vector_set(q_vec, i, d);
+    for (size_t j=0; j<mat1->size2; j++) {
+      gsl_matrix_set(mat1, i, j, atof(ch_ptr));
+      ch_ptr=strtok (NULL, " , \t");
+    }
   }
 
-  for (size_t i=0; i<s_vec->size; i++) {
+  for (size_t i=0; i<mat2->size1; i++) {
     !safeGetline(infile, line).eof();
     ch_ptr=strtok ((char *)line.c_str(), " , \t");
-    d=gsl_vector_get(s_vec, i)+atof(ch_ptr);
-    gsl_vector_set(s_vec, i, d);
+    for (size_t j=0; j<mat2->size2; j++) {
+      gsl_matrix_set(mat2, i, j, atof(ch_ptr));
+      ch_ptr=strtok (NULL, " , \t");
+    }
   }
 
-  !safeGetline(infile, line).eof();
-  ch_ptr=strtok ((char *)line.c_str(), " , \t");
-  df=atof(ch_ptr);
-
   infile.clear();
   infile.close();
 
@@ -3158,22 +3720,274 @@ void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, dou
 
 
 
-void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df)
+//read study file
+void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni)
 {
+  string Vqfile=file_study+".Vq.txt";
+  string sfile=file_study+".size.txt";
+  string qfile=file_study+".q.txt";
+
+  gsl_vector *s=gsl_vector_alloc (s_vec->size+1);
+
+  ReadFile_matrix(Vqfile, Vq_mat);
+  ReadFile_vector(sfile, s);
+  ReadFile_vector(qfile, q_vec);
+
+  double d;
+  for (size_t i=0; i<s_vec->size; i++) {
+    d=gsl_vector_get (s, i);
+    gsl_vector_set (s_vec, i, d);
+  }
+  ni=gsl_vector_get (s, s_vec->size);
+
+  gsl_vector_free(s);
+
+  return;
+}
+
+
+//read reference file
+void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni)
+{
+  string sfile=file_ref+".size.txt";
+  string Sfile=file_ref+".S.txt";
+  //string Vfile=file_ref+".V.txt";
+
+  gsl_vector *s=gsl_vector_alloc (s_vec->size+1);
+
+  ReadFile_vector(sfile, s);
+  ReadFile_matrix(Sfile, S_mat, Svar_mat);
+  //ReadFile_matrix(Vfile, V_mat);
+
+  double d;
+  for (size_t i=0; i<s_vec->size; i++) {
+    d=gsl_vector_get (s, i);
+    gsl_vector_set (s_vec, i, d);
+  }
+  ni=gsl_vector_get (s, s_vec->size);
+
+  gsl_vector_free(s);
+
+  return;
+}
+
+
+//read mstudy file
+void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni)
+{
+  gsl_matrix_set_zero(Vq_mat);
   gsl_vector_set_zero(q_vec);
   gsl_vector_set_zero(s_vec);
+  ni=0;
+
+  gsl_matrix *Vq_sub=gsl_matrix_alloc(Vq_mat->size1, Vq_mat->size2);
+  gsl_vector *q_sub=gsl_vector_alloc(q_vec->size);
+  gsl_vector *s=gsl_vector_alloc (s_vec->size+1);
+
+  igzstream infile (file_mstudy.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mstudy file: "<<file_mstudy<<endl; return;}
 
   string file_name;
+  double d1, d2, d;
+
+  while (!safeGetline(infile, file_name).eof()) {
+    string Vqfile=file_name+".Vq.txt";
+    string sfile=file_name+".size.txt";
+    string qfile=file_name+".q.txt";
+
+    ReadFile_matrix(Vqfile, Vq_sub);
+    ReadFile_vector(sfile, s);
+    ReadFile_vector(qfile, q_sub);
+
+    ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size));
+
+    for (size_t i=0; i<s_vec->size; i++) {
+      d1=gsl_vector_get (s, i);
+      if (d1==0) {continue;}
+
+      d=gsl_vector_get(q_vec, i)+gsl_vector_get(q_sub, i)*d1;
+      gsl_vector_set(q_vec, i, d);
+
+      d=gsl_vector_get(s_vec, i)+d1;
+      gsl_vector_set(s_vec, i, d);
+
+      for (size_t j=i; j<s_vec->size; j++) {
+	d2=gsl_vector_get (s, j);
+	if (d2==0) {continue;}
+
+	d=gsl_matrix_get(Vq_mat, i, j)+gsl_matrix_get(Vq_sub, i, j)*d1*d2;
+	gsl_matrix_set(Vq_mat, i, j, d);
+	if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);}
+      }
+    }
+  }
 
-  igzstream infile (file_mq.c_str(), igzstream::in);
-  if (!infile) {cout<<"error! fail to open mq file: "<<file_mq<<endl; return;}
+  for (size_t i=0; i<s_vec->size; i++) {
+    d1=gsl_vector_get (s_vec, i);
+    if (d1==0) {continue;}
+
+    d=gsl_vector_get (q_vec, i);
+    gsl_vector_set (q_vec, i, d/d1);
+
+    for (size_t j=i; j<s_vec->size; j++) {
+      d2=gsl_vector_get (s_vec, j);
+      if (d2==0) {continue;}
+
+      d=gsl_matrix_get (Vq_mat, i, j)/(d1*d2);
+      gsl_matrix_set (Vq_mat, i, j, d);
+      if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);}
+    }
+  }
+
+  gsl_matrix_free(Vq_sub);
+  gsl_vector_free(q_sub);
+  gsl_vector_free(s);
+
+  return;
+}
+
+
+//copied from lmm.cpp; is used in the following function compKtoV
+//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1
+size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
+	if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;}
+	size_t index;
+	size_t l, h;
+	if (b>a) {l=a; h=b;} else {l=b; h=a;}
+
+	size_t n=n_cvt+2;
+	index=(2*n-l+2)*(l-1)/2+h-l;
+
+	return index;
+}
+
+//read reference file
+void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni)
+{
+  gsl_matrix_set_zero(S_mat);
+  gsl_matrix_set_zero(Svar_mat);
+  //  gsl_matrix_set_zero(V_mat);
+  gsl_vector_set_zero(s_vec);
+  ni=0;
+
+  //size_t n_vc=S_mat->size1;
+  gsl_matrix *S_sub=gsl_matrix_alloc (S_mat->size1, S_mat->size2);
+  gsl_matrix *Svar_sub=gsl_matrix_alloc (Svar_mat->size1, Svar_mat->size2);
+  //gsl_matrix *V_sub=gsl_matrix_alloc (V_mat->size1, V_mat->size2);
+  gsl_vector *s=gsl_vector_alloc (s_vec->size+1);
+
+  igzstream infile (file_mref.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mref file: "<<file_mref<<endl; return;}
+
+  string file_name;
+  double d1, d2, d;
+  //size_t t_ij;
 
   while (!safeGetline(infile, file_name).eof()) {
-    ReadFile_q(file_name, q_vec, s_vec, df);
+    string sfile=file_name+".size.txt";
+    string Sfile=file_name+".S.txt";
+    //string Vfile=file_name+".V.txt";
+
+    ReadFile_vector(sfile, s);
+    ReadFile_matrix(Sfile, S_sub, Svar_sub);
+    //ReadFile_matrix(Vfile, V_sub);
+
+    //update s_vec and ni
+    for (size_t i=0; i<s_vec->size; i++) {
+      d=gsl_vector_get (s, i)+gsl_vector_get (s_vec, i);
+      gsl_vector_set (s_vec, i, d);
+    }
+    ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size));
+
+    //update S and Svar from each file
+    for (size_t i=0; i<S_mat->size1; i++) {
+      d1=gsl_vector_get(s, i);
+      for (size_t j=0; j<S_mat->size2; j++) {
+	d2=gsl_vector_get(s, j);
+
+	d=gsl_matrix_get(S_sub, i, j)*d1*d2;
+	gsl_matrix_set(S_sub, i, j, d);
+	d=gsl_matrix_get(Svar_sub, i, j)*d1*d2*d1*d2;
+	gsl_matrix_set(Svar_sub, i, j, d);
+      }
+    }
+
+    gsl_matrix_add (S_mat, S_sub);
+    gsl_matrix_add (Svar_mat, Svar_sub);
+    /*
+    //update V from each file
+    for (size_t i=0; i<n_vc; i++) {
+      d1=gsl_vector_get(s, i);
+      for (size_t j=i; j<n_vc; j++) {
+	d2=gsl_vector_get(s, j);
+	t_ij=GetabIndex (i+1, j+1, n_vc-2);
+	for (size_t l=0; l<n_vc+1; l++) {
+	  if (l==n_vc) {d3=1;} else {d3=gsl_vector_get(s, l);}
+	  for (size_t m=0; m<n_vc+1; m++) {
+	    if (m==n_vc) {d4=1;} else {d4=gsl_vector_get(s, m);}
+
+	    d=gsl_matrix_get (V_sub, l, t_ij*(n_vc+1)+m)*d1*d2*d3*d4;
+	    gsl_matrix_set (V_sub, l, t_ij*(n_vc+1)+m, d);
+	  }
+	}
+      }
+    }
+
+    gsl_matrix_add (V_mat, V_sub);
+    */
   }
 
-  infile.clear();
-  infile.close();
+  //final: update S and Svar
+  for (size_t i=0; i<S_mat->size1; i++) {
+    d1=gsl_vector_get(s_vec, i);
+    if (d1==0) {continue;}
+    for (size_t j=i; j<S_mat->size2; j++) {
+      d2=gsl_vector_get(s_vec, j);
+      if (d2==0) {continue;}
+
+      d=gsl_matrix_get(S_mat, i, j)/(d1*d2);
+      gsl_matrix_set(S_mat, i, j, d);
+      if (i!=j) {gsl_matrix_set(S_mat, j, i, d);}
+
+      d=gsl_matrix_get(Svar_mat, i, j)/(d1*d2*d1*d2);
+      gsl_matrix_set(Svar_mat, i, j, d);
+      if (i!=j) {gsl_matrix_set(Svar_mat, j, i, d);}
+    }
+  }
+  /*
+  //final: update V
+  for (size_t i=0; i<n_vc; i++) {
+    d1=gsl_vector_get(s_vec, i);
+    if (d1==0) {continue;}
+    for (size_t j=i; j<n_vc; j++) {
+      d2=gsl_vector_get(s_vec, j);
+      if (d2==0) {continue;}
+      t_ij=GetabIndex (i+1, j+1, n_vc-2);
+	for (size_t l=0; l<n_vc+1; l++) {
+	  if (l==n_vc) {d3=1;} else {d3=gsl_vector_get(s_vec, l);}
+	  if (d3==0) {continue;}
+	  for (size_t m=0; m<n_vc+1; m++) {
+	    if (m==n_vc) {d4=1;} else {d4=gsl_vector_get(s_vec, m);}
+	    if (d4==0) {continue;}
+
+	    d=gsl_matrix_get (V_mat, l, t_ij*(n_vc+1)+m)/(d1*d2*d3*d4);
+	    gsl_matrix_set (V_mat, l, t_ij*(n_vc+1)+m, d);
+	  }
+	}
+      }
+    }
+  */
+  //free matrices
+  gsl_matrix_free(S_sub);
+  gsl_matrix_free(Svar_sub);
+  //gsl_matrix_free(V_sub);
+  gsl_vector_free(s);
 
   return;
 }
+
+
+
+
+
+
diff --git a/src/io.h b/src/io.h
index 6787176..14dfcc9 100644
--- a/src/io.h
+++ b/src/io.h
@@ -44,6 +44,7 @@ void ProgressBar (string str, double p, double total, double ratio);
 std::istream& safeGetline(std::istream& is, std::string& t);
 
 bool ReadFile_snps (const string &file_snps, set<string> &setSnps);
+bool ReadFile_snps_header (const string &file_snps, set<string> &setSnps);
 bool ReadFile_log (const string &file_log, double &pheno_mean);
 
 bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo);
@@ -80,20 +81,23 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN
 
 bool ReadHeader (const string &line, HEADER &header);
 bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc);
+bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc);
 
-bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin);
-bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin);
+bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns);
+bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns);
+bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns);
 
-bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var);
-void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test);
+bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2double);
+bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vector<double> > &mapRS2vector);
 
+void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2wA, vector<size_t> &vec_cat, vector<size_t> &vec_ni, vector<double> &vec_weight, vector<double> &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test);
+void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA, map<string, string> &mapRS2A1, map<string, double> &mapRS2z);
+void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s);
 
-void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar);
-void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar);
-void ReadFile_v (const string &file_v, gsl_matrix *V);
-void ReadFile_mv (const string &file_mq, gsl_matrix *V);
-void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df);
-void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df);
+void ReadFile_study (const string &file_study, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni);
+void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni);
+void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni);
+void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni);
 
 // WJA added
 bool bgenKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
diff --git a/src/lm.cpp b/src/lm.cpp
index b4bc010..f8cb974 100644
--- a/src/lm.cpp
+++ b/src/lm.cpp
@@ -41,6 +41,7 @@
 #include "gsl/gsl_min.h"
 #include "gsl/gsl_integration.h"
 
+#include "eigenlib.h"
 #include "gzstream.h"
 #include "lapack.h"
 
@@ -519,9 +520,9 @@ void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y)
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
 
@@ -626,9 +627,9 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y)
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
 		//calculate statistics
@@ -712,7 +713,6 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y)
 		b=ch[0];
 	}
 
-
 	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
 		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
 		if (indicator_snp[t]==0) {continue;}
@@ -747,9 +747,9 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y)
 		for (size_t i=0; i<ni_test; ++i) {
 			geno=gsl_vector_get(x,i);
 			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
 		//calculate statistics
@@ -759,11 +759,11 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y)
 		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
 		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-
 		//store summary data
 		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
 		sumStat.push_back(SNPs);
+
+		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 	}
 	cout<<endl;
 
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 7bcf89a..af6ff8a 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -42,6 +42,7 @@
 #include "gsl/gsl_integration.h"
 
 #include "io.h"
+#include "eigenlib.h"
 #include "lapack.h"
 #include "gzstream.h"
 
@@ -1228,6 +1229,12 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
 	gsl_vector *ab=gsl_vector_alloc (n_index);
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	gsl_matrix_set_zero (Uab);
 	CalcUab (UtW, Uty, Uab);
 //	if (e_mode!=0) {
@@ -1236,6 +1243,7 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 //	}
 
 	//start reading genotypes and analyze
+	size_t c=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t) {
 //		if (t>1) {break;}
 		!safeGetline(infile, line).eof();
@@ -1268,48 +1276,72 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//	gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
-
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		CalcUab(UtW, Uty, Utx, Uab);
-//		if (e_mode!=0) {
-//			Calcab (W, y, x, ab);
-//		}
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		c++;
 
-		time_start=clock();
-		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		if (c%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (c%msize==0) {l=msize;} else {l=c%msize;}
 
-		//3 is before 1
-		if (a_mode==3 || a_mode==4) {
-			CalcRLScore (l_mle_null, param1, beta, se, p_score);
-		}
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-		if (a_mode==1 || a_mode==4) {
-			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
-			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
-		}
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		if (a_mode==2 || a_mode==4) {
-			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
-			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
-		}
+		  gsl_matrix_set_zero (Xlarge);
 
-		if (x_mean>1) {beta*=-1;}
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (Utx, &UtXlarge_col.vector);
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    CalcUab(UtW, Uty, Utx, Uab);
+		    //		if (e_mode!=0) {
+		    //			Calcab (W, y, x, ab);
+		    //		}
 
-		//store summary data
-		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
-    }
+		    time_start=clock();
+		    FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+		    //3 is before 1
+		    if (a_mode==3 || a_mode==4) {
+		      CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		    }
+
+		    if (a_mode==1 || a_mode==4) {
+		      CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+		      CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
+		    }
+
+		    //if (x_mean>1) {beta*=-1;}
+
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		    //store summary data
+		    SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    sumStat.push_back(SNPs);
+		  }
+		}
+	}
 	cout<<endl;
 
 	gsl_vector_free (x);
@@ -1318,6 +1350,9 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 	gsl_matrix_free (Uab);
 	gsl_vector_free (ab);
 
+	gsl_matrix_free (Xlarge);
+	gsl_matrix_free (UtXlarge);
+
 	infile.close();
 	infile.clear();
 
@@ -1354,6 +1389,12 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m
 	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
 	gsl_vector *ab=gsl_vector_alloc (n_index);
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	gsl_matrix_set_zero (Uab);
 	CalcUab (UtW, Uty, Uab);
 //	if (e_mode!=0) {
@@ -1371,7 +1412,7 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m
 		b=ch[0];
 	}
 
-
+	size_t c=0;
 	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
 		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
 		if (indicator_snp[t]==0) {continue;}
@@ -1406,46 +1447,71 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m
 		for (size_t i=0; i<ni_test; ++i) {
 			geno=gsl_vector_get(x,i);
 			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		CalcUab(UtW, Uty, Utx, Uab);
-//		if (e_mode!=0) {
-//			Calcab (W, y, x, ab);
-//		}
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		c++;
 
-		time_start=clock();
-		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		if (c%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (c%msize==0) {l=msize;} else {l=c%msize;}
 
-		//3 is before 1, for beta
-		if (a_mode==3 || a_mode==4) {
-			CalcRLScore (l_mle_null, param1, beta, se, p_score);
-		}
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-		if (a_mode==1 || a_mode==4) {
-			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
-			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
-		}
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		if (a_mode==2 || a_mode==4) {
-			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
-			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
-		}
+		  gsl_matrix_set_zero (Xlarge);
 
-		if (x_mean>1) {beta*=-1;}
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (Utx, &UtXlarge_col.vector);
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    CalcUab(UtW, Uty, Utx, Uab);
+		    //		if (e_mode!=0) {
+		    //			Calcab (W, y, x, ab);
+		    //		}
 
-		//store summary data
-		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
+		    time_start=clock();
+		    FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+		    //3 is before 1, for beta
+		    if (a_mode==3 || a_mode==4) {
+		      CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		    }
+
+		    if (a_mode==1 || a_mode==4) {
+		      CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+		      CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
+		    }
+
+		    //if (x_mean>1) {beta*=-1;}
+
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		    //store summary data
+		    SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    sumStat.push_back(SNPs);
+		  }
+		}
     }
 	cout<<endl;
 
@@ -1454,6 +1520,9 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m
 	gsl_matrix_free (Uab);
 	gsl_vector_free (ab);
 
+	gsl_matrix_free(Xlarge);
+	gsl_matrix_free(UtXlarge);
+
 	infile.close();
 	infile.clear();
 
@@ -1487,6 +1556,12 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma
 	gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index);
 	gsl_vector *ab=gsl_vector_alloc (n_index);
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	gsl_matrix_set_zero (Uab);
 	CalcUab (UtW, Uty, Uab);
 //	if (e_mode!=0) {
@@ -1537,6 +1612,7 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma
 
 
 	//start reading genotypes and analyze
+	size_t c=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t)
 	{
 
@@ -1645,47 +1721,71 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
-
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		CalcUab(UtW, Uty, Utx, Uab);
-//		if (e_mode!=0) {
-//			Calcab (W, y, x, ab);
-//		}
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		c++;
 
-		time_start=clock();
-		FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+		if (c%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (c%msize==0) {l=msize;} else {l=c%msize;}
 
-		//3 is before 1
-		if (a_mode==3 || a_mode==4) {
-			CalcRLScore (l_mle_null, param1, beta, se, p_score);
-		}
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-		if (a_mode==1 || a_mode==4) {
-			CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
-			CalcRLWald (lambda_remle, param1, beta, se, p_wald);
-		}
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		if (a_mode==2 || a_mode==4) {
-			CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
-			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
-		}
+		  gsl_matrix_set_zero (Xlarge);
 
-		if (x_mean>1) {beta*=-1;}
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (Utx, &UtXlarge_col.vector);
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    CalcUab(UtW, Uty, Utx, Uab);
+		    //		if (e_mode!=0) {
+		    //			Calcab (W, y, x, ab);
+		    //		}
 
-		//store summary data
-		SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
+		    time_start=clock();
+		    FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+		    //3 is before 1
+		    if (a_mode==3 || a_mode==4) {
+		      CalcRLScore (l_mle_null, param1, beta, se, p_score);
+		    }
+
+		    if (a_mode==1 || a_mode==4) {
+		      CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+		      CalcRLWald (lambda_remle, param1, beta, se, p_wald);
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1);
+		    }
+
+		    //if (x_mean>1) {beta*=-1;}
+
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		    //store summary data
+		    SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    sumStat.push_back(SNPs);
+		  }
+		}
     }
 	cout<<endl;
 
@@ -1695,6 +1795,9 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma
 	gsl_matrix_free (Uab);
 	gsl_vector_free (ab);
 
+	gsl_matrix_free(Xlarge);
+	gsl_matrix_free(UtXlarge);
+
 	infile.close();
 	infile.clear();
 
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index e9560ad..915245b 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -40,6 +40,7 @@
 #include "Eigen/Dense"
 
 #include "lapack.h"
+#include "eigenlib.h"
 
 #ifdef FORCE_FLOAT
 #include "mathfunc_float.h"
@@ -247,6 +248,7 @@ void StandardizeVector (gsl_vector *y)
 //calculate UtX
 void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX)
 {
+  /*
 	gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1);
 	for (size_t i=0; i<UtX->size2; ++i) {
 		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
@@ -254,17 +256,28 @@ void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX)
 		gsl_vector_memcpy (&UtX_col.vector, Utx_vec);
 	}
 	gsl_vector_free (Utx_vec);
+  */
+
+	gsl_matrix *X=gsl_matrix_alloc (UtX->size1, UtX->size2);
+	gsl_matrix_memcpy (X, UtX);
+	eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX);
+	gsl_matrix_free (X);
+
 	return;
 }
 
 
 void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX)
 {
+  /*
 	for (size_t i=0; i<X->size2; ++i) {
 		gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
 		gsl_vector_view UtX_col=gsl_matrix_column (UtX, i);
 		gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector);
 	}
+  */
+	eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX);
+
 	return;
 }
 
@@ -329,7 +342,8 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab)
 		het_probs[i] = 0.0;
 
 	/* start at midpoint */
-		int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
+	//XZ modified to add (long int)
+	int mid = ((long int)rare_copies * (2 * (long int)genotypes - (long int)rare_copies)) / (2 * (long int)genotypes);
 
 	/* check to ensure that midpoint and rare alleles have same parity */
 		if ((rare_copies & 1) ^ (mid & 1))
@@ -390,7 +404,7 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab)
 			p_hwe += het_probs[i];
 		}
 
-	p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
+		p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
 
 	free(het_probs);
 
diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp
index 5826a1f..7655b50 100644
--- a/src/mvlmm.cpp
+++ b/src/mvlmm.cpp
@@ -42,6 +42,7 @@
 
 #include "io.h"
 #include "lapack.h"
+#include "eigenlib.h"
 #include "gzstream.h"
 
 #ifdef FORCE_FLOAT
@@ -2935,12 +2936,17 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 	ifstream infile (file_bgen.c_str(), ios::binary);
 	if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;}
 
-
 	clock_t time_start=clock();
 	time_UtX=0; time_opt=0;
 
 	string line;
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	//	double lambda_mle=0, lambda_remle=0, beta=0, se=0, ;
 	double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0;
 	double crt_a, crt_b, crt_c;
@@ -3179,6 +3185,7 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 
 
 	//start reading genotypes and analyze
+	size_t csnp=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t) {
 
 
@@ -3287,87 +3294,112 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		//initial values
-		gsl_matrix_memcpy (V_g, V_g_null);
-		gsl_matrix_memcpy (V_e, V_e_null);
-		gsl_matrix_memcpy (B, B_null);
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		csnp++;
 
-		time_start=clock();
+		if (csnp%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}
 
-		//3 is before 1
-		if (a_mode==3 || a_mode==4) {
-			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
-			if (p_score<p_nr && crt==1) {
-				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
-			}
-		}
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-		if (a_mode==2 || a_mode==4) {
-			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		  gsl_matrix_set_zero (Xlarge);
+
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);
+
+		    //initial values
+		    gsl_matrix_memcpy (V_g, V_g_null);
+		    gsl_matrix_memcpy (V_e, V_e_null);
+		    gsl_matrix_memcpy (B, B_null);
+
+		    time_start=clock();
+
+		    //3 is before 1
+		    if (a_mode==3 || a_mode==4) {
+		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+		      if (p_score<p_nr && crt==1) {
+			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+		      }
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      //calculate beta and Vbeta
+		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
+
+		      if (p_lrt<p_nr) {
+			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 			//calculate beta and Vbeta
 			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
 
-			if (p_lrt<p_nr) {
-				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				//calculate beta and Vbeta
-				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
-				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
-
-				if (crt==1) {
-					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		if (a_mode==1 || a_mode==4) {
-			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
-			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		    if (a_mode==1 || a_mode==4) {
+		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-			if (p_wald<p_nr) {
-				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      if (p_wald<p_nr) {
+			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-				if (crt==1) {
-					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		//store summary data
-		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		for (size_t i=0; i<d_size; i++) {
-			v_beta[i]=gsl_vector_get (beta, i);
-		}
+		    //store summary data
+		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    for (size_t i=0; i<d_size; i++) {
+		      v_beta[i]=gsl_vector_get (beta, i);
+		    }
 
-		c=0;
-		for (size_t i=0; i<d_size; i++) {
-			for (size_t j=i; j<d_size; j++) {
-				v_Vg[c]=gsl_matrix_get (V_g, i, j);
-				v_Ve[c]=gsl_matrix_get (V_e, i, j);
-				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
-				c++;
-			}
-		}
+		    c=0;
+		    for (size_t i=0; i<d_size; i++) {
+		      for (size_t j=i; j<d_size; j++) {
+			v_Vg[c]=gsl_matrix_get (V_g, i, j);
+			v_Ve[c]=gsl_matrix_get (V_e, i, j);
+			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+			c++;
+		      }
+		    }
 
-		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
-		sumStat.push_back(SNPs);
-    }
+		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		    sumStat.push_back(SNPs);
+		  }
+		}
+	}
 	cout<<endl;
 
 
@@ -3404,6 +3436,9 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_
 	gsl_matrix_free(B_null);
 	gsl_matrix_free(se_B_null);
 
+	gsl_matrix_free(Xlarge);
+	gsl_matrix_free(UtXlarge);
+
 	return;
 }
 
@@ -3430,6 +3465,12 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs
 
 	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	//large matrices for EM
 	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
 	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
@@ -3615,6 +3656,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs
 	gsl_matrix_memcpy (B_null, B);
 
 	//start reading genotypes and analyze
+	size_t csnp=0;
 	for (size_t t=0; t<indicator_snp.size(); ++t) {
 		//if (t>=1) {break;}
 		!safeGetline(infile, line).eof();
@@ -3647,86 +3689,111 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs
 		for (size_t i=0; i<ni_test; ++i) {
 			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
 			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//	gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		//initial values
-		gsl_matrix_memcpy (V_g, V_g_null);
-		gsl_matrix_memcpy (V_e, V_e_null);
-		gsl_matrix_memcpy (B, B_null);
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		csnp++;
 
-		time_start=clock();
+		if (csnp%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}
 
-		//3 is before 1
-		if (a_mode==3 || a_mode==4) {
-			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
-			if (p_score<p_nr && crt==1) {
-				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
-			}
-		}
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-		if (a_mode==2 || a_mode==4) {
-			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		  gsl_matrix_set_zero (Xlarge);
+
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);
+
+		    //initial values
+		    gsl_matrix_memcpy (V_g, V_g_null);
+		    gsl_matrix_memcpy (V_e, V_e_null);
+		    gsl_matrix_memcpy (B, B_null);
+
+		    time_start=clock();
+
+		    //3 is before 1
+		    if (a_mode==3 || a_mode==4) {
+		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+		      if (p_score<p_nr && crt==1) {
+			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+		      }
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      //calculate beta and Vbeta
+		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
+
+		      if (p_lrt<p_nr) {
+			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 			//calculate beta and Vbeta
 			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
 
-			if (p_lrt<p_nr) {
-				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				//calculate beta and Vbeta
-				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
-				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
-
-				if (crt==1) {
-					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		if (a_mode==1 || a_mode==4) {
-			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
-			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		    if (a_mode==1 || a_mode==4) {
+		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-			if (p_wald<p_nr) {
-				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      if (p_wald<p_nr) {
+			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-				if (crt==1) {
-					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		//store summary data
-		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		for (size_t i=0; i<d_size; i++) {
-			v_beta[i]=gsl_vector_get (beta, i);
-		}
+		    //store summary data
+		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    for (size_t i=0; i<d_size; i++) {
+		      v_beta[i]=gsl_vector_get (beta, i);
+		    }
 
-		c=0;
-		for (size_t i=0; i<d_size; i++) {
-			for (size_t j=i; j<d_size; j++) {
-				v_Vg[c]=gsl_matrix_get (V_g, i, j);
-				v_Ve[c]=gsl_matrix_get (V_e, i, j);
-				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
-				c++;
-			}
-		}
+		    c=0;
+		    for (size_t i=0; i<d_size; i++) {
+		      for (size_t j=i; j<d_size; j++) {
+			v_Vg[c]=gsl_matrix_get (V_g, i, j);
+			v_Ve[c]=gsl_matrix_get (V_e, i, j);
+			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+			c++;
+		      }
+		    }
 
-		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
-		sumStat.push_back(SNPs);
+		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		    sumStat.push_back(SNPs);
+		  }
+		}
     }
 	cout<<endl;
 
@@ -3764,6 +3831,9 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs
 	gsl_matrix_free(B_null);
 	gsl_matrix_free(se_B_null);
 
+	gsl_matrix_free(Xlarge);
+	gsl_matrix_free(UtXlarge);
+
 	return;
 }
 
@@ -3795,6 +3865,12 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl
 	size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2;
 	size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2;
 
+	//create a large matrix
+	size_t msize=10000;
+	gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize);
+	gsl_matrix_set_zero(Xlarge);
+
 	//large matrices for EM
 	gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size);
 	gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size);
@@ -3992,6 +4068,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl
 		b=ch[0];
 	}
 
+	size_t csnp=0;
 	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
 		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
 		if (indicator_snp[t]==0) {continue;}
@@ -4030,9 +4107,9 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl
 		for (size_t i=0; i<ni_test; ++i) {
 			geno=gsl_vector_get(x,i);
 			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
+			//if (x_mean>1) {
+			//	gsl_vector_set(x, i, 2-geno);
+			//}
 		}
 
 		/*
@@ -4047,85 +4124,110 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl
 		}
 		*/
 
+		/*
 		//calculate statistics
 		time_start=clock();
 		gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector);
 		time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		*/
 
-		//initial values
-		gsl_matrix_memcpy (V_g, V_g_null);
-		gsl_matrix_memcpy (V_e, V_e_null);
-		gsl_matrix_memcpy (B, B_null);
+		gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize);
+		gsl_vector_memcpy (&Xlarge_col.vector, x);
+		csnp++;
 
-		time_start=clock();
+		if (csnp%msize==0 || t==indicator_snp.size()-1 ) {
+		  size_t l=0;
+		  if (csnp%msize==0) {l=msize;} else {l=csnp%msize;}
 
-		//3 is before 1
-		if (a_mode==3 || a_mode==4) {
-			p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+		  gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
+		  gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
 
-			if (p_score<p_nr && crt==1) {
-				logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
-			}
-		}
+		  time_start=clock();
+		  eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix);
+		  time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+
+		  gsl_matrix_set_zero (Xlarge);
+
+		  for (size_t i=0; i<l; i++) {
+		    gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i);
+		    gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector);
+
+		    //initial values
+		    gsl_matrix_memcpy (V_g, V_g_null);
+		    gsl_matrix_memcpy (V_e, V_e_null);
+		    gsl_matrix_memcpy (B, B_null);
+
+		    time_start=clock();
+
+		    //3 is before 1
+		    if (a_mode==3 || a_mode==4) {
+		      p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta);
+
+		      if (p_score<p_nr && crt==1) {
+			logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c);
+		      }
+		    }
+
+		    if (a_mode==2 || a_mode==4) {
+		      logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      //calculate beta and Vbeta
+		      p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
+
+		      if (p_lrt<p_nr) {
+			logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 
-		if (a_mode==2 || a_mode==4) {
-			logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
 			//calculate beta and Vbeta
 			p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 			p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
-
-			if (p_lrt<p_nr) {
-				logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-
-				//calculate beta and Vbeta
-				p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
-				p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size );
-				if (crt==1) {
-					p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		if (a_mode==1 || a_mode==4) {
-			logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
-			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		    if (a_mode==1 || a_mode==4) {
+		      logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
+		      p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-			if (p_wald<p_nr) {
-				logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-				p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
+		      if (p_wald<p_nr) {
+			logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
+			p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta);
 
-				if (crt==1) {
-					p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
-				}
+			if (crt==1) {
+			  p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c);
 			}
-		}
+		      }
+		    }
 
-		//cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;
+		    //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl;
 
-		if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
+		    //if (x_mean>1) {gsl_vector_scale(beta, -1.0);}
 
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
+		    time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
 
-		//store summary data
-		//SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
-		for (size_t i=0; i<d_size; i++) {
-			v_beta[i]=gsl_vector_get (beta, i);
-		}
+		    //store summary data
+		    //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score};
+		    for (size_t i=0; i<d_size; i++) {
+		      v_beta[i]=gsl_vector_get (beta, i);
+		    }
 
-		c=0;
-		for (size_t i=0; i<d_size; i++) {
-			for (size_t j=i; j<d_size; j++) {
-				v_Vg[c]=gsl_matrix_get (V_g, i, j);
-				v_Ve[c]=gsl_matrix_get (V_e, i, j);
-				v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
-				c++;
-			}
-		}
+		    c=0;
+		    for (size_t i=0; i<d_size; i++) {
+		      for (size_t j=i; j<d_size; j++) {
+			v_Vg[c]=gsl_matrix_get (V_g, i, j);
+			v_Ve[c]=gsl_matrix_get (V_e, i, j);
+			v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j);
+			c++;
+		      }
+		    }
 
-		MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
-		sumStat.push_back(SNPs);
-    }
+		    MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta};
+		    sumStat.push_back(SNPs);
+		  }
+		}
+	}
 	cout<<endl;
 
 	//cout<<"time_opt = "<<time_opt<<endl;
@@ -4162,6 +4264,9 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl
 	gsl_matrix_free(B_null);
 	gsl_matrix_free(se_B_null);
 
+	gsl_matrix_free(Xlarge);
+	gsl_matrix_free(UtXlarge);
+
 	return;
 }
 
diff --git a/src/param.cpp b/src/param.cpp
index 33b7b48..0a63a16 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -64,7 +64,7 @@ n_accept(0),
 n_mh(10),
 geo_mean(2000.0),
 randseed(-1),
-window_cm(0), window_bp(0), window_ns(0),
+window_cm(0), window_bp(0), window_ns(0), n_block(200),
 error(false),
 ni_subsample(0), n_cvt(1), n_vc(1),
 time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0)
@@ -77,19 +77,27 @@ void PARAM::ReadFiles (void)
 {
 	string file_str;
 
-
-	if (!file_cat.empty()) {
+	//read cat file
+	if (!file_mcat.empty()) {
+	  if (ReadFile_mcat (file_mcat, mapRS2cat, n_vc)==false) {error=true;}
+	} else if (!file_cat.empty()) {
 	  if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;}
 	}
 
-	if (!file_var.empty()) {
-	  if (ReadFile_var (file_var, mapRS2var)==false) {error=true;}
+	//read snp weight files
+	if (!file_wcat.empty()) {
+	  if (ReadFile_wsnp (file_wcat, n_vc, mapRS2wcat)==false) {error=true;}
+	}
+	if (!file_wsnp.empty()) {
+	  if (ReadFile_wsnp (file_wsnp, mapRS2wsnp)==false) {error=true;}
 	}
 
+	//count number of kinship files
 	if (!file_mk.empty()) {
 	  if (CountFileLines (file_mk, n_vc)==false) {error=true;}
 	}
 
+	//read snp set
 	if (!file_snps.empty()) {
 		if (ReadFile_snps (file_snps, setSnps)==false) {error=true;}
 	} else {
@@ -184,10 +192,17 @@ void PARAM::ReadFiles (void)
 	//read genotype and phenotype file for plink format
 	if (!file_bfile.empty()) {
 		file_str=file_bfile+".bim";
+		snpInfo.clear();
 		if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}
 
-		file_str=file_bfile+".fam";
-		if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}
+		//if both fam file and pheno files are used, use phenotypes inside the pheno file
+		if (!file_pheno.empty()) {
+		  //phenotype file before genotype file
+		  if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+		} else {
+		  file_str=file_bfile+".fam";
+		  if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}
+		}
 
 		//post-process covariates and phenotypes, obtain ni_test, save all useful covariates
 		ProcessCvtPhen();
@@ -228,6 +243,97 @@ void PARAM::ReadFiles (void)
 		ns_total=indicator_snp.size();
 	}
 
+
+	//read genotype file for multiple plink files
+	if (!file_mbfile.empty()) {
+	  igzstream infile (file_mbfile.c_str(), igzstream::in);
+	  if (!infile) {cout<<"error! fail to open mbfile file: "<<file_mbfile<<endl; return;}
+
+	  string file_name;
+
+	  size_t t=0, ns_test_tmp=0;
+
+	  gsl_matrix *W;
+	  while (!safeGetline(infile, file_name).eof()) {
+		file_str=file_name+".bim";
+
+		if (ReadFile_bim (file_str, snpInfo)==false) {error=true;}
+
+		if (t==0) {
+		  //if both fam file and pheno files are used, use phenotypes inside the pheno file
+		  if (!file_pheno.empty()) {
+		    //phenotype file before genotype file
+		    if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+		  } else {
+		    file_str=file_name+".fam";
+		    if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;}
+		  }
+
+		  //post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+		  ProcessCvtPhen();
+
+		  //obtain covariate matrix
+		  W=gsl_matrix_alloc (ni_test, n_cvt);
+		  CopyCvt (W);
+		}
+
+		file_str=file_name+".bed";
+		if (ReadFile_bed (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test_tmp)==false) {error=true;}
+		mindicator_snp.push_back(indicator_snp);
+		msnpInfo.push_back(snpInfo);
+		ns_test+=ns_test_tmp;
+		ns_total+=indicator_snp.size();
+
+		t++;
+	  }
+
+	  gsl_matrix_free(W);
+
+	  infile.close();
+	  infile.clear();
+	}
+
+
+
+	//read genotype and phenotype file for multiple bimbam files
+	if (!file_mgeno.empty()) {
+	  //annotation file before genotype file
+	  if (!file_anno.empty() ) {
+	    if (ReadFile_anno (file_anno, mapRS2chr, mapRS2bp, mapRS2cM)==false) {error=true;}
+	  }
+
+	  //phenotype file before genotype file
+	  if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
+
+	  //post-process covariates and phenotypes, obtain ni_test, save all useful covariates
+	  ProcessCvtPhen();
+
+	  //obtain covariate matrix
+	  gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt);
+	  CopyCvt (W);
+
+	  igzstream infile (file_mgeno.c_str(), igzstream::in);
+	  if (!infile) {cout<<"error! fail to open mgeno file: "<<file_mgeno<<endl; return;}
+
+	  string file_name;
+	  size_t ns_test_tmp;
+	  while (!safeGetline(infile, file_name).eof()) {
+	    if (ReadFile_geno (file_name, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp)==false) {error=true;}
+
+	    mindicator_snp.push_back(indicator_snp);
+	    msnpInfo.push_back(snpInfo);
+	    ns_test+=ns_test_tmp;
+	    ns_total+=indicator_snp.size();
+	  }
+
+	  gsl_matrix_free(W);
+
+	  infile.close();
+	  infile.clear();
+	}
+
+
+
 	if (!file_gene.empty()) {
 		if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;}
 
@@ -292,7 +398,7 @@ void PARAM::CheckParam (void)
 
 	//check parameters
 	if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;}
-	if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=71)
+	if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67 && a_mode!=71)
 	{cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm -predict or -calccov is sepcified correctly."<<endl; error=true;}
 	if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;}
 	if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;}
@@ -400,8 +506,8 @@ void PARAM::CheckParam (void)
 	str=file_cat;
 	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;}
 
-	str=file_var;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;}
+	str=file_mcat;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mcategory file: "<<str<<endl; error=true;}
 
 	str=file_beta;
 	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open beta file: "<<str<<endl; error=true;}
@@ -409,23 +515,33 @@ void PARAM::CheckParam (void)
 	str=file_cor;
 	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open correlation file: "<<str<<endl; error=true;}
 
-	str=file_q;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open q file: "<<str<<endl; error=true;}
+	if (!file_study.empty()) {
+	  str=file_study+".Vq.txt";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .Vq.txt file: "<<str<<endl; error=true;}
+		str=file_study+".q.txt";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .q.txt file: "<<str<<endl; error=true;}
+		str=file_study+".size.txt";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .size.txt file: "<<str<<endl; error=true;}
+	}
 
-	str=file_s;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open s file: "<<str<<endl; error=true;}
+	if (!file_ref.empty()) {
+		str=file_ref+".S.txt";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .S.txt file: "<<str<<endl; error=true;}
+		str=file_ref+".size.txt";
+		if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .size.txt file: "<<str<<endl; error=true;}
+	}
 
-	str=file_v;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open v file: "<<str<<endl; error=true;}
+	str=file_mstudy;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mstudy file: "<<str<<endl; error=true;}
 
-	str=file_mq;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mq file: "<<str<<endl; error=true;}
+	str=file_mref;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mref file: "<<str<<endl; error=true;}
 
-	str=file_ms;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open ms file: "<<str<<endl; error=true;}
+	str=file_mgeno;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mgeno file: "<<str<<endl; error=true;}
 
-	str=file_mv;
-	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mv file: "<<str<<endl; error=true;}
+	str=file_mbfile;
+	if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mbfile file: "<<str<<endl; error=true;}
 
 	size_t flag=0;
 	if (!file_bfile.empty()) {flag++;}
@@ -434,7 +550,7 @@ void PARAM::CheckParam (void)
 	// WJA added
 	if (!file_oxford.empty()) {flag++;}
 
-	if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62) {
+	if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67) {
 		cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true;
 	}
 
@@ -443,21 +559,30 @@ void PARAM::CheckParam (void)
 	}
 
 	if (a_mode==61 || a_mode==62) {
-	  if (!file_pheno.empty()) {
+	  if (!file_beta.empty()) {
+	    if ( file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty() && file_mref.empty() && file_ref.empty() ) {
+	      cout<<"error! missing genotype file or ref/mref file."<<endl; error=true;
+	    }
+	  } else if (!file_pheno.empty()) {
 	    if (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) {
 	      cout<<"error! missing relatedness file. "<<endl;  error=true;
 	    }
+	    /*
 	  } else if (!file_cor.empty()) {
 	    if (file_beta.empty() ) {
 	      cout<<"error! missing cor file."<<endl; error=true;
 	    }
-	  } else {
-	    if ( (file_mq.empty() || file_ms.empty() || file_mv.empty() ) && (file_q.empty() || file_s.empty() || file_v.empty() )  ) {
-	      cout<<"error! either phenotype/kinship files or ms/mq/mv s/q/v files are required."<<endl; error=true;
-	    }
+	    */
+	  } else if ( (file_mstudy.empty() && file_study.empty()) || (file_mref.empty() && file_ref.empty() )  ) {
+	      cout<<"error! either beta file, or phenotype files or study/ref mstudy/mref files are required."<<endl; error=true;
 	  }
 	}
 
+	if (a_mode==66 || a_mode==67) {
+	  if (file_beta.empty() || ( file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty()) ) {
+	    cout<<"error! missing beta file or genotype file."<<endl; error=true;
+	  }
+	}
 
 
 	if (!file_epm.empty() && file_bfile.empty() && file_geno.empty() ) {cout<<"error! estimated parameter file also requires genotype file."<<endl; error=true;}
@@ -525,13 +650,16 @@ void PARAM::CheckParam (void)
 
 void PARAM::CheckData (void) {
   if(file_oxford.empty())	// WJA NOTE: I added this condition so that covariates can be added through sample, probably not exactly what is wanted
-
 	{
  	if ((file_cvt).empty() || (indicator_cvt).size()==0) {
  		n_cvt=1;
  	}
 	}
 
+  if ( (a_mode==66 || a_mode==67) && (v_pve.size()!=n_vc))  {
+	    cout<<"error! the number of pve estimates does not equal to the number of categories in the cat file:"<<v_pve.size()<<" "<<n_vc<<endl; error=true;
+	}
+
 	if ( (indicator_cvt).size()!=0 && (indicator_cvt).size()!=(indicator_idv).size()) {
 		error=true;
 		cout<<"error! number of rows in the covariates file do not match the number of individuals. "<<endl;
@@ -610,7 +738,7 @@ void PARAM::CheckData (void) {
 		}
 	}
 	*/
-	if (ni_test==0 && file_cor.empty() && file_mq.empty() && file_q.empty() && file_beta.empty() ) {
+	if (ni_test==0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() ) {
 		error=true;
 		cout<<"error! number of analyzed individuals equals 0. "<<endl;
 		return;
@@ -631,7 +759,7 @@ void PARAM::CheckData (void) {
 	}
 
 	//output some information
-	if (file_cor.empty() && file_mq.empty() && file_q.empty() ) {
+	if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode!=27 && a_mode!=28) {
 	  cout<<"## number of total individuals = "<<ni_total<<endl;
 	  if (a_mode==43) {
 	    cout<<"## number of analyzed individuals = "<<ni_cvt<<endl;
@@ -709,6 +837,9 @@ void PARAM::CheckData (void) {
 		}
 	}
 
+	if (a_mode==62 && !file_beta.empty() && mapRS2wcat.size()==0) {cout<<"vc analysis with beta files requires -wcat file."<<endl; error=true;}
+	if (a_mode==67 && mapRS2wcat.size()==0) {cout<<"ci analysis with beta files requires -wcat file."<<endl; error=true;}
+
 	//file_mk needs to contain more than one line
 	if (n_vc==1 && !file_mk.empty()) {cout<<"error! -mk file should contain more than one line."<<endl; error=true;}
 
@@ -783,46 +914,52 @@ void PARAM::CalcKin (gsl_matrix *matrix_kin)  {
 
 
 
-//from an existing n by nd G matrix, compute the d by d S matrix
-void compKtoS (const gsl_matrix *G, gsl_matrix *S) {
-  size_t n_vc=S->size1, ni_test=G->size1;
-  double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d;
+//from an existing n by nd A and K matrices, compute the d by d S matrix (which is not necessary symmetric)
+void compAKtoS (const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt, gsl_matrix *S) {
+  size_t n_vc=S->size1, ni_test=A->size1;
+  double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d;
 
   for (size_t i=0; i<n_vc; i++) {
-    for (size_t j=i; j<n_vc; j++) {
-      tr_KiKj=0; sum_Ki=0; sum_Kj=0; s_KiKj=0; si=0; sj=0;
+    for (size_t j=0; j<n_vc; j++) {
+      tr_AK=0; sum_A=0; sum_K=0; sum_AK=0; tr_A=0; tr_K=0;
       for (size_t l=0; l<ni_test; l++) {
-	s_Ki=0; s_Kj=0;
+	s_A=0; s_K=0;
 	for (size_t k=0; k<ni_test; k++) {
-	  di=gsl_matrix_get(G, l, k+ni_test*i);
-	  dj=gsl_matrix_get(G, l, k+ni_test*j);
-	  s_Ki+=di; s_Kj+=dj;
+	  di=gsl_matrix_get(A, l, k+ni_test*i);
+	  dj=gsl_matrix_get(K, l, k+ni_test*j);
+	  s_A+=di; s_K+=dj;
 
-	  tr_KiKj+=di*dj; sum_Ki+=di; sum_Kj+=dj;
-	  if (l==k) {si+=di; sj+=dj;}
+	  tr_AK+=di*dj; sum_A+=di; sum_K+=dj;
+	  if (l==k) {tr_A+=di; tr_K+=dj;}
 	}
-	s_KiKj+=s_Ki*s_Kj;
+	sum_AK+=s_A*s_K;
       }
 
-      sum_Ki/=(double)ni_test;
-      sum_Kj/=(double)ni_test;
-      s_KiKj/=(double)ni_test;
-      si-=sum_Ki;
-      sj-=sum_Kj;
-      d=tr_KiKj-2*s_KiKj+sum_Ki*sum_Kj;
-      d=d/(si*sj)-1/(double)(ni_test-1);
+      sum_A/=(double)ni_test;
+      sum_K/=(double)ni_test;
+      sum_AK/=(double)ni_test;
+      tr_A-=sum_A;
+      tr_K-=sum_K;
+      d=tr_AK-2*sum_AK+sum_A*sum_K;
+
+      if (tr_A==0 || tr_K==0) {
+	d=0;
+      } else {
+	d=d/(tr_A*tr_K)-1/(double)(ni_test-n_cvt);
+      }
 
       gsl_matrix_set (S, i, j, d);
-      if (i!=j) {gsl_matrix_set (S, j, i, d);}
     }
   }
+
+  //eigenlib_invert(Si);
   //cout<<tr_KiKj<<" "<<s_KiKj<<" "<<sum_Ki<<" "<<sum_Kj<<" "<<si<<" "<<sj<<" "<<d*1000000<<endl;
   return;
 }
 
 
 
-//copied from lmm.cpp; is used in the following function compKtoQ
+//copied from lmm.cpp; is used in the following function compKtoV
 //map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1
 size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
 	if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;}
@@ -836,20 +973,19 @@ size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
 	return index;
 }
 
-//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d+1) Q matrix
-//where inside i'th d+1 by d+1 matrix, each element is tr(KiKjKiKl)-r*tr(KjKiKl)-r*tr(KlKiKj)+r^2*tr(KjKl), where r=n/(n-1)
-void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) {
+//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d-1)/2*(d+1) Q matrix
+//where inside i'th d+1 by d+1 matrix, each element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm), where r=n/(n-1)
+void compKtoV (const gsl_matrix *G, gsl_matrix *V) {
   size_t n_vc=G->size2/G->size1, ni_test=G->size1;
 
-  gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test);
-  gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc );
+  gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, (n_vc*(n_vc+1))/2*ni_test);
   gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 );
   gsl_vector *trKi=gsl_vector_alloc(n_vc);
 
   double d, tr, r=(double)ni_test/(double)(ni_test-1);
-  size_t t, t_ij, t_il, t_jl, t_ii;
+  size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij;
 
-  //compute KiKj for all pairs of i and j (including the identity matrix)
+  //compute KiKj for all pairs of i and j (not including the identity matrix)
   t=0;
   for (size_t i=0; i<n_vc; i++) {
     gsl_matrix_const_view Ki=gsl_matrix_const_submatrix(G, 0, i*ni_test, ni_test, ni_test);
@@ -889,99 +1025,108 @@ void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) {
     gsl_vector_set (trKi, i, tr);
   }
 
-  //compute trKiKjKi (it is not symmetric w.r.t. i and j)
+  //compute V
   for (size_t i=0; i<n_vc; i++) {
-    for (size_t j=0; j<n_vc; j++) {
-      tr=0;
-      t=GetabIndex (i+1, j+1, n_vc-2);
-      for (size_t k=0; k<ni_test; k++) {
-	gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t*ni_test, ni_test);
-	gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t*ni_test+k);
-
-	gsl_vector_const_view Ki_col=gsl_matrix_const_column (G, i*ni_test+k);
-
-	if (i<=j) {
-	  gsl_blas_ddot (&KiKj_row.vector, &Ki_col.vector, &d);
-	  tr+=d;
-	} else {
-	  gsl_blas_ddot (&KiKj_col.vector, &Ki_col.vector, &d);
-	  tr+=d;
-	}
-      }
-      gsl_vector_set (trKiKjKi, i*n_vc+j, tr);
-    }
-  }
+    for (size_t j=i; j<n_vc; j++) {
+      t_ij=GetabIndex (i+1, j+1, n_vc-2);
+      for (size_t l=0; l<n_vc+1; l++) {
+	for (size_t m=0; m<n_vc+1; m++) {
+	  if (l!=n_vc && m!=n_vc) {
+	    t_il=GetabIndex (i+1, l+1, n_vc-2);
+	    t_jm=GetabIndex (j+1, m+1, n_vc-2);
+	    t_lm=GetabIndex (l+1, m+1, n_vc-2);
+	    //cout<<ni_test<<" "<<r<<t_ij<<" "<<t_il<<" "<<t_jl<<" "<<endl;
+	    tr=0;
+	    for (size_t k=0; k<ni_test; k++) {
+	      gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test);
+	      gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k);
+	      gsl_vector_const_view KjKm_row=gsl_matrix_const_subrow (KiKj, k, t_jm*ni_test, ni_test);
+	      gsl_vector_const_view KjKm_col=gsl_matrix_const_column (KiKj, t_jm*ni_test+k);
+
+	      gsl_vector_const_view Kl_row=gsl_matrix_const_subrow (G, k, l*ni_test, ni_test);
+	      gsl_vector_const_view Km_row=gsl_matrix_const_subrow (G, k, m*ni_test, ni_test);
+
+	      if (i<=l && j<=m) {
+		gsl_blas_ddot (&KiKl_row.vector, &KjKm_col.vector, &d);
+		tr+=d;
+		gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d);
+		tr-=r*d;
+		gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d);
+		tr-=r*d;
+	      } else if (i<=l && j>m) {
+		gsl_blas_ddot (&KiKl_row.vector, &KjKm_row.vector, &d);
+		tr+=d;
+		gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d);
+		tr-=r*d;
+		gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d);
+		tr-=r*d;
+	      } else if (i>l && j<=m) {
+		gsl_blas_ddot (&KiKl_col.vector, &KjKm_col.vector, &d);
+		tr+=d;
+		gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d);
+		tr-=r*d;
+		gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d);
+		tr-=r*d;
+	      } else {
+		gsl_blas_ddot (&KiKl_col.vector, &KjKm_row.vector, &d);
+		tr+=d;
+		gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d);
+		tr-=r*d;
+		gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d);
+		tr-=r*d;
+	      }
+	    }
 
-  //compute Q
-  for (size_t i=0; i<n_vc; i++) {
-    for (size_t j=0; j<n_vc+1; j++) {
-      for (size_t l=j; l<n_vc+1; l++) {
-	if (j!=n_vc && l!=n_vc) {
-	  t_ij=GetabIndex (i+1, j+1, n_vc-2);
-	  t_il=GetabIndex (i+1, l+1, n_vc-2);
-	  t_jl=GetabIndex (j+1, l+1, n_vc-2);
-
-	  //cout<<ni_test<<" "<<r<<t_ij<<" "<<t_il<<" "<<t_jl<<" "<<endl;
-	  tr=0;
-	  for (size_t k=0; k<ni_test; k++) {
-	    gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t_ij*ni_test, ni_test);
-	    gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t_ij*ni_test+k);
-	    gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test);
-	    gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k);
-
-	    gsl_vector_const_view Kj_row=gsl_matrix_const_subrow (G, k, j*ni_test, ni_test);
-	    gsl_vector_const_view Kl_row=gsl_matrix_const_subrow (G, k, l*ni_test, ni_test);
-
-	    if (i<=j && i<=l) {
-	      gsl_blas_ddot (&KiKj_row.vector, &KiKl_col.vector, &d);
-	      tr+=d;
-	      gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d);
-	      tr-=r*d;
-	      gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d);
-	      tr-=r*d;
-	    } else if (i<=j && i>l) {
-	      gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d);
-	      tr+=d;
-	      gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d);
-	      tr-=r*d;
-	      gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d);
-	      tr-=r*d;
-	    } else if (i>j && i<=l) {
-	      gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d);
-	      tr+=d;
-	      gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d);
-	      tr-=r*d;
-	      gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d);
-	      tr-=r*d;
-	    } else {
-	      gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d);
-	      tr+=d;
-	      gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d);
-	      tr-=r*d;
-	      gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d);
-	      tr-=r*d;
+	    tr+=r*r*gsl_vector_get (trKiKj, t_lm);
+	  } else if (l!=n_vc && m==n_vc) {
+	    t_il=GetabIndex (i+1, l+1, n_vc-2);
+	    t_jl=GetabIndex (j+1, l+1, n_vc-2);
+	    tr=0;
+	    for (size_t k=0; k<ni_test; k++) {
+	      gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test);
+	      gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k);
+	      gsl_vector_const_view Kj_row=gsl_matrix_const_subrow (G, k, j*ni_test, ni_test);
+
+	      if (i<=l) {
+		gsl_blas_ddot (&KiKl_row.vector, &Kj_row.vector, &d);
+		tr+=d;
+	      } else {
+		gsl_blas_ddot (&KiKl_col.vector, &Kj_row.vector, &d);
+		tr+=d;
+	      }
 	    }
+	    tr+=-r*gsl_vector_get (trKiKj, t_il)-r*gsl_vector_get (trKiKj, t_jl)+r*r*gsl_vector_get (trKi, l);
+	  } else if (l==n_vc && m!=n_vc) {
+	    t_jm=GetabIndex (j+1, m+1, n_vc-2);
+	    t_im=GetabIndex (i+1, m+1, n_vc-2);
+	    tr=0;
+	    for (size_t k=0; k<ni_test; k++) {
+	      gsl_vector_const_view KjKm_row=gsl_matrix_const_subrow (KiKj, k, t_jm*ni_test, ni_test);
+	      gsl_vector_const_view KjKm_col=gsl_matrix_const_column (KiKj, t_jm*ni_test+k);
+	      gsl_vector_const_view Ki_row=gsl_matrix_const_subrow (G, k, i*ni_test, ni_test);
+
+	      if (j<=m) {
+		gsl_blas_ddot (&KjKm_row.vector, &Ki_row.vector, &d);
+		tr+=d;
+	      } else {
+		gsl_blas_ddot (&KjKm_col.vector, &Ki_row.vector, &d);
+		tr+=d;
+	      }
+	    }
+	    tr+=-r*gsl_vector_get (trKiKj, t_im)-r*gsl_vector_get (trKiKj, t_jm)+r*r*gsl_vector_get (trKi, m);
+	  } else {
+	    tr=gsl_vector_get (trKiKj, t_ij)-r*gsl_vector_get (trKi, i)-r*gsl_vector_get (trKi, j)+r*r*(double)(ni_test-1);
 	  }
 
-	  tr+=r*r*gsl_vector_get (trKiKj, t_jl);
-	} else if (j!=n_vc && l==n_vc) {
-	  t_ij=GetabIndex (i+1, j+1, n_vc-2);
-	  tr=gsl_vector_get (trKiKjKi, i*n_vc+j)-2*r*gsl_vector_get (trKiKj, t_ij)+r*r*gsl_vector_get (trKi, j);
-	} else if (j==n_vc && l==n_vc) {
-	  t_ii=GetabIndex (i+1, i+1, n_vc-2);
-	  tr=gsl_vector_get (trKiKj, t_ii)-2*r*gsl_vector_get (trKi, i)+r*r*(double)(ni_test-1);
+	  gsl_matrix_set (V, l, t_ij*(n_vc+1)+m, tr);
 	}
-
-	gsl_matrix_set (Q, j, i*(n_vc+1)+l, tr);
-	if (l!=j) {gsl_matrix_set (Q, l, i*(n_vc+1)+j, tr);}
       }
     }
   }
 
-  gsl_matrix_scale (Q, 1.0/pow((double)ni_test, 2) );
+  gsl_matrix_scale (V, 1.0/pow((double)ni_test, 2) );
 
   gsl_matrix_free(KiKj);
-  gsl_vector_free(trKiKjKi);
   gsl_vector_free(trKiKj);
   gsl_vector_free(trKi);
 
@@ -991,190 +1136,210 @@ void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) {
 
 
 //perform Jacknife sampling for variance of S
-void JacknifeGtoS (const gsl_matrix *G, gsl_matrix *S, gsl_matrix *Svar) {
-  size_t n_vc=Svar->size1, ni_test=G->size1;
-  vector<vector<vector<double> > > tr_KiKj, s_KiKj;
-  vector<vector<double> > sum_Ki, s_Ki, si;
+void JackknifeAKtoS (const gsl_matrix *W, const gsl_matrix *A, const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) {
+  size_t n_vc=Svar->size1, ni_test=A->size1, n_cvt=W->size2;
+
+  vector<vector<vector<double> > > trAK, sumAK;
+  vector<vector<double> > sumA, sumK, trA, trK, sA, sK;
   vector<double> vec_tmp;
   double di, dj, d, m, v;
 
+  //gsl_matrix *Stmp=gsl_matrix_alloc (n_vc, ni_test*n_vc);
+  //gsl_matrix *Stmp_sub=gsl_matrix_alloc (n_vc, n_vc);
+
   //initialize and set all elements to zero
   for (size_t i=0; i<ni_test; i++) {
     vec_tmp.push_back(0);
   }
 
   for (size_t i=0; i<n_vc; i++) {
-    sum_Ki.push_back(vec_tmp);
-    s_Ki.push_back(vec_tmp);
-    si.push_back(vec_tmp);
+    sumA.push_back(vec_tmp);
+    sumK.push_back(vec_tmp);
+    trA.push_back(vec_tmp);
+    trK.push_back(vec_tmp);
+    sA.push_back(vec_tmp);
+    sK.push_back(vec_tmp);
   }
 
   for (size_t i=0; i<n_vc; i++) {
-    tr_KiKj.push_back(sum_Ki);
-    s_KiKj.push_back(sum_Ki);
+    trAK.push_back(sumK);
+    sumAK.push_back(sumK);
   }
 
-  //run jacknife
+  //run jackknife
   for (size_t i=0; i<n_vc; i++) {
     for (size_t l=0; l<ni_test; l++) {
       for (size_t k=0; k<ni_test; k++) {
-	di=gsl_matrix_get(G, l, k+ni_test*i);
+	di=gsl_matrix_get(A, l, k+ni_test*i);
+	dj=gsl_matrix_get(K, l, k+ni_test*i);
 
 	for (size_t t=0; t<ni_test; t++) {
 	  if (t==l || t==k) {continue;}
-	  sum_Ki[i][t]+=di;
-	  if (l==k) {si[i][t]+=di;}
+	  sumA[i][t]+=di;
+	  sumK[i][t]+=dj;
+	  if (l==k) {trA[i][t]+=di; trK[i][t]+=dj;}
 	}
-	s_Ki[i][l]+=di;
+	sA[i][l]+=di;
+	sK[i][l]+=dj;
       }
     }
 
     for (size_t t=0; t<ni_test; t++) {
-      sum_Ki[i][t]/=(double)(ni_test-1);
+      sumA[i][t]/=(double)(ni_test-1);
+      sumK[i][t]/=(double)(ni_test-1);
     }
   }
 
   for (size_t i=0; i<n_vc; i++) {
-    for (size_t j=i; j<n_vc; j++) {
+    for (size_t j=0; j<n_vc; j++) {
       for (size_t l=0; l<ni_test; l++) {
 	for (size_t k=0; k<ni_test; k++) {
-	  di=gsl_matrix_get(G, l, k+ni_test*i);
-	  dj=gsl_matrix_get(G, l, k+ni_test*j);
+	  di=gsl_matrix_get(A, l, k+ni_test*i);
+	  dj=gsl_matrix_get(K, l, k+ni_test*j);
 	  d=di*dj;
 
 	  for (size_t t=0; t<ni_test; t++) {
 	    if (t==l || t==k) {continue;}
-	    tr_KiKj[i][j][t]+=d;
+	    trAK[i][j][t]+=d;
           }
 	}
 
 	for (size_t t=0; t<ni_test; t++) {
 	  if (t==l) {continue;}
-	  di=gsl_matrix_get(G, l, t+ni_test*i);
-	  dj=gsl_matrix_get(G, l, t+ni_test*j);
+	  di=gsl_matrix_get(A, l, t+ni_test*i);
+	  dj=gsl_matrix_get(K, l, t+ni_test*j);
 
-	  s_KiKj[i][j][t]+=(s_Ki[i][l]-di)*(s_Ki[j][l]-dj);
+	  sumAK[i][j][t]+=(sA[i][l]-di)*(sK[j][l]-dj);
 	}
       }
 
       for (size_t t=0; t<ni_test; t++) {
-	s_KiKj[i][j][t]/=(double)(ni_test-1);
+	sumAK[i][j][t]/=(double)(ni_test-1);
       }
 
       m=0; v=0;
       for (size_t t=0; t<ni_test; t++) {
-	d=tr_KiKj[i][j][t]-2*s_KiKj[i][j][t]+sum_Ki[i][t]*sum_Ki[j][t];
-	d/=(si[i][t]-sum_Ki[i][t])*(si[j][t]-sum_Ki[j][t]);
-	d-=1/(double)(ni_test-2);
-
+	d=trAK[i][j][t]-2*sumAK[i][j][t]+sumA[i][t]*sumK[j][t];
+	if ( (trA[i][t]-sumA[i][t])==0 || (trK[j][t]-sumK[j][t])==0) {
+	  d=0;
+	} else {
+	  d/=(trA[i][t]-sumA[i][t])*(trK[j][t]-sumK[j][t]);
+	  d-=1/(double)(ni_test-n_cvt-1);
+	}
+	//gsl_matrix_set (Stmp, i, t*n_vc+j, d);
+	//gsl_matrix_set (Stmp, j, t*n_vc+i, d);
 	m+=d; v+=d*d;
       }
       m/=(double)ni_test;
       v/=(double)ni_test;
       v-=m*m;
       v*=(double)(ni_test-1);
+      gsl_matrix_set (Svar, i, j, v);
+      if (n_cvt==1) {
+	d=gsl_matrix_get (S, i, j);
+      	d=(double)ni_test*d-(double)(ni_test-1)*m;
+	gsl_matrix_set (S, i, j, d);
+      }
+    }
+  }
+
+  /*
+  for (size_t t=0; t<ni_test; t++) {
+    gsl_matrix_view Stmp_view=gsl_matrix_submatrix(Stmp, 0, t*n_vc, n_vc, n_vc);
+    gsl_matrix_memcpy (Stmp_sub, &Stmp_view.matrix);
+    eigenlib_invert(Stmp_sub);
+    gsl_matrix_memcpy (&Stmp_view.matrix, Stmp_sub);
+  }
+
+  for (size_t i=0; i<n_vc; i++) {
+    for (size_t j=i; j<n_vc; j++) {
+      m=0; v=0;
+      for (size_t t=0; t<ni_test; t++) {
+	d=gsl_matrix_get (Stmp, i, t*n_vc+j);
+	m+=d;
+	v+=d*d;
+      }
+      m/=(double)ni_test;
+      v/=(double)ni_test;
+      v-=m*m;
+      v*=(double)(ni_test-1);
 
       gsl_matrix_set (Svar, i, j, v);
-      d=gsl_matrix_get (S, i, j);
+      d=gsl_matrix_get (Si, i, j);
       d=(double)ni_test*d-(double)(ni_test-1)*m;
-      gsl_matrix_set (S, i, j, d);
-      if (i!=j) {gsl_matrix_set (Svar, j, i, v); gsl_matrix_set (S, j, i, d);}
+      gsl_matrix_set (Si, i, j, d);
+      if (i!=j) {gsl_matrix_set (Svar, j, i, v); gsl_matrix_set (Si, j, i, d);}
     }
   }
 
+  gsl_matrix_free (Stmp);
+  */
   return;
 }
 
 
 
 //compute the d by d S matrix with its d by d variance matrix of Svar, and the d+1 by d(d+1) matrix of Q for V(q)
-void PARAM::CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q)  {
+void PARAM::CalcS (const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns)  {
   string file_str;
 
   gsl_matrix_set_zero (S);
   gsl_matrix_set_zero (Svar);
-  gsl_matrix_set_zero (Q);
+  gsl_vector_set_zero (ns);
 
   //compute the kinship matrix G for multiple categories; these matrices are not centered, for convienence of Jacknife sampling
-  gsl_matrix *G=gsl_matrix_alloc (ni_test, n_vc*ni_test);
-  gsl_matrix_set_zero (G);
-
   if (!file_bfile.empty() ) {
     file_str=file_bfile+".bed";
-    if (PlinkKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;}
-  } else {
+    if (mapRS2wA.size()==0) {
+      if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;}
+    } else {
+      if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;}
+    }
+  } else if (!file_geno.empty()) {
     file_str=file_geno;
-    if (BimbamKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;}
+    if (mapRS2wA.size()==0) {
+      if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;}
+    } else {
+      if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;}
+    }
+  } else if (!file_mbfile.empty() ){
+    if (mapRS2wA.size()==0) {
+      if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;}
+    } else {
+      if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;}
+    }
+  } else if (!file_mgeno.empty()) {
+    if (mapRS2wA.size()==0) {
+      if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;}
+    } else {
+      if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;}
+    }
   }
 
-  //center and scale every kinship matrix inside G
-  double d;
-  for (size_t i=0; i<n_vc; i++) {
-    gsl_matrix_view K=gsl_matrix_submatrix(G, 0, i*ni_test, ni_test, ni_test);
-    CenterMatrix(&K.matrix);
-    d=ScaleMatrix(&K.matrix);
+  if (mapRS2wA.size()==0) {
+    gsl_matrix_memcpy (A, K);
   }
 
-  //based on G, compute S
-  compKtoS (G, S);
-
-  //based on G, compute a matrix Q that can be used to calculate the variance of q
-  compKtoQ (G, Q);
-
-  /*
-  //set up random environment
-  gsl_rng_env_setup();
-  gsl_rng *gsl_r;
-  const gsl_rng_type * gslType;
-  gslType = gsl_rng_default;
-  if (randseed<0) {
-    time_t rawtime;
-    time (&rawtime);
-    tm * ptm = gmtime (&rawtime);
-
-    randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec);
-  }
-  gsl_r = gsl_rng_alloc(gslType);
-  gsl_rng_set(gsl_r, randseed);
-
-  //bootstrap: in each iteration, sample individuals and compute S_pmt
-  size_t n_pmt=100;
-  vector<size_t> idv_order, idv_remove;
-  for (size_t i=0; i<ni_test; i++) {
-    idv_order.push_back(i);
-  }
-  for (size_t i=0; i<n_pmt; i++) {
-    idv_remove.push_back(0);
-  }
-  gsl_ran_choose (gsl_r, static_cast<void*>(&idv_remove[0]), n_pmt, static_cast<void*>(&idv_order[0]), ni_test, sizeof(size_t));
+  //center and scale every kinship matrix inside G
+  for (size_t i=0; i<n_vc; i++) {
+    gsl_matrix_view Ksub=gsl_matrix_submatrix(K, 0, i*ni_test, ni_test, ni_test);
+    CenterMatrix(&Ksub.matrix);
+    ScaleMatrix(&Ksub.matrix);
 
-  gsl_matrix *S_pmt=gsl_matrix_alloc(n_vc, n_vc*n_pmt);
-  for (size_t i=0; i<n_pmt; i++) {
-    gsl_matrix_view S_sub=gsl_matrix_submatrix (S_pmt, 0, n_vc*i, n_vc, n_vc);
-    compKtoS (G, idv_remove[i], &S_sub.matrix);
+    gsl_matrix_view Asub=gsl_matrix_submatrix(A, 0, i*ni_test, ni_test, ni_test);
+    CenterMatrix(&Asub.matrix);
+    ScaleMatrix(&Asub.matrix);
   }
 
-  //based on S_pmt, compute Svar
-  double m, v, d;
-  for (size_t i=0; i<n_vc; i++) {
-    for (size_t j=i; j<n_vc; j++) {
-      m=0; v=0;
-      for (size_t t=0; t<n_pmt; t++) {
-	d=gsl_matrix_get(S_pmt, i, j);
-	m+=d; v+=d*d;
-      }
-      m/=(double)n_pmt; v/=(double)n_pmt;
-      v=v-m*m;
-      gsl_matrix_set(Svar, i, j, v);
-      if (i!=j) {gsl_matrix_set(Svar, j, i, v);}
-    }
-  }
-  */
+  //based on G, compute S
+  compAKtoS (A, K, W->size2, S);
 
   //compute Svar and update S with Jacknife
-  JacknifeGtoS (G, S, Svar);
+  JackknifeAKtoS (W, A, K, S, Svar);
+
+  //based on G, compute a matrix Q that can be used to calculate the variance of q
+  //compKtoV (G, V);
 
-  gsl_matrix_free(G);
   return;
 }
 
@@ -1223,11 +1388,20 @@ void PARAM::WriteVar (const string suffix)
 
 	outfile.precision(10);
 
-	for (size_t i=0; i<indicator_snp.size(); i++) {
-	  if (indicator_snp[i]==0) {continue;}
-	  rs=snpInfo[i].rs_number;
-	  if (mapRS2var.count(rs)!=0) {
-	    outfile<<rs<<"\t"<<mapRS2var.at(rs)<<endl;
+	if (mindicator_snp.size()!=0) {
+	  for (size_t t=0; t<mindicator_snp.size(); t++) {
+	    indicator_snp=mindicator_snp[t];
+	    for (size_t i=0; i<indicator_snp.size(); i++) {
+	      if (indicator_snp[i]==0) {continue;}
+	      rs=snpInfo[i].rs_number;
+	      outfile<<rs<<endl;
+	    }
+	  }
+	} else {
+	  for (size_t i=0; i<indicator_snp.size(); i++) {
+	    if (indicator_snp[i]==0) {continue;}
+	    rs=snpInfo[i].rs_number;
+	    outfile<<rs<<endl;
 	  }
 	}
 
@@ -1564,3 +1738,219 @@ void PARAM::CopyRead (gsl_vector *log_N)
 
 
 
+void PARAM::ObtainWeight (const set<string> &setSnps_beta, map<string, double> &mapRS2wK)
+{
+  mapRS2wK.clear();
+
+  vector<double> wsum, wcount;
+
+  for (size_t i=0; i<n_vc; i++) {
+    wsum.push_back(0.0);
+    wcount.push_back(0.0);
+  }
+
+  string rs;
+  if (msnpInfo.size()==0) {
+    for (size_t i=0; i<snpInfo.size(); i++) {
+      if (indicator_snp[i]==0) {continue;}
+
+      rs=snpInfo[i].rs_number;
+      if ( (setSnps_beta.size()==0 || setSnps_beta.count(rs)!=0) && (mapRS2wsnp.size()==0 || mapRS2wsnp.count(rs)!=0) && (mapRS2wcat.size()==0 || mapRS2wcat.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) {
+	if (mapRS2wsnp.size()!=0) {
+	  mapRS2wK[rs]=mapRS2wsnp[rs];
+	  if (mapRS2cat.size()==0) {
+	    wsum[0]+=mapRS2wsnp[rs];
+	  } else {
+	    wsum[mapRS2cat[rs]]+=mapRS2wsnp[rs];
+	  }
+	  wcount[0]++;
+	} else {
+	  mapRS2wK[rs]=1;
+	}
+      }
+
+    }
+  } else {
+    for (size_t t=0; t<msnpInfo.size(); t++) {
+      snpInfo=msnpInfo[t];
+      indicator_snp=mindicator_snp[t];
+
+      for (size_t i=0; i<snpInfo.size(); i++) {
+	if (indicator_snp[i]==0) {continue;}
+
+	rs=snpInfo[i].rs_number;
+	if ( (setSnps_beta.size()==0 || setSnps_beta.count(rs)!=0) && (mapRS2wsnp.size()==0 || mapRS2wsnp.count(rs)!=0) && (mapRS2wcat.size()==0 || mapRS2wcat.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) {
+	  if (mapRS2wsnp.size()!=0) {
+	    mapRS2wK[rs]=mapRS2wsnp[rs];
+	    if (mapRS2cat.size()==0) {
+	      wsum[0]+=mapRS2wsnp[rs];
+	    } else {
+	      wsum[mapRS2cat[rs]]+=mapRS2wsnp[rs];
+	    }
+	    wcount[0]++;
+	  } else {
+	    mapRS2wK[rs]=1;
+	  }
+	}
+      }
+    }
+  }
+
+  if (mapRS2wsnp.size()!=0) {
+    for (size_t i=0; i<n_vc; i++) {
+      wsum[i]/=wcount[i];
+    }
+
+    for (map<string, double>::iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
+      if (mapRS2cat.size()==0) {
+	it->second/=wsum[0];
+      } else {
+	it->second/=wsum[mapRS2cat[it->first]];
+      }
+    }
+  }
+  return;
+}
+
+
+//pve_flag=0 then do not change pve; pve_flag==1, then change pve to 0 if pve < 0 and pve to 1 if pve > 1
+void PARAM::UpdateWeight (const size_t pve_flag, const map<string, double> &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map<string, double> &mapRS2wA)
+{
+  double d;
+  vector<double> wsum, wcount;
+
+  for (size_t i=0; i<n_vc; i++) {
+    wsum.push_back(0.0);
+    wcount.push_back(0.0);
+  }
+
+  for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) {
+    d=1;
+    for (size_t i=0; i<n_vc; i++) {
+      if (v_pve[i]>=1 && pve_flag==1) {
+	d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i];
+      } else if (v_pve[i]<=0 && pve_flag==1) {
+	d+=0;
+      } else {
+	d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]*v_pve[i];
+      }
+    }
+    mapRS2wA[it->first]=1/(d*d);
+
+    if (mapRS2cat.size()==0) {
+      wsum[0]+=mapRS2wA[it->first];
+      wcount[0]++;
+    } else {
+      wsum[mapRS2cat[it->first]]+=mapRS2wA[it->first];
+      wcount[mapRS2cat[it->first]]++;
+    }
+  }
+
+  for (size_t i=0; i<n_vc; i++) {
+    wsum[i]/=wcount[i];
+  }
+
+  for (map<string, double>::iterator it=mapRS2wA.begin(); it!=mapRS2wA.end(); ++it) {
+    if (mapRS2cat.size()==0) {
+      it->second/=wsum[0];
+    } else {
+      it->second/=wsum[mapRS2cat[it->first]];
+    }
+  }
+  return;
+}
+
+// this function updates indicator_snp, and save z-scores and other values into vectors
+void PARAM::UpdateSNPnZ (const map<string, double> &mapRS2wA, const map<string, string> &mapRS2A1, const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat)
+{
+  gsl_vector_set_zero (w);
+  gsl_vector_set_zero (z);
+  vec_cat.clear();
+
+  string rs, a1;
+  size_t c=0;
+  if (msnpInfo.size()==0) {
+    for (size_t i=0; i<snpInfo.size(); i++) {
+      if (indicator_snp[i]==0) {continue;}
+
+      rs=snpInfo[i].rs_number;
+      a1=snpInfo[i].a_minor;
+
+      if (mapRS2wA.count(rs)!=0) {
+	if (a1==mapRS2A1.at(rs)) {
+	  gsl_vector_set (z, c, mapRS2z.at(rs) );
+	} else {
+	  gsl_vector_set (z, c, -1*mapRS2z.at(rs) );
+	}
+	vec_cat.push_back(mapRS2cat.at(rs) );
+	gsl_vector_set (w, c, mapRS2wA.at(rs) );
+
+	c++;
+      } else {
+	indicator_snp[i]=0;
+      }
+    }
+  } else {
+    for (size_t t=0; t<msnpInfo.size(); t++) {
+      snpInfo=msnpInfo[t];
+
+      for (size_t i=0; i<snpInfo.size(); i++) {
+	if (mindicator_snp[t][i]==0) {continue;}
+
+	rs=snpInfo[i].rs_number;
+	a1=snpInfo[i].a_minor;
+
+	if (mapRS2wA.count(rs)!=0) {
+	  if (a1==mapRS2A1.at(rs)) {
+	    gsl_vector_set (z, c, mapRS2z.at(rs) );
+	  } else {
+	    gsl_vector_set (z, c, -1*mapRS2z.at(rs) );
+	  }
+	  vec_cat.push_back(mapRS2cat.at(rs) );
+	  gsl_vector_set (w, c, mapRS2wA.at(rs) );
+
+	  c++;
+	} else {
+	  mindicator_snp[t][i]=0;
+	}
+      }
+    }
+  }
+
+  return;
+}
+
+
+
+// this function updates indicator_snp, and save z-scores and other values into vectors
+void PARAM::UpdateSNP (const map<string, double> &mapRS2wA)
+{
+  string rs;
+  if (msnpInfo.size()==0) {
+    for (size_t i=0; i<snpInfo.size(); i++) {
+      if (indicator_snp[i]==0) {continue;}
+
+      rs=snpInfo[i].rs_number;
+
+      if (mapRS2wA.count(rs)==0) {
+	indicator_snp[i]=0;
+      }
+    }
+  } else {
+    for (size_t t=0; t<msnpInfo.size(); t++) {
+      snpInfo=msnpInfo[t];
+
+      for (size_t i=0; i<mindicator_snp[t].size(); i++) {
+	if (mindicator_snp[t][i]==0) {continue;}
+
+	rs=snpInfo[i].rs_number;
+
+	if (mapRS2wA.count(rs)==0) {
+	  mindicator_snp[t][i]=0;
+	}
+      }
+    }
+  }
+
+  return;
+}
diff --git a/src/param.h b/src/param.h
index 3c3b42e..4b4ad29 100644
--- a/src/param.h
+++ b/src/param.h
@@ -102,6 +102,8 @@ public:
     size_t n_col;
     size_t nmis_col;
     size_t nobs_col;
+    size_t ncase_col;
+    size_t ncontrol_col;
     size_t af_col;
     size_t var_col;
     size_t ws_col;
@@ -120,23 +122,21 @@ public:
 	vector<size_t> p_column;			//which phenotype column needs analysis
 	size_t d_pace;		//display pace
 
-	string file_bfile;
-	string file_geno;
+	string file_bfile, file_mbfile;
+	string file_geno, file_mgeno;
 	string file_pheno;
 	string file_anno;		//optional
 	string file_gxe;		//optional
 	string file_cvt;		//optional
-	string file_cat;
+	string file_cat, file_mcat;
 	string file_var;
 	string file_beta;
 	string file_cor;
-	string file_kin;
+	string file_kin, file_mk;
 	string file_ku, file_kd;
-	string file_mk;
-	string file_q, file_mq;
-	string file_s, file_ms;
-	string file_v, file_mv;
-	string file_weight;
+	string file_study, file_mstudy;
+	string file_ref, file_mref;
+	string file_weight, file_wsnp, file_wcat;
 	string file_out;
 	string path_out;
 
@@ -165,7 +165,7 @@ public:
 	size_t n_region;
 	double l_mle_null, l_remle_null;
 	double logl_mle_H0, logl_remle_H0;
-	double pve_null, pve_se_null;
+	double pve_null, pve_se_null, pve_total, se_pve_total;
 	double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null;
 	vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null;
 	vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null;
@@ -185,6 +185,8 @@ public:
 
 	vector<double> v_sigma2;
 	vector<double> v_se_sigma2;
+	vector<double> v_enrich;
+	vector<double> v_se_enrich;
 	vector<double> v_beta;
 	vector<double> v_se_beta;
 
@@ -210,15 +212,18 @@ public:
 	size_t window_bp;
 	size_t window_ns;
 
+	//vc related parameters
+	size_t n_block;
+
 	// Summary statistics
 	bool error;
-	size_t ni_total, ni_test, ni_cvt;	//number of individuals
+	size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref;	//number of individuals
 	size_t np_obs, np_miss;		//number of observed and missing phenotypes
-	size_t ns_total, ns_test;	//number of snps
+	size_t ns_total, ns_test, ns_study, ns_ref;	//number of snps
 	size_t ng_total, ng_test;	//number of genes
 	size_t ni_control, ni_case;	//number of controls and number of cases
 	size_t ni_subsample;            //number of subsampled individuals
-	size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel
+	//size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel
 	size_t n_cvt;			//number of covariates
 	size_t n_ph;			//number of phenotypes
 	size_t n_vc;			//number of variance components (including the diagonal matrix)
@@ -240,6 +245,7 @@ public:
 	vector<vector<int> > indicator_pheno;			//a matrix record when a phenotype is missing for an individual; 0 missing, 1 available
 	vector<int> indicator_idv;				//indicator for individuals (phenotypes), 0 missing, 1 available for analysis
 	vector<int> indicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
+	vector< vector<int> >  mindicator_snp;				//sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis
 	vector<int> indicator_cvt;				//indicator for covariates, 0 missing, 1 available for analysis
 	vector<int> indicator_gxe;				//indicator for gxe, 0 missing, 1 available for analysis
 	vector<int> indicator_weight;                           //indicator for weight, 0 missing, 1 available for analysis
@@ -256,9 +262,11 @@ public:
 	map<string, double> mapRS2cM;		//map rs# to cM
 	map<string, double> mapRS2est;			//map rs# to parameters
 	map<string, size_t> mapRS2cat;          //map rs# to category number
-	map<string, double> mapRS2var;          //map rs# to category number
+	map<string, double> mapRS2wsnp;          //map rs# to snp weights
+	map<string, vector<double> > mapRS2wcat;          //map rs# to snp cat weights
 
 	vector<SNPINFO> snpInfo;		//record SNP information
+	vector< vector<SNPINFO> > msnpInfo;		//record SNP information
 	set<string> setSnps;			//a set of snps for analysis
 
 	//constructor
@@ -279,12 +287,16 @@ public:
 	void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag);
 	void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag);
 	void CalcKin (gsl_matrix *matrix_kin);
-	void CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q);
+	void CalcS (const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns);
 	void WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix);
 	void WriteVar (const string suffix);
 	void WriteMatrix (const gsl_matrix *matrix_U, const string suffix);
 	void WriteVector (const gsl_vector *vector_D, const string suffix);
 	void CopyRead (gsl_vector *log_N);
+	void ObtainWeight (const set<string> &setSnps_beta, map<string, double> &mapRS2wK);
+	void UpdateWeight (const size_t pve_flag, const map<string, double> &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map<string, double> &mapRS2wA);
+	void UpdateSNPnZ (const map<string, double> &mapRS2wA, const map<string, string> &mapRS2A1, const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat);
+	void UpdateSNP (const map<string, double> &mapRS2wA);
 };
 
 
diff --git a/src/vc.cpp b/src/vc.cpp
index 77cf746..94bf931 100644
--- a/src/vc.cpp
+++ b/src/vc.cpp
@@ -1,17 +1,17 @@
 /*
  Genome-wide Efficient Mixed Model Association (GEMMA)
  Copyright (C) 2011  Xiang Zhou
- 
+
  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
- 
+
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
- 
+
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
@@ -26,8 +26,12 @@
 #include <cmath>
 #include <iostream>
 #include <stdio.h>
-#include <stdlib.h> 
+#include <stdlib.h>
 #include <bitset>
+#include <vector>
+#include <set>
+#include <map>
+#include <string>
 #include <cstring>
 
 #include "gsl/gsl_vector.h"
@@ -39,9 +43,14 @@
 #include "gsl/gsl_multiroots.h"
 #include "gsl/gsl_min.h"
 
+#include "Eigen/Dense"
+
+#include "param.h"
 #include "io.h"
 #include "lapack.h"
+#include "eigenlib.h"
 #include "gzstream.h"
+#include "mathfunc.h"
 
 #ifdef FORCE_FLOAT
 #include "lmm_float.h"
@@ -54,95 +63,194 @@
 
 
 using namespace std;
-
+using namespace Eigen;
 
 //in this file, X, Y are already transformed (i.e. UtX and UtY)
 
 
-void VC::CopyFromParam (PARAM &cPar) 
-{	
-	file_out=cPar.file_out;
-	
-	//	v_sigma2=cPar.v_sigma2;
-	
-	time_UtX=0.0;
-	time_opt=0.0;
+void VC::CopyFromParam (PARAM &cPar)
+{
+  a_mode=cPar.a_mode;
 
-	v_traceG=cPar.v_traceG;
-	
-	return;
+  file_cat=cPar.file_cat;
+  file_beta=cPar.file_beta;
+  file_cor=cPar.file_cor;
+
+  setSnps=cPar.setSnps;
+
+  file_out=cPar.file_out;
+  path_out=cPar.path_out;
+
+  //v_sigma2=cPar.v_sigma2;
+
+  time_UtX=0.0;
+  time_opt=0.0;
+
+  v_traceG=cPar.v_traceG;
+
+  ni_total=cPar.ni_total;
+  ns_total=cPar.ns_total;
+  ns_test=cPar.ns_test;
+
+  crt=cPar.crt;
+  window_cm=cPar.window_cm;
+  window_bp=cPar.window_bp;
+  window_ns=cPar.window_ns;
+
+  n_vc=cPar.n_vc;
+
+  return;
 }
 
 
-void VC::CopyToParam (PARAM &cPar) 
+void VC::CopyToParam (PARAM &cPar)
 {
 	cPar.time_UtX=time_UtX;
-	cPar.time_opt=time_opt;	
-		
-	cPar.v_sigma2=v_sigma2;
-	cPar.v_se_sigma2=v_se_sigma2;
+	cPar.time_opt=time_opt;
+
 	cPar.v_pve=v_pve;
 	cPar.v_se_pve=v_se_pve;
+	cPar.v_sigma2=v_sigma2;
+	cPar.v_se_sigma2=v_se_sigma2;
+	cPar.pve_total=pve_total;
+	cPar.se_pve_total=se_pve_total;
 	cPar.v_traceG=v_traceG;
-	
+
 	cPar.v_beta=v_beta;
 	cPar.v_se_beta=v_se_beta;
-	
+
+	cPar.ni_total=ni_total;
+	cPar.ns_total=ns_total;
+	cPar.ns_test=ns_test;
+
+	cPar.n_vc=n_vc;
+
+	return;
+}
+
+
+
+void VC::WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat)
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".qvec.txt";
+
+	ofstream outfile_q (file_str.c_str(), ofstream::out);
+	if (!outfile_q) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	for (size_t i=0; i<s_vec->size; i++) {
+	  outfile_q<<gsl_vector_get(s_vec, i)<<endl;
+	}
+	for (size_t i=0; i<q_vec->size; i++) {
+	  outfile_q<<gsl_vector_get(q_vec, i)<<endl;
+	}
+	for (size_t i=0; i<qvar_vec->size; i++) {
+	  outfile_q<<gsl_vector_get(qvar_vec, i)<<endl;
+	}
+
+	outfile_q.clear();
+	outfile_q.close();
+
+	file_str=path_out+"/"+file_out;
+	file_str+=".smat.txt";
+
+	ofstream outfile_s (file_str.c_str(), ofstream::out);
+	if (!outfile_s) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+
+	for (size_t i=0; i<S_mat->size1; i++) {
+	  for (size_t j=0; j<S_mat->size2; j++) {
+	    outfile_s<<gsl_matrix_get(S_mat, i, j)<<"\t";
+	  }
+	  outfile_s<<endl;
+	}
+	for (size_t i=0; i<Svar_mat->size1; i++) {
+	  for (size_t j=0; j<Svar_mat->size2; j++) {
+	    outfile_s<<gsl_matrix_get(Svar_mat, i, j)<<"\t";
+	  }
+	  outfile_s<<endl;
+	}
+
+	outfile_s.clear();
+	outfile_s.close();
+
 	return;
 }
 
 
 
+
+
+
+
+
 void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p)
 {
   size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2;
- 
+
   gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1);
   gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt);
   gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt);
   gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt);
   gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1);
 
-  double sigma2;  
+  double sigma2;
   //calculate H=\sum_i^{k+1} \sigma_i^2 K_i
   gsl_matrix_set_zero (p->P);
   for (size_t i=0; i<n_vc+1; i++) {
     if (i==n_vc) {
-      gsl_matrix_set_identity (K_temp);      
+      gsl_matrix_set_identity (K_temp);
     } else {
       gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);
       gsl_matrix_memcpy (K_temp, &K_sub.matrix);
     }
 
-    sigma2=exp(gsl_vector_get (log_sigma2, i) );
+    //when unconstrained, update on sigma2 instead of log_sigma2
+    if (p->noconstrain) {
+      sigma2=gsl_vector_get (log_sigma2, i);
+    } else {
+      sigma2=exp(gsl_vector_get (log_sigma2, i) );
+    }
     gsl_matrix_scale(K_temp, sigma2);
     gsl_matrix_add (p->P, K_temp);
   }
 
   //calculate H^{-1}
+  /*
   int sig;
   gsl_permutation * pmt1=gsl_permutation_alloc (n1);
-  LUDecomp (p->P, pmt1, &sig);	
+  LUDecomp (p->P, pmt1, &sig);
   LUInvert (p->P, pmt1, K_temp);
   gsl_permutation_free(pmt1);
 
   gsl_matrix_memcpy (p->P, K_temp);
+  */
+  eigenlib_invert(p->P);
 
   //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1}
-  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW);
-  gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW);
+  //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW);
+  //gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW);
+
+  eigenlib_dgemm ("N", "N", 1.0, p->P, p->W, 0.0, HiW);
+  eigenlib_dgemm ("T", "N", 1.0, p->W, HiW, 0.0, WtHiW);
 
-  gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt);
-  LUDecomp (WtHiW, pmt2, &sig);	
-  LUInvert (WtHiW, pmt2, WtHiWi);
-  gsl_permutation_free(pmt2);
+  //gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt);
+  //LUDecomp (WtHiW, pmt2, &sig);
+  //LUInvert (WtHiW, pmt2, WtHiWi);
+  //gsl_permutation_free(pmt2);
+  eigenlib_invert(WtHiW);
+  gsl_matrix_memcpy(WtHiWi, WtHiW);
+
+  //gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi);
+  //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P);
+  eigenlib_dgemm ("N", "T", 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi);
+  eigenlib_dgemm ("N", "N", -1.0, HiW, WtHiWiWtHi, 1.0, p->P);
 
-  gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi);  
-  gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P);
-  
   //calculate Py, KPy, PKPy
-  gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py);    
+  gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py);
+  //eigenlib_dgemv("N", 1.0, p->P, p->y, 0.0, p->Py);
 
+  double d;
   for (size_t i=0; i<n_vc+1; i++) {
     gsl_vector_view KPy=gsl_matrix_column (p->KPy_mat, i);
     gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i);
@@ -150,11 +258,22 @@ void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p)
     if (i==n_vc) {
       gsl_vector_memcpy (&KPy.vector, p->Py);
     } else {
-      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);      
+      gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1);
+      //seems to be important to use gsl dgemv here instead of eigenlib_dgemv; otherwise
       gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector);
+      //eigenlib_dgemv("N", 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector);
     }
-    
+
     gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector);
+    //eigenlib_dgemv("N", 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector);
+
+    //when phenotypes are not normalized well, then some values in the following matrix maybe nan; change that to 0; this seems to only happen when eigenlib_dgemv was used above
+    for (size_t j=0; j<p->KPy_mat->size1; j++) {
+      d=gsl_matrix_get (p->KPy_mat, j, i);
+      if (std::isnan(d)) {gsl_matrix_set (p->KPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl;}
+      d=gsl_matrix_get (p->PKPy_mat, j, i);
+      if (std::isnan(d)) {gsl_matrix_set (p->PKPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl;}
+    }
   }
 
   gsl_matrix_free (K_temp);
@@ -173,7 +292,7 @@ int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1)
   VC_PARAM *p=(VC_PARAM *) params;
 
   size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
-  
+
   double tr, d;
 
   //update parameters
@@ -199,8 +318,12 @@ int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1)
     gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
     gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
 
-    d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i));
-    
+    if (p->noconstrain) {
+      d=(-0.5*tr+0.5*d);
+    } else {
+      d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i));
+    }
+
     gsl_vector_set(dev1, i, d);
   }
 
@@ -214,32 +337,47 @@ int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2)
   VC_PARAM *p=(VC_PARAM *) params;
 
   size_t n_vc=log_sigma2->size-1;
-  
+
   double d, sigma2_i, sigma2_j;
 
   //update parameters
   UpdateParam (log_sigma2, p);
-  
+
   //calculate dev2=0.5(yPKPKPy)
   for (size_t i=0; i<n_vc+1; i++) {
     gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
-    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+    if (p->noconstrain) {
+      sigma2_i=gsl_vector_get(log_sigma2, i);
+    } else {
+      sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+    }
 
     for (size_t j=i; j<n_vc+1; j++) {
       gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
 
       gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
-      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
-
-      d*=-0.5*sigma2_i*sigma2_j;
+      if (p->noconstrain) {
+	sigma2_j=gsl_vector_get(log_sigma2, j);
+	d*=-0.5;
+      } else {
+	sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+	d*=-0.5*sigma2_i*sigma2_j;
+      }
 
       gsl_matrix_set(dev2, i, j, d);
       if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
-    }   
+    }
   }
 
   gsl_matrix_memcpy (p->Hessian, dev2);
-
+  /*
+  for (size_t i=0; i<dev2->size1; i++) {
+    for (size_t j=0; j<dev2->size2; j++) {
+      cout<<gsl_matrix_get (dev2, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+  */
   return GSL_SUCCESS;
 }
 
@@ -250,14 +388,14 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g
   VC_PARAM *p=(VC_PARAM *) params;
 
   size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1;
-  
+
   double tr, d, sigma2_i, sigma2_j;
 
   //update parameters
   UpdateParam (log_sigma2, p);
 
-  //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy
-  //calculate dev2=0.5(yPKPKPy)
+  //calculate dev1=(-0.5*trace(PK_i)+0.5*yPK_iPy)*sigma2_i
+  //calculate dev2=0.5(yPK_iPK_jPy)*sigma2_i*sigma2_j
   for (size_t i=0; i<n_vc+1; i++) {
     if (i==n_vc) {
       tr=0;
@@ -277,21 +415,31 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g
     gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i);
     gsl_blas_ddot(p->Py, &KPy_i.vector, &d);
 
-    sigma2_i=exp(gsl_vector_get(log_sigma2, i));
-    d=(-0.5*tr+0.5*d)*sigma2_i;
- 
+    if (p->noconstrain) {
+      sigma2_i=gsl_vector_get(log_sigma2, i);
+      d=(-0.5*tr+0.5*d);
+    } else {
+      sigma2_i=exp(gsl_vector_get(log_sigma2, i));
+      d=(-0.5*tr+0.5*d)*sigma2_i;
+    }
+
     gsl_vector_set(dev1, i, d);
-      
+
     for (size_t j=i; j<n_vc+1; j++) {
       gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j);
       gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d);
 
-      sigma2_j=exp(gsl_vector_get(log_sigma2, j));
-      d*=-0.5*sigma2_i*sigma2_j;
+      if (p->noconstrain) {
+	sigma2_j=gsl_vector_get(log_sigma2, j);
+	d*=-0.5;
+      } else {
+	sigma2_j=exp(gsl_vector_get(log_sigma2, j));
+	d*=-0.5*sigma2_i*sigma2_j;
+      }
 
       gsl_matrix_set(dev2, i, j, d);
       if (j!=i) {gsl_matrix_set(dev2, j, i, d);}
-    }   
+    }
 
   }
 
@@ -303,13 +451,1195 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g
 
 
 
-void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y)
+
+//read header to determine which column contains which item
+bool ReadHeader (const string &line, HEADER &header)
+{
+  string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"};
+  set<string> rs_set(rs_ptr, rs_ptr+10);
+  string chr_ptr[]={"chr","CHR"};
+  set<string> chr_set(chr_ptr, chr_ptr+2);
+  string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"};
+  set<string> pos_set(pos_ptr, pos_ptr+8);
+  string cm_ptr[]={"cm","CM"};
+  set<string> cm_set(cm_ptr, cm_ptr+2);
+  string a1_ptr[]={"a1","A1","allele1","ALLELE1"};
+  set<string> a1_set(a1_ptr, a1_ptr+4);
+  string a0_ptr[]={"a0","A0","allele0","ALLELE0"};
+  set<string> a0_set(a0_ptr, a0_ptr+4);
+
+  string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"};
+  set<string> z_set(z_ptr, z_ptr+6);
+  string beta_ptr[]={"beta","BETA","b","B"};
+  set<string> beta_set(beta_ptr, beta_ptr+4);
+  string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"};
+  set<string> sebeta_set(sebeta_ptr, sebeta_ptr+4);
+  string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"};
+  set<string> chisq_set(chisq_ptr, chisq_ptr+4);
+  string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"};
+  set<string> p_set(p_ptr, p_ptr+6);
+
+  string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"};
+  set<string> n_set(n_ptr, n_ptr+6);
+  string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"};
+  set<string> nmis_set(nmis_ptr, nmis_ptr+6);
+  string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"};
+  set<string> nobs_set(nobs_ptr, nobs_ptr+4);
+
+  string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"};
+  set<string> af_set(af_ptr, af_ptr+10);
+  string var_ptr[]={"var","VAR"};
+  set<string> var_set(var_ptr, var_ptr+2);
+
+  string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"};
+  set<string> ws_set(ws_ptr, ws_ptr+4);
+  string cor_ptr[]={"cor","COR","r","R"};
+  set<string> cor_set(cor_ptr, cor_ptr+4);
+
+  header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0;
+
+  char *ch_ptr;
+  string type;
+  size_t n_error=0;
+
+  ch_ptr=strtok ((char *)line.c_str(), " , \t");
+  while (ch_ptr!=NULL) {
+    type=ch_ptr;
+    if (rs_set.count(type)!=0) {
+      if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."<<endl; n_error++;}
+    } else if (chr_set.count(type)!=0) {
+      if (header.chr_col==0) {header.chr_col=header.coln+1;} else {cout<<"error! more than two chr columns in the file."<<endl; n_error++;}
+    } else if (pos_set.count(type)!=0) {
+      if (header.pos_col==0) {header.pos_col=header.coln+1;} else {cout<<"error! more than two pos columns in the file."<<endl; n_error++;}
+    } else if (cm_set.count(type)!=0) {
+      if (header.cm_col==0) {header.cm_col=header.coln+1;} else {cout<<"error! more than two cm columns in the file."<<endl; n_error++;}
+    } else if (a1_set.count(type)!=0) {
+      if (header.a1_col==0) {header.a1_col=header.coln+1;} else {cout<<"error! more than two allele1 columns in the file."<<endl; n_error++;}
+    } else if (a0_set.count(type)!=0) {
+      if (header.a0_col==0) {header.a0_col=header.coln+1;} else {cout<<"error! more than two allele0 columns in the file."<<endl; n_error++;}
+    } else if (z_set.count(type)!=0) {
+      if (header.z_col==0) {header.z_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;}
+    } else if (beta_set.count(type)!=0) {
+      if (header.beta_col==0) {header.beta_col=header.coln+1;} else {cout<<"error! more than two beta columns in the file."<<endl; n_error++;}
+    } else if (sebeta_set.count(type)!=0) {
+      if (header.sebeta_col==0) {header.sebeta_col=header.coln+1;} else {cout<<"error! more than two se_beta columns in the file."<<endl; n_error++;}
+    } else if (chisq_set.count(type)!=0) {
+      if (header.chisq_col==0) {header.chisq_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;}
+    } else if (p_set.count(type)!=0) {
+      if (header.p_col==0) {header.p_col=header.coln+1;} else {cout<<"error! more than two p columns in the file."<<endl; n_error++;}
+    } else if (n_set.count(type)!=0) {
+      if (header.n_col==0) {header.n_col=header.coln+1;} else {cout<<"error! more than two n_total columns in the file."<<endl; n_error++;}
+    } else if (nmis_set.count(type)!=0) {
+      if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;}
+    } else if (nobs_set.count(type)!=0) {
+      if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;}
+    } else if (ws_set.count(type)!=0) {
+      if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;}
+    } else if (af_set.count(type)!=0) {
+      if (header.af_col==0) {header.af_col=header.coln+1;} else {cout<<"error! more than two af columns in the file."<<endl; n_error++;}
+    } else if (cor_set.count(type)!=0) {
+      if (header.cor_col==0) {header.cor_col=header.coln+1;} else {cout<<"error! more than two cor columns in the file."<<endl; n_error++;}
+    } else {}
+
+    ch_ptr=strtok (NULL, " , \t");
+    header.coln++;
+  }
+
+  if (header.cor_col!=0 && header.cor_col!=header.coln) {cout<<"error! the cor column should be the last column."<<endl; n_error++;}
+
+  if (header.rs_col==0) {
+    if (header.chr_col!=0 && header.pos_col!=0) {
+      cout<<"missing an rs column. rs id will be replaced by chr:pos"<<endl;
+    } else {
+      cout<<"error! missing an rs column."<<endl; n_error++;
+    }
+  }
+
+  if (n_error==0) {return true;} else {return false;}
+}
+
+
+
+
+
+
+//read cov file the first time, record mapRS2in, mapRS2var (in case var is not provided in the z file), store vec_n and vec_rs
+void ReadFile_cor (const string &file_cor, const set<string> &setSnps, vector<string> &vec_rs, vector<size_t> &vec_n, vector<double> &vec_cm, vector<double> &vec_bp, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var)
+{
+  vec_rs.clear();
+  vec_n.clear();
+  mapRS2in.clear();
+  mapRS2var.clear();
+
+  igzstream infile (file_cor.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open cov file: "<<file_cor<<endl; return;}
+
+  string line;
+  char *ch_ptr;
+
+  string rs, chr, a1, a0, pos, cm;
+  double af=0, var_x=0, d_pos, d_cm;
+  size_t n_total=0, n_mis=0, n_obs=0, ni_total=0;
+  size_t ns_test=0, ns_total=0;
+
+  HEADER header;
+
+  //header
+  !safeGetline(infile, line).eof();
+  ReadHeader (line, header);
+
+  if (header.n_col==0 ) {
+    if (header.nobs_col==0 && header.nmis_col==0) {
+      cout<<"error! missing sample size in the cor file."<<endl;
+    } else {
+      cout<<"total sample size will be replaced by obs/mis sample size."<<endl;
+    }
+  }
+
+  while (!safeGetline(infile, line).eof()) {
+    //do not read cor values this time; upto col_n-1
+    ch_ptr=strtok ((char *)line.c_str(), " , \t");
+
+    n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; d_cm=0; d_pos=0;
+    for (size_t i=0; i<header.coln-1; i++) {
+      if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+      if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
+      if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; d_pos=atof(ch_ptr);}
+      if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; d_cm=atof(ch_ptr);}
+      if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;}
+      if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;}
+
+      if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);}
+      if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);}
+      if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);}
+
+      if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);}
+      if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);}
+
+      ch_ptr=strtok (NULL, " , \t");
+    }
+
+    if (header.rs_col==0) {
+      rs=chr+":"+pos;
+    }
+
+    if (header.n_col==0) {
+      n_total=n_mis+n_obs;
+    }
+
+    //record rs, n
+    vec_rs.push_back(rs);
+    vec_n.push_back(n_total);
+    if (d_cm>0) {vec_cm.push_back(d_cm);} else {vec_cm.push_back(d_cm);}
+    if (d_pos>0) {vec_bp.push_back(d_pos);} else {vec_bp.push_back(d_pos);}
+
+    //record mapRS2in and mapRS2var
+    if (setSnps.size()==0 || setSnps.count(rs)!=0) {
+      if (mapRS2in.count(rs)==0) {
+	mapRS2in[rs]=1;
+
+	if (header.var_col!=0) {
+	  mapRS2var[rs]=var_x;
+	} else if (header.af_col!=0) {
+	  var_x=2.0*af*(1.0-af);
+	  mapRS2var[rs]=var_x;
+	} else {}
+
+	ns_test++;
+
+      } else {
+	cout<<"error! more than one snp has the same id "<<rs<<" in cor file?"<<endl;
+      }
+    }
+
+    //record max pos,
+
+    ni_total=max(ni_total, n_total);
+    ns_total++;
+  }
+
+  //  cout<<"## number of analyzed individuals in the reference = "<<ni_total<<endl;
+  //  cout<<"## number of analyzed SNPs in the reference = "<<ns_total<<endl;
+
+  infile.close();
+  infile.clear();
+
+  return;
+}
+
+
+
+
+
+
+//read beta file, store mapRS2var if var is provided here, calculate q and var_y
+void ReadFile_beta (const bool flag_priorscale, const string &file_beta, const map<string, size_t> &mapRS2cat, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var, map<string, size_t> &mapRS2nsamp, gsl_vector *q_vec, gsl_vector *qvar_vec, gsl_vector *s_vec, size_t &ni_total, size_t &ns_total)
+{
+  mapRS2nsamp.clear();
+
+  igzstream infile (file_beta.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;}
+
+  string line;
+  char *ch_ptr;
+  string type;
+
+  string rs, chr, a1, a0, pos, cm;
+  double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0;
+  size_t n_total=0, n_mis=0, n_obs=0;
+  size_t ns_test=0;
+  ns_total=0; ni_total=0;
+
+  vector<double> vec_q, vec_qvar, vec_s;
+  for (size_t i=0; i<q_vec->size; i++) {
+    vec_q.push_back(0.0);
+    vec_qvar.push_back(0.0);
+    vec_s.push_back(0.0);
+  }
+
+  //read header
+  HEADER header;
+  !safeGetline(infile, line).eof();
+  ReadHeader (line, header);
+
+  if (header.n_col==0 ) {
+    if (header.nobs_col==0 && header.nmis_col==0) {
+      cout<<"error! missing sample size in the beta file."<<endl;
+    } else {
+      cout<<"total sample size will be replaced by obs/mis sample size."<<endl;
+    }
+  }
+
+  if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) {
+    cout<<"error! missing z scores in the beta file."<<endl;
+  }
+
+  if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) {
+    cout<<"error! missing allele frequency in the beta file."<<endl;
+  }
+
+  while (!safeGetline(infile, line).eof()) {
+    ch_ptr=strtok ((char *)line.c_str(), " , \t");
+
+    z=0; beta=0; se_beta=0; chisq=0; pvalue=0;
+    n_total=0; n_mis=0; n_obs=0; af=0; var_x=0;
+    for (size_t i=0; i<header.coln; i++) {
+      if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+      if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
+      if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;}
+      if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;}
+      if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;}
+      if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;}
+
+      if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);}
+      if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);}
+      if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);}
+      if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);}
+      if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);}
+
+      if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);}
+      if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);}
+      if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);}
+
+      if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);}
+      if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);}
+
+      ch_ptr=strtok (NULL, " , \t");
+    }
+
+    if (header.rs_col==0) {
+      rs=chr+":"+pos;
+    }
+
+    if (header.n_col==0) {
+      n_total=n_mis+n_obs;
+    }
+
+    //both z values and beta/se_beta have directions, while chisq/pvalue do not
+    if (header.z_col!=0) {
+      zsquare=z*z;
+    } else if (header.beta_col!=0 && header.sebeta_col!=0) {
+      z=beta/se_beta;
+      zsquare=z*z;
+    } else if (header.chisq_col!=0) {
+      zsquare=chisq;
+    } else if (header.p_col!=0) {
+      zsquare=gsl_cdf_chisq_Qinv (pvalue, 1);
+    } else {zsquare=0;}
+
+    //if the snp is also present in cor file, then do calculations
+    if ((header.var_col!=0 || header.af_col!=0 || mapRS2var.count(rs)!=0) && mapRS2in.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) {
+      if (mapRS2in.at(rs)>1) {
+	cout<<"error! more than one snp has the same id "<<rs<<" in beta file?"<<endl;
+	break;
+      }
+
+      if (header.var_col==0) {
+	if (header.af_col!=0) {
+	  var_x=2.0*af*(1.0-af);
+	} else {
+	  var_x=mapRS2var.at(rs);
+	}
+      }
+
+      if (flag_priorscale) {var_x=1;}
+
+      mapRS2in[rs]++;
+      mapRS2var[rs]=var_x;
+      mapRS2nsamp[rs]=n_total;
+
+      if (mapRS2cat.size()!=0) {
+	vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total;
+	vec_s[mapRS2cat.at(rs) ]+=var_x;
+	vec_qvar[mapRS2cat.at(rs) ]+=var_x*var_x/((double)n_total*(double)n_total);
+      } else {
+	vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total;
+	vec_s[0]+=var_x;
+	vec_qvar[0]+=var_x*var_x/((double)n_total*(double)n_total);
+      }
+
+      ni_total=max(ni_total, n_total);
+      ns_test++;
+    }
+
+    ns_total++;
+  }
+
+  for (size_t i=0; i<q_vec->size; i++) {
+    gsl_vector_set(q_vec, i, vec_q[i]);
+    gsl_vector_set(qvar_vec, i, 2.0*vec_qvar[i]);
+    gsl_vector_set(s_vec, i, vec_s[i]);
+  }
+
+
+  infile.clear();
+  infile.close();
+
+  return;
+}
+
+
+
+
+
+//read covariance file the second time
+//look for rs, n_mis+n_obs, var, window_size, cov
+//if window_cm/bp/ns is provided, then use these max values to calibrate estimates
+void ReadFile_cor (const string &file_cor, const vector<string> &vec_rs, const vector<size_t> &vec_n, const vector<double> &vec_cm, const vector<double> &vec_bp, const map<string, size_t> &mapRS2cat, const map<string, size_t> &mapRS2in, const map<string, double> &mapRS2var, const map<string, size_t> &mapRS2nsamp, const size_t crt, const double &window_cm, const double &window_bp, const double &window_ns, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *qvar_vec, size_t &ni_total, size_t &ns_total, size_t &ns_test, size_t &ns_pair)
+{
+  igzstream infile (file_cor.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open cov file: "<<file_cor<<endl; return;}
+
+  string line;
+  char *ch_ptr;
+
+  string rs1, rs2;
+  double d1, d2, d3, cor, var1, var2;
+  size_t n_nb, nsamp1, nsamp2, n12, bin_size=10, bin;
+
+  vector<vector<double> > mat_S, mat_Svar, mat_tmp;
+  vector<double> vec_qvar, vec_tmp;
+  vector<vector<vector<double> > > mat3d_Sbin;
+
+  for (size_t i=0; i<S_mat->size1; i++) {
+    vec_qvar.push_back(0.0);
+  }
+
+  for (size_t i=0; i<S_mat->size1; i++) {
+    mat_S.push_back(vec_qvar);
+    mat_Svar.push_back(vec_qvar);
+  }
+
+  for (size_t k=0; k<bin_size; k++) {
+    vec_tmp.push_back(0.0);
+  }
+  for (size_t i=0; i<S_mat->size1; i++) {
+    mat_tmp.push_back(vec_tmp);
+  }
+  for (size_t i=0; i<S_mat->size1; i++) {
+    mat3d_Sbin.push_back(mat_tmp);
+  }
+
+  string rs, chr, a1, a0, type, pos, cm;
+  size_t n_total=0, n_mis=0, n_obs=0;
+  double d_pos1, d_pos2, d_pos, d_cm1, d_cm2, d_cm;
+  ns_test=0; ns_total=0; ns_pair=0; ni_total=0;
+
+  //header
+  HEADER header;
+
+  !safeGetline(infile, line).eof();
+  ReadHeader (line, header);
+
+  while (!safeGetline(infile, line).eof()) {
+    //do not read cor values this time; upto col_n-1
+    d_pos1=0; d_cm1=0;
+    ch_ptr=strtok ((char *)line.c_str(), " , \t");
+    for (size_t i=0; i<header.coln-1; i++) {
+      if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;}
+      if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;}
+      if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; d_pos1=atof(ch_ptr);}
+      if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; d_cm1=atof(ch_ptr); }
+      if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;}
+      if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;}
+
+      if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);}
+      if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);}
+      if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);}
+
+      ch_ptr=strtok (NULL, " , \t");
+    }
+
+    if (header.rs_col==0) {
+      rs=chr+":"+pos;
+    }
+
+    if (header.n_col==0) {
+      n_total=n_mis+n_obs;
+    }
+
+    rs1=rs;
+
+    if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs1)!=0) && mapRS2in.count(rs1)!=0 && mapRS2in.at(rs1)==2) {
+      var1=mapRS2var.at(rs1);
+      nsamp1=mapRS2nsamp.at(rs1);
+      d2=var1*var1;
+
+      if (mapRS2cat.size()!=0) {
+	mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+=(1-1.0/(double)vec_n[ns_total])*d2;
+	mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+=d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]);
+	if (crt==1) {
+	  mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ][0]+=(1-1.0/(double)vec_n[ns_total])*d2;
+	}
+      } else {
+	//mat_S[0][0]+=(1-1.0/(double)vec_n[ns_total])*d2;
+	mat_S[0][0]+=(1-1.0/(double)vec_n[ns_total])*d2;
+	mat_Svar[0][0]+=d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]);
+	if (crt==1) {
+	  mat3d_Sbin[0][0][0]+=(1-1.0/(double)vec_n[ns_total])*d2;
+	}
+      }
+
+      n_nb=0;
+      while(ch_ptr!=NULL) {
+	type=ch_ptr;
+	if (type.compare("NA")!=0 && type.compare("na")!=0 && type.compare("nan")!=0 && type.compare("-nan")!=0) {
+	  cor=atof(ch_ptr);
+	  rs2=vec_rs[ns_total+n_nb+1];
+	  d_pos2=vec_bp[ns_total+n_nb+1];
+	  d_cm2=vec_cm[ns_total+n_nb+1];
+	  d_pos=abs(d_pos2-d_pos1);
+	  d_cm=abs(d_cm2-d_cm1);
+
+	  if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs2)!=0) && mapRS2in.count(rs2)!=0 && mapRS2in.at(rs2)==2) {
+	    var2=mapRS2var.at(rs2);
+	    nsamp2=mapRS2nsamp.at(rs2);
+	    d1=cor*cor-1.0/(double)min(vec_n[ns_total], vec_n[ns_total+n_nb+1]);
+	    d2=var1*var2;
+	    d3=cor*cor/((double)nsamp1*(double)nsamp2);
+	    n12=min(vec_n[ns_total], vec_n[ns_total+n_nb+1]);
+
+	    //compute bin
+	    if (crt==1) {
+	      if (window_cm!=0 && d_cm1!=0 && d_cm2!=0) {
+		bin=min( (int)floor(d_cm/window_cm*bin_size), (int)bin_size);
+	      } else if (window_bp!=0 && d_pos1!=0 && d_pos2!=0) {
+		bin=min( (int)floor(d_pos/window_bp*bin_size), (int)bin_size);
+	      } else if (window_ns!=0) {
+		bin=min( (int)floor(((double)n_nb+1)/window_ns*bin_size), (int)bin_size);
+	      }
+	    }
+
+	    //if (mat_S[0][0]!=mat_S[0][0] && flag_nan==0) {
+	    //if (rs1.compare("rs10915560")==0 || rs1.compare("rs241273")==0) {cout<<rs1<<" "<<rs2<<" "<<ns_total<<" "<<n_nb<<" "<<vec_n[ns_total]<<" "<<vec_n[ns_total+n_nb+1]<<" "<<nsamp1<<" "<<nsamp2<<" "<<var1<<" "<<var2<<" "<<cor<<" "<<d1<<" "<<d2<<" "<<d3<<" "<<mat_S[0][0]<<endl; flag_nan++;}
+	    if (mapRS2cat.size()!=0) {
+	      if (mapRS2cat.at(rs1)==mapRS2cat.at(rs2)) {
+		vec_qvar[mapRS2cat.at(rs1)]+=2*d3*d2;
+		mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=2*d1*d2;
+		mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=2*d2*d2/((double)n12*(double)n12);
+		if (crt==1) {
+		  mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+=2*d1*d2;
+		}
+	      } else {
+		mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=d1*d2;
+		mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=d2*d2/((double)n12*(double)n12);
+		if (crt==1) {
+		  mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+=d1*d2;
+		}
+	      }
+	    } else {
+	      vec_qvar[0]+=2*d3*d2;
+	      mat_S[0][0]+=2*d1*d2;
+	      mat_Svar[0][0]+=2*d2*d2/((double)n12*(double)n12);
+
+	      if (crt==1) {
+		mat3d_Sbin[0][0][bin]+=2*d1*d2;
+	      }
+	    }
+	    ns_pair++;
+	  }
+	}
+
+	ch_ptr=strtok (NULL, " , \t");
+	n_nb++;
+      }
+      ni_total=max(ni_total, n_total);
+      ns_test++;
+    }
+
+    ns_total++;
+  }
+
+  //use S_bin to fit a rational function y=1/(a+bx)^2, where x=seq(0.5,bin_size-0.5,by=1)
+  //and then compute a correlation factor as a percentage
+  double a, b, x, y, n, var_y, var_x, mean_y, mean_x, cov_xy, crt_factor;
+  if (crt==1) {
+    for (size_t i=0; i<S_mat->size1; i++) {
+      for (size_t j=i; j<S_mat->size2; j++) {
+
+	//correct mat_S
+	n=0; var_y=0; var_x=0; mean_y=0; mean_x=0; cov_xy=0;
+	for (size_t k=0; k<bin_size; k++) {
+	  if (j==i) {
+	    y=mat3d_Sbin[i][j][k];
+	  } else {
+	    y=mat3d_Sbin[i][j][k]+mat3d_Sbin[j][i][k];
+	  }
+	  x=k+0.5;
+	  cout<<y<<", ";
+	  if (y>0) {
+	    y=1/sqrt(y);
+	    mean_x+=x; mean_y+=y; var_x+=x*x; var_y+=y*y; cov_xy+=x*y;
+	    n++;
+	  }
+	}
+	cout<<endl;
+
+	if (n>=5) {
+	  mean_x/=n; mean_y/=n; var_x/=n; var_y/=n; cov_xy/=n;
+	  var_x-=mean_x*mean_x; var_y-=mean_y*mean_y; cov_xy-=mean_x*mean_y;
+	  b=cov_xy/var_x;
+	  a=mean_y-b*mean_x;
+	  crt_factor=a/(b*(bin_size+0.5))+1;
+	  if (i==j) {
+	    mat_S[i][j]*=crt_factor;
+	  } else {
+	    mat_S[i][j]*=crt_factor; mat_S[j][i]*=crt_factor;
+	  }
+	  cout<<crt_factor<<endl;
+	  //correct qvar
+	  if (i==j) {
+	    vec_qvar[i]*=crt_factor; //=vec_qvar[i]*crt_factor+(ns_test*ns_test-ns_pair*crt_factor)/pow(ni_total, 3.0);
+	  }
+	}
+      }
+    }
+  }
+
+  //save to gsl_vector and gsl_matrix: qvar_vec, S_mat, Svar_mat
+  for (size_t i=0; i<S_mat->size1; i++) {
+    d1=gsl_vector_get(qvar_vec, i)+2*vec_qvar[i];
+    gsl_vector_set(qvar_vec, i, d1);
+    for (size_t j=0; j<S_mat->size2; j++) {
+      if (i==j) {
+	gsl_matrix_set(S_mat, i, j, mat_S[i][i]);
+	gsl_matrix_set(Svar_mat, i, j, 2.0*mat_Svar[i][i]*ns_test*ns_test/(2.0*ns_pair) );
+      } else {
+	gsl_matrix_set(S_mat, i, j, mat_S[i][j]+mat_S[j][i]);
+	gsl_matrix_set(Svar_mat, i, j, 2.0*(mat_Svar[i][j]+mat_Svar[j][i])*ns_test*ns_test/(2.0*ns_pair) );
+      }
+    }
+  }
+
+
+
+  infile.clear();
+  infile.close();
+
+  return;
+}
+
+
+
+
+
+//copied from lmm.cpp; is used in the following function VCss
+//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1
+size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) {
+	if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;}
+	size_t index;
+	size_t l, h;
+	if (b>a) {l=a; h=b;} else {l=b; h=a;}
+
+	size_t n=n_cvt+2;
+	index=(2*n-l+2)*(l-1)/2+h-l;
+
+	return index;
+}
+
+
+//use the new method to calculate variance components with summary statistics
+//first, use a function CalcS to compute S matrix (where the diagonal elements are part of V(q) ), and then use bootstrap to compute the variance for S, use a set of genotypes, phenotypes, and individual ids, and snp category label
+void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) {
+  size_t n_vc=S_mat->size1;
+
+  gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc);
+
+  gsl_vector *pve=gsl_vector_alloc (n_vc);
+  gsl_vector *pve_plus=gsl_vector_alloc (n_vc+1);
+  gsl_vector *tmp=gsl_vector_alloc (n_vc+1);
+  gsl_vector *sigma2persnp=gsl_vector_alloc (n_vc);
+  gsl_vector *enrich=gsl_vector_alloc (n_vc);
+  gsl_vector *se_pve=gsl_vector_alloc (n_vc);
+  gsl_vector *se_sigma2persnp=gsl_vector_alloc (n_vc);
+  gsl_vector *se_enrich=gsl_vector_alloc (n_vc);
+
+  double d;
+
+  //calculate S^{-1}q
+  gsl_matrix_memcpy (tmp_mat, S_mat);
+  int sig;
+  gsl_permutation * pmt=gsl_permutation_alloc (n_vc);
+  LUDecomp (tmp_mat, pmt, &sig);
+  LUInvert (tmp_mat, pmt, Si_mat);
+
+  //calculate sigma2snp and pve
+  gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve);
+  gsl_vector_memcpy(sigma2persnp, pve);
+  gsl_vector_div(sigma2persnp, s_vec);
+
+  //get qvar_mat
+  /*
+  if (n_block==0 || n_block==1) {
+    double s=1.0;
+    for (size_t i=0; i<n_vc; i++) {
+      d=gsl_vector_get(pve, i);
+      gsl_vector_set(pve_plus, i, d);
+      s-=d;
+    }
+    gsl_vector_set(pve_plus, n_vc, s);
+
+    for (size_t i=0; i<n_vc; i++) {
+      for (size_t j=i; j<n_vc; j++) {
+	size_t t_ij=GetabIndex (i+1, j+1, n_vc-2);
+	gsl_matrix_const_view Vsub=gsl_matrix_const_submatrix(V, 0, t_ij*(n_vc+1), n_vc+1, n_vc+1);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &Vsub.matrix, pve_plus, 0.0, tmp);
+	gsl_blas_ddot (pve_plus, tmp, &d);
+
+	d*=2/(df*df);
+
+	gsl_matrix_set (qvar_mat, i, j, d);
+	if (i!=j) {gsl_matrix_set (qvar_mat, j, i, d);}
+	//cout<<t_ij<<"/"<<d<<" ";
+      }
+      //cout<<endl;
+    }
+  } else {
+  */
+    gsl_matrix_memcpy (qvar_mat, Vq);
+    gsl_matrix_scale (qvar_mat, 1.0/(df*df));
+    //}
+
+  //gsl_matrix_memcpy (qvar_mat, S_mat);
+  //gsl_matrix_scale (qvar_mat, 2/(df*df));
+
+  //calculate variance for these estimates
+  for (size_t i=0; i<n_vc; i++) {
+    for (size_t j=i; j<n_vc; j++) {
+      d=gsl_matrix_get(Svar_mat, i, j);
+      d*=gsl_vector_get(pve, i)*gsl_vector_get(pve, j);
+      //cout<<d<<" ";
+
+      d+=gsl_matrix_get(qvar_mat, i, j);
+      gsl_matrix_set(Var_mat, i, j, d);
+      if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);}
+    }
+    //cout<<endl;
+  }
+
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, Var_mat, 0.0, tmp_mat);
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat);
+
+  for (size_t i=0; i<n_vc; i++) {
+    d=sqrt(gsl_matrix_get(Var_mat, i, i));
+    gsl_vector_set(se_pve, i, d);
+    d/=gsl_vector_get(s_vec, i);
+    gsl_vector_set(se_sigma2persnp, i, d);
+  }
+
+  //compute pve_total, se_pve_total
+  pve_total=0; se_pve_total=0;
+  for (size_t i=0; i<n_vc; i++) {
+    pve_total+=gsl_vector_get(pve, i);
+
+    for (size_t j=0; j<n_vc; j++) {
+      se_pve_total+=gsl_matrix_get(Var_mat, i, j);
+    }
+  }
+  se_pve_total=sqrt(se_pve_total);
+
+  //compute enrichment and its variance
+  double s_pve=0, s_snp=0;
+  for (size_t i=0; i<n_vc; i++) {
+    s_pve+=gsl_vector_get(pve, i);
+    s_snp+=gsl_vector_get(s_vec, i);
+  }
+  gsl_vector_memcpy (enrich, sigma2persnp);
+  gsl_vector_scale (enrich, s_snp/s_pve);
+
+  gsl_matrix_set_identity(tmp_mat);
+
+  double d1;
+  for (size_t i=0; i<n_vc; i++) {
+    d=gsl_vector_get(pve, i)/s_pve;
+    d1=gsl_vector_get(s_vec, i);
+    for (size_t j=0; j<n_vc; j++) {
+      if (i==j) {
+	gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve);
+      } else {
+	gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve);
+      }
+    }
+  }
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Var_mat, 0.0, tmp_mat1);
+  gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, tmp_mat1, tmp_mat, 0.0, VarEnrich_mat);
+
+  for (size_t i=0; i<n_vc; i++) {
+    d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i));
+    gsl_vector_set(se_enrich, i, d);
+  }
+
+  cout<<"pve = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(pve, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(pve) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(se_pve, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"sigma2 per snp = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(sigma2persnp, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(sigma2 per snp) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(se_sigma2persnp, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"enrichment = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(enrich, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(enrichment) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<gsl_vector_get(se_enrich, i)<<" ";
+  }
+  cout<<endl;
+
+  //save data
+  v_pve.clear(); v_se_pve.clear();
+  v_sigma2.clear(); v_se_sigma2.clear();
+  v_enrich.clear(); v_se_enrich.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    d=gsl_vector_get(pve, i);
+    v_pve.push_back(d);
+    d=gsl_vector_get(se_pve, i);
+    v_se_pve.push_back(d);
+
+    d=gsl_vector_get(sigma2persnp, i);
+    v_sigma2.push_back(d);
+    d=gsl_vector_get(se_sigma2persnp, i);
+    v_se_sigma2.push_back(d);
+
+    d=gsl_vector_get(enrich, i);
+    v_enrich.push_back(d);
+    d=gsl_vector_get(se_enrich, i);
+    v_se_enrich.push_back(d);
+  }
+
+  //delete matrices
+  gsl_matrix_free(Si_mat);
+  gsl_matrix_free(Var_mat);
+  gsl_matrix_free(VarEnrich_mat);
+  gsl_matrix_free(tmp_mat);
+  gsl_matrix_free(tmp_mat1);
+  gsl_matrix_free(qvar_mat);
+
+  gsl_vector_free(pve);
+  gsl_vector_free(pve_plus);
+  gsl_vector_free(tmp);
+  gsl_vector_free(sigma2persnp);
+  gsl_vector_free(enrich);
+  gsl_vector_free(se_pve);
+  gsl_vector_free(se_sigma2persnp);
+  gsl_vector_free(se_enrich);
+
+  return;
+}
+
+
+
+
+
+//Ks are not scaled;
+void VC::CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y)
+{
+  size_t n1=K->size1, n2=K->size2;
+  size_t n_vc=n2/n1;
+
+  double r=(double)n1/(double)(n1 - W->size2);
+  double var_y, var_y_new;
+  double d, tr, s, v;
+  vector<double> traceG_new;
+
+  //new matrices/vectors
+  gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2);
+  gsl_vector *y_scale=gsl_vector_alloc (n1);
+  gsl_matrix *Kry=gsl_matrix_alloc (n1, n_vc);
+  gsl_matrix *yKrKKry=gsl_matrix_alloc (n_vc, n_vc*(n_vc+1) );
+  gsl_vector *KKry=gsl_vector_alloc (n1);
+
+  //old matrices/vectors
+  gsl_vector *pve=gsl_vector_alloc (n_vc);
+  gsl_vector *se_pve=gsl_vector_alloc (n_vc);
+  gsl_vector *q_vec=gsl_vector_alloc (n_vc);
+  gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc);
+
+  //center and scale K by W
+  for (size_t i=0; i<n_vc; i++) {
+    gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1);
+    gsl_matrix_const_view K_sub = gsl_matrix_const_submatrix (K, 0, n1*i, n1, n1);
+    gsl_matrix_memcpy (&Kscale_sub.matrix, &K_sub.matrix);
+
+    CenterMatrix (&Kscale_sub.matrix, W);
+    d=ScaleMatrix (&Kscale_sub.matrix);
+    traceG_new.push_back(d);
+  }
+
+  //center y by W, and standardize it to have variance 1 (t(y)%*%y/n=1)
+  gsl_vector_memcpy (y_scale, y);
+  CenterVector (y_scale, W);
+
+  var_y=VectorVar (y);
+  var_y_new=VectorVar (y_scale);
+
+  StandardizeVector (y_scale);
+
+  //compute Kry, which is used for confidence interval; also compute q_vec (*n^2)
+  for (size_t i=0; i<n_vc; i++) {
+    gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1);
+    gsl_vector_view Kry_col=gsl_matrix_column (Kry, i);
+
+    gsl_vector_memcpy (&Kry_col.vector, y_scale);
+    gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, -1.0*r, &Kry_col.vector);
+
+    gsl_blas_ddot (&Kry_col.vector, y_scale, &d);
+    gsl_vector_set(q_vec, i, d);
+  }
+
+  //compuate yKrKKry, which is used later for confidence interval
+  for (size_t i=0; i<n_vc; i++) {
+    gsl_vector_const_view Kry_coli=gsl_matrix_const_column (Kry, i);
+    for (size_t j=i; j<n_vc; j++) {
+      gsl_vector_const_view Kry_colj=gsl_matrix_const_column (Kry, j);
+      for (size_t l=0; l<n_vc; l++) {
+	gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*l, n1, n1);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, &Kscale_sub.matrix, &Kry_coli.vector, 0.0, KKry);
+	gsl_blas_ddot (&Kry_colj.vector, KKry, &d);
+	gsl_matrix_set(yKrKKry, i, l*n_vc+j, d);
+	if (i!=j) {gsl_matrix_set(yKrKKry, j, l*n_vc+i, d);}
+      }
+      gsl_blas_ddot (&Kry_coli.vector, &Kry_colj.vector, &d);
+      gsl_matrix_set(yKrKKry, i, n_vc*n_vc+j, d);
+      if (i!=j) {gsl_matrix_set(yKrKKry, j, n_vc*n_vc+i, d);}
+    }
+  }
+
+  //compute Sij (*n^2)
+  for (size_t i=0; i<n_vc; i++) {
+    for (size_t j=i; j<n_vc; j++) {
+      tr=0;
+      for (size_t l=0; l<n1; l++) {
+	gsl_vector_const_view Ki_col=gsl_matrix_const_column (K_scale, i*n1+l);
+	gsl_vector_const_view Kj_col=gsl_matrix_const_column (K_scale, j*n1+l);
+	gsl_blas_ddot (&Ki_col.vector, &Kj_col.vector, &d);
+	tr+=d;
+      }
+
+      tr=tr-r*(double)n1;
+      gsl_matrix_set (S_mat, i, j, tr);
+      if (i!=j) {gsl_matrix_set (S_mat, j, i, tr);}
+    }
+  }
+
+  /*
+  cout<<"q_vec = "<<endl;
+  for (size_t i=0; i<q_vec->size; i++) {
+    cout<<gsl_vector_get(q_vec, i)<<" ";
+  }
+  cout<<endl;
+
+  cout<<"S_mat = "<<endl;
+  for (size_t i=0; i<S_mat->size1; i++) {
+    for (size_t j=0; j<S_mat->size2; j++) {
+      cout<<gsl_matrix_get(S_mat, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+  */
+
+  //compute S^{-1}q
+  int sig;
+  gsl_permutation * pmt=gsl_permutation_alloc (n_vc);
+  LUDecomp (S_mat, pmt, &sig);
+  LUInvert (S_mat, pmt, Si_mat);
+
+  //compute pve (on the transformed scale)
+  gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve);
+
+  //compute q_var (*n^4)
+  gsl_matrix_set_zero (qvar_mat);
+  s=1;
+  for (size_t i=0; i<n_vc; i++) {
+    d=gsl_vector_get(pve, i);
+    gsl_matrix_view yKrKKry_sub=gsl_matrix_submatrix(yKrKKry, 0, i*n_vc, n_vc, n_vc);
+    gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix);
+    gsl_matrix_scale(tmp_mat, d);
+    gsl_matrix_add (qvar_mat, tmp_mat);
+    s-=d;
+  }
+  gsl_matrix_view yKrKKry_sub=gsl_matrix_submatrix(yKrKKry, 0, n_vc*n_vc, n_vc, n_vc);
+  gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix);
+  gsl_matrix_scale(tmp_mat, s);
+  gsl_matrix_add (qvar_mat, tmp_mat);
+
+  gsl_matrix_scale(qvar_mat, 2.0);
+
+  //compute S^{-1}var_qS^{-1}
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, qvar_mat, 0.0, tmp_mat);
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat);
+
+  //transform pve back to the original scale and save data
+  v_pve.clear(); v_se_pve.clear();
+  v_sigma2.clear(); v_se_sigma2.clear();
+
+  s=1.0, v=0, pve_total=0, se_pve_total=0;
+  for (size_t i=0; i<n_vc; i++) {
+    d=gsl_vector_get (pve, i);
+    //cout<<var_y<<" "<<var_y_new<<" "<<v_traceG[i]<<" "<<traceG_new[i]<<endl;
+    v_sigma2.push_back(d*var_y_new/traceG_new[i]);
+    v_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y));
+    s-=d;
+    pve_total+=d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y);
+
+    d=sqrt(gsl_matrix_get (Var_mat, i, i));
+    v_se_sigma2.push_back(d*var_y_new/traceG_new[i]);
+    v_se_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y));
+
+    //d*=sqrt(var_y/v_traceG[i]-v_sigma2[i]);
+    //v_se_pve.push_back(d/var_y);
+
+    for (size_t j=0; j<n_vc; j++) {
+      v+=gsl_matrix_get(Var_mat, i, j);
+      se_pve_total+=gsl_matrix_get(Var_mat, i, j)*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)*(var_y_new/traceG_new[j])*(v_traceG[j]/var_y);
+    }
+  }
+  v_sigma2.push_back(s*r*var_y_new);
+  v_se_sigma2.push_back(sqrt(v)*r*var_y_new);
+  se_pve_total=sqrt(se_pve_total);
+
+  cout<<"sigma2 = ";
+  for (size_t i=0; i<n_vc+1; i++) {
+    cout<<v_sigma2[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(sigma2) = ";
+  for (size_t i=0; i<n_vc+1; i++) {
+    cout<<v_se_sigma2[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"pve = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_pve[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(pve) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_se_pve[i]<<" ";
+  }
+  cout<<endl;
+
+  if (n_vc>1) {
+    cout<<"total pve = "<<pve_total<<endl;
+    cout<<"se(total pve) = "<<se_pve_total<<endl;
+  }
+
+  gsl_permutation_free(pmt);
+  gsl_matrix_free(K_scale);
+  gsl_vector_free(y_scale);
+  gsl_matrix_free(Kry);
+  gsl_matrix_free(yKrKKry);
+  gsl_vector_free(KKry);
+
+  //old matrices/vectors
+  gsl_vector_free(pve);
+  gsl_vector_free(se_pve);
+  gsl_vector_free(q_vec);
+  gsl_matrix_free(qvar_mat);
+  gsl_matrix_free(tmp_mat);
+  gsl_matrix_free(S_mat);
+  gsl_matrix_free(Si_mat);
+  gsl_matrix_free(Var_mat);
+
+  return;
+}
+
+
+
+
+//reml for log(sigma2) based on the AI algorithm
+void VC::CalcVCreml (bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y)
 {
   size_t n1=K->size1, n2=K->size2;
   size_t n_vc=n2/n1;
   gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1);
   double d, s;
 
+  /*
+  //compare eigenlib vs lapack
+  //dgemm
+  gsl_matrix *K2=gsl_matrix_alloc(K->size1, K->size1);
+
+  clock_t time_start=clock();
+  gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, K, K, 0.0, K2);
+  cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    for (size_t j=0; j<2; j++) {
+      cout<<gsl_matrix_get(K2, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+
+  time_start=clock();
+  lapack_dgemm ((char *)"N", (char *)"T", 1.0, K, K, 0.0, K2);
+  cout<<"lapack time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    for (size_t j=0; j<2; j++) {
+      cout<<gsl_matrix_get(K2, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+
+  time_start=clock();
+  eigenlib_dgemm((char *)"N", (char *)"T", 1.0, K, K, 0.0, K2);
+  cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    for (size_t j=0; j<2; j++) {
+      cout<<gsl_matrix_get(K2, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+
+  //dgemv
+  gsl_vector_const_view W_col=gsl_matrix_const_column (K, 0);
+  gsl_vector *v=gsl_vector_alloc (K->size1);
+  time_start=clock();
+  for (size_t i=0; i<1000; i++) {
+    gsl_blas_dgemv(CblasNoTrans, 1.0, K2, &W_col.vector, 0.0, v);
+  }
+  cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    cout<<gsl_vector_get(v, i)<<endl;
+  }
+
+  time_start=clock();
+  for (size_t i=0; i<1000; i++) {
+    eigenlib_dgemv((char *)"N", 1.0, K2, &W_col.vector, 0.0, v);
+  }
+  cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    cout<<gsl_vector_get(v, i)<<endl;
+  }
+
+  //eigen
+  gsl_matrix *K2copy=gsl_matrix_alloc(K->size1, K->size1);
+  gsl_matrix *K3=gsl_matrix_alloc(K->size1, K->size1);
+
+  gsl_matrix_memcpy(K2copy, K2);
+  time_start=clock();
+  EigenDecomp(K2copy, K3, v, 0);
+  cout<<"standard time 0: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    cout<<gsl_vector_get(v, i)<<endl;
+  }
+
+  gsl_matrix_memcpy(K2copy, K2);
+  time_start=clock();
+  EigenDecomp(K2copy, K3, v, 1);
+  cout<<"standard time 1: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    cout<<gsl_vector_get(v, i)<<endl;
+  }
+
+  gsl_matrix_memcpy(K2copy, K2);
+  time_start=clock();
+  eigenlib_eigensymm(K2copy, K3, v);
+  cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    cout<<gsl_vector_get(v, i)<<endl;
+  }
+
+
+
+  //invert
+  gsl_matrix_memcpy(K2copy, K2);
+  time_start=clock();
+  int sigcopy;
+  gsl_permutation * pmt1=gsl_permutation_alloc (K2->size1);
+  LUDecomp (K2copy, pmt1, &sigcopy);
+  LUInvert (K2copy, pmt1, K3);
+  gsl_permutation_free(pmt1);
+  cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    for (size_t j=0; j<2; j++) {
+      cout<<gsl_matrix_get(K3, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+
+  gsl_matrix_memcpy(K2copy, K2);
+  time_start=clock();
+  eigenlib_invert(K2copy);
+  cout<<"eigen time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl;
+  for (size_t i=0; i<2; i++) {
+    for (size_t j=0; j<2; j++) {
+      cout<<gsl_matrix_get(K2copy, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+  */
+
   //set up params
   gsl_matrix *P=gsl_matrix_alloc (n1, n1);
   gsl_vector *Py=gsl_vector_alloc (n1);
@@ -318,18 +1648,26 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
   gsl_vector *dev1=gsl_vector_alloc (n_vc+1);
   gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1);
   gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1);
-  VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian};
+  VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian, noconstrain};
 
   //initialize sigma2/log_sigma2
+  CalcVChe (K, W, y);
+
   gsl_blas_ddot (y, y, &s);
   s/=(double)n1;
   for (size_t i=0; i<n_vc+1; i++) {
+    if (noconstrain) {
+      d=v_sigma2[i];
+    } else {
+      if (v_sigma2[i]<=0) {d=log(0.1);} else {d=log(v_sigma2[i]);}
+    }
+    /*
     if (i==n_vc) {
       d=s/((double)n_vc+1.0);
     } else {
       d=s/( ((double)n_vc+1.0)*v_traceG[i]);
     }
-
+    */
     gsl_vector_set (log_sigma2, i, d);
   }
   //  gsl_vector_set (log_sigma2, 0, 0.38);
@@ -338,7 +1676,11 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
   cout<<"iteration "<<0<<endl;
   cout<<"sigma2 = ";
   for (size_t i=0; i<n_vc+1; i++) {
-    cout<<exp(gsl_vector_get(log_sigma2, i))<<" ";
+    if (noconstrain) {
+      cout<<gsl_vector_get(log_sigma2, i)<<" ";
+    } else {
+      cout<<exp(gsl_vector_get(log_sigma2, i))<<" ";
+    }
   }
   cout<<endl;
 
@@ -349,15 +1691,15 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
   FDF.f=&LogRL_dev1;
   FDF.df=&LogRL_dev2;
   FDF.fdf=&LogRL_dev12;
-  
-  //set up solver 	
+
+  //set up solver
   int status;
   int iter=0, max_iter=100;
 
   const gsl_multiroot_fdfsolver_type *T_fdf;
   gsl_multiroot_fdfsolver *s_fdf;
   T_fdf=gsl_multiroot_fdfsolver_hybridsj;
-  s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1);	
+  s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1);
 
   gsl_multiroot_fdfsolver_set (s_fdf, &FDF, log_sigma2);
 
@@ -370,37 +1712,55 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
     cout<<"iteration "<<iter<<endl;
     cout<<"sigma2 = ";
     for (size_t i=0; i<n_vc+1; i++) {
-      cout<<exp(gsl_vector_get(s_fdf->x, i))<<" ";
+      if (noconstrain) {
+	cout<<gsl_vector_get(s_fdf->x, i)<<" ";
+      } else {
+	cout<<exp(gsl_vector_get(s_fdf->x, i))<<" ";
+      }
     }
     cout<<endl;
+    /*
     cout<<"derivatives = ";
     for (size_t i=0; i<n_vc+1; i++) {
       cout<<gsl_vector_get(s_fdf->f, i)<<" ";
     }
     cout<<endl;
-
-    status=gsl_multiroot_test_residual (s_fdf->f, 1e-3);		
+    */
+    status=gsl_multiroot_test_residual (s_fdf->f, 1e-3);
   }
-  while (status==GSL_CONTINUE && iter<max_iter); 
-
-  //obtain Hessian inverse
-  int sig=LogRL_dev12 (s_fdf->f, &params, dev1, dev2);
+  while (status==GSL_CONTINUE && iter<max_iter);
+
+  //obtain Hessian and Hessian inverse
+  int sig=LogRL_dev12 (s_fdf->x, &params, dev1, dev2);
+  /*
+  for (size_t i=0; i<dev2->size1; i++) {
+    for (size_t j=0; j<dev2->size2; j++) {
+      cout<<gsl_matrix_get (dev2, i, j)<<" ";
+    }
+    cout<<endl;
+  }
+  */
 
   gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1);
-  LUDecomp (dev2, pmt, &sig);	
+  LUDecomp (dev2, pmt, &sig);
   LUInvert (dev2, pmt, Hessian);
   gsl_permutation_free(pmt);
 
-  //save data
-  v_sigma2.clear(); 
+  //save sigma2 and se_sigma2
+  v_sigma2.clear(); v_se_sigma2.clear();
   for (size_t i=0; i<n_vc+1; i++) {
-    d=exp(gsl_vector_get(s_fdf->x, i));
+    if (noconstrain) {
+      d=gsl_vector_get(s_fdf->x, i);
+    } else {
+      d=exp(gsl_vector_get(s_fdf->x, i));
+    }
     v_sigma2.push_back(d);
-  }
 
-  v_se_sigma2.clear();
-  for (size_t i=0; i<n_vc+1; i++) {
-    d=-1.0*v_sigma2[i]*v_sigma2[i]*gsl_matrix_get(Hessian, i, i);
+    if (noconstrain) {
+      d=-1.0*gsl_matrix_get(Hessian, i, i);
+    } else {
+      d=-1.0*d*d*gsl_matrix_get(Hessian, i, i);
+    }
     v_se_sigma2.push_back(sqrt(d));
   }
 
@@ -409,20 +1769,80 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
     s+=v_traceG[i]*v_sigma2[i];
   }
   s+=v_sigma2[n_vc];
-  
-  v_pve.clear();
+
+  //compute pve
+  v_pve.clear(); pve_total=0;
   for (size_t i=0; i<n_vc; i++) {
     d=v_traceG[i]*v_sigma2[i]/s;
     v_pve.push_back(d);
+    pve_total+=d;
   }
 
-  v_se_pve.clear();
-  for (size_t i=0; i<n_vc; i++) {
-    d=v_traceG[i]*(s-v_sigma2[i]*v_traceG[i])/(s*s)*v_se_sigma2[i]*v_se_sigma2[i];
-    v_se_pve.push_back(sqrt(d) );
+  //compute se_pve; k=n_vc+1: total
+  double d1, d2;
+  v_se_pve.clear(); se_pve_total=0;
+  for (size_t k=0; k<n_vc+1; k++) {
+    d=0;
+    for (size_t i=0; i<n_vc+1; i++) {
+      if (noconstrain) {
+	d1=gsl_vector_get(s_fdf->x, i);
+	d1=1;
+      } else {
+	d1=exp(gsl_vector_get(s_fdf->x, i));
+      }
+
+      if (k<n_vc) {
+	if (i==k) {
+	  d1*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s);
+	} else if (i==n_vc) {
+	  d1*=-1*v_traceG[k]*v_sigma2[k]/(s*s);
+	} else {
+	  d1*=-1*v_traceG[i]*v_traceG[k]*v_sigma2[k]/(s*s);
+	}
+      } else {
+	if (i==k) {
+	  d1*=-1*(s-v_sigma2[n_vc])/(s*s);
+	} else {
+	  d1*=v_traceG[i]*v_sigma2[n_vc]/(s*s);
+	}
+      }
+
+      for (size_t j=0; j<n_vc+1; j++) {
+	if (noconstrain) {
+	  d2=gsl_vector_get(s_fdf->x, j);
+	  d2=1;
+	} else {
+	  d2=exp(gsl_vector_get(s_fdf->x, j));
+	}
+
+	if (k<n_vc) {
+	  if (j==k) {
+	    d2*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s);
+	  } else if (j==n_vc) {
+	    d2*=-1*v_traceG[k]*v_sigma2[k]/(s*s);
+	  } else {
+	    d2*=-1*v_traceG[j]*v_traceG[k]*v_sigma2[k]/(s*s);
+	  }
+	} else {
+	  if (j==k) {
+	    d2*=-1*(s-v_sigma2[n_vc])/(s*s);
+	  } else {
+	    d2*=v_traceG[j]*v_sigma2[n_vc]/(s*s);
+	  }
+	}
+
+	d+=-1.0*d1*d2*gsl_matrix_get(Hessian, i, j);
+      }
+    }
+
+    if (k<n_vc) {
+      v_se_pve.push_back(sqrt(d) );
+    } else {
+      se_pve_total=sqrt(d);
+    }
   }
-  
-  gsl_multiroot_fdfsolver_free(s_fdf);	
+
+  gsl_multiroot_fdfsolver_free(s_fdf);
 
   gsl_vector_free(log_sigma2);
   gsl_matrix_free(P);
@@ -437,7 +1857,643 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector
 }
 
 
-	
 
 
 
+
+//read bimbam mean genotype file and compute XWz
+bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+
+	size_t n_miss;
+	double d, geno_mean, geno_var;
+
+	size_t ni_test=XWz->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_test);
+	gsl_vector *geno_miss=gsl_vector_alloc (ni_test);
+	gsl_vector *wz=gsl_vector_alloc (w->size);
+	gsl_vector_memcpy (wz, z);
+	gsl_vector_mul(wz, w);
+
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+
+		geno_mean=0.0; n_miss=0; geno_var=0.0;
+		gsl_vector_set_all(geno_miss, 0);
+
+		size_t j=0;
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+		  if (indicator_idv[i]==0) {continue;}
+			ch_ptr=strtok (NULL, " , \t");
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;}
+			else {
+				d=atof(ch_ptr);
+				gsl_vector_set (geno, j, d);
+				gsl_vector_set (geno_miss, j, 1);
+				geno_mean+=d;
+				geno_var+=d*d;
+			}
+			j++;
+		}
+
+		geno_mean/=(double)(ni_test-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_test;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);}
+		}
+
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+
+		gsl_vector_view XWz_col=gsl_matrix_column(XWz, vec_cat[ns_test]);
+		d=gsl_vector_get (wz, ns_test);
+		gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector);
+
+		ns_test++;
+	}
+
+	cout<<endl;
+
+	gsl_vector_free (geno);
+	gsl_vector_free (geno_miss);
+	gsl_vector_free (wz);
+
+	infile.close();
+	infile.clear();
+
+	return true;
+}
+
+
+
+
+
+
+//read plink bed file and compute XWz
+bool PlinkXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz)
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+
+	char ch[1];
+	bitset<8> b;
+
+	size_t n_miss, ci_total, ci_test;
+	double d, geno_mean, geno_var;
+
+	size_t ni_test=XWz->size1;
+	size_t ni_total=indicator_idv.size();
+	gsl_vector *geno=gsl_vector_alloc (ni_test);
+	gsl_vector *wz=gsl_vector_alloc (w->size);
+	gsl_vector_memcpy (wz, z);
+	gsl_vector_mul(wz, w);
+
+	int n_bit;
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+
+		//read genotypes
+		geno_mean=0.0;	n_miss=0; ci_total=0; geno_var=0.0; ci_test=0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; }
+					else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); }
+					else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; }
+				}
+
+				ci_test++;
+				ci_total++;
+			}
+		}
+
+
+		geno_mean/=(double)(ni_test-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_test;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+
+		for (size_t i=0; i<ni_test; ++i) {
+			d=gsl_vector_get(geno,i);
+			if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);}
+		}
+
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+
+		gsl_vector_view XWz_col=gsl_matrix_column(XWz, vec_cat[ns_test]);
+		d=gsl_vector_get (wz, ns_test);
+		gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector);
+
+		ns_test++;
+    }
+	cout<<endl;
+
+	gsl_vector_free (geno);
+	gsl_vector_free (wz);
+
+	infile.close();
+	infile.clear();
+
+	return true;
+}
+
+
+
+//read multiple genotype files and compute XWz
+bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz)
+{
+  gsl_matrix_set_zero(XWz);
+
+  igzstream infile (file_mfile.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;}
+
+  string file_name;
+  size_t l=0, ns_test=0;
+
+  while (!safeGetline(infile, file_name).eof()) {
+    if (mfile_mode==1) {
+      file_name+=".bed";
+      PlinkXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz);
+    } else {
+      BimbamXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz);
+    }
+
+    l++;
+  }
+
+
+  infile.close();
+  infile.clear();
+
+  return true;
+}
+
+
+
+
+
+
+//read bimbam mean genotype file and compute X_i^TX_jWz
+bool BimbamXtXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz)
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+	//ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;}
+
+	string line;
+	char *ch_ptr;
+
+	size_t n_miss;
+	double d, geno_mean, geno_var;
+
+	size_t ni_test=XWz->size1;
+	gsl_vector *geno=gsl_vector_alloc (ni_test);
+	gsl_vector *geno_miss=gsl_vector_alloc (ni_test);
+
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");
+
+		geno_mean=0.0; n_miss=0; geno_var=0.0;
+		gsl_vector_set_all(geno_miss, 0);
+
+		size_t j=0;
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+		  if (indicator_idv[i]==0) {continue;}
+			ch_ptr=strtok (NULL, " , \t");
+			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;}
+			else {
+				d=atof(ch_ptr);
+				gsl_vector_set (geno, j, d);
+				gsl_vector_set (geno_miss, j, 1);
+				geno_mean+=d;
+				geno_var+=d*d;
+			}
+			j++;
+		}
+
+		geno_mean/=(double)(ni_test-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_test;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+
+		for (size_t i=0; i<ni_test; ++i) {
+			if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);}
+		}
+
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+
+		for (size_t i=0; i<XWz->size2; i++) {
+		  gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i);
+		  gsl_blas_ddot (geno, &XWz_col.vector, &d);
+		  gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var));
+		}
+
+		ns_test++;
+	}
+
+	cout<<endl;
+
+	gsl_vector_free (geno);
+	gsl_vector_free (geno_miss);
+
+	infile.close();
+	infile.clear();
+
+	return true;
+}
+
+
+
+
+
+
+//read plink bed file and compute XWz
+bool PlinkXtXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz)
+{
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;}
+
+	char ch[1];
+	bitset<8> b;
+
+	size_t n_miss, ci_total, ci_test;
+	double d, geno_mean, geno_var;
+
+	size_t ni_test=XWz->size1;
+	size_t ni_total=indicator_idv.size();
+	gsl_vector *geno=gsl_vector_alloc (ni_test);
+
+	int n_bit;
+
+	//calculate n_bit and c, the number of bit for each snp
+	if (ni_total%4==0) {n_bit=ni_total/4;}
+	else {n_bit=ni_total/4+1; }
+
+	//print the first three majic numbers
+	for (int i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}
+
+	for (size_t t=0; t<indicator_snp.size(); ++t) {
+		if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+		if (indicator_snp[t]==0) {continue;}
+
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+
+		//read genotypes
+		geno_mean=0.0;	n_miss=0; ci_total=0; geno_var=0.0; ci_test=0;
+		for (int i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==ni_total) {break;}
+				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
+
+				if (b[2*j]==0) {
+					if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; }
+					else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;}
+				}
+				else {
+					if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); }
+					else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; }
+				}
+
+				ci_test++;
+				ci_total++;
+			}
+		}
+
+
+		geno_mean/=(double)(ni_test-n_miss);
+		geno_var+=geno_mean*geno_mean*(double)n_miss;
+		geno_var/=(double)ni_test;
+		geno_var-=geno_mean*geno_mean;
+//		geno_var=geno_mean*(1-geno_mean*0.5);
+
+		for (size_t i=0; i<ni_test; ++i) {
+			d=gsl_vector_get(geno,i);
+			if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);}
+		}
+
+		gsl_vector_add_constant (geno, -1.0*geno_mean);
+
+		for (size_t i=0; i<XWz->size2; i++) {
+		  gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i);
+		  gsl_blas_ddot (geno, &XWz_col.vector, &d);
+		  gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var));
+		}
+
+		ns_test++;
+    }
+	cout<<endl;
+
+	gsl_vector_free (geno);
+
+	infile.close();
+	infile.clear();
+
+	return true;
+}
+
+
+
+//read multiple genotype files and compute XWz
+bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz)
+{
+  gsl_matrix_set_zero(XtXWz);
+
+  igzstream infile (file_mfile.c_str(), igzstream::in);
+  if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;}
+
+  string file_name;
+  size_t l=0, ns_test=0;
+
+  while (!safeGetline(infile, file_name).eof()) {
+    if (mfile_mode==1) {
+      file_name+=".bed";
+      PlinkXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz);
+    } else {
+      BimbamXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz);
+    }
+
+    l++;
+  }
+
+  infile.close();
+  infile.clear();
+
+  return true;
+}
+
+
+//compute confidence intervals from summary statistics
+void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector<size_t> &vec_cat, const vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) {
+  size_t n_vc=XWz->size2, ns_test=w->size, ni_test=XWz->size1;
+
+  //set up matrices
+  gsl_vector *w_pve=gsl_vector_alloc (ns_test);
+  gsl_vector *wz=gsl_vector_alloc (ns_test);
+  gsl_vector *zwz=gsl_vector_alloc (n_vc);
+  gsl_vector *zz=gsl_vector_alloc (n_vc);
+  gsl_vector *Xz_pve=gsl_vector_alloc (ni_test);
+  gsl_vector *WXtXWz=gsl_vector_alloc (ns_test);
+
+  gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc);
+  gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc);
+
+  double d, s0, s1, s, s_pve, s_snp;
+
+  //compute wz and zwz
+  gsl_vector_memcpy (wz, z);
+  gsl_vector_mul (wz, w);
+
+  gsl_vector_set_zero (zwz);
+  gsl_vector_set_zero (zz);
+  for (size_t i=0; i<w->size; i++) {
+    d=gsl_vector_get (wz, i)*gsl_vector_get (z, i);
+    d+=gsl_vector_get (zwz, vec_cat[i]);
+    gsl_vector_set (zwz, vec_cat[i], d);
+
+    d=gsl_vector_get (z, i)*gsl_vector_get (z, i);
+    d+=gsl_vector_get (zz, vec_cat[i]);
+    gsl_vector_set (zz, vec_cat[i], d);
+  }
+
+  //compute wz, ve and Xz_pve
+  gsl_vector_set_zero (Xz_pve); s_pve=0; s_snp=0;
+  for (size_t i=0; i<n_vc; i++) {
+    s_pve+=v_pve[i];
+    s_snp+=gsl_vector_get(s_vec, i);
+
+    gsl_vector_const_view Xz_col=gsl_matrix_const_column (Xz, i);
+    gsl_blas_daxpy (v_pve[i]/gsl_vector_get(s_vec, i), &Xz_col.vector, Xz_pve);
+  }
+
+  //set up wpve vector
+  for (size_t i=0; i<w->size; i++) {
+    d=v_pve[vec_cat[i]]/gsl_vector_get(s_vec, vec_cat[i]);
+    gsl_vector_set (w_pve, i, d);
+  }
+
+  //compute Vq (in qvar_mat)
+  s0=1-s_pve;
+  for (size_t i=0; i<n_vc; i++) {
+    s0+=gsl_vector_get (zz, i)*v_pve[i]/gsl_vector_get(s_vec, i);
+  }
+
+  for (size_t i=0; i<n_vc; i++) {
+    s1=s0;
+    s1-=gsl_vector_get (zwz, i)*(1-s_pve)/gsl_vector_get(s_vec, i);
+
+    gsl_vector_const_view XWz_col1=gsl_matrix_const_column (XWz, i);
+    gsl_vector_const_view XtXWz_col1=gsl_matrix_const_column (XtXWz, i);
+
+    gsl_vector_memcpy (WXtXWz, &XtXWz_col1.vector);
+    gsl_vector_mul (WXtXWz, w_pve);
+
+    gsl_blas_ddot (Xz_pve, &XWz_col1.vector, &d);
+    s1-=d/gsl_vector_get(s_vec, i);
+
+    for (size_t j=0; j<n_vc; j++) {
+      s=s1;
+
+      s-=gsl_vector_get (zwz, j)*(1-s_pve)/gsl_vector_get(s_vec, j);
+
+      gsl_vector_const_view XWz_col2=gsl_matrix_const_column (XWz, j);
+      gsl_vector_const_view XtXWz_col2=gsl_matrix_const_column (XtXWz, j);
+
+      gsl_blas_ddot (WXtXWz, &XtXWz_col2.vector, &d);
+      s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j));
+
+      gsl_blas_ddot (&XWz_col1.vector, &XWz_col2.vector, &d);
+      s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j))*(1-s_pve);
+
+      gsl_blas_ddot (Xz_pve, &XWz_col2.vector, &d);
+      s-=d/gsl_vector_get(s_vec, j);
+
+      gsl_matrix_set (qvar_mat, i, j, s);
+    }
+
+  }
+
+  d=(double)(ni_test-1);
+  gsl_matrix_scale (qvar_mat, 2.0/(d*d*d));
+
+  //cout<<scientific<<gsl_matrix_get(qvar_mat, 0, 0)<<endl;
+
+  //calculate S^{-1}
+  gsl_matrix_memcpy (tmp_mat, S_mat);
+  int sig;
+  gsl_permutation * pmt=gsl_permutation_alloc (n_vc);
+  LUDecomp (tmp_mat, pmt, &sig);
+  LUInvert (tmp_mat, pmt, Si_mat);
+
+  //calculate variance for the estimates
+  for (size_t i=0; i<n_vc; i++) {
+    for (size_t j=i; j<n_vc; j++) {
+      d=gsl_matrix_get(Svar_mat, i, j);
+      d*=v_pve[i]*v_pve[j];
+      //cout<<d<<" ";
+
+      d+=gsl_matrix_get(qvar_mat, i, j);
+      gsl_matrix_set(Var_mat, i, j, d);
+      if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);}
+    }
+    //cout<<endl;
+  }
+
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, Var_mat, 0.0, tmp_mat);
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat);
+
+  //compute sigma2 per snp, enrich
+  v_sigma2.clear(); v_enrich.clear();
+  for (size_t i=0; i<n_vc; i++) {
+    v_sigma2.push_back(v_pve[i]/gsl_vector_get(s_vec, i) );
+    v_enrich.push_back(v_pve[i]/gsl_vector_get(s_vec, i)*s_snp/s_pve);
+  }
+
+  //compute se_pve, se_sigma2
+  for (size_t i=0; i<n_vc; i++) {
+    d=sqrt(gsl_matrix_get(Var_mat, i, i));
+    v_se_pve.push_back(d);
+    v_se_sigma2.push_back(d/gsl_vector_get(s_vec, i));
+  }
+
+  //compute pve_total, se_pve_total
+  pve_total=0;
+  for (size_t i=0; i<n_vc; i++) {
+    pve_total+=v_pve[i];
+  }
+
+  se_pve_total=0;
+  for (size_t i=0; i<n_vc; i++) {
+    for (size_t j=0; j<n_vc; j++) {
+      se_pve_total+=gsl_matrix_get(Var_mat, i, j);
+    }
+  }
+  se_pve_total=sqrt(se_pve_total);
+
+  //compute se_enrich
+  gsl_matrix_set_identity(tmp_mat);
+
+  double d1;
+  for (size_t i=0; i<n_vc; i++) {
+    d=v_pve[i]/s_pve;
+    d1=gsl_vector_get(s_vec, i);
+    for (size_t j=0; j<n_vc; j++) {
+      if (i==j) {
+	gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve);
+      } else {
+	gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve);
+      }
+    }
+  }
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Var_mat, 0.0, tmp_mat1);
+  gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, tmp_mat1, tmp_mat, 0.0, VarEnrich_mat);
+
+  for (size_t i=0; i<n_vc; i++) {
+    d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i));
+    v_se_enrich.push_back(d);
+  }
+
+  cout<<"pve = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_pve[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(pve) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_se_pve[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"sigma2 per snp = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_sigma2[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(sigma2 per snp) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_se_sigma2[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"enrichment = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_enrich[i]<<" ";
+  }
+  cout<<endl;
+
+  cout<<"se(enrichment) = ";
+  for (size_t i=0; i<n_vc; i++) {
+    cout<<v_se_enrich[i]<<" ";
+  }
+  cout<<endl;
+
+  //delete matrices
+  gsl_matrix_free(Si_mat);
+  gsl_matrix_free(Var_mat);
+  gsl_matrix_free(VarEnrich_mat);
+  gsl_matrix_free(tmp_mat);
+  gsl_matrix_free(tmp_mat1);
+  gsl_matrix_free(qvar_mat);
+
+  gsl_vector_free(w_pve);
+  gsl_vector_free(wz);
+  gsl_vector_free(zwz);
+  gsl_vector_free(WXtXWz);
+  gsl_vector_free(Xz_pve);
+
+  return;
+}
diff --git a/src/vc.h b/src/vc.h
index f34d72e..d4a9779 100644
--- a/src/vc.h
+++ b/src/vc.h
@@ -16,7 +16,7 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#ifndef __VC_H__                
+#ifndef __VC_H__
 #define __VC_H__
 
 #include "gsl/gsl_vector.h"
@@ -38,7 +38,7 @@ using namespace std;
 class VC_PARAM
 {
 
-public:	
+public:
 	const gsl_matrix *K;
 	const gsl_matrix *W;
 	const gsl_vector *y;
@@ -47,18 +47,34 @@ public:
 	gsl_matrix *KPy_mat;
 	gsl_matrix *PKPy_mat;
 	gsl_matrix *Hessian;
+	bool noconstrain;
 };
 
 
 
 
+
 class VC {
 
 public:
 	// IO related parameters
+  size_t a_mode;
+	string file_cat;
+	string file_beta;
+	string file_cor;
+	string file_mq;
+	string file_ms;
+
 	string file_out;
 	string path_out;
 
+	set<string> setSnps;
+
+	size_t ni_total_ref, ns_total_ref, ns_pair;
+	size_t ni_total, ns_total, ns_test;
+	size_t n_vc;
+
+	double pve_total, se_pve_total;
 	vector<double> v_sigma2;
 	vector<double> v_se_sigma2;
 	vector<double> v_pve;
@@ -67,16 +83,33 @@ public:
 	vector<double> v_beta;
 	vector<double> v_se_beta;
 
+	size_t crt;
+	double window_cm, window_bp, window_ns;
+
 	double time_UtX;
 	double time_opt;
-	
+
 	// Main functions
 	void CopyFromParam (PARAM &cPar);
 	void CopyToParam (PARAM &cPar);
+	void WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat);
 	void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
-	void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
+	void CalcVCreml (const bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y);
 };
 
+void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich);
+
+
+bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz);
+bool PlinkXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz);
+bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz);
+
+bool BimbamXtXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz);
+bool PlinkXtXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz);
+bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz);
+
+void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector<size_t> &vec_cat, const vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich);
+
 #endif