diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gemma.cpp | 1840 | ||||
-rw-r--r-- | src/io.cpp | 1224 | ||||
-rw-r--r-- | src/io.h | 24 | ||||
-rw-r--r-- | src/lm.cpp | 24 | ||||
-rw-r--r-- | src/lmm.cpp | 267 | ||||
-rw-r--r-- | src/mathfunc.cpp | 18 | ||||
-rw-r--r-- | src/mvlmm.cpp | 451 | ||||
-rw-r--r-- | src/param.cpp | 878 | ||||
-rw-r--r-- | src/param.h | 42 | ||||
-rw-r--r-- | src/vc.cpp | 2240 | ||||
-rw-r--r-- | src/vc.h | 41 |
11 files changed, 5786 insertions, 1263 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp index b8693a8..3b9fe29 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -39,9 +39,11 @@ #include "vc_float.h" #include "lm_float.h" //for LM class #include "bslmm_float.h" //for BSLMM class +#include "ldr_float.h" //for LDR class #include "lmm_float.h" //for LMM class, and functions CalcLambda, CalcPve, CalcVgVe #include "mvlmm_float.h" //for MVLMM class #include "prdt_float.h" //for PRDT class +#include "varcov_float.h" //for MVLMM class #include "mathfunc_float.h" //for a few functions #else #include "io.h" @@ -49,9 +51,11 @@ #include "vc.h" #include "lm.h" #include "bslmm.h" +#include "ldr.h" #include "lmm.h" #include "mvlmm.h" #include "prdt.h" +#include "varcov.h" #include "mathfunc.h" #endif @@ -60,26 +64,23 @@ using namespace std; -GEMMA::GEMMA(void): -version("0.95alpha"), date("08/08/2014"), year("2011") +GEMMA::GEMMA(void): +version("0.95alpha"), date("07/11/2015"), year("2011") {} void GEMMA::PrintHeader (void) { cout<<endl; cout<<"*********************************************************"<<endl; - cout<<" Genome-wide Efficient Mixed Model Association (GEMMA) "<<endl; + cout<<" Genome-wide Efficient Mixed Model Association (GEMMA) "<<endl; cout<<" Version "<<version<<", "<<date<<" "<<endl; - cout<<" Visit "<<endl; - cout<<" http://stephenslab.uchicago.edu/software.html "<<endl; - cout<<" http://home.uchicago.edu/~xz7/software.html "<<endl; - cout<<" For Possible Updates "<<endl; + cout<<" Visit http://www.xzlab.org/software.html For Updates "<<endl; cout<<" (C) "<<year<<" Xiang Zhou "<<endl; - cout<<" GNU General Public License "<<endl; - cout<<" For Help, Type ./gemma -h "<<endl; + cout<<" GNU General Public License "<<endl; + cout<<" For Help, Type ./gemma -h "<<endl; cout<<"*********************************************************"<<endl; cout<<endl; - + return; } @@ -89,13 +90,13 @@ void GEMMA::PrintLicense (void) cout<<endl; cout<<"The Software Is Distributed Under GNU General Public License, But May Also Require The Following Notifications."<<endl; cout<<endl; - + cout<<"Including Lapack Routines In The Software May Require The Following Notification:"<<endl; cout<<"Copyright (c) 1992-2010 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved."<<endl; cout<<"Copyright (c) 2000-2010 The University of California Berkeley. All rights reserved."<<endl; - cout<<"Copyright (c) 2006-2010 The University of Colorado Denver. All rights reserved."<<endl; + cout<<"Copyright (c) 2006-2010 The University of Colorado Denver. All rights reserved."<<endl; cout<<endl; - + cout<<"$COPYRIGHT$"<<endl; cout<<"Additional copyrights may follow"<<endl; cout<<"$HEADER$"<<endl; @@ -113,9 +114,9 @@ void GEMMA::PrintLicense (void) <<"THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE " <<"OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."<<endl; cout<<endl; - - - + + + return; } @@ -124,9 +125,9 @@ void GEMMA::PrintLicense (void) void GEMMA::PrintHelp(size_t option) { if (option==0) { - cout<<endl; + cout<<endl; cout<<" GEMMA version "<<version<<", released on "<<date<<endl; - cout<<" implemented by Xiang Zhou"<<endl; + cout<<" implemented by Xiang Zhou"<<endl; cout<<endl; cout<<" type ./gemma -h [num] for detailed helps"<<endl; cout<<" options: " << endl; @@ -135,72 +136,116 @@ void GEMMA::PrintHelp(size_t option) cout<<" 3: SNP QC"<<endl; cout<<" 4: calculate relatedness matrix"<<endl; cout<<" 5: perform eigen decomposition"<<endl; - cout<<" 6: perform variance component estiamtion"<<endl; + cout<<" 6: perform variance component estimation"<<endl; cout<<" 7: fit a linear model"<<endl; cout<<" 8: fit a linear mixed model"<<endl; cout<<" 9: fit a multivariate linear mixed model"<<endl; cout<<" 10: fit a Bayesian sparse linear mixed model"<<endl; cout<<" 11: obtain predicted values"<<endl; - cout<<" 12: note"<<endl; + cout<<" 12: calculate snp variance covariance"<<endl; + cout<<" 13: note"<<endl; cout<<endl; - } - + } + if (option==1) { cout<<" QUICK GUIDE" << endl; cout<<" to generate a relatedness matrix: "<<endl; cout<<" ./gemma -bfile [prefix] -gk [num] -o [prefix]"<<endl; cout<<" ./gemma -g [filename] -p [filename] -gk [num] -o [prefix]"<<endl; + cout<<" to generate the S matrix: "<<endl; + cout<<" ./gemma -bfile [prefix] -gs -o [prefix]"<<endl; + cout<<" ./gemma -p [filename] -g [filename] -gs -o [prefix]"<<endl; + cout<<" ./gemma -bfile [prefix] -cat [filename] -gs -o [prefix]"<<endl; + cout<<" ./gemma -p [filename] -g [filename] -cat [filename] -gs -o [prefix]"<<endl; + cout<<" ./gemma -bfile [prefix] -sample [num] -gs -o [prefix]"<<endl; + cout<<" ./gemma -p [filename] -g [filename] -sample [num] -gs -o [prefix]"<<endl; + cout<<" to generate the q vector: "<<endl; + cout<<" ./gemma -beta [filename] -gq -o [prefix]"<<endl; + cout<<" ./gemma -beta [filename] -cat [filename] -gq -o [prefix]"<<endl; + cout<<" to generate the ldsc weigthts: "<<endl; + cout<<" ./gemma -beta [filename] -gw -o [prefix]"<<endl; + cout<<" ./gemma -beta [filename] -cat [filename] -gw -o [prefix]"<<endl; cout<<" to perform eigen decomposition of the relatedness matrix: "<<endl; cout<<" ./gemma -bfile [prefix] -k [filename] -eigen -o [prefix]"<<endl; cout<<" ./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]"<<endl; cout<<" to estimate variance components: "<<endl; - cout<<" ./gemma -bfile [prefix] -k [filename] -vc -o [prefix]"<<endl; - cout<<" ./gemma -p [filename] -k [filename] -vc -o [prefix]"<<endl; - cout<<" ./gemma -bfile [prefix] -mk [filename] -vc -o [prefix]"<<endl; - cout<<" ./gemma -p [filename] -mk [filename] -vc -o [prefix]"<<endl; + cout<<" ./gemma -bfile [prefix] -k [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -p [filename] -k [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -bfile [prefix] -mk [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -p [filename] -mk [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -beta [filename] -cor [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -beta [filename] -cor [filename] -cat [filename] -vc [num] -o [prefix]"<<endl; + cout<<" options for the above two commands: -crt -windowbp [num]"<<endl; + cout<<" ./gemma -mq [filename] -ms [filename] -mv [filename] -vc [num] -o [prefix]"<<endl; + cout<<" or with summary statistics, replace bfile with mbfile, or g or mg; vc=1 for HE weights and vc=2 for LDSC weights"<<endl; + cout<<" ./gemma -beta [filename] -bfile [filename] -cat [filename] -wsnp [filename] -wcat [filename] -vc [num] -o [prefix]"<<endl; + cout<<" ./gemma -beta [filename] -bfile [filename] -cat [filename] -wsnp [filename] -wcat [filename] -ci [num] -o [prefix]"<<endl; cout<<" to fit a linear mixed model: "<<endl; cout<<" ./gemma -bfile [prefix] -k [filename] -lmm [num] -o [prefix]"<<endl; - cout<<" ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; + cout<<" ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; + cout<<" to fit a linear mixed model to test g by e effects: "<<endl; + cout<<" ./gemma -bfile [prefix] -gxe [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; + cout<<" ./gemma -g [filename] -p [filename] -a [filename] -gxe [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; + cout<<" to fit a univariate linear mixed model with different residual weights for different individuals: "<<endl; + cout<<" ./gemma -bfile [prefix] -weight [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; + cout<<" ./gemma -g [filename] -p [filename] -a [filename] -weight [filename] -k [filename] -lmm [num] -o [prefix]"<<endl; cout<<" to fit a multivariate linear mixed model: "<<endl; cout<<" ./gemma -bfile [prefix] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl; - cout<<" ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl; + cout<<" ./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] -o [prefix]"<<endl; cout<<" to fit a Bayesian sparse linear mixed model: "<<endl; cout<<" ./gemma -bfile [prefix] -bslmm [num] -o [prefix]"<<endl; cout<<" ./gemma -g [filename] -p [filename] -a [filename] -bslmm [num] -o [prefix]"<<endl; cout<<" to obtain predicted values: "<<endl; cout<<" ./gemma -bfile [prefix] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl; cout<<" ./gemma -g [filename] -p [filename] -epm [filename] -emu [filename] -ebv [filename] -k [filename] -predict [num] -o [prefix]"<<endl; + cout<<" to calculate correlations between SNPs: "<<endl; + cout<<" ./gemma -bfile [prefix] -calccor -o [prefix]"<<endl; + cout<<" ./gemma -g [filename] -p [filename] -calccor -o [prefix]"<<endl; cout<<endl; } - + if (option==2) { cout<<" FILE I/O RELATED OPTIONS" << endl; - cout<<" -bfile [prefix] "<<" specify input PLINK binary ped file prefix."<<endl; - cout<<" requires: *.fam, *.bim and *.bed files"<<endl; + cout<<" -bfile [prefix] "<<" specify input PLINK binary ped file prefix."<<endl; + cout<<" requires: *.fam, *.bim and *.bed files"<<endl; cout<<" missing value: -9"<<endl; cout<<" -g [filename] "<<" specify input BIMBAM mean genotype file name"<<endl; - cout<<" format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl; - cout<<" rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl; - cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; + cout<<" format: rs#1, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl; + cout<<" rs#2, allele0, allele1, genotype for individual 1, genotype for individual 2, ..."<<endl; + cout<<" ..."<<endl; + cout<<" missing value: NA"<<endl; cout<<" -p [filename] "<<" specify input BIMBAM phenotype file name"<<endl; - cout<<" format: phenotype for individual 1"<<endl; - cout<<" phenotype for individual 2"<<endl; + cout<<" format: phenotype for individual 1"<<endl; + cout<<" phenotype for individual 2"<<endl; + cout<<" ..."<<endl; + cout<<" missing value: NA"<<endl; + cout<<" -a [filename] "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl; + cout<<" format: rs#1, base_position, chr_number"<<endl; + cout<<" rs#2, base_position, chr_number"<<endl; cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; - cout<<" -a [filename] "<<" specify input BIMBAM SNP annotation file name (optional)"<<endl; - cout<<" format: rs#1, base_position, chr_number"<<endl; - cout<<" rs#2, base_position, chr_number"<<endl; + // WJA added + cout<<" -oxford [prefix] "<<" specify input Oxford genotype bgen file prefix."<<endl; + cout<<" requires: *.bgen, *.sample files"<<endl; + + cout<<" -gxe [filename] "<<" specify input file that contains a column of environmental factor for g by e tests"<<endl; + cout<<" format: variable for individual 1"<<endl; + cout<<" variable for individual 2"<<endl; cout<<" ..."<<endl; - cout<<" -k [filename] "<<" specify input kinship/relatedness matrix file name"<<endl; - cout<<" -mk [filename] "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl; - cout<<" -u [filename] "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl; - cout<<" -d [filename] "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl; - cout<<" -c [filename] "<<" specify input covariates file name (optional)"<<endl; - cout<<" format: covariate 1 for individual 1, ... , covariate c for individual 1"<<endl; - cout<<" covariate 1 for individual 2, ... , covariate c for individual 2"<<endl; + cout<<" missing value: NA"<<endl; + cout<<" -widv [filename] "<<" specify input file that contains a column of residual weights"<<endl; + cout<<" format: variable for individual 1"<<endl; + cout<<" variable for individual 2"<<endl; cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; + cout<<" missing value: NA"<<endl; + cout<<" -k [filename] "<<" specify input kinship/relatedness matrix file name"<<endl; + cout<<" -mk [filename] "<<" specify input file which contains a list of kinship/relatedness matrices"<<endl; + cout<<" -u [filename] "<<" specify input file containing the eigen vectors of the kinship/relatedness matrix"<<endl; + cout<<" -d [filename] "<<" specify input file containing the eigen values of the kinship/relatedness matrix"<<endl; + cout<<" -c [filename] "<<" specify input covariates file name (optional)"<<endl; + cout<<" -cat [filename] "<<" specify input category file name (optional), which contains rs cat1 cat2 ..."<<endl; + cout<<" -beta [filename] "<<" specify input beta file name (optional), which contains rs beta se_beta n_total (or n_mis and n_obs) estimates from a lm model"<<endl; + cout<<" -cor [filename] "<<" specify input correlation file name (optional), which contains rs window_size correlations from snps"<<endl; + cout<<" missing value: NA"<<endl; cout<<" note: the intercept (a column of 1s) may need to be included"<<endl; cout<<" -epm [filename] "<<" specify input estimated parameter file name"<<endl; cout<<" -en [n1] [n2] [n3] [n4] "<<" specify values for the input estimated parameter file (with a header)"<<endl; @@ -210,74 +255,81 @@ void GEMMA::PrintHelp(size_t option) cout<<" n4: estimated gamma column number (0 to ignore)"<<endl; cout<<" default: 2 4 5 6 if -ebv is not specified; 2 0 5 6 if -ebv is specified"<<endl; cout<<" -ebv [filename] "<<" specify input estimated random effect (breeding value) file name"<<endl; - cout<<" format: value for individual 1"<<endl; - cout<<" value for individual 2"<<endl; + cout<<" format: value for individual 1"<<endl; + cout<<" value for individual 2"<<endl; cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; + cout<<" missing value: NA"<<endl; cout<<" -emu [filename] "<<" specify input log file name containing estimated mean"<<endl; cout<<" -mu [num] "<<" specify input estimated mean value"<<endl; cout<<" -gene [filename] "<<" specify input gene expression file name"<<endl; - cout<<" format: header"<<endl; - cout<<" gene1, count for individual 1, count for individual 2, ..."<<endl; - cout<<" gene2, count for individual 1, count for individual 2, ..."<<endl; + cout<<" format: header"<<endl; + cout<<" gene1, count for individual 1, count for individual 2, ..."<<endl; + cout<<" gene2, count for individual 1, count for individual 2, ..."<<endl; cout<<" ..."<<endl; - cout<<" missing value: not allowed"<<endl; + cout<<" missing value: not allowed"<<endl; cout<<" -r [filename] "<<" specify input total read count file name"<<endl; - cout<<" format: total read count for individual 1"<<endl; - cout<<" total read count for individual 2"<<endl; + cout<<" format: total read count for individual 1"<<endl; + cout<<" total read count for individual 2"<<endl; cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; + cout<<" missing value: NA"<<endl; cout<<" -snps [filename] "<<" specify input snps file name to only analyze a certain set of snps"<<endl; - cout<<" format: rs#1"<<endl; - cout<<" rs#2"<<endl; + cout<<" format: rs#1"<<endl; + cout<<" rs#2"<<endl; cout<<" ..."<<endl; - cout<<" missing value: NA"<<endl; + cout<<" missing value: NA"<<endl; cout<<" -silence "<<" silent terminal display"<<endl; cout<<" -km [num] "<<" specify input kinship/relatedness file type (default 1)."<<endl; cout<<" options: 1: \"n by n matrix\" format"<<endl; cout<<" 2: \"id id value\" format"<<endl; - cout<<" -n [num] "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl; + cout<<" -n [num] "<<" specify phenotype column in the phenotype/*.fam file (optional; default 1)"<<endl; cout<<" -pace [num] "<<" specify terminal display update pace (default 100000 SNPs or 100000 iterations)."<<endl; - cout<<" -outdir [path] "<<" specify output directory path (default \"./output/\")"<<endl; - cout<<" -o [prefix] "<<" specify output file prefix (default \"result\")"<<endl; - cout<<" output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl; - cout<<" output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl; + cout<<" -outdir [path] "<<" specify output directory path (default \"./output/\")"<<endl; + cout<<" -o [prefix] "<<" specify output file prefix (default \"result\")"<<endl; + cout<<" output: prefix.cXX.txt or prefix.sXX.txt from kinship/relatedness matrix estimation"<<endl; + cout<<" output: prefix.assoc.txt and prefix.log.txt form association tests"<<endl; cout<<endl; } - + if (option==3) { cout<<" SNP QC OPTIONS" << endl; - cout<<" -miss [num] "<<" specify missingness threshold (default 0.05)" << endl; - cout<<" -maf [num] "<<" specify minor allele frequency threshold (default 0.01)" << endl; - cout<<" -hwe [num] "<<" specify HWE test p value threshold (default 0; no test)" << endl; - cout<<" -r2 [num] "<<" specify r-squared threshold (default 0.9999)" << endl; - cout<<" -notsnp "<<" minor allele frequency cutoff is not used" << endl; + cout<<" -miss [num] "<<" specify missingness threshold (default 0.05)" << endl; + cout<<" -maf [num] "<<" specify minor allele frequency threshold (default 0.01)" << endl; + cout<<" -hwe [num] "<<" specify HWE test p value threshold (default 0; no test)" << endl; + cout<<" -r2 [num] "<<" specify r-squared threshold (default 0.9999)" << endl; + cout<<" -notsnp "<<" minor allele frequency cutoff is not used" << endl; cout<<endl; } - + if (option==4) { cout<<" RELATEDNESS MATRIX CALCULATION OPTIONS" << endl; - cout<<" -gk [num] "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl; + cout<<" -gk [num] "<<" specify which type of kinship/relatedness matrix to generate (default 1)" << endl; cout<<" options: 1: centered XX^T/p"<<endl; cout<<" 2: standardized XX^T/p"<<endl; cout<<" note: non-polymorphic SNPs are excluded "<<endl; cout<<endl; } - + if (option==5) { cout<<" EIGEN-DECOMPOSITION OPTIONS" << endl; - cout<<" -eigen "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl; + cout<<" -eigen "<<" specify to perform eigen decomposition of the loaded relatedness matrix" << endl; cout<<endl; } if (option==6) { cout<<" VARIANCE COMPONENT ESTIMATION OPTIONS" << endl; - cout<<" -vc "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl; + cout<<" -vc "<<" specify to perform variance component estimation for the loaded relatedness matrix/matrices" << endl; + cout<<" options (with kinship file): 1: HE regression (default)"<<endl; + cout<<" 2: REML"<<endl; + cout<<" options (with beta/cor files): 1: Centered genotypes (default)"<<endl; + cout<<" 2: Standardized genotypes"<<endl; + cout<<" -crt -windowbp [num]"<<" specify the window size based on bp (default 1000000; 1Mb)"<<endl; + cout<<" -crt -windowcm [num]"<<" specify the window size based on cm (default 0)"<<endl; + cout<<" -crt -windowns [num]"<<" specify the window size based on number of snps (default 0)"<<endl; cout<<endl; } - + if (option==7) { - cout<<" LINEAR MODEL OPTIONS" << endl; + cout<<" LINEAR MODEL OPTIONS" << endl; cout<<" -lm [num] "<<" specify analysis options (default 1)."<<endl; cout<<" options: 1: Wald test"<<endl; cout<<" 2: Likelihood ratio test"<<endl; @@ -285,21 +337,21 @@ void GEMMA::PrintHelp(size_t option) cout<<" 4: 1-3"<<endl; cout<<endl; } - + if (option==8) { - cout<<" LINEAR MIXED MODEL OPTIONS" << endl; + cout<<" LINEAR MIXED MODEL OPTIONS" << endl; cout<<" -lmm [num] "<<" specify analysis options (default 1)."<<endl; - cout<<" options: 1: Wald test"<<endl; + cout<<" options: 1: Wald test"<<endl; cout<<" 2: Likelihood ratio test"<<endl; cout<<" 3: Score test"<<endl; cout<<" 4: 1-3"<<endl; cout<<" 5: Parameter estimation in the null model only"<<endl; - cout<<" -lmin [num] "<<" specify minimal value for lambda (default 1e-5)" << endl; - cout<<" -lmax [num] "<<" specify maximum value for lambda (default 1e+5)" << endl; - cout<<" -region [num] "<<" specify the number of regions used to evaluate lambda (default 10)" << endl; + cout<<" -lmin [num] "<<" specify minimal value for lambda (default 1e-5)" << endl; + cout<<" -lmax [num] "<<" specify maximum value for lambda (default 1e+5)" << endl; + cout<<" -region [num] "<<" specify the number of regions used to evaluate lambda (default 10)" << endl; cout<<endl; } - + if (option==9) { cout<<" MULTIVARIATE LINEAR MIXED MODEL OPTIONS" << endl; cout<<" -pnr "<<" specify the pvalue threshold to use the Newton-Raphson's method (default 0.001)"<<endl; @@ -310,51 +362,63 @@ void GEMMA::PrintHelp(size_t option) cout<<" -crt "<<" specify to output corrected pvalues for these pvalues that are below the -pnr threshold"<<endl; cout<<endl; } - + if (option==10) { cout<<" MULTI-LOCUS ANALYSIS OPTIONS" << endl; cout<<" -bslmm [num] "<<" specify analysis options (default 1)."<<endl; - cout<<" options: 1: BSLMM"<<endl; - cout<<" 2: standard ridge regression/GBLUP (no mcmc)"<<endl; - cout<<" 3: probit BSLMM (requires 0/1 phenotypes)"<<endl; - + cout<<" options: 1: BSLMM"<<endl; + cout<<" 2: standard ridge regression/GBLUP (no mcmc)"<<endl; + cout<<" 3: probit BSLMM (requires 0/1 phenotypes)"<<endl; + + cout<<" -ldr [num] "<<" specify analysis options (default 1)."<<endl; + cout<<" options: 1: LDR"<<endl; + cout<<" MCMC OPTIONS" << endl; - cout<<" Prior" << endl; - cout<<" -hmin [num] "<<" specify minimum value for h (default 0)" << endl; - cout<<" -hmax [num] "<<" specify maximum value for h (default 1)" << endl; - cout<<" -rmin [num] "<<" specify minimum value for rho (default 0)" << endl; - cout<<" -rmax [num] "<<" specify maximum value for rho (default 1)" << endl; - cout<<" -pmin [num] "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl; - cout<<" -pmax [num] "<<" specify maximum value for log10(pi) (default log10(1) )" << endl; - cout<<" -smin [num] "<<" specify minimum value for |gamma| (default 0)" << endl; - cout<<" -smax [num] "<<" specify maximum value for |gamma| (default 300)" << endl; - + cout<<" Prior" << endl; + cout<<" -hmin [num] "<<" specify minimum value for h (default 0)" << endl; + cout<<" -hmax [num] "<<" specify maximum value for h (default 1)" << endl; + cout<<" -rmin [num] "<<" specify minimum value for rho (default 0)" << endl; + cout<<" -rmax [num] "<<" specify maximum value for rho (default 1)" << endl; + cout<<" -pmin [num] "<<" specify minimum value for log10(pi) (default log10(1/p), where p is the number of analyzed SNPs )" << endl; + cout<<" -pmax [num] "<<" specify maximum value for log10(pi) (default log10(1) )" << endl; + cout<<" -smin [num] "<<" specify minimum value for |gamma| (default 0)" << endl; + cout<<" -smax [num] "<<" specify maximum value for |gamma| (default 300)" << endl; + cout<<" Proposal" << endl; - cout<<" -gmean [num] "<<" specify the mean for the geometric distribution (default: 2000)" << endl; - cout<<" -hscale [num] "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; - cout<<" -rscale [num] "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; - cout<<" -pscale [num] "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl; - + cout<<" -gmean [num] "<<" specify the mean for the geometric distribution (default: 2000)" << endl; + cout<<" -hscale [num] "<<" specify the step size scale for the proposal distribution of h (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; + cout<<" -rscale [num] "<<" specify the step size scale for the proposal distribution of rho (value between 0 and 1, default min(10/sqrt(n),1) )" << endl; + cout<<" -pscale [num] "<<" specify the step size scale for the proposal distribution of log10(pi) (value between 0 and 1, default min(5/sqrt(n),1) )" << endl; + cout<<" Others" << endl; - cout<<" -w [num] "<<" specify burn-in steps (default 100,000)" << endl; - cout<<" -s [num] "<<" specify sampling steps (default 1,000,000)" << endl; - cout<<" -rpace [num] "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl; - cout<<" -wpace [num] "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl; - cout<<" -seed [num] "<<" specify random seed (a random seed is generated by default)" << endl; - cout<<" -mh [num] "<<" specify number of MH steps in each iteration (default 10)" << endl; - cout<<" requires: 0/1 phenotypes and -bslmm 3 option"<<endl; + cout<<" -w [num] "<<" specify burn-in steps (default 100,000)" << endl; + cout<<" -s [num] "<<" specify sampling steps (default 1,000,000)" << endl; + cout<<" -rpace [num] "<<" specify recording pace, record one state in every [num] steps (default 10)" << endl; + cout<<" -wpace [num] "<<" specify writing pace, write values down in every [num] recorded steps (default 1000)" << endl; + cout<<" -seed [num] "<<" specify random seed (a random seed is generated by default)" << endl; + cout<<" -mh [num] "<<" specify number of MH steps in each iteration (default 10)" << endl; + cout<<" requires: 0/1 phenotypes and -bslmm 3 option"<<endl; cout<<endl; } - + if (option==11) { cout<<" PREDICTION OPTIONS" << endl; cout<<" -predict [num] "<<" specify prediction options (default 1)."<<endl; - cout<<" options: 1: predict for individuals with missing phenotypes"<<endl; - cout<<" 2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl; + cout<<" options: 1: predict for individuals with missing phenotypes"<<endl; + cout<<" 2: predict for individuals with missing phenotypes, and convert the predicted values to probability scale. Use only for files fitted with -bslmm 3 option"<<endl; cout<<endl; } - + if (option==12) { + cout<<" CALC CORRELATION OPTIONS" << endl; + cout<<" -calccor "<<endl; + cout<<" -windowbp [num] "<<" specify the window size based on bp (default 1000000; 1Mb)" << endl; + cout<<" -windowcm [num] "<<" specify the window size based on cm (default 0; not used)" << endl; + cout<<" -windowns [num] "<<" specify the window size based on number of snps (default 0; not used)" << endl; + cout<<endl; + } + + if (option==13) { cout<<" NOTE"<<endl; cout<<" 1. Only individuals with non-missing phenotoypes and covariates will be analyzed."<<endl; cout<<" 2. Missing genotoypes will be repalced with the mean genotype of that SNP."<<endl; @@ -363,17 +427,29 @@ void GEMMA::PrintHelp(size_t option) cout<<" 5. For bslmm analysis, in addition to 3, memory should be large enough to hold the whole genotype matrix."<<endl; cout<<endl; } - + return; } - +//options +//gk: 21-22 +//gs: 25-26 +//gq: 27-28 +//eigen: 31-32 +//lmm: 1-5 +//bslmm: 11-13 +//predict: 41-43 +//lm: 51 +//vc: 61 +//ci: 66-67 +//calccor: 71 +//gw: 72 void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) { string str; - - for(int i = 1; i < argc; i++) { + + for(int i = 1; i < argc; i++) { if (strcmp(argv[i], "-bfile")==0 || strcmp(argv[i], "--bfile")==0 || strcmp(argv[i], "-b")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -381,6 +457,13 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_bfile=str; } + else if (strcmp(argv[i], "-mbfile")==0 || strcmp(argv[i], "--mbfile")==0 || strcmp(argv[i], "-mb")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mbfile=str; + } else if (strcmp(argv[i], "-silence")==0) { cPar.mode_silence=true; } @@ -391,6 +474,13 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_geno=str; } + else if (strcmp(argv[i], "-mg")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mgeno=str; + } else if (strcmp(argv[i], "-p")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -405,6 +495,42 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_anno=str; } + // WJA added + else if (strcmp(argv[i], "-oxford")==0 || strcmp(argv[i], "--oxford")==0 || strcmp(argv[i], "-x")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_oxford=str; + } + else if (strcmp(argv[i], "-gxe")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_gxe=str; + } + else if (strcmp(argv[i], "-widv")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_weight=str; + } + else if (strcmp(argv[i], "-wsnp")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_wsnp=str; + } + else if (strcmp(argv[i], "-wcat")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_wcat=str; + } else if (strcmp(argv[i], "-k")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -440,6 +566,62 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_cvt=str; } + else if (strcmp(argv[i], "-cat")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_cat=str; + } + else if (strcmp(argv[i], "-mcat")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mcat=str; + } + else if (strcmp(argv[i], "-beta")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_beta=str; + } + else if (strcmp(argv[i], "-cor")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_cor=str; + } + else if (strcmp(argv[i], "-study")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_study=str; + } + else if (strcmp(argv[i], "-ref")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_ref=str; + } + else if (strcmp(argv[i], "-mstudy")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mstudy=str; + } + else if (strcmp(argv[i], "-mref")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.file_mref=str; + } else if (strcmp(argv[i], "-epm")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -447,7 +629,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.assign(argv[i]); cPar.file_epm=str; } - else if (strcmp(argv[i], "-en")==0) { + else if (strcmp(argv[i], "-en")==0) { while (argv[i+1] != NULL && argv[i+1][0] != '-') { ++i; str.clear(); @@ -503,7 +685,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.clear(); str.assign(argv[i]); cPar.k_mode=atoi(str.c_str()); - } + } else if (strcmp(argv[i], "-n")==0) { (cPar.p_column).clear(); while (argv[i+1] != NULL && argv[i+1][0] != '-') { @@ -533,7 +715,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) str.clear(); str.assign(argv[i]); cPar.file_out=str; - } + } else if (strcmp(argv[i], "-miss")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -566,31 +748,101 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) cPar.maf_level=-1; } else if (strcmp(argv[i], "-gk")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=21; continue;} ++i; str.clear(); str.assign(argv[i]); cPar.a_mode=20+atoi(str.c_str()); - } + } + else if (strcmp(argv[i], "-gs")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=25; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=24+atoi(str.c_str()); + } + else if (strcmp(argv[i], "-gq")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=27; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=26+atoi(str.c_str()); + } + else if (strcmp(argv[i], "-gw")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=72; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=71+atoi(str.c_str()); + } + else if (strcmp(argv[i], "-sample")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.ni_subsample=atoi(str.c_str()); + } else if (strcmp(argv[i], "-eigen")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=31; continue;} ++i; str.clear(); str.assign(argv[i]); cPar.a_mode=30+atoi(str.c_str()); - } + } + else if (strcmp(argv[i], "-calccor")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=71; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=70+atoi(str.c_str()); + } else if (strcmp(argv[i], "-vc")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=61; continue;} ++i; str.clear(); str.assign(argv[i]); cPar.a_mode=60+atoi(str.c_str()); - } + } + else if (strcmp(argv[i], "-ci")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=66; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=65+atoi(str.c_str()); + } + else if (strcmp(argv[i], "-pve")==0) { + double s=0; + while (argv[i+1] != NULL && (argv[i+1][0] != '-' || !isalpha(argv[i+1][1]) ) ) { + ++i; + str.clear(); + str.assign(argv[i]); + cPar.v_pve.push_back(atof(str.c_str())); + s+=atof(str.c_str()); + } + if (s==1) { + cout<<"summation of pve equals one."<<endl; + } + } + else if (strcmp(argv[i], "-blocks")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.n_block=atoi(str.c_str()); + } + else if (strcmp(argv[i], "-noconstrain")==0) { + cPar.noconstrain=true; + } else if (strcmp(argv[i], "-lm")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=51; continue;} ++i; str.clear(); @@ -598,7 +850,7 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) cPar.a_mode=50+atoi(str.c_str()); } else if (strcmp(argv[i], "-fa")==0 || strcmp(argv[i], "-lmm")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=1; continue;} ++i; str.clear(); @@ -665,13 +917,21 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) cPar.crt=1; } else if (strcmp(argv[i], "-bslmm")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=11; continue;} ++i; str.clear(); str.assign(argv[i]); cPar.a_mode=10+atoi(str.c_str()); } + else if (strcmp(argv[i], "-ldr")==0) { + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} + if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=14; continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.a_mode=13+atoi(str.c_str()); + } else if (strcmp(argv[i], "-hmin")==0) { if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} ++i; @@ -799,25 +1059,46 @@ void GEMMA::Assign(int argc, char ** argv, PARAM &cPar) cPar.n_mh=atoi(str.c_str()); } else if (strcmp(argv[i], "-predict")==0) { - if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -eigen -vc -lm -lmm -bslmm -predict options is allowed."<<endl; break;} + if (cPar.a_mode!=0) {cPar.error=true; cout<<"error! only one of -gk -gs -eigen -vc -lm -lmm -bslmm -predict -calccor options is allowed."<<endl; break;} if(argv[i+1] == NULL || argv[i+1][0] == '-') {cPar.a_mode=41; continue;} ++i; str.clear(); str.assign(argv[i]); cPar.a_mode=40+atoi(str.c_str()); } + else if (strcmp(argv[i], "-windowcm")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.window_cm=atof(str.c_str()); + } + else if (strcmp(argv[i], "-windowbp")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.window_bp=atoi(str.c_str()); + } + else if (strcmp(argv[i], "-windowns")==0) { + if(argv[i+1] == NULL || argv[i+1][0] == '-') {continue;} + ++i; + str.clear(); + str.assign(argv[i]); + cPar.window_ns=atoi(str.c_str()); + } else {cout<<"error! unrecognized option: "<<argv[i]<<endl; cPar.error=true; continue;} } - + //change prediction mode to 43, if the epm file is not provided if (cPar.a_mode==41 && cPar.file_epm.empty()) {cPar.a_mode=43;} - + return; } -void GEMMA::BatchRun (PARAM &cPar) +void GEMMA::BatchRun (PARAM &cPar) { clock_t time_begin, time_start; time_begin=clock(); @@ -828,25 +1109,26 @@ void GEMMA::BatchRun (PARAM &cPar) if (cPar.error==true) {cout<<"error! fail to read files. "<<endl; return;} cPar.CheckData(); if (cPar.error==true) {cout<<"error! fail to check data. "<<endl; return;} - //Prediction for bslmm + + //Prediction for bslmm if (cPar.a_mode==41 || cPar.a_mode==42) { gsl_vector *y_prdt; - + y_prdt=gsl_vector_alloc (cPar.ni_total-cPar.ni_test); //set to zero gsl_vector_set_zero (y_prdt); - + PRDT cPRDT; cPRDT.CopyFromParam(cPar); - + //add breeding value if needed if (!cPar.file_kin.empty() && !cPar.file_ebv.empty()) { cout<<"Adding Breeding Values ... "<<endl; - + gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total); gsl_vector *u_hat=gsl_vector_alloc (cPar.ni_test); - + //read kinship matrix and set u_hat vector<int> indicator_all; size_t c_bv=0; @@ -854,13 +1136,13 @@ void GEMMA::BatchRun (PARAM &cPar) indicator_all.push_back(1); if (cPar.indicator_bv[i]==1) {gsl_vector_set(u_hat, c_bv, cPar.vec_bv[i]); c_bv++;} } - + ReadFile_kin (cPar.file_kin, indicator_all, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - - //read u - cPRDT.AddBV(G, u_hat, y_prdt); - + + //read u + cPRDT.AddBV(G, u_hat, y_prdt); + gsl_matrix_free(G); gsl_vector_free(u_hat); } @@ -872,10 +1154,10 @@ void GEMMA::BatchRun (PARAM &cPar) else { cPRDT.AnalyzeBimbam (y_prdt); } - + //add mu gsl_vector_add_constant(y_prdt, cPar.pheno_mean); - + //convert y to probability if needed if (cPar.a_mode==42) { double d; @@ -885,51 +1167,51 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_set(y_prdt, i, d); } } - - + + cPRDT.CopyToParam(cPar); - + cPRDT.WriteFiles(y_prdt); - + gsl_vector_free(y_prdt); } - - + + //Prediction with kinship matrix only; for one or more phenotypes if (cPar.a_mode==43) { - //first, use individuals with full phenotypes to obtain estimates of Vg and Ve + //first, use individuals with full phenotypes to obtain estimates of Vg and Ve gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); - gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); - gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); - + gsl_matrix *Y_full=gsl_matrix_alloc (cPar.ni_cvt, cPar.n_ph); gsl_matrix *W_full=gsl_matrix_alloc (Y_full->size1, cPar.n_cvt); //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); cPar.CopyCvtPhen (W_full, Y_full, 1); - - gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); - gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); + + gsl_matrix *Y_hat=gsl_matrix_alloc (Y_full->size1, cPar.n_ph); + gsl_matrix *G_full=gsl_matrix_alloc (Y_full->size1, Y_full->size1); gsl_matrix *H_full=gsl_matrix_alloc (Y_full->size1*Y_hat->size2, Y_full->size1*Y_hat->size2); - + //read relatedness matrix G, and matrix G_full ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} ReadFile_kin (cPar.file_kin, cPar.indicator_cvt, cPar.mapID2num, cPar.k_mode, cPar.error, G_full); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - + //center matrix G CenterMatrix (G); CenterMatrix (G_full); - + //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<<endl; - time_start=clock(); + time_start=clock(); cPar.trace_G=EigenDecomp (G, U, eval, 0); cPar.trace_G=0.0; for (size_t i=0; i<eval->size; i++) { @@ -937,8 +1219,8 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); @@ -948,7 +1230,7 @@ void GEMMA::BatchRun (PARAM &cPar) if (cPar.n_ph==1) { gsl_vector *beta=gsl_vector_alloc (W->size2); gsl_vector *se_beta=gsl_vector_alloc (W->size2); - + double lambda, logl, vg, ve; gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); @@ -959,29 +1241,29 @@ void GEMMA::BatchRun (PARAM &cPar) cout<<"REMLE estimate for vg in the null model = "<<vg<<endl; cout<<"REMLE estimate for ve in the null model = "<<ve<<endl; cPar.vg_remle_null=vg; cPar.ve_remle_null=ve; - + //obtain Y_hat from fixed effects - gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0); + gsl_vector_view Yhat_col=gsl_matrix_column (Y_hat, 0); gsl_blas_dgemv (CblasNoTrans, 1.0, W_full, beta, 0.0, &Yhat_col.vector); - + //obtain H gsl_matrix_set_identity (H_full); gsl_matrix_scale (H_full, ve); gsl_matrix_scale (G_full, vg); gsl_matrix_add (H_full, G_full); - - //free matrices + + //free matrices gsl_vector_free(beta); gsl_vector_free(se_beta); - } else { + } else { gsl_matrix *Vg=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph); gsl_matrix *Ve=gsl_matrix_alloc (cPar.n_ph, cPar.n_ph); gsl_matrix *B=gsl_matrix_alloc (cPar.n_ph, W->size2); gsl_matrix *se_B=gsl_matrix_alloc (cPar.n_ph, W->size2); - + //obtain estimates CalcMvLmmVgVeBeta (eval, UtW, UtY, cPar.em_iter, cPar.nr_iter, cPar.em_prec, cPar.nr_prec, cPar.l_min, cPar.l_max, cPar.n_region, Vg, Ve, B, se_B); - + cout<<"REMLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<Vg->size1; i++) { for (size_t j=0; j<=i; j++) { @@ -1004,110 +1286,250 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.Ve_remle_null.push_back(gsl_matrix_get (Ve, i, j) ); } } - + //obtain Y_hat from fixed effects gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, W_full, B, 0.0, Y_hat); - + //obtain H KroneckerSym(G_full, Vg, H_full); for (size_t i=0; i<G_full->size1; i++) { gsl_matrix_view H_sub=gsl_matrix_submatrix (H_full, i*Ve->size1, i*Ve->size2, Ve->size1, Ve->size2); gsl_matrix_add (&H_sub.matrix, Ve); } - - //free matrices + + //free matrices gsl_matrix_free (Vg); gsl_matrix_free (Ve); gsl_matrix_free (B); gsl_matrix_free (se_B); } - + PRDT cPRDT; - + cPRDT.CopyFromParam(cPar); - + cout<<"Predicting Missing Phentypes ... "<<endl; - time_start=clock(); + time_start=clock(); cPRDT.MvnormPrdt(Y_hat, H_full, Y_full); - cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); cPRDT.WriteFiles(Y_full); - + gsl_matrix_free(Y); - gsl_matrix_free(W); + gsl_matrix_free(W); gsl_matrix_free(G); - gsl_matrix_free(U); + gsl_matrix_free(U); gsl_matrix_free(UtW); gsl_matrix_free(UtY); gsl_vector_free(eval); - + gsl_matrix_free(Y_full); gsl_matrix_free(Y_hat); gsl_matrix_free(W_full); - gsl_matrix_free(G_full); + gsl_matrix_free(G_full); gsl_matrix_free(H_full); } - - + + //Generate Kinship matrix - if (cPar.a_mode==21 || cPar.a_mode==22) { + if (cPar.a_mode==21 || cPar.a_mode==22) { cout<<"Calculating Relatedness Matrix ... "<<endl; - + gsl_matrix *G=gsl_matrix_alloc (cPar.ni_total, cPar.ni_total); - + time_start=clock(); cPar.CalcKin (G); cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); if (cPar.error==true) {cout<<"error! fail to calculate relatedness matrix. "<<endl; return;} - + if (cPar.a_mode==21) { cPar.WriteMatrix (G, "cXX"); } else { cPar.WriteMatrix (G, "sXX"); } - + gsl_matrix_free (G); } - - + + //Compute the LDSC weights (not implemented yet) + if (cPar.a_mode==72) { + cout<<"Calculating Weights ... "<<endl; + + VARCOV cVarcov; + cVarcov.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cVarcov.AnalyzePlink (); + } else { + cVarcov.AnalyzeBimbam (); + } + + cVarcov.CopyToParam(cPar); + } + + + //Compute the S matrix (and its variance), that is used for variance component estimation using summary statistics + if (cPar.a_mode==25 || cPar.a_mode==26) { + cout<<"Calculating the S Matrix ... "<<endl; + + gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc*2, cPar.n_vc); + gsl_vector *ns=gsl_vector_alloc (cPar.n_vc+1); + gsl_matrix_set_zero(S); + gsl_vector_set_zero(ns); + + gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc); + gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc); + gsl_vector_view ns_vec=gsl_vector_subvector(ns, 0, cPar.n_vc); + + gsl_matrix *K=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + gsl_matrix *A=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + gsl_matrix_set_zero (K); + gsl_matrix_set_zero (A); + + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (cPar.ni_test, cPar.n_cvt); + + cPar.CopyCvtPhen (W, y, 0); + + set<string> setSnps_beta; + map <string, double> mapRS2wA, mapRS2wK; + + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + time_start=clock(); + cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &ns_vec.vector); + cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;} + + gsl_vector_set (ns, cPar.n_vc, cPar.ni_test); + + cPar.WriteMatrix (S, "S"); + cPar.WriteVector (ns, "size"); + cPar.WriteVar ("snps"); + /* + cout<<scientific; + for (size_t i=0; i<cPar.n_vc; i++) { + for (size_t j=0; j<cPar.n_vc; j++) { + cout<<gsl_matrix_get(S, i, j)<<" "; + } + cout<<endl; + } + + for (size_t i=cPar.n_vc; i<cPar.n_vc*2; i++) { + for (size_t j=0; j<cPar.n_vc; j++) { + cout<<gsl_matrix_get(S, i, j)<<" "; + } + cout<<endl; + } + */ + gsl_matrix_free (S); + gsl_vector_free (ns); + + gsl_matrix_free (A); + gsl_matrix_free (K); + + gsl_vector_free (y); + gsl_matrix_free (K); + } + + //Compute the q vector, that is used for variance component estimation using summary statistics + if (cPar.a_mode==27 || cPar.a_mode==28) { + gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_vector *q=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1); + gsl_vector_set_zero (q); + gsl_vector_set_zero (s); + + gsl_vector_view s_vec=gsl_vector_subvector(s, 0, cPar.n_vc); + + vector<size_t> vec_cat, vec_ni; + vector<double> vec_weight, vec_z2; + map<string, double> mapRS2weight; + mapRS2weight.clear(); + + time_start=clock(); + ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2weight, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_total, cPar.ns_total, cPar.ns_test); + cout<<"## number of total individuals = "<<cPar.ni_total<<endl; + cout<<"## number of total SNPs = "<<cPar.ns_total<<endl; + cout<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + cout<<"## number of variance components = "<<cPar.n_vc<<endl; + cout<<"Calculating the q vector ... "<<endl; + Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector); + cPar.time_G=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + if (cPar.error==true) {cout<<"error! fail to calculate the q vector. "<<endl; return;} + + gsl_vector_set (s, cPar.n_vc, cPar.ni_total); + + cPar.WriteMatrix (Vq, "Vq"); + cPar.WriteVector (q, "q"); + cPar.WriteVector (s, "size"); + /* + for (size_t i=0; i<cPar.n_vc; i++) { + cout<<gsl_vector_get(q, i)<<endl; + } + */ + gsl_matrix_free (Vq); + gsl_vector_free (q); + gsl_vector_free (s); + } + + + //Calculate SNP covariance + if (cPar.a_mode==71) { + VARCOV cVarcov; + cVarcov.CopyFromParam(cPar); + + if (!cPar.file_bfile.empty()) { + cVarcov.AnalyzePlink (); + } else { + cVarcov.AnalyzeBimbam (); + } + + cVarcov.CopyToParam(cPar); + } + + //LM if (cPar.a_mode==51 || cPar.a_mode==52 || cPar.a_mode==53 || cPar.a_mode==54) { //Fit LM gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); - gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); - - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); - + //Fit LM or mvLM - if (cPar.n_ph==1) { + if (cPar.n_ph==1) { LM cLm; cLm.CopyFromParam(cPar); - + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); - - if (!cPar.file_gene.empty()) { + + if (!cPar.file_gene.empty()) { cLm.AnalyzeGene (W, &Y_col.vector); //y is the predictor, not the phenotype } else if (!cPar.file_bfile.empty()) { cLm.AnalyzePlink (W, &Y_col.vector); + } else if (!cPar.file_oxford.empty()) { + cLm.Analyzebgen (W, &Y_col.vector); } else { cLm.AnalyzeBimbam (W, &Y_col.vector); } - + cLm.WriteFiles(); cLm.CopyToParam(cPar); } /* - else { + else { MVLM cMvlm; - cMvlm.CopyFromParam(cPar); - + cMvlm.CopyFromParam(cPar); + if (!cPar.file_bfile.empty()) { cMvlm.AnalyzePlink (W, Y); } else { cMvlm.AnalyzeBimbam (W, Y); } - + cMvlm.WriteFiles(); cMvlm.CopyToParam(cPar); } @@ -1115,27 +1537,202 @@ void GEMMA::BatchRun (PARAM &cPar) //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); - } + } //VC estimation with one or multiple kinship matrices //REML approach only //if file_kin or file_ku/kd is provided, then a_mode is changed to 5 already, in param.cpp - //for one phenotype only; - if (cPar.a_mode==61) { + //for one phenotype only; + if (cPar.a_mode==61 || cPar.a_mode==62) { + if (!cPar.file_beta.empty() ) { + //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK + //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1 + //update indicator_snps, so that the numbers are in accordance with mapRS2wK + set<string> setSnps_beta; + ReadFile_snps_header (cPar.file_beta, setSnps_beta); + + map <string, double> mapRS2wA, mapRS2wK; + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + cPar.UpdateSNP (mapRS2wK); + + //setup matrices and vectors + gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc*2, cPar.n_vc); + gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_vector *q=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1); + + gsl_matrix *K=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + gsl_matrix *A=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc*cPar.ni_test); + + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (cPar.ni_test, cPar.n_cvt); + + gsl_matrix_set_zero (K); + gsl_matrix_set_zero (A); + + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Vq); + gsl_vector_set_zero (q); + gsl_vector_set_zero (s); + + cPar.CopyCvtPhen (W, y, 0); + + gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc); + gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc); + gsl_vector_view s_vec=gsl_vector_subvector(s, 0, cPar.n_vc); + + vector<size_t> vec_cat, vec_ni; + vector<double> vec_weight, vec_z2; + + //read beta, based on the mapRS2wK + ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2wK, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_study, cPar.ns_study, cPar.ns_test); + + cout<<"Study Panel: "<<endl; + cout<<"## number of total individuals = "<<cPar.ni_study<<endl; + cout<<"## number of total SNPs = "<<cPar.ns_study<<endl; + cout<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + cout<<"## number of variance components = "<<cPar.n_vc<<endl; + + //compute q + Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector); + + //compute S + time_start=clock(); + cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &s_vec.vector); + cPar.time_G+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;} + + //compute vc estimates + CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, &s_vec.vector, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich); + + //if LDSC weights, then compute the weights and run the above steps again + if (cPar.a_mode==62) { + //compute the weights and normalize the weights for A + cPar.UpdateWeight (1, mapRS2wK, cPar.ni_study, &s_vec.vector, mapRS2wA); + + //read beta file again, and update weigths vector + ReadFile_beta (cPar.file_beta, cPar.mapRS2cat, mapRS2wA, vec_cat, vec_ni, vec_weight, vec_z2, cPar.ni_study, cPar.ns_total, cPar.ns_test); + + //compute q + Calcq (cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, &s_vec.vector); + + //compute S + time_start=clock(); + cPar.CalcS (mapRS2wA, mapRS2wK, W, A, K, &S_mat.matrix, &Svar_mat.matrix, &s_vec.vector); + cPar.time_G+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + if (cPar.error==true) {cout<<"error! fail to calculate the S matrix. "<<endl; return;} + + //compute vc estimates + CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, &s_vec.vector, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich); + } + + gsl_vector_set (s, cPar.n_vc, cPar.ni_test); + + cPar.WriteMatrix (S, "S"); + cPar.WriteMatrix (Vq, "Vq"); + cPar.WriteVector (q, "q"); + cPar.WriteVector (s, "size"); + + gsl_matrix_free (S); + gsl_matrix_free (Vq); + gsl_vector_free (q); + gsl_vector_free (s); + + gsl_matrix_free (A); + gsl_matrix_free (K); + gsl_vector_free (y); + gsl_matrix_free (W); + } else if (!cPar.file_study.empty() || !cPar.file_mstudy.empty()) { + if (!cPar.file_study.empty()) { + string sfile=cPar.file_study+".size.txt"; + CountFileLines (sfile, cPar.n_vc); + } else { + string file_name; + igzstream infile (cPar.file_mstudy.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mstudy file: "<<cPar.file_study<<endl; return;} + + safeGetline(infile, file_name); + + infile.clear(); + infile.close(); + + string sfile=file_name+".size.txt"; + CountFileLines (sfile, cPar.n_vc); + } + + cPar.n_vc=cPar.n_vc-1; + + gsl_matrix *S=gsl_matrix_alloc (2*cPar.n_vc, cPar.n_vc); + gsl_matrix *Vq=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + //gsl_matrix *V=gsl_matrix_alloc (cPar.n_vc+1, (cPar.n_vc*(cPar.n_vc+1))/2*(cPar.n_vc+1) ); + //gsl_matrix *Vslope=gsl_matrix_alloc (n_lines+1, (n_lines*(n_lines+1))/2*(n_lines+1) ); + gsl_vector *q=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s_study=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s_ref=gsl_vector_alloc (cPar.n_vc); + gsl_vector *s=gsl_vector_alloc (cPar.n_vc+1); + + gsl_matrix_set_zero(S); + gsl_matrix_view S_mat=gsl_matrix_submatrix(S, 0, 0, cPar.n_vc, cPar.n_vc); + gsl_matrix_view Svar_mat=gsl_matrix_submatrix (S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc); + + gsl_matrix_set_zero(Vq); + //gsl_matrix_set_zero(V); + //gsl_matrix_set_zero(Vslope); + gsl_vector_set_zero(q); + gsl_vector_set_zero(s_study); + gsl_vector_set_zero(s_ref); + + if (!cPar.file_study.empty()) { + ReadFile_study(cPar.file_study, Vq, q, s_study, cPar.ni_study); + } else { + ReadFile_mstudy(cPar.file_mstudy, Vq, q, s_study, cPar.ni_study); + } + + if (!cPar.file_ref.empty()) { + ReadFile_ref(cPar.file_ref, &S_mat.matrix, &Svar_mat.matrix, s_ref, cPar.ni_ref); + } else { + ReadFile_mref(cPar.file_mref, &S_mat.matrix, &Svar_mat.matrix, s_ref, cPar.ni_ref); + } + + cout<<"## number of variance components = "<<cPar.n_vc<<endl; + cout<<"## number of individuals in the sample = "<<cPar.ni_study<<endl; + cout<<"## number of individuals in the reference = "<<cPar.ni_ref<<endl; + + CalcVCss(Vq, &S_mat.matrix, &Svar_mat.matrix, q, s_study, cPar.ni_study, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich); + + gsl_vector_view s_sub=gsl_vector_subvector (s, 0, cPar.n_vc); + gsl_vector_memcpy (&s_sub.vector, s_ref); + gsl_vector_set (s, cPar.n_vc, cPar.ni_ref); + + cPar.WriteMatrix (S, "S"); + cPar.WriteMatrix (Vq, "Vq"); + cPar.WriteVector (q, "q"); + cPar.WriteVector (s, "size"); + + gsl_matrix_free (S); + gsl_matrix_free (Vq); + //gsl_matrix_free (V); + //gsl_matrix_free (Vslope); + gsl_vector_free (q); + gsl_vector_free (s_study); + gsl_vector_free (s_ref); + gsl_vector_free (s); + } else { gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); gsl_matrix *W=gsl_matrix_alloc (Y->size1, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1*cPar.n_vc ); - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); //read kinship matrices if (!(cPar.file_mk).empty()) { ReadFile_mk (cPar.file_mk, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - + //center matrix G, and obtain v_traceG double d=0; (cPar.v_traceG).clear(); @@ -1152,7 +1749,7 @@ void GEMMA::BatchRun (PARAM &cPar) } else if (!(cPar.file_kin).empty()) { ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - + //center matrix G CenterMatrix (G); @@ -1167,8 +1764,8 @@ void GEMMA::BatchRun (PARAM &cPar) /* //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<<endl; - time_start=clock(); - + time_start=clock(); + if (cPar.a_mode==31) { cPar.trace_G=EigenDecomp (G, U, eval, 1); } else { @@ -1182,14 +1779,14 @@ void GEMMA::BatchRun (PARAM &cPar) } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;} - - ReadFile_eigenD (cPar.file_kd, cPar.error, eval); + + ReadFile_eigenD (cPar.file_kd, cPar.error, eval); if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;} - + cPar.trace_G=0.0; for (size_t i=0; i<eval->size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} @@ -1202,7 +1799,7 @@ void GEMMA::BatchRun (PARAM &cPar) if (cPar.n_ph==1) { // if (cPar.n_vc==1) { /* - //calculate UtW and Uty + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, Y, UtY); @@ -1228,10 +1825,10 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } - + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); - + //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); @@ -1239,11 +1836,11 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); - + //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); - + double d, u, e; for (size_t i=0; i<eval->size; i++) { d=gsl_vector_get (eval, i); @@ -1252,37 +1849,210 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } - + //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); - - //output residuals + + //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); - + gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); - } -*/ + } +*/ // } else { gsl_vector_view Y_col=gsl_matrix_column (Y, 0); VC cVc; - cVc.CopyFromParam(cPar); - cVc.CalcVCreml (G, W, &Y_col.vector); + cVc.CopyFromParam(cPar); + if (cPar.a_mode==61) { + cVc.CalcVChe (G, W, &Y_col.vector); + } else { + cVc.CalcVCreml (cPar.noconstrain, G, W, &Y_col.vector); + } cVc.CopyToParam(cPar); - //obtain pve from sigma2 //obtain se_pve from se_sigma2 - + //} - } + } + } + + } + - + //compute confidence intervals with additional summary statistics + //we do not check the sign of z-scores here, but they have to be matched with the genotypes + if (cPar.a_mode==66 || cPar.a_mode==67) { + //read reference file first + gsl_matrix *S=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_matrix *Svar=gsl_matrix_alloc (cPar.n_vc, cPar.n_vc); + gsl_vector *s_ref=gsl_vector_alloc (cPar.n_vc); + + gsl_matrix_set_zero(S); + gsl_matrix_set_zero(Svar); + gsl_vector_set_zero(s_ref); + + if (!cPar.file_ref.empty()) { + ReadFile_ref(cPar.file_ref, S, Svar, s_ref, cPar.ni_ref); + } else { + ReadFile_mref(cPar.file_mref, S, Svar, s_ref, cPar.ni_ref); + } + + //need to obtain a common set of SNPs between beta file and the genotype file; these are saved in mapRS2wA and mapRS2wK + //normalize the weight in mapRS2wK to have an average of one; each element of mapRS2wA is 1 + set<string> setSnps_beta; + ReadFile_snps_header (cPar.file_beta, setSnps_beta); + + //obtain the weights for wA, which contains the SNP weights for SNPs used in the model + map <string, double> mapRS2wK; + cPar.ObtainWeight(setSnps_beta, mapRS2wK); + + //set up matrices and vector + gsl_matrix *Xz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc); + gsl_matrix *XWz=gsl_matrix_alloc (cPar.ni_test, cPar.n_vc); + gsl_matrix *XtXWz=gsl_matrix_alloc (mapRS2wK.size(), cPar.n_vc*cPar.n_vc); + gsl_vector *w=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *w1=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *z=gsl_vector_alloc (mapRS2wK.size()); + gsl_vector *s_vec=gsl_vector_alloc (cPar.n_vc); + + vector<size_t> vec_cat, vec_size; + vector<double> vec_z; + + map <string, double> mapRS2z, mapRS2wA; + map <string, string> mapRS2A1; + string file_str; + + //update s_vec, the number of snps in each category + for (size_t i=0; i<cPar.n_vc; i++) { + vec_size.push_back(0); + } + + for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + vec_size[cPar.mapRS2cat[it->first]]++; + } + + for (size_t i=0; i<cPar.n_vc; i++) { + gsl_vector_set(s_vec, i, vec_size[i]); + } + + //update mapRS2wA using v_pve and s_vec + if (cPar.a_mode==66) { + for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + mapRS2wA[it->first]=1; + } + } else { + cPar.UpdateWeight (0, mapRS2wK, cPar.ni_test, s_vec, mapRS2wA); + } + + //read in z-scores based on allele 0, and save that into a vector + ReadFile_beta (cPar.file_beta, mapRS2wA, mapRS2A1, mapRS2z); + + //update snp indicator, save weights to w, save z-scores to vec_z, save category label to vec_cat + //sign of z is determined by matching alleles + cPar.UpdateSNPnZ (mapRS2wA, mapRS2A1, mapRS2z, w, z, vec_cat); + + //compute an n by k matrix of X_iWz + cout<<"Calculating Xz ... "<<endl; + + gsl_matrix_set_zero(Xz); + gsl_vector_set_all (w1, 1); + + if (!cPar.file_bfile.empty() ) { + file_str=cPar.file_bfile+".bed"; + PlinkXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w1, z, 0, Xz); + } else if (!cPar.file_geno.empty()) { + BimbamXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w1, z, 0, Xz); + } else if (!cPar.file_mbfile.empty() ){ + MFILEXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w1, z, Xz); + } else if (!cPar.file_mgeno.empty()) { + MFILEXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w1, z, Xz); + } + /* + cout<<"Xz: "<<endl; + for (size_t i=0; i<5; i++) { + for (size_t j=0; j<cPar.n_vc; j++) { + cout<<gsl_matrix_get (Xz, i, j)<<" "; + } + cout<<endl; + } + */ + if (cPar.a_mode==66) { + gsl_matrix_memcpy (XWz, Xz); + } else if (cPar.a_mode==67) { + cout<<"Calculating XWz ... "<<endl; + + gsl_matrix_set_zero(XWz); + + if (!cPar.file_bfile.empty() ) { + file_str=cPar.file_bfile+".bed"; + PlinkXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w, z, 0, XWz); + } else if (!cPar.file_geno.empty()) { + BimbamXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, vec_cat, w, z, 0, XWz); + } else if (!cPar.file_mbfile.empty() ){ + MFILEXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w, z, XWz); + } else if (!cPar.file_mgeno.empty()) { + MFILEXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, vec_cat, w, z, XWz); + } + } + /* + cout<<"XWz: "<<endl; + for (size_t i=0; i<5; i++) { + cout<<gsl_vector_get (w, i)<<endl; + for (size_t j=0; j<cPar.n_vc; j++) { + cout<<gsl_matrix_get (XWz, i, j)<<" "; + } + cout<<endl; + } + */ + //compute an p by k matrix of X_j^TWX_iWz + cout<<"Calculating XtXWz ... "<<endl; + gsl_matrix_set_zero(XtXWz); + + if (!cPar.file_bfile.empty() ) { + file_str=cPar.file_bfile+".bed"; + PlinkXtXwz (file_str, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, XWz, 0, XtXWz); + } else if (!cPar.file_geno.empty()) { + BimbamXtXwz (cPar.file_geno, cPar.d_pace, cPar.indicator_idv, cPar.indicator_snp, XWz, 0, XtXWz); + } else if (!cPar.file_mbfile.empty() ){ + MFILEXtXwz (1, cPar.file_mbfile, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, XWz, XtXWz); + } else if (!cPar.file_mgeno.empty()) { + MFILEXtXwz (0, cPar.file_mgeno, cPar.d_pace, cPar.indicator_idv, cPar.mindicator_snp, XWz, XtXWz); + } + /* + cout<<"XtXWz: "<<endl; + for (size_t i=0; i<5; i++) { + for (size_t j=0; j<cPar.n_vc; j++) { + cout<<gsl_matrix_get (XtXWz, i, j)<<" "; + } + cout<<endl; + } + */ + //compute confidence intervals + CalcCIss(Xz, XWz, XtXWz, S, Svar, w, z, s_vec, vec_cat, cPar.v_pve, cPar.v_se_pve, cPar.pve_total, cPar.se_pve_total, cPar.v_sigma2, cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich); + + //write files + //cPar.WriteMatrix (XWz, "XWz"); + //cPar.WriteMatrix (XtXWz, "XtXWz"); + //cPar.WriteVector (w, "w"); + + gsl_matrix_free(S); + gsl_matrix_free(Svar); + gsl_vector_free(s_ref); + + gsl_matrix_free(Xz); + gsl_matrix_free(XWz); + gsl_matrix_free(XtXWz); + gsl_vector_free(w); + gsl_vector_free(w1); + gsl_vector_free(z); + gsl_vector_free(s_vec); } - - + + //LMM or mvLMM or Eigen-Decomposition if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==31) { //Fit LMM or mvLMM or eigen gsl_matrix *Y=gsl_matrix_alloc (cPar.ni_test, cPar.n_ph); @@ -1290,33 +2060,62 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_matrix *B=gsl_matrix_alloc (Y->size2, W->size2); //B is a d by c matrix gsl_matrix *se_B=gsl_matrix_alloc (Y->size2, W->size2); gsl_matrix *G=gsl_matrix_alloc (Y->size1, Y->size1); - gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); + gsl_matrix *U=gsl_matrix_alloc (Y->size1, Y->size1); gsl_matrix *UtW=gsl_matrix_alloc (Y->size1, W->size2); gsl_matrix *UtY=gsl_matrix_alloc (Y->size1, Y->size2); gsl_vector *eval=gsl_vector_alloc (Y->size1); - - //set covariates matrix W and phenotype matrix Y - //an intercept should be included in W, + gsl_vector *env=gsl_vector_alloc (Y->size1); + gsl_vector *weight=gsl_vector_alloc (Y->size1); + + //set covariates matrix W and phenotype matrix Y + //an intercept should be included in W, cPar.CopyCvtPhen (W, Y, 0); - - //read relatedness matrix G + if (!cPar.file_gxe.empty()) {cPar.CopyGxe (env);} + + //read relatedness matrix G if (!(cPar.file_kin).empty()) { ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - + //center matrix G CenterMatrix (G); - + + //is residual weights are provided, then + if (!cPar.file_weight.empty()) { + cPar.CopyWeight (weight); + double d, wi, wj; + for (size_t i=0; i<G->size1; i++) { + wi=gsl_vector_get(weight, i); + for (size_t j=i; j<G->size2; j++) { + wj=gsl_vector_get(weight, j); + d=gsl_matrix_get(G, i, j); + if (wi<=0 || wj<=0) {d=0;} else {d/=sqrt(wi*wj);} + gsl_matrix_set(G, i, j, d); + if (j!=i) {gsl_matrix_set(G, j, i, d);} + } + } + } + //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<<endl; - time_start=clock(); - + time_start=clock(); + if (cPar.a_mode==31) { cPar.trace_G=EigenDecomp (G, U, eval, 1); } else { cPar.trace_G=EigenDecomp (G, U, eval, 0); } + if (!cPar.file_weight.empty()) { + double wi; + for (size_t i=0; i<U->size1; i++) { + wi=gsl_vector_get(weight, i); + if (wi<=0) {wi=0;} else {wi=sqrt(wi);} + gsl_vector_view Urow=gsl_matrix_row (U, i); + gsl_vector_scale (&Urow.vector, wi); + } + } + cPar.trace_G=0.0; for (size_t i=0; i<eval->size; i++) { if (gsl_vector_get (eval, i)<1e-10) {gsl_vector_set (eval, i, 0);} @@ -1324,14 +2123,14 @@ void GEMMA::BatchRun (PARAM &cPar) } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } else { ReadFile_eigenU (cPar.file_ku, cPar.error, U); if (cPar.error==true) {cout<<"error! fail to read the U file. "<<endl; return;} - - ReadFile_eigenD (cPar.file_kd, cPar.error, eval); + + ReadFile_eigenD (cPar.file_kd, cPar.error, eval); if (cPar.error==true) {cout<<"error! fail to read the D file. "<<endl; return;} - + cPar.trace_G=0.0; for (size_t i=0; i<eval->size; i++) { if (gsl_vector_get(eval, i)<1e-10) {gsl_vector_set(eval, i, 0);} @@ -1339,14 +2138,29 @@ void GEMMA::BatchRun (PARAM &cPar) } cPar.trace_G/=(double)eval->size; } - + if (cPar.a_mode==31) { cPar.WriteMatrix(U, "eigenU"); cPar.WriteVector(eval, "eigenD"); - } else { - //calculate UtW and Uty + } else if (!cPar.file_gene.empty() ) { + //calculate UtW and Uty CalcUtX (U, W, UtW); - CalcUtX (U, Y, UtY); + CalcUtX (U, Y, UtY); + + LMM cLmm; + cLmm.CopyFromParam(cPar); + + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); + gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); + + cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype + + cLmm.WriteFiles(); + cLmm.CopyToParam(cPar); + } else { + //calculate UtW and Uty + CalcUtX (U, W, UtW); + CalcUtX (U, Y, UtY); //calculate REMLE/MLE estimate and pve for univariate model if (cPar.n_ph==1) { @@ -1372,10 +2186,10 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i) ); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i) ); } - + CalcPve (eval, UtW, &UtY_col.vector, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); cPar.PrintSummary(); - + //calculate and output residuals if (cPar.a_mode==5) { gsl_vector *Utu_hat=gsl_vector_alloc (Y->size1); @@ -1383,11 +2197,11 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector *u_hat=gsl_vector_alloc (Y->size1); gsl_vector *e_hat=gsl_vector_alloc (Y->size1); gsl_vector *y_hat=gsl_vector_alloc (Y->size1); - + //obtain Utu and Ute gsl_vector_memcpy (y_hat, &UtY_col.vector); gsl_blas_dgemv (CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); - + double d, u, e; for (size_t i=0; i<eval->size; i++) { d=gsl_vector_get (eval, i); @@ -1396,81 +2210,106 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_vector_set (Utu_hat, i, u); gsl_vector_set (Ute_hat, i, e); } - + //obtain u and e gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu_hat, 0.0, u_hat); gsl_blas_dgemv (CblasNoTrans, 1.0, U, Ute_hat, 0.0, e_hat); - - //output residuals + + //output residuals cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); - + gsl_vector_free(u_hat); gsl_vector_free(e_hat); gsl_vector_free(y_hat); - } - } - + } + } + //Fit LMM or mvLMM if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4) { - if (cPar.n_ph==1) { + if (cPar.n_ph==1) { LMM cLmm; cLmm.CopyFromParam(cPar); - + gsl_vector_view Y_col=gsl_matrix_column (Y, 0); gsl_vector_view UtY_col=gsl_matrix_column (UtY, 0); - - if (!cPar.file_gene.empty()) { - cLmm.AnalyzeGene (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); //y is the predictor, not the phenotype - } else if (!cPar.file_bfile.empty()) { - cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); - } else { - cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); - } - + + if (!cPar.file_bfile.empty()) { + if (cPar.file_gxe.empty()) { + cLmm.AnalyzePlink (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzePlinkGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); + } + } + // WJA added + else if(!cPar.file_oxford.empty()) { + cLmm.Analyzebgen (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } + else { + if (cPar.file_gxe.empty()) { + cLmm.AnalyzeBimbam (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); + } else { + cLmm.AnalyzeBimbamGXE (U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); + } + } + cLmm.WriteFiles(); cLmm.CopyToParam(cPar); - } else { + } else { MVLMM cMvlmm; - cMvlmm.CopyFromParam(cPar); - + cMvlmm.CopyFromParam(cPar); + if (!cPar.file_bfile.empty()) { - cMvlmm.AnalyzePlink (U, eval, UtW, UtY); - } else { - cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + if (cPar.file_gxe.empty()) { + cMvlmm.AnalyzePlink (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzePlinkGXE (U, eval, UtW, UtY, env); + } + } + else if(!cPar.file_oxford.empty()) + { + cMvlmm.Analyzebgen (U, eval, UtW, UtY); + } + else { + if (cPar.file_gxe.empty()) { + cMvlmm.AnalyzeBimbam (U, eval, UtW, UtY); + } else { + cMvlmm.AnalyzeBimbamGXE (U, eval, UtW, UtY, env); + } } - + cMvlmm.WriteFiles(); cMvlmm.CopyToParam(cPar); } } } - - + + //release all matrices and vectors gsl_matrix_free (Y); gsl_matrix_free (W); gsl_matrix_free(B); gsl_matrix_free(se_B); - gsl_matrix_free (G); + gsl_matrix_free (G); gsl_matrix_free (U); gsl_matrix_free (UtW); gsl_matrix_free (UtY); gsl_vector_free (eval); - } - - + gsl_vector_free (env); + } + + //BSLMM if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { gsl_vector *y=gsl_vector_alloc (cPar.ni_test); - gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); gsl_matrix *G=gsl_matrix_alloc (y->size, y->size); - gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); - - //set covariates matrix W and phenotype vector y - //an intercept should be included in W, + gsl_matrix *UtX=gsl_matrix_alloc (y->size, cPar.ns_test); + + //set covariates matrix W and phenotype vector y + //an intercept should be included in W, cPar.CopyCvtPhen (W, y, 0); - + //center y, even for case/control data cPar.pheno_mean=CenterVector(y); @@ -1482,32 +2321,32 @@ void GEMMA::BatchRun (PARAM &cPar) //perform BSLMM analysis BSLMM cBslmm; cBslmm.CopyFromParam(cPar); - time_start=clock(); + time_start=clock(); cBslmm.MCMC(UtX, y); cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); cBslmm.CopyToParam(cPar); //else, if rho!=1 } else { - gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); + gsl_matrix *U=gsl_matrix_alloc (y->size, y->size); gsl_vector *eval=gsl_vector_alloc (y->size); gsl_matrix *UtW=gsl_matrix_alloc (y->size, W->size2); gsl_vector *Uty=gsl_vector_alloc (y->size); - - //read relatedness matrix G - if (!(cPar.file_kin).empty()) { + + //read relatedness matrix G + if (!(cPar.file_kin).empty()) { cPar.ReadGenotypes (UtX, G, false); - + //read relatedness matrix G ReadFile_kin (cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, cPar.k_mode, cPar.error, G); if (cPar.error==true) {cout<<"error! fail to read kinship/relatedness file. "<<endl; return;} - + //center matrix G CenterMatrix (G); } else { cPar.ReadGenotypes (UtX, G, true); } - + //eigen-decomposition and calculate trace_G cout<<"Start Eigen-Decomposition..."<<endl; time_start=clock(); @@ -1518,39 +2357,39 @@ void GEMMA::BatchRun (PARAM &cPar) cPar.trace_G+=gsl_vector_get (eval, i); } cPar.trace_G/=(double)eval->size; - cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - - //calculate UtW and Uty + cPar.time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //calculate UtW and Uty CalcUtX (U, W, UtW); CalcUtX (U, y, Uty); - + //calculate REMLE/MLE estimate and pve CalcLambda ('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); CalcLambda ('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0); CalcPve (eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null, cPar.pve_se_null); - + cPar.PrintSummary(); - + //Creat and calcualte UtX, use a large memory cout<<"Calculating UtX..."<<endl; - time_start=clock(); + time_start=clock(); CalcUtX (U, UtX); cPar.time_UtX=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + //perform BSLMM analysis BSLMM cBslmm; cBslmm.CopyFromParam(cPar); - time_start=clock(); - if (cPar.a_mode==12) { //ridge regression + time_start=clock(); + if (cPar.a_mode==12) { //ridge regression cBslmm.RidgeR(U, UtX, Uty, eval, cPar.l_remle_null); } else { //Run MCMC cBslmm.MCMC(U, UtX, Uty, eval, y); } cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); cBslmm.CopyToParam(cPar); - + //release all matrices and vectors - gsl_matrix_free (G); + gsl_matrix_free (G); gsl_matrix_free (U); gsl_matrix_free (UtW); gsl_vector_free (eval); @@ -1560,106 +2399,259 @@ void GEMMA::BatchRun (PARAM &cPar) gsl_matrix_free (W); gsl_vector_free (y); gsl_matrix_free (UtX); - } - - - + } + + + + //LDR + if (cPar.a_mode==14) { + gsl_vector *y=gsl_vector_alloc (cPar.ni_test); + gsl_matrix *W=gsl_matrix_alloc (y->size, cPar.n_cvt); + gsl_matrix *G=gsl_matrix_alloc (1, 1); + vector<vector<unsigned char> > Xt; + + //set covariates matrix W and phenotype vector y + //an intercept is included in W + cPar.CopyCvtPhen (W, y, 0); + + //read in genotype matrix X + cPar.ReadGenotypes (Xt, G, false); + + LDR cLdr; + cLdr.CopyFromParam(cPar); + time_start=clock(); + + cLdr.VB(Xt, W, y); + + cPar.time_opt=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + cLdr.CopyToParam(cPar); + + gsl_vector_free (y); + gsl_matrix_free (W); + gsl_matrix_free (G); + } + cPar.time_total=(clock()-time_begin)/(double(CLOCKS_PER_SEC)*60.0); - + return; } -void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) +void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) { string file_str; file_str=cPar.path_out+"/"+cPar.file_out; file_str+=".log.txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) {cout<<"error writing log file: "<<file_str.c_str()<<endl; return;} - + outfile<<"##"<<endl; outfile<<"## GEMMA Version = "<<version<<endl; - + outfile<<"##"<<endl; outfile<<"## Command Line Input = "; - for(int i = 1; i < argc; i++) { + for(int i = 0; i < argc; i++) { outfile<<argv[i]<<" "; } outfile<<endl; outfile<<"##"<<endl; - time_t rawtime; + time_t rawtime; time(&rawtime); tm *ptm = localtime (&rawtime); - outfile<<"## Date = "<<asctime(ptm)<<endl; + outfile<<"## Date = "<<asctime(ptm); //ptm->tm_year<<":"<<ptm->tm_month<<":"<<ptm->tm_day":"<<ptm->tm_hour<<":"<<ptm->tm_min<<endl; - + outfile<<"##"<<endl; outfile<<"## Summary Statistics:"<<endl; - outfile<<"## number of total individuals = "<<cPar.ni_total<<endl; - if (cPar.a_mode==43) { - outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl; - outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl; - } else { - outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl; - } - outfile<<"## number of covariates = "<<cPar.n_cvt<<endl; - outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl; - if (cPar.a_mode==43) { - outfile<<"## number of observed data = "<<cPar.np_obs<<endl; - outfile<<"## number of missing data = "<<cPar.np_miss<<endl; - } - if (cPar.a_mode==61) { - outfile<<"## number of variance components = "<<cPar.n_vc<<endl; - } - - if (!(cPar.file_gene).empty()) { - outfile<<"## number of total genes = "<<cPar.ng_total<<endl; - outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl; - } else if (cPar.file_epm.empty()) { - outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl; - outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + if (!cPar.file_cor.empty() || !cPar.file_study.empty() || !cPar.file_mstudy.empty() ) { + outfile<<"## number of total individuals in the sample = "<<cPar.ni_study<<endl; + outfile<<"## number of total individuals in the reference = "<<cPar.ni_ref<<endl; + //outfile<<"## number of total SNPs in the sample = "<<cPar.ns_study<<endl; + //outfile<<"## number of total SNPs in the reference panel = "<<cPar.ns_ref<<endl; + //outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + //outfile<<"## number of analyzed SNP pairs = "<<cPar.ns_pair<<endl; + outfile<<"## number of variance components = "<<cPar.n_vc<<endl; + + outfile<<"## pve estimates = "; + for (size_t i=0; i<cPar.v_pve.size(); i++) { + outfile<<" "<<cPar.v_pve[i]; + } + outfile<<endl; + + outfile<<"## se(pve) = "; + for (size_t i=0; i<cPar.v_se_pve.size(); i++) { + outfile<<" "<<cPar.v_se_pve[i]; + } + outfile<<endl; + + if (cPar.n_vc>1) { + outfile<<"## total pve = "<<cPar.pve_total<<endl; + outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl; + } + + outfile<<"## sigma2 per snp = "; + for (size_t i=0; i<cPar.v_sigma2.size(); i++) { + outfile<<" "<<cPar.v_sigma2[i]; + } + outfile<<endl; + + outfile<<"## se(sigma2 per snp) = "; + for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) { + outfile<<" "<<cPar.v_se_sigma2[i]; + } + outfile<<endl; + + outfile<<"## enrichment = "; + for (size_t i=0; i<cPar.v_enrich.size(); i++) { + outfile<<" "<<cPar.v_enrich[i]; + } + outfile<<endl; + + outfile<<"## se(enrichment) = "; + for (size_t i=0; i<cPar.v_se_enrich.size(); i++) { + outfile<<" "<<cPar.v_se_enrich[i]; + } + outfile<<endl; + } else if (!cPar.file_beta.empty() && (cPar.a_mode==61 || cPar.a_mode==62) ) { + outfile<<"## number of total individuals in the sample = "<<cPar.ni_study<<endl; + outfile<<"## number of total individuals in the reference = "<<cPar.ni_total<<endl; + outfile<<"## number of total SNPs in the sample = "<<cPar.ns_study<<endl; + outfile<<"## number of total SNPs in the reference panel = "<<cPar.ns_total<<endl; + outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + outfile<<"## number of variance components = "<<cPar.n_vc<<endl; + } else if (!cPar.file_beta.empty() && (cPar.a_mode==66 || cPar.a_mode==67) ) { + outfile<<"## number of total individuals in the sample = "<<cPar.ni_total<<endl; + outfile<<"## number of total individuals in the reference = "<<cPar.ni_ref<<endl; + outfile<<"## number of total SNPs in the sample = "<<cPar.ns_total<<endl; + outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + outfile<<"## number of variance components = "<<cPar.n_vc<<endl; + + outfile<<"## pve estimates = "; + for (size_t i=0; i<cPar.v_pve.size(); i++) { + outfile<<" "<<cPar.v_pve[i]; + } + outfile<<endl; + + outfile<<"## se(pve) = "; + for (size_t i=0; i<cPar.v_se_pve.size(); i++) { + outfile<<" "<<cPar.v_se_pve[i]; + } + outfile<<endl; + + if (cPar.n_vc>1) { + outfile<<"## total pve = "<<cPar.pve_total<<endl; + outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl; + } + + outfile<<"## sigma2 per snp = "; + for (size_t i=0; i<cPar.v_sigma2.size(); i++) { + outfile<<" "<<cPar.v_sigma2[i]; + } + outfile<<endl; + + outfile<<"## se(sigma2 per snp) = "; + for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) { + outfile<<" "<<cPar.v_se_sigma2[i]; + } + outfile<<endl; + + outfile<<"## enrichment = "; + for (size_t i=0; i<cPar.v_enrich.size(); i++) { + outfile<<" "<<cPar.v_enrich[i]; + } + outfile<<endl; + + outfile<<"## se(enrichment) = "; + for (size_t i=0; i<cPar.v_se_enrich.size(); i++) { + outfile<<" "<<cPar.v_se_enrich[i]; + } + outfile<<endl; } else { - outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + outfile<<"## number of total individuals = "<<cPar.ni_total<<endl; + + if (cPar.a_mode==43) { + outfile<<"## number of analyzed individuals = "<<cPar.ni_cvt<<endl; + outfile<<"## number of individuals with full phenotypes = "<<cPar.ni_test<<endl; + } else if (cPar.a_mode!=27 && cPar.a_mode!=28) { + outfile<<"## number of analyzed individuals = "<<cPar.ni_test<<endl; + } + + outfile<<"## number of covariates = "<<cPar.n_cvt<<endl; + outfile<<"## number of phenotypes = "<<cPar.n_ph<<endl; + if (cPar.a_mode==43) { + outfile<<"## number of observed data = "<<cPar.np_obs<<endl; + outfile<<"## number of missing data = "<<cPar.np_miss<<endl; + } + if (cPar.a_mode==25 || cPar.a_mode==26 || cPar.a_mode==27 || cPar.a_mode==28 || cPar.a_mode==61 || cPar.a_mode==62 || cPar.a_mode==66 || cPar.a_mode==67) { + outfile<<"## number of variance components = "<<cPar.n_vc<<endl; + } + + if (!(cPar.file_gene).empty()) { + outfile<<"## number of total genes = "<<cPar.ng_total<<endl; + outfile<<"## number of analyzed genes = "<<cPar.ng_test<<endl; + } else if (cPar.file_epm.empty()) { + outfile<<"## number of total SNPs = "<<cPar.ns_total<<endl; + outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + } else { + outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; + } + + if (cPar.a_mode==13) { + outfile<<"## number of cases = "<<cPar.ni_case<<endl; + outfile<<"## number of controls = "<<cPar.ni_control<<endl; + } } - - if (cPar.a_mode==13) { - outfile<<"## number of cases = "<<cPar.ni_case<<endl; - outfile<<"## number of controls = "<<cPar.ni_control<<endl; - } - - - if (cPar.a_mode==61) { - // outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl; - if (cPar.n_ph==1) { - outfile<<"## pve estimate in the null model = "; - for (size_t i=0; i<cPar.v_pve.size(); i++) { - outfile<<" "<<cPar.v_pve[i]; - } - outfile<<endl; - - outfile<<"## se(pve) in the null model = "; - for (size_t i=0; i<cPar.v_se_pve.size(); i++) { - outfile<<" "<<cPar.v_se_pve[i]; - } - outfile<<endl; - - outfile<<"## sigma2 estimate in the null model = "; - for (size_t i=0; i<cPar.v_sigma2.size(); i++) { - outfile<<" "<<cPar.v_sigma2[i]; - } - outfile<<endl; - outfile<<"## se(sigma2) in the null model = "; - for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) { - outfile<<" "<<cPar.v_se_sigma2[i]; - } - outfile<<endl; + if ( (cPar.a_mode==61 || cPar.a_mode==62) && cPar.file_cor.empty() && cPar.file_study.empty() && cPar.file_mstudy.empty() ) { + // outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl; + if (cPar.n_ph==1) { + outfile<<"## pve estimates = "; + for (size_t i=0; i<cPar.v_pve.size(); i++) { + outfile<<" "<<cPar.v_pve[i]; + } + outfile<<endl; + + outfile<<"## se(pve) = "; + for (size_t i=0; i<cPar.v_se_pve.size(); i++) { + outfile<<" "<<cPar.v_se_pve[i]; + } + outfile<<endl; + + if (cPar.n_vc>1) { + outfile<<"## total pve = "<<cPar.pve_total<<endl; + outfile<<"## se(total pve) = "<<cPar.se_pve_total<<endl; + } + + outfile<<"## sigma2 estimates = "; + for (size_t i=0; i<cPar.v_sigma2.size(); i++) { + outfile<<" "<<cPar.v_sigma2[i]; + } + outfile<<endl; + + outfile<<"## se(sigma2) = "; + for (size_t i=0; i<cPar.v_se_sigma2.size(); i++) { + outfile<<" "<<cPar.v_se_sigma2[i]; + } + outfile<<endl; + + if (!cPar.file_beta.empty() ) { + outfile<<"## enrichment = "; + for (size_t i=0; i<cPar.v_enrich.size(); i++) { + outfile<<" "<<cPar.v_enrich[i]; + } + outfile<<endl; + + outfile<<"## se(enrichment) = "; + for (size_t i=0; i<cPar.v_se_enrich.size(); i++) { + outfile<<" "<<cPar.v_se_enrich[i]; + } + outfile<<endl; + } /* outfile<<"## beta estimate in the null model = "; for (size_t i=0; i<cPar.beta_remle_null.size(); i++) { @@ -1672,19 +2664,19 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; */ - } + } } - + if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { outfile<<"## REMLE log-likelihood in the null model = "<<cPar.logl_remle_H0<<endl; outfile<<"## MLE log-likelihood in the null model = "<<cPar.logl_mle_H0<<endl; if (cPar.n_ph==1) { //outfile<<"## lambda REMLE estimate in the null (linear mixed) model = "<<cPar.l_remle_null<<endl; - //outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl; + //outfile<<"## lambda MLE estimate in the null (linear mixed) model = "<<cPar.l_mle_null<<endl; outfile<<"## pve estimate in the null model = "<<cPar.pve_null<<endl; - outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl; + outfile<<"## se(pve) in the null model = "<<cPar.pve_se_null<<endl; outfile<<"## vg estimate in the null model = "<<cPar.vg_remle_null<<endl; - outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl; + outfile<<"## ve estimate in the null model = "<<cPar.ve_remle_null<<endl; outfile<<"## beta estimate in the null model = "; for (size_t i=0; i<cPar.beta_remle_null.size(); i++) { outfile<<" "<<cPar.beta_remle_null[i]; @@ -1695,10 +2687,10 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) outfile<<" "<<cPar.se_beta_remle_null[i]; } outfile<<endl; - + } else { size_t c; - outfile<<"## REMLE estimate for Vg in the null model: "<<endl; + outfile<<"## REMLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1706,7 +2698,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## se(Vg): "<<endl; + outfile<<"## se(Vg): "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1714,7 +2706,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## REMLE estimate for Ve in the null model: "<<endl; + outfile<<"## REMLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1722,7 +2714,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## se(Ve): "<<endl; + outfile<<"## se(Ve): "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1730,7 +2722,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - + outfile<<"## MLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<cPar.n_ph; j++) { @@ -1739,7 +2731,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## se(Vg): "<<endl; + outfile<<"## se(Vg): "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1747,7 +2739,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## MLE estimate for Ve in the null model: "<<endl; + outfile<<"## MLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<cPar.n_ph; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1755,7 +2747,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## se(Ve): "<<endl; + outfile<<"## se(Ve): "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1781,15 +2773,15 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } } } - + /* if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { if (cPar.n_ph==1) { outfile<<"## REMLE vg estimate in the null model = "<<cPar.vg_remle_null<<endl; - outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl; + outfile<<"## REMLE ve estimate in the null model = "<<cPar.ve_remle_null<<endl; } else { size_t c; - outfile<<"## REMLE estimate for Vg in the null model: "<<endl; + outfile<<"## REMLE estimate for Vg in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1797,7 +2789,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } outfile<<endl; } - outfile<<"## REMLE estimate for Ve in the null model: "<<endl; + outfile<<"## REMLE estimate for Ve in the null model: "<<endl; for (size_t i=0; i<cPar.n_ph; i++) { for (size_t j=0; j<=i; j++) { c=(2*cPar.n_ph-min(i,j)+1)*min(i,j)/2+max(i,j)-min(i,j); @@ -1808,15 +2800,15 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } } */ - - + + if (cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { - outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl; + outfile<<"## estimated mean = "<<cPar.pheno_mean<<endl; } - - if (cPar.a_mode==11 || cPar.a_mode==13) { + + if (cPar.a_mode==11 || cPar.a_mode==13) { outfile<<"##"<<endl; - outfile<<"## MCMC related:"<<endl; + outfile<<"## MCMC related:"<<endl; outfile<<"## initial value of h = "<<cPar.cHyp_initial.h<<endl; outfile<<"## initial value of rho = "<<cPar.cHyp_initial.rho<<endl; outfile<<"## initial value of pi = "<<exp(cPar.cHyp_initial.logp)<<endl; @@ -1824,7 +2816,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) outfile<<"## random seed = "<<cPar.randseed<<endl; outfile<<"## acceptance ratio = "<<(double)cPar.n_accept/(double)((cPar.w_step+cPar.s_step)*cPar.n_mh)<<endl; } - + outfile<<"##"<<endl; outfile<<"## Computation Time:"<<endl; outfile<<"## total computation time = "<<cPar.time_total<<" min "<<endl; @@ -1837,7 +2829,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) } if (cPar.a_mode==1 || cPar.a_mode==2 || cPar.a_mode==3 || cPar.a_mode==4 || cPar.a_mode==5 || cPar.a_mode==11 || cPar.a_mode==12 || cPar.a_mode==13) { outfile<<"## time on eigen-decomposition = "<<cPar.time_eigen<<" min "<<endl; - outfile<<"## time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl; + outfile<<"## time on calculating UtX = "<<cPar.time_UtX<<" min "<<endl; } if ((cPar.a_mode>=1 && cPar.a_mode<=4) || (cPar.a_mode>=51 && cPar.a_mode<=54) ) { outfile<<"## time on optimization = "<<cPar.time_opt<<" min "<<endl; @@ -1855,7 +2847,7 @@ void GEMMA::WriteLog (int argc, char ** argv, PARAM &cPar) outfile<<"## time on predicting phenotypes = "<<cPar.time_opt<<" min "<<endl; } outfile<<"##"<<endl; - + outfile.close(); outfile.clear(); return; @@ -90,6 +90,21 @@ void ProgressBar (string str, double p, double total, double ratio) return; } + +bool isBlankLine(char const* line) +{ + for ( char const* cp = line; *cp; ++cp ) + { + if ( !isspace(*cp) ) return false; + } + return true; +} + +bool isBlankLine(std::string const& line) +{ + return isBlankLine(line.c_str()); +} + // in case files are ended with "\r" or "\r\n" std::istream& safeGetline(std::istream& is, std::string& t) { @@ -129,7 +144,10 @@ bool ReadFile_snps (const string &file_snps, set<string> &setSnps) { setSnps.clear(); - ifstream infile (file_snps.c_str(), ifstream::in); + //ifstream infile (file_snps.c_str(), ifstream::in); + //if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;} + + igzstream infile (file_snps.c_str(), igzstream::in); if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;} string line; @@ -147,6 +165,54 @@ bool ReadFile_snps (const string &file_snps, set<string> &setSnps) } +bool ReadFile_snps_header (const string &file_snps, set<string> &setSnps) +{ + setSnps.clear(); + + //ifstream infile (file_snps.c_str(), ifstream::in); + //if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;} + + igzstream infile (file_snps.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open snps file: "<<file_snps<<endl; return false;} + + string line, rs, chr, pos; + char *ch_ptr; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.rs_col==0 && (header.chr_col==0 || header.pos_col==0) ) { + cout<<"missing rs id in the hearder"<<endl; + } + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) {continue;} + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + setSnps.insert(rs); + } + + infile.close(); + infile.clear(); + + return true; +} + + //Read log file bool ReadFile_log (const string &file_log, double &pheno_mean) { @@ -353,7 +419,7 @@ bool ReadFile_cvt (const string &file_cvt, vector<int> &indicator_cvt, vector<ve //Read .bim file bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo) { - snpInfo.clear(); + snpInfo.clear(); ifstream infile (file_bim.c_str(), ifstream::in); if (!infile) {cout<<"error opening .bim file: "<<file_bim<<endl; return false;} @@ -662,7 +728,7 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl //start reading snps and doing association test for (size_t t=0; t<ns_total; ++t) { - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) { snpInfo[t].n_miss=-9; @@ -710,11 +776,10 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} - if (hwe_level!=1 && maf_level!=-1) { + if (hwe_level!=0 && maf_level!=-1) { if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} } - //filter SNP if it is correlated with W //unless W has only one column, of 1s for (size_t i=0; i<genotype->size; ++i) { @@ -1054,6 +1119,11 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k gsl_vector *geno=gsl_vector_alloc (ni_total); gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); + gsl_matrix_set_zero(Xlarge); + size_t ns_test=0; for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); @@ -1090,6 +1160,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k gsl_vector_add_constant (geno, -1.0*geno_mean); + /* if (geno_var!=0) { if (k_mode==1) { gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); @@ -1101,8 +1172,23 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k cout<<"Unknown kinship mode."<<endl; } } + */ + + if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));} + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + ns_test++; - } + + if (ns_test%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } + + if (ns_test%msize!=0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + } cout<<endl; gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test); @@ -1116,6 +1202,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k gsl_vector_free (geno); gsl_vector_free (geno_miss); + gsl_matrix_free (Xlarge); infile.close(); infile.clear(); @@ -1146,11 +1233,16 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m size_t ns_test=0; int n_bit; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); + gsl_matrix_set_zero(Xlarge); + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - //print the first three majic numbers + //print the first three magic numbers for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; @@ -1196,14 +1288,30 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m gsl_vector_add_constant (geno, -1.0*geno_mean); + /* if (geno_var!=0) { if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} else {cout<<"Unknown kinship mode."<<endl;} } + */ + + if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));} + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_test++; - } + + if (ns_test%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } + + if (ns_test%msize!=0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + } + cout<<endl; gsl_matrix_scale (matrix_kin, 1.0/(double)ns_test); @@ -1216,6 +1324,7 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m } gsl_vector_free (geno); + gsl_matrix_free (Xlarge); infile.close(); infile.clear(); @@ -2053,7 +2162,7 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs uint16_t unzipped_data[3*bgen_N]; if (setSnps.size()!=0 && setSnps.count(rs)==0) { - SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, -9}; + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, -9, -9, (long int) -9}; snpInfo.push_back(sInfo); indicator_snp.push_back(0); if(CompressedSNPBlocks) @@ -2394,18 +2503,18 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k //read header to determine which column contains which item bool ReadHeader (const string &line, HEADER &header) { - string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; - set<string> rs_set(rs_ptr, rs_ptr+10); + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID","MarkerName"}; + set<string> rs_set(rs_ptr, rs_ptr+11); string chr_ptr[]={"chr","CHR"}; set<string> chr_set(chr_ptr, chr_ptr+2); string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; set<string> pos_set(pos_ptr, pos_ptr+8); string cm_ptr[]={"cm","CM"}; set<string> cm_set(cm_ptr, cm_ptr+2); - string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; - set<string> a1_set(a1_ptr, a1_ptr+4); - string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; - set<string> a0_set(a0_ptr, a0_ptr+4); + string a1_ptr[]={"a1","A1","allele1","ALLELE1","Allele1","INC_ALLELE"}; + set<string> a1_set(a1_ptr, a1_ptr+5); + string a0_ptr[]={"a0","A0","allele0","ALLELE0","Allele0","a2","A2","allele2","ALLELE2","Allele2","DEC_ALLELE"}; + set<string> a0_set(a0_ptr, a0_ptr+10); string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; set<string> z_set(z_ptr, z_ptr+6); @@ -2424,9 +2533,13 @@ bool ReadHeader (const string &line, HEADER &header) set<string> nmis_set(nmis_ptr, nmis_ptr+6); string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; set<string> nobs_set(nobs_ptr, nobs_ptr+4); + string ncase_ptr[]={"ncase","NCASE","n_case","N_CASE"}; + set<string> ncase_set(ncase_ptr, ncase_ptr+4); + string ncontrol_ptr[]={"ncontrol","NCONTROL","n_control","N_CONTROL"}; + set<string> ncontrol_set(ncontrol_ptr, ncontrol_ptr+4); - string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; - set<string> af_set(af_ptr, af_ptr+10); + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY","Freq.Allele1.HapMapCEU","FreqAllele1HapMapCEU", "Freq1.Hapmap"}; + set<string> af_set(af_ptr, af_ptr+13); string var_ptr[]={"var","VAR"}; set<string> var_set(var_ptr, var_ptr+2); @@ -2435,7 +2548,7 @@ bool ReadHeader (const string &line, HEADER &header) string cor_ptr[]={"cor","COR","r","R"}; set<string> cor_set(cor_ptr, cor_ptr+4); - header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.cm_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.ncase_col=0; header.ncontrol_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; char *ch_ptr; string type; @@ -2472,6 +2585,10 @@ bool ReadHeader (const string &line, HEADER &header) if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;} } else if (nobs_set.count(type)!=0) { if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;} + } else if (ncase_set.count(type)!=0) { + if (header.ncase_col==0) {header.ncase_col=header.coln+1;} else {cout<<"error! more than two n_case columns in the file."<<endl; n_error++;} + } else if (ncontrol_set.count(type)!=0) { + if (header.ncontrol_col==0) {header.ncontrol_col=header.coln+1;} else {cout<<"error! more than two n_control columns in the file."<<endl; n_error++;} } else if (ws_set.count(type)!=0) { if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;} } else if (af_set.count(type)!=0) { @@ -2576,8 +2693,31 @@ bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_ +bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc) +{ + mapRS2cat.clear(); + + igzstream infile (file_mcat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; return false;} + + string file_name; + map<string, size_t> mapRS2cat_tmp; + size_t n_vc_tmp, t=0; + + while (!safeGetline(infile, file_name).eof()) { + mapRS2cat_tmp.clear(); + ReadFile_cat (file_name, mapRS2cat_tmp, n_vc_tmp); + mapRS2cat.insert(mapRS2cat_tmp.begin(), mapRS2cat_tmp.end()); + if (t==0) {n_vc=n_vc_tmp;} else {n_vc=max(n_vc, n_vc_tmp);} + t++; + } + + return true; +} + + //read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix -bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin) +bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { igzstream infile (file_geno.c_str(), igzstream::in); //ifstream infile (file_geno.c_str(), ifstream::in); @@ -2593,6 +2733,17 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> gsl_vector *geno=gsl_vector_alloc (ni_test); gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + size_t n_vc=matrix_kin->size2/ni_test, i_vc; string rs; vector<size_t> ns_vec; @@ -2600,6 +2751,11 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> ns_vec.push_back(0); } + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); + gsl_matrix_set_zero(Xlarge); + size_t ns_test=0; for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); @@ -2640,49 +2796,85 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} } - //this line is new; removed - //gsl_vector_add_constant (geno, -1.0*geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); - if (geno_var!=0) { - mapRS2var[rs]=geno_var; + gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot (geno, geno, &geno_var); + geno_var/=(double)ni_test; - if (k_mode==1) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { + if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (mapRS2weight.size()==0) { + d=1.0/geno_var; + } else { + d=mapRS2weight.at(rs)/geno_var; + } + + /* + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { i_vc=mapRS2cat.at(rs); ns_vec[i_vc]++; gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); + gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); + //eigenlib_dsyr (1.0, geno, matrix_kin); + } + */ + + gsl_vector_scale (geno, sqrt(d)); + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0]%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); } + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); - //eigenlib_dsyr (1.0, geno, matrix_kin); - } else if (k_mode==2) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc]%msize==0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); } - } else { - cout<<"Unknown kinship mode."<<endl; } + } ns_test++; - } + + } + + for (size_t i_vc=0; i_vc<n_vc; i_vc++) { + if (ns_vec[i_vc]%msize!=0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + } + } + cout<<endl; for (size_t t=0; t<n_vc; t++) { - if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);} + gsl_vector_set(vector_ns, t, ns_vec[t]); for (size_t i=0; i<ni_test; ++i) { - for (size_t j=0; j<i; ++j) { + for (size_t j=0; j<=i; ++j) { d=gsl_matrix_get (matrix_kin, j, i+ni_test*t); + d/=(double)ns_vec[t]; gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); + gsl_matrix_set (matrix_kin, j, i+ni_test*t, d); } } } @@ -2690,6 +2882,14 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> gsl_vector_free (geno); gsl_vector_free (geno_miss); + gsl_vector_free (Wtx); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + gsl_matrix_free (Xlarge); + infile.close(); infile.clear(); @@ -2702,7 +2902,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> -bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin) +bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { ifstream infile (file_bed.c_str(), ios::binary); if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} @@ -2717,6 +2917,17 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> & size_t ni_total=indicator_idv.size(); gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + size_t ns_test=0; int n_bit; @@ -2727,6 +2938,11 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> & ns_vec.push_back(0); } + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); + gsl_matrix_set_zero(Xlarge); + //calculate n_bit and c, the number of bit for each snp if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -2780,65 +2996,97 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> & if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} } - //this line is new; removed - //gsl_vector_add_constant (geno, -1.0*geno_mean); + gsl_vector_add_constant (geno, -1.0*geno_mean); + + gsl_blas_dgemv (CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv (CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot (geno, geno, &geno_var); + geno_var/=(double)ni_test; + + if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (mapRS2weight.size()==0) { + d=1.0/geno_var; + } else { + d=mapRS2weight.at(rs)/geno_var; + } + + /* + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); + ns_vec[0]++; + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + ns_vec[i_vc]++; + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); + } + */ + + gsl_vector_scale (geno, sqrt(d)); + if (n_vc==1 || mapRS2cat.size()==0 ) { + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0]%msize==0) { + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } else if (mapRS2cat.count(rs)!=0) { + i_vc=mapRS2cat.at(rs); + + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_memcpy (&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc]%msize==0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); + } + } - if (geno_var!=0) { - mapRS2var[rs]=geno_var; - if (k_mode==1) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0, geno, &kin_sub.matrix); - } - } else if (k_mode==2) { - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, &kin_sub.matrix); - } - } else { - cout<<"Unknown kinship mode."<<endl; - } - } + } ns_test++; - } + } + + for (size_t i_vc=0; i_vc<n_vc; i_vc++) { + if (ns_vec[i_vc]%msize!=0) { + gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + } + } + cout<<endl; for (size_t t=0; t<n_vc; t++) { - if (ns_vec[t]!=0) {gsl_matrix_scale (matrix_kin, 1.0/(double)ns_vec[t]);} + gsl_vector_set(vector_ns, t, ns_vec[t]); for (size_t i=0; i<ni_test; ++i) { - for (size_t j=0; j<i; ++j) { + for (size_t j=0; j<=i; ++j) { d=gsl_matrix_get (matrix_kin, j, i+ni_test*t); + d/=(double)ns_vec[t]; gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); - //cout<<d<<" "; + gsl_matrix_set (matrix_kin, j, i+ni_test*t, d); } - //cout<<endl; - } - } - - d=0; - for (size_t i=0; i<ni_test; ++i) { - for (size_t j=0; j<ni_test; ++j) { - d+=gsl_matrix_get (matrix_kin, i, j)*gsl_matrix_get (matrix_kin, i, j); } } - d/=(double)ni_test*(double)ni_test; - //cout<<"trace = "<<scientific<<d-1/(double)ni_test<<endl; + gsl_vector_free (geno); + gsl_vector_free (Wtx); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); - gsl_vector_free (geno); + gsl_matrix_free (Xlarge); infile.close(); infile.clear(); @@ -2848,34 +3096,176 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> & -//read var file, store mapRS2var -bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var) +bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) { - mapRS2var.clear(); + size_t n_vc=vector_ns->size, ni_test=matrix_kin->size1; + gsl_matrix_set_zero(matrix_kin); + gsl_vector_set_zero(vector_ns); + + igzstream infile (file_mfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;} - igzstream infile (file_var.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open var file: "<<file_var<<endl; return false;} + string file_name; + + gsl_matrix *kin_tmp=gsl_matrix_alloc (matrix_kin->size1, matrix_kin->size2); + gsl_vector *ns_tmp=gsl_vector_alloc (vector_ns->size); + + size_t l=0; + double d; + while (!safeGetline(infile, file_name).eof()) { + gsl_matrix_set_zero(kin_tmp); + gsl_vector_set_zero(ns_tmp); + + if (mfile_mode==1) { + file_name+=".bed"; + PlinkKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + } else { + BimbamKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + } + + //add ns + gsl_vector_add(vector_ns, ns_tmp); + + //add kin + for (size_t t=0; t<n_vc; t++) { + for (size_t i=0; i<ni_test; ++i) { + for (size_t j=0; j<=i; ++j) { + d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)+gsl_matrix_get (kin_tmp, j, i+ni_test*t)*gsl_vector_get(ns_tmp, t); + + gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); + gsl_matrix_set (matrix_kin, j, i+ni_test*t, d); + } + } + } + l++; + } + + //renormalize kin + for (size_t t=0; t<n_vc; t++) { + for (size_t i=0; i<ni_test; ++i) { + for (size_t j=0; j<=i; ++j) { + d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)/gsl_vector_get(vector_ns, t); + + gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); + gsl_matrix_set (matrix_kin, j, i+ni_test*t, d); + + } + } + } + cout<<endl; + + infile.close(); + infile.clear(); + + gsl_matrix_free(kin_tmp); + gsl_vector_free(ns_tmp); + + return true; +} + + + + +//read var file, store mapRS2wsnp +bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2weight) +{ + mapRS2weight.clear(); + + igzstream infile (file_wsnp.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wsnp<<endl; return false;} char *ch_ptr; string line, rs; - double var; + double weight; while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; ch_ptr=strtok (NULL, " , \t"); - var=atof(ch_ptr); - mapRS2var[rs]=var; + weight=atof(ch_ptr); + mapRS2weight[rs]=weight; } return true; } +bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vector<double> > &mapRS2wvector) +{ + mapRS2wvector.clear(); + + igzstream infile (file_wcat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wcat<<endl; return false;} + + char *ch_ptr; + vector<double> weight; + for (size_t i=0; i<n_vc; i++) { + weight.push_back(0.0); + } + + string line, rs, chr, a1, a0, pos, cm; + //double af=0, var_x=0; + //size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) {continue;} + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + //n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; n_case=0; af=0; var_x=0; + size_t t=0; + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + else if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr; } + else if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; } + else if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; } + else if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr; } + else if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr; } + //else if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr); } + //else if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr); } + //else if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr); } + //else if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr); } + //else if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr); } + //else if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr); } + //else if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr); } + else { + weight[t]=atof(ch_ptr); t++; + if (t>n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + } + + ch_ptr=strtok (NULL, " , \t"); + } + + if (t!=n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + mapRS2wvector[rs]=weight; + } + + return true; +} + + + + + + + -//read beta file, use the mapRS2var to select snps (and to provide var if maf/var is not provided in the beta file), calculate q -void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test) +//read the beta file, save snp z scores in to z2_score, and save category into indicator_snp based on mapRS2var and set, and indicator_snp record the category number (from 1 to n_vc), and provide var if maf/var is not provided in the beta file +//notice that indicator_snp contains ns_test snps, instead of ns_total snps +//read the beta file for the second time, compute q, and Vq based on block jacknife +//use the mapRS2var to select snps (and to ), calculate q +//do a block-wise jacknife, and compute Vq +void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2wA, vector<size_t> &vec_cat, vector<size_t> &vec_ni, vector<double> &vec_weight, vector<double> &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test) { - gsl_vector_set_zero(q); + vec_cat.clear(); vec_ni.clear(); vec_weight.clear(); vec_z2.clear(); ni_total=0; ns_total=0; ns_test=0; igzstream infile (file_beta.c_str(), igzstream::in); @@ -2887,13 +3277,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, string rs, chr, a1, a0, pos, cm; double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0; - size_t n_total=0, n_mis=0, n_obs=0; - - vector<double> vec_q, vec_s; - for (size_t i=0; i<q->size; i++) { - vec_q.push_back(0.0); - vec_s.push_back(0.0); - } + size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; //read header HEADER header; @@ -2901,7 +3285,7 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, ReadHeader (line, header); if (header.n_col==0 ) { - if (header.nobs_col==0 && header.nmis_col==0) { + if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { cout<<"error! missing sample size in the beta file."<<endl; } else { cout<<"total sample size will be replaced by obs/mis sample size."<<endl; @@ -2911,16 +3295,17 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) { cout<<"error! missing z scores in the beta file."<<endl; } - - if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) { + /* + if (header.af_col==0 && header.var_col==0) { cout<<"error! missing allele frequency in the beta file."<<endl; } - + */ while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); z=0; beta=0; se_beta=0; chisq=0; pvalue=0; - n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; + n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; af=0; var_x=0; for (size_t i=0; i<header.coln; i++) { if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} @@ -2938,6 +3323,8 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);} + if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);} if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} @@ -2950,7 +3337,11 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, } if (header.n_col==0) { - n_total=n_mis+n_obs; + if (header.nmis_col!=0 && header.nobs_col!=0) { + n_total=n_mis+n_obs; + } else { + n_total=n_case+n_control; + } } //both z values and beta/se_beta have directions, while chisq/pvalue do not @@ -2965,29 +3356,25 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); } else {zsquare=0;} + //obtain var_x + if (header.var_col==0 && header.af_col!=0) { + var_x=2.0*af*(1.0-af); + } + //if the snp is also present in cor file, then do calculations - if (mapRS2var.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { - //obtain var_x - if (k_mode==1) { - if (header.var_col==0) { - if (header.af_col!=0) { - var_x=2.0*af*(1.0-af); - } else { - var_x=mapRS2var.at(rs); - } - } + if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) && zsquare!=0) { + if (mapRS2cat.size()!=0) { + vec_cat.push_back(mapRS2cat.at(rs)); } else { - var_x=1.0; + vec_cat.push_back(0); } - - //compute q - if (mapRS2cat.size()!=0) { - vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total; - vec_s[mapRS2cat.at(rs) ]+=var_x; + vec_ni.push_back(n_total); + if (mapRS2wA.size()==0) { + vec_weight.push_back(1); } else { - vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total; - vec_s[0]+=var_x; + vec_weight.push_back(mapRS2wA.at(rs)); } + vec_z2.push_back(zsquare); ni_total=max(ni_total, n_total); ns_test++; @@ -2996,14 +3383,6 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, ns_total++; } - //save q - for (size_t i=0; i<q->size; i++) { - if (vec_s[i]!=0) { - gsl_vector_set(q, i, vec_q[i]/vec_s[i]); - } - gsl_vector_set(s, i, vec_s[i]); - } - infile.clear(); infile.close(); @@ -3013,34 +3392,108 @@ void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, -//read S file: S and Svar -void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar) + + +void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA, map<string, string> &mapRS2A1, map<string, double> &mapRS2z) { - igzstream infile (file_s.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;} + mapRS2A1.clear(); mapRS2z.clear(); + + igzstream infile (file_beta.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;} string line; char *ch_ptr; - double d; + string type; - for (size_t i=0; i<S->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - for (size_t j=0; j<S->size2; j++) { - d=gsl_matrix_get(S, i, j)+atof(ch_ptr); - gsl_matrix_set(S, i, j, d); - ch_ptr=strtok (NULL, " , \t"); + string rs, chr, a1, a0, pos, cm; + double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, af=0, var_x=0; + size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; + size_t ni_total=0, ns_total=0, ns_test=0; + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { + cout<<"error! missing sample size in the beta file."<<endl; + } else { + cout<<"total sample size will be replaced by obs/mis sample size."<<endl; } } - for (size_t i=0; i<Svar->size1; i++) { - !safeGetline(infile, line).eof(); + if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0)) { + cout<<"error! missing z scores in the beta file."<<endl; + } + /* + if (header.af_col==0 && header.var_col==0) { + cout<<"error! missing allele frequency in the beta file."<<endl; + } + */ + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); - for (size_t j=0; j<Svar->size2; j++) { - d=gsl_matrix_get(Svar, i, j)+atof(ch_ptr); - gsl_matrix_set(Svar, i, j, d); + + z=0; beta=0; se_beta=0; chisq=0; pvalue=0; + n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; af=0; var_x=0; + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;} + if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;} + if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} + if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} + + if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} + if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} + if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);} + if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} + if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} + + if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} + if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} + if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);} + if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);} + + if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} + if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} + ch_ptr=strtok (NULL, " , \t"); } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + if (header.n_col==0) { + if (header.nmis_col!=0 && header.nobs_col!=0) { + n_total=n_mis+n_obs; + } else { + n_total=n_case+n_control; + } + } + + //both z values and beta/se_beta have directions, while chisq/pvalue do not + if (header.z_col!=0) { + z=z; + } else if (header.beta_col!=0 && header.sebeta_col!=0) { + z=beta/se_beta; + } else { + z=0; + } + + //if the snp is also present in cor file, then do calculations + if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) ) { + mapRS2z[rs]=z; + mapRS2A1[rs]=a1; + + ni_total=max(ni_total, n_total); + ns_test++; + } + + ns_total++; } infile.clear(); @@ -3052,22 +3505,135 @@ void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar) -void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar) +void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s) { - gsl_matrix_set_zero(S); - gsl_matrix_set_zero(Svar); + gsl_matrix_set_zero (Vq); + gsl_vector_set_zero (q); + gsl_vector_set_zero (s); - string file_name; + size_t cat, n_total; + double w, zsquare; - igzstream infile (file_ms.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open ms file: "<<file_ms<<endl; return;} + vector<double> vec_q, vec_s, n_snps; + for (size_t i=0; i<q->size; i++) { + vec_q.push_back(0.0); + vec_s.push_back(0.0); + n_snps.push_back(0.0); + } - while (!safeGetline(infile, file_name).eof()) { - ReadFile_s(file_name, S, Svar); + vector<vector<double> > mat_q, mat_s; + for (size_t i=0; i<n_block; i++) { + mat_q.push_back(vec_q); + mat_s.push_back(vec_s); } - infile.clear(); - infile.close(); + //compute q and s + for (size_t i=0; i<vec_cat.size(); i++) { + //extract quantities + cat=vec_cat[i]; + n_total=vec_ni[i]; + w=vec_weight[i]; + zsquare=vec_z2[i]; + + //compute q and s + vec_q[cat]+=(zsquare-1.0)*w/(double)n_total; + vec_s[cat]+=w; + n_snps[cat]++; + } + + //update q; vec_q is used again for computing Vq below + for (size_t i=0; i<q->size; i++) { + if (vec_s[i]!=0) { + gsl_vector_set(q, i, vec_q[i]/vec_s[i]); + } + gsl_vector_set(s, i, vec_s[i]); + } + + //compute Vq; divide SNPs in each category into evenly distributed blocks + size_t t=0, b=0, n_snp=0; + double d, m, n; + for (size_t l=0; l<q->size; l++) { + n_snp=floor(n_snps[l]/n_block); t=0; b=0; + if (n_snp==0) {continue;} + + //initiate everything to zero + for (size_t i=0; i<n_block; i++) { + for (size_t j=0; j<q->size; j++) { + mat_q[i][j]=0; + mat_s[i][j]=0; + } + } + + //record values + for (size_t i=0; i<vec_cat.size(); i++) { + //extract quantities + cat=vec_cat[i]; + n_total=vec_ni[i]; + w=vec_weight[i]; + zsquare=vec_z2[i]; + + //save quantities for computing Vq (which is not divided by n_total) + mat_q[b][cat]+=(zsquare-1.0)*w; + mat_s[b][cat]+=w; + + if (cat==l) { + if (b<n_block-1) { + if (t<n_snp-1) {t++;} else {b++; t=0;} + } else { + t++; + } + } + } + + //center mat_q + for (size_t i=0; i<q->size; i++) { + m=0; n=0; + for (size_t k=0; k<n_block; k++) { + if (mat_s[k][i]!=0 && vec_s[i]!=mat_s[k][i]) { + d=(vec_q[i]-mat_q[k][i])/(vec_s[i]-mat_s[k][i]); + mat_q[k][i]=d; + m+=d; + n++; + } + } + if (n!=0) {m/=n;} + + for (size_t k=0; k<n_block; k++) { + if (mat_q[k][i]!=0) { + mat_q[k][i]-=m; + } + } + } + + //compute Vq for l'th row and l'th column only + for (size_t i=0; i<q->size; i++) { + d=0; n=0; + for (size_t k=0; k<n_block; k++) { + if (mat_q[k][l]!=0 && mat_q[k][i]!=0) { + d+=mat_q[k][l]*mat_q[k][i]; + n++; + } + } + if (n!=0) { + d/=n; + d*=n-1; + } + d+=gsl_matrix_get(Vq, i, l); + gsl_matrix_set(Vq, i, l, d); + if (i!=l) {gsl_matrix_set(Vq, l, i, d);} + } + + } + + //divide the off diagonal elements of Vq by 2 + for (size_t i=0; i<q->size; i++) { + for (size_t j=i; j<q->size; j++) { + if (i==j) {continue;} + d=gsl_matrix_get(Vq, i, j); + gsl_matrix_set(Vq, i, j, d/2); + gsl_matrix_set(Vq, j, i, d/2); + } + } return; } @@ -3075,24 +3641,19 @@ void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar) -//read V file: V (i.e. Q) -void ReadFile_v (const string &file_v, gsl_matrix *V) +//read vector file +void ReadFile_vector (const string &file_vec, gsl_vector *vec) { - igzstream infile (file_v.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open v file: "<<file_v<<endl; return;} + igzstream infile (file_vec.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open vector file: "<<file_vec<<endl; return;} string line; char *ch_ptr; - double d; - for (size_t i=0; i<V->size1; i++) { + for (size_t i=0; i<vec->size; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - for (size_t j=0; j<V->size2; j++) { - d=gsl_matrix_get(V, i, j)+atof(ch_ptr); - gsl_matrix_set(V, i, j, d); - ch_ptr=strtok (NULL, " , \t"); - } + gsl_vector_set(vec, i, atof(ch_ptr)); } infile.clear(); @@ -3102,17 +3663,21 @@ void ReadFile_v (const string &file_v, gsl_matrix *V) } -void ReadFile_mv (const string &file_mv, gsl_matrix *V) +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat) { - gsl_matrix_set_zero(V); - - string file_name; + igzstream infile (file_mat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;} - igzstream infile (file_mv.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open ms file: "<<file_mv<<endl; return;} + string line; + char *ch_ptr; - while (!safeGetline(infile, file_name).eof()) { - ReadFile_v(file_name, V); + for (size_t i=0; i<mat->size1; i++) { + !safeGetline(infile, line).eof(); + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t j=0; j<mat->size2; j++) { + gsl_matrix_set(mat, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } infile.clear(); @@ -3121,35 +3686,32 @@ void ReadFile_mv (const string &file_mv, gsl_matrix *V) return; } - -//read q file: q, s and ni_test -void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, gsl_matrix *mat2) { - igzstream infile (file_s.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open s file: "<<file_s<<endl; return;} + igzstream infile (file_mat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;} string line; char *ch_ptr; - double d; - for (size_t i=0; i<q_vec->size; i++) { + for (size_t i=0; i<mat1->size1; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - d=gsl_vector_get(q_vec, i)+atof(ch_ptr); - gsl_vector_set(q_vec, i, d); + for (size_t j=0; j<mat1->size2; j++) { + gsl_matrix_set(mat1, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } - for (size_t i=0; i<s_vec->size; i++) { + for (size_t i=0; i<mat2->size1; i++) { !safeGetline(infile, line).eof(); ch_ptr=strtok ((char *)line.c_str(), " , \t"); - d=gsl_vector_get(s_vec, i)+atof(ch_ptr); - gsl_vector_set(s_vec, i, d); + for (size_t j=0; j<mat2->size2; j++) { + gsl_matrix_set(mat2, i, j, atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } } - !safeGetline(infile, line).eof(); - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - df=atof(ch_ptr); - infile.clear(); infile.close(); @@ -3158,22 +3720,274 @@ void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, dou -void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df) +//read study file +void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { + string Vqfile=file_study+".Vq.txt"; + string sfile=file_study+".size.txt"; + string qfile=file_study+".q.txt"; + + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + ReadFile_matrix(Vqfile, Vq_mat); + ReadFile_vector(sfile, s); + ReadFile_vector(qfile, q_vec); + + double d; + for (size_t i=0; i<s_vec->size; i++) { + d=gsl_vector_get (s, i); + gsl_vector_set (s_vec, i, d); + } + ni=gsl_vector_get (s, s_vec->size); + + gsl_vector_free(s); + + return; +} + + +//read reference file +void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) +{ + string sfile=file_ref+".size.txt"; + string Sfile=file_ref+".S.txt"; + //string Vfile=file_ref+".V.txt"; + + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + ReadFile_vector(sfile, s); + ReadFile_matrix(Sfile, S_mat, Svar_mat); + //ReadFile_matrix(Vfile, V_mat); + + double d; + for (size_t i=0; i<s_vec->size; i++) { + d=gsl_vector_get (s, i); + gsl_vector_set (s_vec, i, d); + } + ni=gsl_vector_get (s, s_vec->size); + + gsl_vector_free(s); + + return; +} + + +//read mstudy file +void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) +{ + gsl_matrix_set_zero(Vq_mat); gsl_vector_set_zero(q_vec); gsl_vector_set_zero(s_vec); + ni=0; + + gsl_matrix *Vq_sub=gsl_matrix_alloc(Vq_mat->size1, Vq_mat->size2); + gsl_vector *q_sub=gsl_vector_alloc(q_vec->size); + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + igzstream infile (file_mstudy.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mstudy file: "<<file_mstudy<<endl; return;} string file_name; + double d1, d2, d; + + while (!safeGetline(infile, file_name).eof()) { + string Vqfile=file_name+".Vq.txt"; + string sfile=file_name+".size.txt"; + string qfile=file_name+".q.txt"; + + ReadFile_matrix(Vqfile, Vq_sub); + ReadFile_vector(sfile, s); + ReadFile_vector(qfile, q_sub); + + ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size)); + + for (size_t i=0; i<s_vec->size; i++) { + d1=gsl_vector_get (s, i); + if (d1==0) {continue;} + + d=gsl_vector_get(q_vec, i)+gsl_vector_get(q_sub, i)*d1; + gsl_vector_set(q_vec, i, d); + + d=gsl_vector_get(s_vec, i)+d1; + gsl_vector_set(s_vec, i, d); + + for (size_t j=i; j<s_vec->size; j++) { + d2=gsl_vector_get (s, j); + if (d2==0) {continue;} + + d=gsl_matrix_get(Vq_mat, i, j)+gsl_matrix_get(Vq_sub, i, j)*d1*d2; + gsl_matrix_set(Vq_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);} + } + } + } - igzstream infile (file_mq.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mq file: "<<file_mq<<endl; return;} + for (size_t i=0; i<s_vec->size; i++) { + d1=gsl_vector_get (s_vec, i); + if (d1==0) {continue;} + + d=gsl_vector_get (q_vec, i); + gsl_vector_set (q_vec, i, d/d1); + + for (size_t j=i; j<s_vec->size; j++) { + d2=gsl_vector_get (s_vec, j); + if (d2==0) {continue;} + + d=gsl_matrix_get (Vq_mat, i, j)/(d1*d2); + gsl_matrix_set (Vq_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Vq_mat, j, i, d);} + } + } + + gsl_matrix_free(Vq_sub); + gsl_vector_free(q_sub); + gsl_vector_free(s); + + return; +} + + +//copied from lmm.cpp; is used in the following function compKtoV +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + +//read reference file +void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) +{ + gsl_matrix_set_zero(S_mat); + gsl_matrix_set_zero(Svar_mat); + // gsl_matrix_set_zero(V_mat); + gsl_vector_set_zero(s_vec); + ni=0; + + //size_t n_vc=S_mat->size1; + gsl_matrix *S_sub=gsl_matrix_alloc (S_mat->size1, S_mat->size2); + gsl_matrix *Svar_sub=gsl_matrix_alloc (Svar_mat->size1, Svar_mat->size2); + //gsl_matrix *V_sub=gsl_matrix_alloc (V_mat->size1, V_mat->size2); + gsl_vector *s=gsl_vector_alloc (s_vec->size+1); + + igzstream infile (file_mref.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mref file: "<<file_mref<<endl; return;} + + string file_name; + double d1, d2, d; + //size_t t_ij; while (!safeGetline(infile, file_name).eof()) { - ReadFile_q(file_name, q_vec, s_vec, df); + string sfile=file_name+".size.txt"; + string Sfile=file_name+".S.txt"; + //string Vfile=file_name+".V.txt"; + + ReadFile_vector(sfile, s); + ReadFile_matrix(Sfile, S_sub, Svar_sub); + //ReadFile_matrix(Vfile, V_sub); + + //update s_vec and ni + for (size_t i=0; i<s_vec->size; i++) { + d=gsl_vector_get (s, i)+gsl_vector_get (s_vec, i); + gsl_vector_set (s_vec, i, d); + } + ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size)); + + //update S and Svar from each file + for (size_t i=0; i<S_mat->size1; i++) { + d1=gsl_vector_get(s, i); + for (size_t j=0; j<S_mat->size2; j++) { + d2=gsl_vector_get(s, j); + + d=gsl_matrix_get(S_sub, i, j)*d1*d2; + gsl_matrix_set(S_sub, i, j, d); + d=gsl_matrix_get(Svar_sub, i, j)*d1*d2*d1*d2; + gsl_matrix_set(Svar_sub, i, j, d); + } + } + + gsl_matrix_add (S_mat, S_sub); + gsl_matrix_add (Svar_mat, Svar_sub); + /* + //update V from each file + for (size_t i=0; i<n_vc; i++) { + d1=gsl_vector_get(s, i); + for (size_t j=i; j<n_vc; j++) { + d2=gsl_vector_get(s, j); + t_ij=GetabIndex (i+1, j+1, n_vc-2); + for (size_t l=0; l<n_vc+1; l++) { + if (l==n_vc) {d3=1;} else {d3=gsl_vector_get(s, l);} + for (size_t m=0; m<n_vc+1; m++) { + if (m==n_vc) {d4=1;} else {d4=gsl_vector_get(s, m);} + + d=gsl_matrix_get (V_sub, l, t_ij*(n_vc+1)+m)*d1*d2*d3*d4; + gsl_matrix_set (V_sub, l, t_ij*(n_vc+1)+m, d); + } + } + } + } + + gsl_matrix_add (V_mat, V_sub); + */ } - infile.clear(); - infile.close(); + //final: update S and Svar + for (size_t i=0; i<S_mat->size1; i++) { + d1=gsl_vector_get(s_vec, i); + if (d1==0) {continue;} + for (size_t j=i; j<S_mat->size2; j++) { + d2=gsl_vector_get(s_vec, j); + if (d2==0) {continue;} + + d=gsl_matrix_get(S_mat, i, j)/(d1*d2); + gsl_matrix_set(S_mat, i, j, d); + if (i!=j) {gsl_matrix_set(S_mat, j, i, d);} + + d=gsl_matrix_get(Svar_mat, i, j)/(d1*d2*d1*d2); + gsl_matrix_set(Svar_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Svar_mat, j, i, d);} + } + } + /* + //final: update V + for (size_t i=0; i<n_vc; i++) { + d1=gsl_vector_get(s_vec, i); + if (d1==0) {continue;} + for (size_t j=i; j<n_vc; j++) { + d2=gsl_vector_get(s_vec, j); + if (d2==0) {continue;} + t_ij=GetabIndex (i+1, j+1, n_vc-2); + for (size_t l=0; l<n_vc+1; l++) { + if (l==n_vc) {d3=1;} else {d3=gsl_vector_get(s_vec, l);} + if (d3==0) {continue;} + for (size_t m=0; m<n_vc+1; m++) { + if (m==n_vc) {d4=1;} else {d4=gsl_vector_get(s_vec, m);} + if (d4==0) {continue;} + + d=gsl_matrix_get (V_mat, l, t_ij*(n_vc+1)+m)/(d1*d2*d3*d4); + gsl_matrix_set (V_mat, l, t_ij*(n_vc+1)+m, d); + } + } + } + } + */ + //free matrices + gsl_matrix_free(S_sub); + gsl_matrix_free(Svar_sub); + //gsl_matrix_free(V_sub); + gsl_vector_free(s); return; } + + + + + + @@ -44,6 +44,7 @@ void ProgressBar (string str, double p, double total, double ratio); std::istream& safeGetline(std::istream& is, std::string& t); bool ReadFile_snps (const string &file_snps, set<string> &setSnps); +bool ReadFile_snps_header (const string &file_snps, set<string> &setSnps); bool ReadFile_log (const string &file_log, double &pheno_mean); bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo); @@ -80,20 +81,23 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN bool ReadHeader (const string &line, HEADER &header); bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc); +bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc); -bool BimbamKin (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin); -bool PlinkKin (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, const int k_mode, const int display_pace, const map<string, size_t> &mapRS2cat, map<string, double> &mapRS2var, vector<SNPINFO> &snpInfo, gsl_matrix *matrix_kin); +bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); -bool ReadFile_var (const string &file_var, map<string, double> &mapRS2var); -void ReadFile_beta (const string &file_beta, const int k_mode, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2var, gsl_vector *q, gsl_vector *s, size_t &ni_total, size_t &ns_total, size_t &ns_test); +bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2double); +bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vector<double> > &mapRS2vector); +void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2wA, vector<size_t> &vec_cat, vector<size_t> &vec_ni, vector<double> &vec_weight, vector<double> &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test); +void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA, map<string, string> &mapRS2A1, map<string, double> &mapRS2z); +void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s); -void ReadFile_s (const string &file_s, gsl_matrix *S, gsl_matrix *Svar); -void ReadFile_ms (const string &file_ms, gsl_matrix *S, gsl_matrix *Svar); -void ReadFile_v (const string &file_v, gsl_matrix *V); -void ReadFile_mv (const string &file_mq, gsl_matrix *V); -void ReadFile_q (const string &file_s, gsl_vector *q_vec, gsl_vector *s_vec, double &df); -void ReadFile_mq (const string &file_mq, gsl_vector *q_vec, gsl_vector *s_vec, double &df); +void ReadFile_study (const string &file_study, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); +void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); +void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); +void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); // WJA added bool bgenKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); @@ -41,6 +41,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_integration.h" +#include "eigenlib.h" #include "gzstream.h" #include "lapack.h" @@ -519,9 +520,9 @@ void LM::Analyzebgen (const gsl_matrix *W, const gsl_vector *y) for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } @@ -626,9 +627,9 @@ void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y) for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } //calculate statistics @@ -712,7 +713,6 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) b=ch[0]; } - for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} @@ -747,9 +747,9 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } //calculate statistics @@ -759,11 +759,11 @@ void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score}; sumStat.push_back(SNPs); + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); } cout<<endl; diff --git a/src/lmm.cpp b/src/lmm.cpp index 7bcf89a..af6ff8a 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -42,6 +42,7 @@ #include "gsl/gsl_integration.h" #include "io.h" +#include "eigenlib.h" #include "lapack.h" #include "gzstream.h" @@ -1228,6 +1229,12 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1236,6 +1243,7 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ // } //start reading genotypes and analyze + size_t c=0; for (size_t t=0; t<indicator_snp.size(); ++t) { // if (t>1) {break;} !safeGetline(infile, line).eof(); @@ -1268,48 +1276,72 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } - + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (Utx, &UtXlarge_col.vector); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + CalcUab(UtW, Uty, Utx, Uab); + // if (e_mode!=0) { + // Calcab (W, y, x, ab); + // } - //store summary data - SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); - } + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + //if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } + } cout<<endl; gsl_vector_free (x); @@ -1318,6 +1350,9 @@ void LMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gsl_ gsl_matrix_free (Uab); gsl_vector_free (ab); + gsl_matrix_free (Xlarge); + gsl_matrix_free (UtXlarge); + infile.close(); infile.clear(); @@ -1354,6 +1389,12 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1371,7 +1412,7 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m b=ch[0]; } - + size_t c=0; for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} @@ -1406,46 +1447,71 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1, for beta - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (Utx, &UtXlarge_col.vector); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + CalcUab(UtW, Uty, Utx, Uab); + // if (e_mode!=0) { + // Calcab (W, y, x, ab); + // } - //store summary data - SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1, for beta + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + //if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } } cout<<endl; @@ -1454,6 +1520,9 @@ void LMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl_m gsl_matrix_free (Uab); gsl_vector_free (ab); + gsl_matrix_free(Xlarge); + gsl_matrix_free(UtXlarge); + infile.close(); infile.clear(); @@ -1487,6 +1556,12 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma gsl_matrix *Uab=gsl_matrix_alloc (U->size2, n_index); gsl_vector *ab=gsl_vector_alloc (n_index); + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero (Uab); CalcUab (UtW, Uty, Uab); // if (e_mode!=0) { @@ -1537,6 +1612,7 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma //start reading genotypes and analyze + size_t c=0; for (size_t t=0; t<indicator_snp.size(); ++t) { @@ -1645,47 +1721,71 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } - + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, Utx); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - CalcUab(UtW, Uty, Utx, Uab); -// if (e_mode!=0) { -// Calcab (W, y, x, ab); -// } + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, c%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + c++; - time_start=clock(); - FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + if (c%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (c%msize==0) {l=msize;} else {l=c%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - CalcRLScore (l_mle_null, param1, beta, se, p_score); - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==1 || a_mode==4) { - CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); - CalcRLWald (lambda_remle, param1, beta, se, p_wald); - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - if (a_mode==2 || a_mode==4) { - CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); - } + gsl_matrix_set_zero (Xlarge); - if (x_mean>1) {beta*=-1;} + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (Utx, &UtXlarge_col.vector); - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + CalcUab(UtW, Uty, Utx, Uab); + // if (e_mode!=0) { + // Calcab (W, y, x, ab); + // } - //store summary data - SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - sumStat.push_back(SNPs); + time_start=clock(); + FUNC_PARAM param1={false, ni_test, n_cvt, eval, Uab, ab, 0}; + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + CalcRLScore (l_mle_null, param1, beta, se, p_score); + } + + if (a_mode==1 || a_mode==4) { + CalcLambda ('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald (lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode==2 || a_mode==4) { + CalcLambda ('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_mle_H0), 1); + } + + //if (x_mean>1) {beta*=-1;} + + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //store summary data + SUMSTAT SNPs={beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); + } + } } cout<<endl; @@ -1695,6 +1795,9 @@ void LMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ma gsl_matrix_free (Uab); gsl_vector_free (ab); + gsl_matrix_free(Xlarge); + gsl_matrix_free(UtXlarge); + infile.close(); infile.clear(); diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index e9560ad..915245b 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -40,6 +40,7 @@ #include "Eigen/Dense" #include "lapack.h" +#include "eigenlib.h" #ifdef FORCE_FLOAT #include "mathfunc_float.h" @@ -247,6 +248,7 @@ void StandardizeVector (gsl_vector *y) //calculate UtX void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) { + /* gsl_vector *Utx_vec=gsl_vector_alloc (UtX->size1); for (size_t i=0; i<UtX->size2; ++i) { gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); @@ -254,17 +256,28 @@ void CalcUtX (const gsl_matrix *U, gsl_matrix *UtX) gsl_vector_memcpy (&UtX_col.vector, Utx_vec); } gsl_vector_free (Utx_vec); + */ + + gsl_matrix *X=gsl_matrix_alloc (UtX->size1, UtX->size2); + gsl_matrix_memcpy (X, UtX); + eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX); + gsl_matrix_free (X); + return; } void CalcUtX (const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) { + /* for (size_t i=0; i<X->size2; ++i) { gsl_vector_const_view X_col=gsl_matrix_const_column (X, i); gsl_vector_view UtX_col=gsl_matrix_column (UtX, i); gsl_blas_dgemv (CblasTrans, 1.0, U, &X_col.vector, 0.0, &UtX_col.vector); } + */ + eigenlib_dgemm ("T", "N", 1.0, U, X, 0.0, UtX); + return; } @@ -329,7 +342,8 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) het_probs[i] = 0.0; /* start at midpoint */ - int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + //XZ modified to add (long int) + int mid = ((long int)rare_copies * (2 * (long int)genotypes - (long int)rare_copies)) / (2 * (long int)genotypes); /* check to ensure that midpoint and rare alleles have same parity */ if ((rare_copies & 1) ^ (mid & 1)) @@ -390,7 +404,7 @@ double CalcHWE (const size_t n_hom1, const size_t n_hom2, const size_t n_ab) p_hwe += het_probs[i]; } - p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; free(het_probs); diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index 5826a1f..7655b50 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -42,6 +42,7 @@ #include "io.h" #include "lapack.h" +#include "eigenlib.h" #include "gzstream.h" #ifdef FORCE_FLOAT @@ -2935,12 +2936,17 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ ifstream infile (file_bgen.c_str(), ios::binary); if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return;} - clock_t time_start=clock(); time_UtX=0; time_opt=0; string line; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + // double lambda_mle=0, lambda_remle=0, beta=0, se=0, ; double logl_H0=0.0, logl_H1=0.0, p_wald=0, p_lrt=0, p_score=0; double crt_a, crt_b, crt_c; @@ -3179,6 +3185,7 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ //start reading genotypes and analyze + size_t csnp=0; for (size_t t=0; t<indicator_snp.size(); ++t) { @@ -3287,87 +3294,112 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + //gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); - if (p_score<p_nr && crt==1) { - logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); - } - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==2 || a_mode==4) { - logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Xlarge); + + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - if (p_lrt<p_nr) { - logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - //calculate beta and Vbeta - p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - - if (crt==1) { - p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); } - } + } + } - if (a_mode==1 || a_mode==4) { - logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (p_wald<p_nr) { - logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (crt==1) { - p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); } - } + } + } - if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; i<d_size; i++) { - v_beta[i]=gsl_vector_get (beta, i); - } + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } - c=0; - for (size_t i=0; i<d_size; i++) { - for (size_t j=i; j<d_size; j++) { - v_Vg[c]=gsl_matrix_get (V_g, i, j); - v_Ve[c]=gsl_matrix_get (V_e, i, j); - v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); - c++; - } - } + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } - MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; - sumStat.push_back(SNPs); - } + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + } + } cout<<endl; @@ -3404,6 +3436,9 @@ void MVLMM::Analyzebgen (const gsl_matrix *U, const gsl_vector *eval, const gsl_ gsl_matrix_free(B_null); gsl_matrix_free(se_B_null); + gsl_matrix_free(Xlarge); + gsl_matrix_free(UtXlarge); + return; } @@ -3430,6 +3465,12 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3615,6 +3656,7 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix_memcpy (B_null, B); //start reading genotypes and analyze + size_t csnp=0; for (size_t t=0; t<indicator_snp.size(); ++t) { //if (t>=1) {break;} !safeGetline(infile, line).eof(); @@ -3647,86 +3689,111 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs for (size_t i=0; i<ni_test; ++i) { if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);} geno=gsl_vector_get(x, i); - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); - if (p_score<p_nr && crt==1) { - logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); - } - } + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (a_mode==2 || a_mode==4) { - logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Xlarge); + + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - if (p_lrt<p_nr) { - logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - //calculate beta and Vbeta - p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - - if (crt==1) { - p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); } - } + } + } - if (a_mode==1 || a_mode==4) { - logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (p_wald<p_nr) { - logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (crt==1) { - p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); } - } + } + } - if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; i<d_size; i++) { - v_beta[i]=gsl_vector_get (beta, i); - } + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } - c=0; - for (size_t i=0; i<d_size; i++) { - for (size_t j=i; j<d_size; j++) { - v_Vg[c]=gsl_matrix_get (V_g, i, j); - v_Ve[c]=gsl_matrix_get (V_e, i, j); - v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); - c++; - } - } + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } - MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; - sumStat.push_back(SNPs); + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + } } cout<<endl; @@ -3764,6 +3831,9 @@ void MVLMM::AnalyzeBimbam (const gsl_matrix *U, const gsl_vector *eval, const gs gsl_matrix_free(B_null); gsl_matrix_free(se_B_null); + gsl_matrix_free(Xlarge); + gsl_matrix_free(UtXlarge); + return; } @@ -3795,6 +3865,12 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl size_t n_size=UtY->size1, d_size=UtY->size2, c_size=UtW->size2; size_t dc_size=d_size*(c_size+1), v_size=d_size*(d_size+1)/2; + //create a large matrix + size_t msize=10000; + gsl_matrix *Xlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix *UtXlarge=gsl_matrix_alloc (U->size1, msize); + gsl_matrix_set_zero(Xlarge); + //large matrices for EM gsl_matrix *U_hat=gsl_matrix_alloc (d_size, n_size); gsl_matrix *E_hat=gsl_matrix_alloc (d_size, n_size); @@ -3992,6 +4068,7 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl b=ch[0]; } + size_t csnp=0; for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);} if (indicator_snp[t]==0) {continue;} @@ -4030,9 +4107,9 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl for (size_t i=0; i<ni_test; ++i) { geno=gsl_vector_get(x,i); if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;} - if (x_mean>1) { - gsl_vector_set(x, i, 2-geno); - } + //if (x_mean>1) { + // gsl_vector_set(x, i, 2-geno); + //} } /* @@ -4047,85 +4124,110 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl } */ + /* //calculate statistics time_start=clock(); gsl_blas_dgemv (CblasTrans, 1.0, U, x, 0.0, &X_row.vector); time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + */ - //initial values - gsl_matrix_memcpy (V_g, V_g_null); - gsl_matrix_memcpy (V_e, V_e_null); - gsl_matrix_memcpy (B, B_null); + gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, csnp%msize); + gsl_vector_memcpy (&Xlarge_col.vector, x); + csnp++; - time_start=clock(); + if (csnp%msize==0 || t==indicator_snp.size()-1 ) { + size_t l=0; + if (csnp%msize==0) {l=msize;} else {l=csnp%msize;} - //3 is before 1 - if (a_mode==3 || a_mode==4) { - p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + gsl_matrix_view Xlarge_sub=gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); + gsl_matrix_view UtXlarge_sub=gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - if (p_score<p_nr && crt==1) { - logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); - } - } + time_start=clock(); + eigenlib_dgemm ("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); + time_UtX+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + gsl_matrix_set_zero (Xlarge); + + for (size_t i=0; i<l; i++) { + gsl_vector_view UtXlarge_col=gsl_matrix_column (UtXlarge, i); + gsl_vector_memcpy (&X_row.vector, &UtXlarge_col.vector); + + //initial values + gsl_matrix_memcpy (V_g, V_g_null); + gsl_matrix_memcpy (V_e, V_e_null); + gsl_matrix_memcpy (B, B_null); + + time_start=clock(); + + //3 is before 1 + if (a_mode==3 || a_mode==4) { + p_score=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, V_e_null, UltVehiY, beta, Vbeta); + + if (p_score<p_nr && crt==1) { + logl_H1=MphNR ('R', 1, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_score=PCRT (3, d_size, p_score, crt_a, crt_b, crt_c); + } + } + + if (a_mode==2 || a_mode==4) { + logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + //calculate beta and Vbeta + p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); + + if (p_lrt<p_nr) { + logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - if (a_mode==2 || a_mode==4) { - logl_H1=MphEM ('L', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); //calculate beta and Vbeta p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - - if (p_lrt<p_nr) { - logl_H1=MphNR ('L', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - - //calculate beta and Vbeta - p_lrt=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - p_lrt=gsl_cdf_chisq_Q (2.0*(logl_H1-logl_H0), (double)d_size ); - if (crt==1) { - p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_lrt=PCRT (2, d_size, p_lrt, crt_a, crt_b, crt_c); } - } + } + } - if (a_mode==1 || a_mode==4) { - logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (a_mode==1 || a_mode==4) { + logl_H1=MphEM ('R', em_iter/10, em_prec*10, eval, X, Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (p_wald<p_nr) { - logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); + if (p_wald<p_nr) { + logl_H1=MphNR ('R', nr_iter/10, nr_prec*10, eval, X, Y, Hi_all, xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); + p_wald=MphCalcP (eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, UltVehiY, beta, Vbeta); - if (crt==1) { - p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); - } + if (crt==1) { + p_wald=PCRT (1, d_size, p_wald, crt_a, crt_b, crt_c); } - } + } + } - //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl; + //cout<<setprecision(10)<<p_wald<<"\t"<<p_lrt<<"\t"<<p_score<<endl; - if (x_mean>1) {gsl_vector_scale(beta, -1.0);} + //if (x_mean>1) {gsl_vector_scale(beta, -1.0);} - time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - //store summary data - //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; - for (size_t i=0; i<d_size; i++) { - v_beta[i]=gsl_vector_get (beta, i); - } + //store summary data + //SUMSTAT SNPs={snpInfo[t].get_chr(), snpInfo[t].get_rs(), snpInfo[t].get_pos(), n_miss, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score}; + for (size_t i=0; i<d_size; i++) { + v_beta[i]=gsl_vector_get (beta, i); + } - c=0; - for (size_t i=0; i<d_size; i++) { - for (size_t j=i; j<d_size; j++) { - v_Vg[c]=gsl_matrix_get (V_g, i, j); - v_Ve[c]=gsl_matrix_get (V_e, i, j); - v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); - c++; - } - } + c=0; + for (size_t i=0; i<d_size; i++) { + for (size_t j=i; j<d_size; j++) { + v_Vg[c]=gsl_matrix_get (V_g, i, j); + v_Ve[c]=gsl_matrix_get (V_e, i, j); + v_Vbeta[c]=gsl_matrix_get (Vbeta, i, j); + c++; + } + } - MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; - sumStat.push_back(SNPs); - } + MPHSUMSTAT SNPs={v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; + sumStat.push_back(SNPs); + } + } + } cout<<endl; //cout<<"time_opt = "<<time_opt<<endl; @@ -4162,6 +4264,9 @@ void MVLMM::AnalyzePlink (const gsl_matrix *U, const gsl_vector *eval, const gsl gsl_matrix_free(B_null); gsl_matrix_free(se_B_null); + gsl_matrix_free(Xlarge); + gsl_matrix_free(UtXlarge); + return; } diff --git a/src/param.cpp b/src/param.cpp index 33b7b48..0a63a16 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -64,7 +64,7 @@ n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), -window_cm(0), window_bp(0), window_ns(0), +window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), ni_subsample(0), n_cvt(1), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) @@ -77,19 +77,27 @@ void PARAM::ReadFiles (void) { string file_str; - - if (!file_cat.empty()) { + //read cat file + if (!file_mcat.empty()) { + if (ReadFile_mcat (file_mcat, mapRS2cat, n_vc)==false) {error=true;} + } else if (!file_cat.empty()) { if (ReadFile_cat (file_cat, mapRS2cat, n_vc)==false) {error=true;} } - if (!file_var.empty()) { - if (ReadFile_var (file_var, mapRS2var)==false) {error=true;} + //read snp weight files + if (!file_wcat.empty()) { + if (ReadFile_wsnp (file_wcat, n_vc, mapRS2wcat)==false) {error=true;} + } + if (!file_wsnp.empty()) { + if (ReadFile_wsnp (file_wsnp, mapRS2wsnp)==false) {error=true;} } + //count number of kinship files if (!file_mk.empty()) { if (CountFileLines (file_mk, n_vc)==false) {error=true;} } + //read snp set if (!file_snps.empty()) { if (ReadFile_snps (file_snps, setSnps)==false) {error=true;} } else { @@ -184,10 +192,17 @@ void PARAM::ReadFiles (void) //read genotype and phenotype file for plink format if (!file_bfile.empty()) { file_str=file_bfile+".bim"; + snpInfo.clear(); if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} - file_str=file_bfile+".fam"; - if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + //if both fam file and pheno files are used, use phenotypes inside the pheno file + if (!file_pheno.empty()) { + //phenotype file before genotype file + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + } else { + file_str=file_bfile+".fam"; + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + } //post-process covariates and phenotypes, obtain ni_test, save all useful covariates ProcessCvtPhen(); @@ -228,6 +243,97 @@ void PARAM::ReadFiles (void) ns_total=indicator_snp.size(); } + + //read genotype file for multiple plink files + if (!file_mbfile.empty()) { + igzstream infile (file_mbfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mbfile file: "<<file_mbfile<<endl; return;} + + string file_name; + + size_t t=0, ns_test_tmp=0; + + gsl_matrix *W; + while (!safeGetline(infile, file_name).eof()) { + file_str=file_name+".bim"; + + if (ReadFile_bim (file_str, snpInfo)==false) {error=true;} + + if (t==0) { + //if both fam file and pheno files are used, use phenotypes inside the pheno file + if (!file_pheno.empty()) { + //phenotype file before genotype file + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + } else { + file_str=file_name+".fam"; + if (ReadFile_fam (file_str, indicator_pheno, pheno, mapID2num, p_column)==false) {error=true;} + } + + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates + ProcessCvtPhen(); + + //obtain covariate matrix + W=gsl_matrix_alloc (ni_test, n_cvt); + CopyCvt (W); + } + + file_str=file_name+".bed"; + if (ReadFile_bed (file_str, setSnps, W, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test_tmp)==false) {error=true;} + mindicator_snp.push_back(indicator_snp); + msnpInfo.push_back(snpInfo); + ns_test+=ns_test_tmp; + ns_total+=indicator_snp.size(); + + t++; + } + + gsl_matrix_free(W); + + infile.close(); + infile.clear(); + } + + + + //read genotype and phenotype file for multiple bimbam files + if (!file_mgeno.empty()) { + //annotation file before genotype file + if (!file_anno.empty() ) { + if (ReadFile_anno (file_anno, mapRS2chr, mapRS2bp, mapRS2cM)==false) {error=true;} + } + + //phenotype file before genotype file + if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} + + //post-process covariates and phenotypes, obtain ni_test, save all useful covariates + ProcessCvtPhen(); + + //obtain covariate matrix + gsl_matrix *W=gsl_matrix_alloc (ni_test, n_cvt); + CopyCvt (W); + + igzstream infile (file_mgeno.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mgeno file: "<<file_mgeno<<endl; return;} + + string file_name; + size_t ns_test_tmp; + while (!safeGetline(infile, file_name).eof()) { + if (ReadFile_geno (file_name, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp)==false) {error=true;} + + mindicator_snp.push_back(indicator_snp); + msnpInfo.push_back(snpInfo); + ns_test+=ns_test_tmp; + ns_total+=indicator_snp.size(); + } + + gsl_matrix_free(W); + + infile.close(); + infile.clear(); + } + + + if (!file_gene.empty()) { if (ReadFile_pheno (file_pheno, indicator_pheno, pheno, p_column)==false) {error=true;} @@ -292,7 +398,7 @@ void PARAM::CheckParam (void) //check parameters if (k_mode!=1 && k_mode!=2) {cout<<"error! unknown kinship/relatedness input mode: "<<k_mode<<endl; error=true;} - if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=71) + if (a_mode!=1 && a_mode!=2 && a_mode!=3 && a_mode!=4 && a_mode!=5 && a_mode!=11 && a_mode!=12 && a_mode!=13 && a_mode!=14 && a_mode!=21 && a_mode!=22 && a_mode!=25 && a_mode!=26 && a_mode!=27 && a_mode!=28 && a_mode!=31 && a_mode!=41 && a_mode!=42 && a_mode!=43 && a_mode!=51 && a_mode!=52 && a_mode!=53 && a_mode!=54 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67 && a_mode!=71) {cout<<"error! unknown analysis mode: "<<a_mode<<". make sure -gk or -eigen or -lmm or -bslmm -predict or -calccov is sepcified correctly."<<endl; error=true;} if (miss_level>1) {cout<<"error! missing level needs to be between 0 and 1. current value = "<<miss_level<<endl; error=true;} if (maf_level>0.5) {cout<<"error! maf level needs to be between 0 and 0.5. current value = "<<maf_level<<endl; error=true;} @@ -400,8 +506,8 @@ void PARAM::CheckParam (void) str=file_cat; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;} - str=file_var; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open category file: "<<str<<endl; error=true;} + str=file_mcat; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mcategory file: "<<str<<endl; error=true;} str=file_beta; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open beta file: "<<str<<endl; error=true;} @@ -409,23 +515,33 @@ void PARAM::CheckParam (void) str=file_cor; if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open correlation file: "<<str<<endl; error=true;} - str=file_q; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open q file: "<<str<<endl; error=true;} + if (!file_study.empty()) { + str=file_study+".Vq.txt"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .Vq.txt file: "<<str<<endl; error=true;} + str=file_study+".q.txt"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .q.txt file: "<<str<<endl; error=true;} + str=file_study+".size.txt"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .size.txt file: "<<str<<endl; error=true;} + } - str=file_s; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open s file: "<<str<<endl; error=true;} + if (!file_ref.empty()) { + str=file_ref+".S.txt"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .S.txt file: "<<str<<endl; error=true;} + str=file_ref+".size.txt"; + if (stat(str.c_str(),&fileInfo)==-1) {cout<<"error! fail to open .size.txt file: "<<str<<endl; error=true;} + } - str=file_v; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open v file: "<<str<<endl; error=true;} + str=file_mstudy; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mstudy file: "<<str<<endl; error=true;} - str=file_mq; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mq file: "<<str<<endl; error=true;} + str=file_mref; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mref file: "<<str<<endl; error=true;} - str=file_ms; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open ms file: "<<str<<endl; error=true;} + str=file_mgeno; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mgeno file: "<<str<<endl; error=true;} - str=file_mv; - if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mv file: "<<str<<endl; error=true;} + str=file_mbfile; + if (!str.empty() && stat(str.c_str(),&fileInfo)==-1 ) {cout<<"error! fail to open mbfile file: "<<str<<endl; error=true;} size_t flag=0; if (!file_bfile.empty()) {flag++;} @@ -434,7 +550,7 @@ void PARAM::CheckParam (void) // WJA added if (!file_oxford.empty()) {flag++;} - if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62) { + if (flag!=1 && a_mode!=27 && a_mode!=28 && a_mode!=43 && a_mode!=5 && a_mode!=61 && a_mode!=62 && a_mode!=66 && a_mode!=67) { cout<<"error! either plink binary files, or bimbam mean genotype files, or gene expression files are required."<<endl; error=true; } @@ -443,21 +559,30 @@ void PARAM::CheckParam (void) } if (a_mode==61 || a_mode==62) { - if (!file_pheno.empty()) { + if (!file_beta.empty()) { + if ( file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty() && file_mref.empty() && file_ref.empty() ) { + cout<<"error! missing genotype file or ref/mref file."<<endl; error=true; + } + } else if (!file_pheno.empty()) { if (file_kin.empty() && (file_ku.empty()||file_kd.empty()) && file_mk.empty() ) { cout<<"error! missing relatedness file. "<<endl; error=true; } + /* } else if (!file_cor.empty()) { if (file_beta.empty() ) { cout<<"error! missing cor file."<<endl; error=true; } - } else { - if ( (file_mq.empty() || file_ms.empty() || file_mv.empty() ) && (file_q.empty() || file_s.empty() || file_v.empty() ) ) { - cout<<"error! either phenotype/kinship files or ms/mq/mv s/q/v files are required."<<endl; error=true; - } + */ + } else if ( (file_mstudy.empty() && file_study.empty()) || (file_mref.empty() && file_ref.empty() ) ) { + cout<<"error! either beta file, or phenotype files or study/ref mstudy/mref files are required."<<endl; error=true; } } + if (a_mode==66 || a_mode==67) { + if (file_beta.empty() || ( file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty()) ) { + cout<<"error! missing beta file or genotype file."<<endl; error=true; + } + } if (!file_epm.empty() && file_bfile.empty() && file_geno.empty() ) {cout<<"error! estimated parameter file also requires genotype file."<<endl; error=true;} @@ -525,13 +650,16 @@ void PARAM::CheckParam (void) void PARAM::CheckData (void) { if(file_oxford.empty()) // WJA NOTE: I added this condition so that covariates can be added through sample, probably not exactly what is wanted - { if ((file_cvt).empty() || (indicator_cvt).size()==0) { n_cvt=1; } } + if ( (a_mode==66 || a_mode==67) && (v_pve.size()!=n_vc)) { + cout<<"error! the number of pve estimates does not equal to the number of categories in the cat file:"<<v_pve.size()<<" "<<n_vc<<endl; error=true; + } + if ( (indicator_cvt).size()!=0 && (indicator_cvt).size()!=(indicator_idv).size()) { error=true; cout<<"error! number of rows in the covariates file do not match the number of individuals. "<<endl; @@ -610,7 +738,7 @@ void PARAM::CheckData (void) { } } */ - if (ni_test==0 && file_cor.empty() && file_mq.empty() && file_q.empty() && file_beta.empty() ) { + if (ni_test==0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() ) { error=true; cout<<"error! number of analyzed individuals equals 0. "<<endl; return; @@ -631,7 +759,7 @@ void PARAM::CheckData (void) { } //output some information - if (file_cor.empty() && file_mq.empty() && file_q.empty() ) { + if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode!=27 && a_mode!=28) { cout<<"## number of total individuals = "<<ni_total<<endl; if (a_mode==43) { cout<<"## number of analyzed individuals = "<<ni_cvt<<endl; @@ -709,6 +837,9 @@ void PARAM::CheckData (void) { } } + if (a_mode==62 && !file_beta.empty() && mapRS2wcat.size()==0) {cout<<"vc analysis with beta files requires -wcat file."<<endl; error=true;} + if (a_mode==67 && mapRS2wcat.size()==0) {cout<<"ci analysis with beta files requires -wcat file."<<endl; error=true;} + //file_mk needs to contain more than one line if (n_vc==1 && !file_mk.empty()) {cout<<"error! -mk file should contain more than one line."<<endl; error=true;} @@ -783,46 +914,52 @@ void PARAM::CalcKin (gsl_matrix *matrix_kin) { -//from an existing n by nd G matrix, compute the d by d S matrix -void compKtoS (const gsl_matrix *G, gsl_matrix *S) { - size_t n_vc=S->size1, ni_test=G->size1; - double di, dj, tr_KiKj, sum_Ki, sum_Kj, s_Ki, s_Kj, s_KiKj, si, sj, d; +//from an existing n by nd A and K matrices, compute the d by d S matrix (which is not necessary symmetric) +void compAKtoS (const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt, gsl_matrix *S) { + size_t n_vc=S->size1, ni_test=A->size1; + double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d; for (size_t i=0; i<n_vc; i++) { - for (size_t j=i; j<n_vc; j++) { - tr_KiKj=0; sum_Ki=0; sum_Kj=0; s_KiKj=0; si=0; sj=0; + for (size_t j=0; j<n_vc; j++) { + tr_AK=0; sum_A=0; sum_K=0; sum_AK=0; tr_A=0; tr_K=0; for (size_t l=0; l<ni_test; l++) { - s_Ki=0; s_Kj=0; + s_A=0; s_K=0; for (size_t k=0; k<ni_test; k++) { - di=gsl_matrix_get(G, l, k+ni_test*i); - dj=gsl_matrix_get(G, l, k+ni_test*j); - s_Ki+=di; s_Kj+=dj; + di=gsl_matrix_get(A, l, k+ni_test*i); + dj=gsl_matrix_get(K, l, k+ni_test*j); + s_A+=di; s_K+=dj; - tr_KiKj+=di*dj; sum_Ki+=di; sum_Kj+=dj; - if (l==k) {si+=di; sj+=dj;} + tr_AK+=di*dj; sum_A+=di; sum_K+=dj; + if (l==k) {tr_A+=di; tr_K+=dj;} } - s_KiKj+=s_Ki*s_Kj; + sum_AK+=s_A*s_K; } - sum_Ki/=(double)ni_test; - sum_Kj/=(double)ni_test; - s_KiKj/=(double)ni_test; - si-=sum_Ki; - sj-=sum_Kj; - d=tr_KiKj-2*s_KiKj+sum_Ki*sum_Kj; - d=d/(si*sj)-1/(double)(ni_test-1); + sum_A/=(double)ni_test; + sum_K/=(double)ni_test; + sum_AK/=(double)ni_test; + tr_A-=sum_A; + tr_K-=sum_K; + d=tr_AK-2*sum_AK+sum_A*sum_K; + + if (tr_A==0 || tr_K==0) { + d=0; + } else { + d=d/(tr_A*tr_K)-1/(double)(ni_test-n_cvt); + } gsl_matrix_set (S, i, j, d); - if (i!=j) {gsl_matrix_set (S, j, i, d);} } } + + //eigenlib_invert(Si); //cout<<tr_KiKj<<" "<<s_KiKj<<" "<<sum_Ki<<" "<<sum_Kj<<" "<<si<<" "<<sj<<" "<<d*1000000<<endl; return; } -//copied from lmm.cpp; is used in the following function compKtoQ +//copied from lmm.cpp; is used in the following function compKtoV //map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} @@ -836,20 +973,19 @@ size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { return index; } -//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d+1) Q matrix -//where inside i'th d+1 by d+1 matrix, each element is tr(KiKjKiKl)-r*tr(KjKiKl)-r*tr(KlKiKj)+r^2*tr(KjKl), where r=n/(n-1) -void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { +//from an existing n by nd (centered) G matrix, compute the d+1 by d*(d-1)/2*(d+1) Q matrix +//where inside i'th d+1 by d+1 matrix, each element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm), where r=n/(n-1) +void compKtoV (const gsl_matrix *G, gsl_matrix *V) { size_t n_vc=G->size2/G->size1, ni_test=G->size1; - gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, n_vc*(n_vc+1)/2*ni_test); - gsl_vector *trKiKjKi=gsl_vector_alloc ( n_vc*n_vc ); + gsl_matrix *KiKj=gsl_matrix_alloc(ni_test, (n_vc*(n_vc+1))/2*ni_test); gsl_vector *trKiKj=gsl_vector_alloc( n_vc*(n_vc+1)/2 ); gsl_vector *trKi=gsl_vector_alloc(n_vc); double d, tr, r=(double)ni_test/(double)(ni_test-1); - size_t t, t_ij, t_il, t_jl, t_ii; + size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij; - //compute KiKj for all pairs of i and j (including the identity matrix) + //compute KiKj for all pairs of i and j (not including the identity matrix) t=0; for (size_t i=0; i<n_vc; i++) { gsl_matrix_const_view Ki=gsl_matrix_const_submatrix(G, 0, i*ni_test, ni_test, ni_test); @@ -889,99 +1025,108 @@ void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { gsl_vector_set (trKi, i, tr); } - //compute trKiKjKi (it is not symmetric w.r.t. i and j) + //compute V for (size_t i=0; i<n_vc; i++) { - for (size_t j=0; j<n_vc; j++) { - tr=0; - t=GetabIndex (i+1, j+1, n_vc-2); - for (size_t k=0; k<ni_test; k++) { - gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t*ni_test, ni_test); - gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t*ni_test+k); - - gsl_vector_const_view Ki_col=gsl_matrix_const_column (G, i*ni_test+k); - - if (i<=j) { - gsl_blas_ddot (&KiKj_row.vector, &Ki_col.vector, &d); - tr+=d; - } else { - gsl_blas_ddot (&KiKj_col.vector, &Ki_col.vector, &d); - tr+=d; - } - } - gsl_vector_set (trKiKjKi, i*n_vc+j, tr); - } - } + for (size_t j=i; j<n_vc; j++) { + t_ij=GetabIndex (i+1, j+1, n_vc-2); + for (size_t l=0; l<n_vc+1; l++) { + for (size_t m=0; m<n_vc+1; m++) { + if (l!=n_vc && m!=n_vc) { + t_il=GetabIndex (i+1, l+1, n_vc-2); + t_jm=GetabIndex (j+1, m+1, n_vc-2); + t_lm=GetabIndex (l+1, m+1, n_vc-2); + //cout<<ni_test<<" "<<r<<t_ij<<" "<<t_il<<" "<<t_jl<<" "<<endl; + tr=0; + for (size_t k=0; k<ni_test; k++) { + gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test); + gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k); + gsl_vector_const_view KjKm_row=gsl_matrix_const_subrow (KiKj, k, t_jm*ni_test, ni_test); + gsl_vector_const_view KjKm_col=gsl_matrix_const_column (KiKj, t_jm*ni_test+k); + + gsl_vector_const_view Kl_row=gsl_matrix_const_subrow (G, k, l*ni_test, ni_test); + gsl_vector_const_view Km_row=gsl_matrix_const_subrow (G, k, m*ni_test, ni_test); + + if (i<=l && j<=m) { + gsl_blas_ddot (&KiKl_row.vector, &KjKm_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d); + tr-=r*d; + } else if (i<=l && j>m) { + gsl_blas_ddot (&KiKl_row.vector, &KjKm_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_col.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d); + tr-=r*d; + } else if (i>l && j<=m) { + gsl_blas_ddot (&KiKl_col.vector, &KjKm_col.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_col.vector, &d); + tr-=r*d; + } else { + gsl_blas_ddot (&KiKl_col.vector, &KjKm_row.vector, &d); + tr+=d; + gsl_blas_ddot (&Km_row.vector, &KiKl_row.vector, &d); + tr-=r*d; + gsl_blas_ddot (&Kl_row.vector, &KjKm_row.vector, &d); + tr-=r*d; + } + } - //compute Q - for (size_t i=0; i<n_vc; i++) { - for (size_t j=0; j<n_vc+1; j++) { - for (size_t l=j; l<n_vc+1; l++) { - if (j!=n_vc && l!=n_vc) { - t_ij=GetabIndex (i+1, j+1, n_vc-2); - t_il=GetabIndex (i+1, l+1, n_vc-2); - t_jl=GetabIndex (j+1, l+1, n_vc-2); - - //cout<<ni_test<<" "<<r<<t_ij<<" "<<t_il<<" "<<t_jl<<" "<<endl; - tr=0; - for (size_t k=0; k<ni_test; k++) { - gsl_vector_const_view KiKj_row=gsl_matrix_const_subrow (KiKj, k, t_ij*ni_test, ni_test); - gsl_vector_const_view KiKj_col=gsl_matrix_const_column (KiKj, t_ij*ni_test+k); - gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test); - gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k); - - gsl_vector_const_view Kj_row=gsl_matrix_const_subrow (G, k, j*ni_test, ni_test); - gsl_vector_const_view Kl_row=gsl_matrix_const_subrow (G, k, l*ni_test, ni_test); - - if (i<=j && i<=l) { - gsl_blas_ddot (&KiKj_row.vector, &KiKl_col.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); - tr-=r*d; - } else if (i<=j && i>l) { - gsl_blas_ddot (&KiKj_row.vector, &KiKl_row.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_col.vector, &d); - tr-=r*d; - } else if (i>j && i<=l) { - gsl_blas_ddot (&KiKj_col.vector, &KiKl_col.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_col.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); - tr-=r*d; - } else { - gsl_blas_ddot (&KiKj_col.vector, &KiKl_row.vector, &d); - tr+=d; - gsl_blas_ddot (&Kj_row.vector, &KiKl_row.vector, &d); - tr-=r*d; - gsl_blas_ddot (&Kl_row.vector, &KiKj_row.vector, &d); - tr-=r*d; + tr+=r*r*gsl_vector_get (trKiKj, t_lm); + } else if (l!=n_vc && m==n_vc) { + t_il=GetabIndex (i+1, l+1, n_vc-2); + t_jl=GetabIndex (j+1, l+1, n_vc-2); + tr=0; + for (size_t k=0; k<ni_test; k++) { + gsl_vector_const_view KiKl_row=gsl_matrix_const_subrow (KiKj, k, t_il*ni_test, ni_test); + gsl_vector_const_view KiKl_col=gsl_matrix_const_column (KiKj, t_il*ni_test+k); + gsl_vector_const_view Kj_row=gsl_matrix_const_subrow (G, k, j*ni_test, ni_test); + + if (i<=l) { + gsl_blas_ddot (&KiKl_row.vector, &Kj_row.vector, &d); + tr+=d; + } else { + gsl_blas_ddot (&KiKl_col.vector, &Kj_row.vector, &d); + tr+=d; + } } + tr+=-r*gsl_vector_get (trKiKj, t_il)-r*gsl_vector_get (trKiKj, t_jl)+r*r*gsl_vector_get (trKi, l); + } else if (l==n_vc && m!=n_vc) { + t_jm=GetabIndex (j+1, m+1, n_vc-2); + t_im=GetabIndex (i+1, m+1, n_vc-2); + tr=0; + for (size_t k=0; k<ni_test; k++) { + gsl_vector_const_view KjKm_row=gsl_matrix_const_subrow (KiKj, k, t_jm*ni_test, ni_test); + gsl_vector_const_view KjKm_col=gsl_matrix_const_column (KiKj, t_jm*ni_test+k); + gsl_vector_const_view Ki_row=gsl_matrix_const_subrow (G, k, i*ni_test, ni_test); + + if (j<=m) { + gsl_blas_ddot (&KjKm_row.vector, &Ki_row.vector, &d); + tr+=d; + } else { + gsl_blas_ddot (&KjKm_col.vector, &Ki_row.vector, &d); + tr+=d; + } + } + tr+=-r*gsl_vector_get (trKiKj, t_im)-r*gsl_vector_get (trKiKj, t_jm)+r*r*gsl_vector_get (trKi, m); + } else { + tr=gsl_vector_get (trKiKj, t_ij)-r*gsl_vector_get (trKi, i)-r*gsl_vector_get (trKi, j)+r*r*(double)(ni_test-1); } - tr+=r*r*gsl_vector_get (trKiKj, t_jl); - } else if (j!=n_vc && l==n_vc) { - t_ij=GetabIndex (i+1, j+1, n_vc-2); - tr=gsl_vector_get (trKiKjKi, i*n_vc+j)-2*r*gsl_vector_get (trKiKj, t_ij)+r*r*gsl_vector_get (trKi, j); - } else if (j==n_vc && l==n_vc) { - t_ii=GetabIndex (i+1, i+1, n_vc-2); - tr=gsl_vector_get (trKiKj, t_ii)-2*r*gsl_vector_get (trKi, i)+r*r*(double)(ni_test-1); + gsl_matrix_set (V, l, t_ij*(n_vc+1)+m, tr); } - - gsl_matrix_set (Q, j, i*(n_vc+1)+l, tr); - if (l!=j) {gsl_matrix_set (Q, l, i*(n_vc+1)+j, tr);} } } } - gsl_matrix_scale (Q, 1.0/pow((double)ni_test, 2) ); + gsl_matrix_scale (V, 1.0/pow((double)ni_test, 2) ); gsl_matrix_free(KiKj); - gsl_vector_free(trKiKjKi); gsl_vector_free(trKiKj); gsl_vector_free(trKi); @@ -991,190 +1136,210 @@ void compKtoQ (const gsl_matrix *G, gsl_matrix *Q) { //perform Jacknife sampling for variance of S -void JacknifeGtoS (const gsl_matrix *G, gsl_matrix *S, gsl_matrix *Svar) { - size_t n_vc=Svar->size1, ni_test=G->size1; - vector<vector<vector<double> > > tr_KiKj, s_KiKj; - vector<vector<double> > sum_Ki, s_Ki, si; +void JackknifeAKtoS (const gsl_matrix *W, const gsl_matrix *A, const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) { + size_t n_vc=Svar->size1, ni_test=A->size1, n_cvt=W->size2; + + vector<vector<vector<double> > > trAK, sumAK; + vector<vector<double> > sumA, sumK, trA, trK, sA, sK; vector<double> vec_tmp; double di, dj, d, m, v; + //gsl_matrix *Stmp=gsl_matrix_alloc (n_vc, ni_test*n_vc); + //gsl_matrix *Stmp_sub=gsl_matrix_alloc (n_vc, n_vc); + //initialize and set all elements to zero for (size_t i=0; i<ni_test; i++) { vec_tmp.push_back(0); } for (size_t i=0; i<n_vc; i++) { - sum_Ki.push_back(vec_tmp); - s_Ki.push_back(vec_tmp); - si.push_back(vec_tmp); + sumA.push_back(vec_tmp); + sumK.push_back(vec_tmp); + trA.push_back(vec_tmp); + trK.push_back(vec_tmp); + sA.push_back(vec_tmp); + sK.push_back(vec_tmp); } for (size_t i=0; i<n_vc; i++) { - tr_KiKj.push_back(sum_Ki); - s_KiKj.push_back(sum_Ki); + trAK.push_back(sumK); + sumAK.push_back(sumK); } - //run jacknife + //run jackknife for (size_t i=0; i<n_vc; i++) { for (size_t l=0; l<ni_test; l++) { for (size_t k=0; k<ni_test; k++) { - di=gsl_matrix_get(G, l, k+ni_test*i); + di=gsl_matrix_get(A, l, k+ni_test*i); + dj=gsl_matrix_get(K, l, k+ni_test*i); for (size_t t=0; t<ni_test; t++) { if (t==l || t==k) {continue;} - sum_Ki[i][t]+=di; - if (l==k) {si[i][t]+=di;} + sumA[i][t]+=di; + sumK[i][t]+=dj; + if (l==k) {trA[i][t]+=di; trK[i][t]+=dj;} } - s_Ki[i][l]+=di; + sA[i][l]+=di; + sK[i][l]+=dj; } } for (size_t t=0; t<ni_test; t++) { - sum_Ki[i][t]/=(double)(ni_test-1); + sumA[i][t]/=(double)(ni_test-1); + sumK[i][t]/=(double)(ni_test-1); } } for (size_t i=0; i<n_vc; i++) { - for (size_t j=i; j<n_vc; j++) { + for (size_t j=0; j<n_vc; j++) { for (size_t l=0; l<ni_test; l++) { for (size_t k=0; k<ni_test; k++) { - di=gsl_matrix_get(G, l, k+ni_test*i); - dj=gsl_matrix_get(G, l, k+ni_test*j); + di=gsl_matrix_get(A, l, k+ni_test*i); + dj=gsl_matrix_get(K, l, k+ni_test*j); d=di*dj; for (size_t t=0; t<ni_test; t++) { if (t==l || t==k) {continue;} - tr_KiKj[i][j][t]+=d; + trAK[i][j][t]+=d; } } for (size_t t=0; t<ni_test; t++) { if (t==l) {continue;} - di=gsl_matrix_get(G, l, t+ni_test*i); - dj=gsl_matrix_get(G, l, t+ni_test*j); + di=gsl_matrix_get(A, l, t+ni_test*i); + dj=gsl_matrix_get(K, l, t+ni_test*j); - s_KiKj[i][j][t]+=(s_Ki[i][l]-di)*(s_Ki[j][l]-dj); + sumAK[i][j][t]+=(sA[i][l]-di)*(sK[j][l]-dj); } } for (size_t t=0; t<ni_test; t++) { - s_KiKj[i][j][t]/=(double)(ni_test-1); + sumAK[i][j][t]/=(double)(ni_test-1); } m=0; v=0; for (size_t t=0; t<ni_test; t++) { - d=tr_KiKj[i][j][t]-2*s_KiKj[i][j][t]+sum_Ki[i][t]*sum_Ki[j][t]; - d/=(si[i][t]-sum_Ki[i][t])*(si[j][t]-sum_Ki[j][t]); - d-=1/(double)(ni_test-2); - + d=trAK[i][j][t]-2*sumAK[i][j][t]+sumA[i][t]*sumK[j][t]; + if ( (trA[i][t]-sumA[i][t])==0 || (trK[j][t]-sumK[j][t])==0) { + d=0; + } else { + d/=(trA[i][t]-sumA[i][t])*(trK[j][t]-sumK[j][t]); + d-=1/(double)(ni_test-n_cvt-1); + } + //gsl_matrix_set (Stmp, i, t*n_vc+j, d); + //gsl_matrix_set (Stmp, j, t*n_vc+i, d); m+=d; v+=d*d; } m/=(double)ni_test; v/=(double)ni_test; v-=m*m; v*=(double)(ni_test-1); + gsl_matrix_set (Svar, i, j, v); + if (n_cvt==1) { + d=gsl_matrix_get (S, i, j); + d=(double)ni_test*d-(double)(ni_test-1)*m; + gsl_matrix_set (S, i, j, d); + } + } + } + + /* + for (size_t t=0; t<ni_test; t++) { + gsl_matrix_view Stmp_view=gsl_matrix_submatrix(Stmp, 0, t*n_vc, n_vc, n_vc); + gsl_matrix_memcpy (Stmp_sub, &Stmp_view.matrix); + eigenlib_invert(Stmp_sub); + gsl_matrix_memcpy (&Stmp_view.matrix, Stmp_sub); + } + + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + m=0; v=0; + for (size_t t=0; t<ni_test; t++) { + d=gsl_matrix_get (Stmp, i, t*n_vc+j); + m+=d; + v+=d*d; + } + m/=(double)ni_test; + v/=(double)ni_test; + v-=m*m; + v*=(double)(ni_test-1); gsl_matrix_set (Svar, i, j, v); - d=gsl_matrix_get (S, i, j); + d=gsl_matrix_get (Si, i, j); d=(double)ni_test*d-(double)(ni_test-1)*m; - gsl_matrix_set (S, i, j, d); - if (i!=j) {gsl_matrix_set (Svar, j, i, v); gsl_matrix_set (S, j, i, d);} + gsl_matrix_set (Si, i, j, d); + if (i!=j) {gsl_matrix_set (Svar, j, i, v); gsl_matrix_set (Si, j, i, d);} } } + gsl_matrix_free (Stmp); + */ return; } //compute the d by d S matrix with its d by d variance matrix of Svar, and the d+1 by d(d+1) matrix of Q for V(q) -void PARAM::CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q) { +void PARAM::CalcS (const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns) { string file_str; gsl_matrix_set_zero (S); gsl_matrix_set_zero (Svar); - gsl_matrix_set_zero (Q); + gsl_vector_set_zero (ns); //compute the kinship matrix G for multiple categories; these matrices are not centered, for convienence of Jacknife sampling - gsl_matrix *G=gsl_matrix_alloc (ni_test, n_vc*ni_test); - gsl_matrix_set_zero (G); - if (!file_bfile.empty() ) { file_str=file_bfile+".bed"; - if (PlinkKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} - } else { + if (mapRS2wA.size()==0) { + if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;} + } else { + if (PlinkKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_geno.empty()) { file_str=file_geno; - if (BimbamKin (file_str, indicator_idv, indicator_snp, a_mode-24, d_pace, mapRS2cat, mapRS2var, snpInfo, G)==false) {error=true;} + if (mapRS2wA.size()==0) { + if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns)==false) {error=true;} + } else { + if (BimbamKin (file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_mbfile.empty() ){ + if (mapRS2wA.size()==0) { + if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;} + } else { + if (MFILEKin (1, file_mbfile, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;} + } + } else if (!file_mgeno.empty()) { + if (mapRS2wA.size()==0) { + if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns)==false) {error=true;} + } else { + if (MFILEKin (0, file_mgeno, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns)==false) {error=true;} + } } - //center and scale every kinship matrix inside G - double d; - for (size_t i=0; i<n_vc; i++) { - gsl_matrix_view K=gsl_matrix_submatrix(G, 0, i*ni_test, ni_test, ni_test); - CenterMatrix(&K.matrix); - d=ScaleMatrix(&K.matrix); + if (mapRS2wA.size()==0) { + gsl_matrix_memcpy (A, K); } - //based on G, compute S - compKtoS (G, S); - - //based on G, compute a matrix Q that can be used to calculate the variance of q - compKtoQ (G, Q); - - /* - //set up random environment - gsl_rng_env_setup(); - gsl_rng *gsl_r; - const gsl_rng_type * gslType; - gslType = gsl_rng_default; - if (randseed<0) { - time_t rawtime; - time (&rawtime); - tm * ptm = gmtime (&rawtime); - - randseed = (unsigned) (ptm->tm_hour%24*3600+ptm->tm_min*60+ptm->tm_sec); - } - gsl_r = gsl_rng_alloc(gslType); - gsl_rng_set(gsl_r, randseed); - - //bootstrap: in each iteration, sample individuals and compute S_pmt - size_t n_pmt=100; - vector<size_t> idv_order, idv_remove; - for (size_t i=0; i<ni_test; i++) { - idv_order.push_back(i); - } - for (size_t i=0; i<n_pmt; i++) { - idv_remove.push_back(0); - } - gsl_ran_choose (gsl_r, static_cast<void*>(&idv_remove[0]), n_pmt, static_cast<void*>(&idv_order[0]), ni_test, sizeof(size_t)); + //center and scale every kinship matrix inside G + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_view Ksub=gsl_matrix_submatrix(K, 0, i*ni_test, ni_test, ni_test); + CenterMatrix(&Ksub.matrix); + ScaleMatrix(&Ksub.matrix); - gsl_matrix *S_pmt=gsl_matrix_alloc(n_vc, n_vc*n_pmt); - for (size_t i=0; i<n_pmt; i++) { - gsl_matrix_view S_sub=gsl_matrix_submatrix (S_pmt, 0, n_vc*i, n_vc, n_vc); - compKtoS (G, idv_remove[i], &S_sub.matrix); + gsl_matrix_view Asub=gsl_matrix_submatrix(A, 0, i*ni_test, ni_test, ni_test); + CenterMatrix(&Asub.matrix); + ScaleMatrix(&Asub.matrix); } - //based on S_pmt, compute Svar - double m, v, d; - for (size_t i=0; i<n_vc; i++) { - for (size_t j=i; j<n_vc; j++) { - m=0; v=0; - for (size_t t=0; t<n_pmt; t++) { - d=gsl_matrix_get(S_pmt, i, j); - m+=d; v+=d*d; - } - m/=(double)n_pmt; v/=(double)n_pmt; - v=v-m*m; - gsl_matrix_set(Svar, i, j, v); - if (i!=j) {gsl_matrix_set(Svar, j, i, v);} - } - } - */ + //based on G, compute S + compAKtoS (A, K, W->size2, S); //compute Svar and update S with Jacknife - JacknifeGtoS (G, S, Svar); + JackknifeAKtoS (W, A, K, S, Svar); + + //based on G, compute a matrix Q that can be used to calculate the variance of q + //compKtoV (G, V); - gsl_matrix_free(G); return; } @@ -1223,11 +1388,20 @@ void PARAM::WriteVar (const string suffix) outfile.precision(10); - for (size_t i=0; i<indicator_snp.size(); i++) { - if (indicator_snp[i]==0) {continue;} - rs=snpInfo[i].rs_number; - if (mapRS2var.count(rs)!=0) { - outfile<<rs<<"\t"<<mapRS2var.at(rs)<<endl; + if (mindicator_snp.size()!=0) { + for (size_t t=0; t<mindicator_snp.size(); t++) { + indicator_snp=mindicator_snp[t]; + for (size_t i=0; i<indicator_snp.size(); i++) { + if (indicator_snp[i]==0) {continue;} + rs=snpInfo[i].rs_number; + outfile<<rs<<endl; + } + } + } else { + for (size_t i=0; i<indicator_snp.size(); i++) { + if (indicator_snp[i]==0) {continue;} + rs=snpInfo[i].rs_number; + outfile<<rs<<endl; } } @@ -1564,3 +1738,219 @@ void PARAM::CopyRead (gsl_vector *log_N) +void PARAM::ObtainWeight (const set<string> &setSnps_beta, map<string, double> &mapRS2wK) +{ + mapRS2wK.clear(); + + vector<double> wsum, wcount; + + for (size_t i=0; i<n_vc; i++) { + wsum.push_back(0.0); + wcount.push_back(0.0); + } + + string rs; + if (msnpInfo.size()==0) { + for (size_t i=0; i<snpInfo.size(); i++) { + if (indicator_snp[i]==0) {continue;} + + rs=snpInfo[i].rs_number; + if ( (setSnps_beta.size()==0 || setSnps_beta.count(rs)!=0) && (mapRS2wsnp.size()==0 || mapRS2wsnp.count(rs)!=0) && (mapRS2wcat.size()==0 || mapRS2wcat.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { + if (mapRS2wsnp.size()!=0) { + mapRS2wK[rs]=mapRS2wsnp[rs]; + if (mapRS2cat.size()==0) { + wsum[0]+=mapRS2wsnp[rs]; + } else { + wsum[mapRS2cat[rs]]+=mapRS2wsnp[rs]; + } + wcount[0]++; + } else { + mapRS2wK[rs]=1; + } + } + + } + } else { + for (size_t t=0; t<msnpInfo.size(); t++) { + snpInfo=msnpInfo[t]; + indicator_snp=mindicator_snp[t]; + + for (size_t i=0; i<snpInfo.size(); i++) { + if (indicator_snp[i]==0) {continue;} + + rs=snpInfo[i].rs_number; + if ( (setSnps_beta.size()==0 || setSnps_beta.count(rs)!=0) && (mapRS2wsnp.size()==0 || mapRS2wsnp.count(rs)!=0) && (mapRS2wcat.size()==0 || mapRS2wcat.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { + if (mapRS2wsnp.size()!=0) { + mapRS2wK[rs]=mapRS2wsnp[rs]; + if (mapRS2cat.size()==0) { + wsum[0]+=mapRS2wsnp[rs]; + } else { + wsum[mapRS2cat[rs]]+=mapRS2wsnp[rs]; + } + wcount[0]++; + } else { + mapRS2wK[rs]=1; + } + } + } + } + } + + if (mapRS2wsnp.size()!=0) { + for (size_t i=0; i<n_vc; i++) { + wsum[i]/=wcount[i]; + } + + for (map<string, double>::iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + if (mapRS2cat.size()==0) { + it->second/=wsum[0]; + } else { + it->second/=wsum[mapRS2cat[it->first]]; + } + } + } + return; +} + + +//pve_flag=0 then do not change pve; pve_flag==1, then change pve to 0 if pve < 0 and pve to 1 if pve > 1 +void PARAM::UpdateWeight (const size_t pve_flag, const map<string, double> &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map<string, double> &mapRS2wA) +{ + double d; + vector<double> wsum, wcount; + + for (size_t i=0; i<n_vc; i++) { + wsum.push_back(0.0); + wcount.push_back(0.0); + } + + for (map<string, double>::const_iterator it=mapRS2wK.begin(); it!=mapRS2wK.end(); ++it) { + d=1; + for (size_t i=0; i<n_vc; i++) { + if (v_pve[i]>=1 && pve_flag==1) { + d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]; + } else if (v_pve[i]<=0 && pve_flag==1) { + d+=0; + } else { + d+=(double)ni_test/gsl_vector_get(ns, i)*mapRS2wcat[it->first][i]*v_pve[i]; + } + } + mapRS2wA[it->first]=1/(d*d); + + if (mapRS2cat.size()==0) { + wsum[0]+=mapRS2wA[it->first]; + wcount[0]++; + } else { + wsum[mapRS2cat[it->first]]+=mapRS2wA[it->first]; + wcount[mapRS2cat[it->first]]++; + } + } + + for (size_t i=0; i<n_vc; i++) { + wsum[i]/=wcount[i]; + } + + for (map<string, double>::iterator it=mapRS2wA.begin(); it!=mapRS2wA.end(); ++it) { + if (mapRS2cat.size()==0) { + it->second/=wsum[0]; + } else { + it->second/=wsum[mapRS2cat[it->first]]; + } + } + return; +} + +// this function updates indicator_snp, and save z-scores and other values into vectors +void PARAM::UpdateSNPnZ (const map<string, double> &mapRS2wA, const map<string, string> &mapRS2A1, const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat) +{ + gsl_vector_set_zero (w); + gsl_vector_set_zero (z); + vec_cat.clear(); + + string rs, a1; + size_t c=0; + if (msnpInfo.size()==0) { + for (size_t i=0; i<snpInfo.size(); i++) { + if (indicator_snp[i]==0) {continue;} + + rs=snpInfo[i].rs_number; + a1=snpInfo[i].a_minor; + + if (mapRS2wA.count(rs)!=0) { + if (a1==mapRS2A1.at(rs)) { + gsl_vector_set (z, c, mapRS2z.at(rs) ); + } else { + gsl_vector_set (z, c, -1*mapRS2z.at(rs) ); + } + vec_cat.push_back(mapRS2cat.at(rs) ); + gsl_vector_set (w, c, mapRS2wA.at(rs) ); + + c++; + } else { + indicator_snp[i]=0; + } + } + } else { + for (size_t t=0; t<msnpInfo.size(); t++) { + snpInfo=msnpInfo[t]; + + for (size_t i=0; i<snpInfo.size(); i++) { + if (mindicator_snp[t][i]==0) {continue;} + + rs=snpInfo[i].rs_number; + a1=snpInfo[i].a_minor; + + if (mapRS2wA.count(rs)!=0) { + if (a1==mapRS2A1.at(rs)) { + gsl_vector_set (z, c, mapRS2z.at(rs) ); + } else { + gsl_vector_set (z, c, -1*mapRS2z.at(rs) ); + } + vec_cat.push_back(mapRS2cat.at(rs) ); + gsl_vector_set (w, c, mapRS2wA.at(rs) ); + + c++; + } else { + mindicator_snp[t][i]=0; + } + } + } + } + + return; +} + + + +// this function updates indicator_snp, and save z-scores and other values into vectors +void PARAM::UpdateSNP (const map<string, double> &mapRS2wA) +{ + string rs; + if (msnpInfo.size()==0) { + for (size_t i=0; i<snpInfo.size(); i++) { + if (indicator_snp[i]==0) {continue;} + + rs=snpInfo[i].rs_number; + + if (mapRS2wA.count(rs)==0) { + indicator_snp[i]=0; + } + } + } else { + for (size_t t=0; t<msnpInfo.size(); t++) { + snpInfo=msnpInfo[t]; + + for (size_t i=0; i<mindicator_snp[t].size(); i++) { + if (mindicator_snp[t][i]==0) {continue;} + + rs=snpInfo[i].rs_number; + + if (mapRS2wA.count(rs)==0) { + mindicator_snp[t][i]=0; + } + } + } + } + + return; +} diff --git a/src/param.h b/src/param.h index 3c3b42e..4b4ad29 100644 --- a/src/param.h +++ b/src/param.h @@ -102,6 +102,8 @@ public: size_t n_col; size_t nmis_col; size_t nobs_col; + size_t ncase_col; + size_t ncontrol_col; size_t af_col; size_t var_col; size_t ws_col; @@ -120,23 +122,21 @@ public: vector<size_t> p_column; //which phenotype column needs analysis size_t d_pace; //display pace - string file_bfile; - string file_geno; + string file_bfile, file_mbfile; + string file_geno, file_mgeno; string file_pheno; string file_anno; //optional string file_gxe; //optional string file_cvt; //optional - string file_cat; + string file_cat, file_mcat; string file_var; string file_beta; string file_cor; - string file_kin; + string file_kin, file_mk; string file_ku, file_kd; - string file_mk; - string file_q, file_mq; - string file_s, file_ms; - string file_v, file_mv; - string file_weight; + string file_study, file_mstudy; + string file_ref, file_mref; + string file_weight, file_wsnp, file_wcat; string file_out; string path_out; @@ -165,7 +165,7 @@ public: size_t n_region; double l_mle_null, l_remle_null; double logl_mle_H0, logl_remle_H0; - double pve_null, pve_se_null; + double pve_null, pve_se_null, pve_total, se_pve_total; double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; @@ -185,6 +185,8 @@ public: vector<double> v_sigma2; vector<double> v_se_sigma2; + vector<double> v_enrich; + vector<double> v_se_enrich; vector<double> v_beta; vector<double> v_se_beta; @@ -210,15 +212,18 @@ public: size_t window_bp; size_t window_ns; + //vc related parameters + size_t n_block; + // Summary statistics bool error; - size_t ni_total, ni_test, ni_cvt; //number of individuals + size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref; //number of individuals size_t np_obs, np_miss; //number of observed and missing phenotypes - size_t ns_total, ns_test; //number of snps + size_t ns_total, ns_test, ns_study, ns_ref; //number of snps size_t ng_total, ng_test; //number of genes size_t ni_control, ni_case; //number of controls and number of cases size_t ni_subsample; //number of subsampled individuals - size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel + //size_t ni_total_ref, ns_total_ref, ns_pair;//max number of individuals, number of snps and number of snp pairs in the reference panel size_t n_cvt; //number of covariates size_t n_ph; //number of phenotypes size_t n_vc; //number of variance components (including the diagonal matrix) @@ -240,6 +245,7 @@ public: vector<vector<int> > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + vector< vector<int> > mindicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis vector<int> indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis vector<int> indicator_gxe; //indicator for gxe, 0 missing, 1 available for analysis vector<int> indicator_weight; //indicator for weight, 0 missing, 1 available for analysis @@ -256,9 +262,11 @@ public: map<string, double> mapRS2cM; //map rs# to cM map<string, double> mapRS2est; //map rs# to parameters map<string, size_t> mapRS2cat; //map rs# to category number - map<string, double> mapRS2var; //map rs# to category number + map<string, double> mapRS2wsnp; //map rs# to snp weights + map<string, vector<double> > mapRS2wcat; //map rs# to snp cat weights vector<SNPINFO> snpInfo; //record SNP information + vector< vector<SNPINFO> > msnpInfo; //record SNP information set<string> setSnps; //a set of snps for analysis //constructor @@ -279,12 +287,16 @@ public: void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); void CalcKin (gsl_matrix *matrix_kin); - void CalcS (gsl_matrix *S, gsl_matrix *Svar, gsl_matrix *Q); + void CalcS (const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns); void WriteVector (const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix); void WriteVar (const string suffix); void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); void WriteVector (const gsl_vector *vector_D, const string suffix); void CopyRead (gsl_vector *log_N); + void ObtainWeight (const set<string> &setSnps_beta, map<string, double> &mapRS2wK); + void UpdateWeight (const size_t pve_flag, const map<string, double> &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map<string, double> &mapRS2wA); + void UpdateSNPnZ (const map<string, double> &mapRS2wA, const map<string, string> &mapRS2A1, const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat); + void UpdateSNP (const map<string, double> &mapRS2wA); }; @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011 Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -26,8 +26,12 @@ #include <cmath> #include <iostream> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <bitset> +#include <vector> +#include <set> +#include <map> +#include <string> #include <cstring> #include "gsl/gsl_vector.h" @@ -39,9 +43,14 @@ #include "gsl/gsl_multiroots.h" #include "gsl/gsl_min.h" +#include "Eigen/Dense" + +#include "param.h" #include "io.h" #include "lapack.h" +#include "eigenlib.h" #include "gzstream.h" +#include "mathfunc.h" #ifdef FORCE_FLOAT #include "lmm_float.h" @@ -54,95 +63,194 @@ using namespace std; - +using namespace Eigen; //in this file, X, Y are already transformed (i.e. UtX and UtY) -void VC::CopyFromParam (PARAM &cPar) -{ - file_out=cPar.file_out; - - // v_sigma2=cPar.v_sigma2; - - time_UtX=0.0; - time_opt=0.0; +void VC::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; - v_traceG=cPar.v_traceG; - - return; + file_cat=cPar.file_cat; + file_beta=cPar.file_beta; + file_cor=cPar.file_cor; + + setSnps=cPar.setSnps; + + file_out=cPar.file_out; + path_out=cPar.path_out; + + //v_sigma2=cPar.v_sigma2; + + time_UtX=0.0; + time_opt=0.0; + + v_traceG=cPar.v_traceG; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ns_test=cPar.ns_test; + + crt=cPar.crt; + window_cm=cPar.window_cm; + window_bp=cPar.window_bp; + window_ns=cPar.window_ns; + + n_vc=cPar.n_vc; + + return; } -void VC::CopyToParam (PARAM &cPar) +void VC::CopyToParam (PARAM &cPar) { cPar.time_UtX=time_UtX; - cPar.time_opt=time_opt; - - cPar.v_sigma2=v_sigma2; - cPar.v_se_sigma2=v_se_sigma2; + cPar.time_opt=time_opt; + cPar.v_pve=v_pve; cPar.v_se_pve=v_se_pve; + cPar.v_sigma2=v_sigma2; + cPar.v_se_sigma2=v_se_sigma2; + cPar.pve_total=pve_total; + cPar.se_pve_total=se_pve_total; cPar.v_traceG=v_traceG; - + cPar.v_beta=v_beta; cPar.v_se_beta=v_se_beta; - + + cPar.ni_total=ni_total; + cPar.ns_total=ns_total; + cPar.ns_test=ns_test; + + cPar.n_vc=n_vc; + + return; +} + + + +void VC::WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".qvec.txt"; + + ofstream outfile_q (file_str.c_str(), ofstream::out); + if (!outfile_q) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + for (size_t i=0; i<s_vec->size; i++) { + outfile_q<<gsl_vector_get(s_vec, i)<<endl; + } + for (size_t i=0; i<q_vec->size; i++) { + outfile_q<<gsl_vector_get(q_vec, i)<<endl; + } + for (size_t i=0; i<qvar_vec->size; i++) { + outfile_q<<gsl_vector_get(qvar_vec, i)<<endl; + } + + outfile_q.clear(); + outfile_q.close(); + + file_str=path_out+"/"+file_out; + file_str+=".smat.txt"; + + ofstream outfile_s (file_str.c_str(), ofstream::out); + if (!outfile_s) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;} + + for (size_t i=0; i<S_mat->size1; i++) { + for (size_t j=0; j<S_mat->size2; j++) { + outfile_s<<gsl_matrix_get(S_mat, i, j)<<"\t"; + } + outfile_s<<endl; + } + for (size_t i=0; i<Svar_mat->size1; i++) { + for (size_t j=0; j<Svar_mat->size2; j++) { + outfile_s<<gsl_matrix_get(Svar_mat, i, j)<<"\t"; + } + outfile_s<<endl; + } + + outfile_s.clear(); + outfile_s.close(); + return; } + + + + + void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) { size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1, n_cvt=(p->W)->size2; - + gsl_matrix *K_temp=gsl_matrix_alloc(n1, n1); gsl_matrix *HiW=gsl_matrix_alloc(n1, n_cvt); gsl_matrix *WtHiW=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWi=gsl_matrix_alloc(n_cvt, n_cvt); gsl_matrix *WtHiWiWtHi=gsl_matrix_alloc(n_cvt, n1); - double sigma2; + double sigma2; //calculate H=\sum_i^{k+1} \sigma_i^2 K_i gsl_matrix_set_zero (p->P); for (size_t i=0; i<n_vc+1; i++) { if (i==n_vc) { - gsl_matrix_set_identity (K_temp); + gsl_matrix_set_identity (K_temp); } else { gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); gsl_matrix_memcpy (K_temp, &K_sub.matrix); } - sigma2=exp(gsl_vector_get (log_sigma2, i) ); + //when unconstrained, update on sigma2 instead of log_sigma2 + if (p->noconstrain) { + sigma2=gsl_vector_get (log_sigma2, i); + } else { + sigma2=exp(gsl_vector_get (log_sigma2, i) ); + } gsl_matrix_scale(K_temp, sigma2); gsl_matrix_add (p->P, K_temp); } //calculate H^{-1} + /* int sig; gsl_permutation * pmt1=gsl_permutation_alloc (n1); - LUDecomp (p->P, pmt1, &sig); + LUDecomp (p->P, pmt1, &sig); LUInvert (p->P, pmt1, K_temp); gsl_permutation_free(pmt1); gsl_matrix_memcpy (p->P, K_temp); + */ + eigenlib_invert(p->P); //calculate P=H^{-1}-H^{-1}W(W^TH^{-1}W)^{-1}W^TH^{-1} - gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); - gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, p->P, p->W, 0.0, HiW); + //gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, p->W, HiW, 0.0, WtHiW); + + eigenlib_dgemm ("N", "N", 1.0, p->P, p->W, 0.0, HiW); + eigenlib_dgemm ("T", "N", 1.0, p->W, HiW, 0.0, WtHiW); - gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); - LUDecomp (WtHiW, pmt2, &sig); - LUInvert (WtHiW, pmt2, WtHiWi); - gsl_permutation_free(pmt2); + //gsl_permutation * pmt2=gsl_permutation_alloc (n_cvt); + //LUDecomp (WtHiW, pmt2, &sig); + //LUInvert (WtHiW, pmt2, WtHiWi); + //gsl_permutation_free(pmt2); + eigenlib_invert(WtHiW); + gsl_matrix_memcpy(WtHiWi, WtHiW); + + //gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + //gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); + eigenlib_dgemm ("N", "T", 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); + eigenlib_dgemm ("N", "N", -1.0, HiW, WtHiWiWtHi, 1.0, p->P); - gsl_blas_dgemm (CblasNoTrans, CblasTrans, 1.0, WtHiWi, HiW, 0.0, WtHiWiWtHi); - gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, -1.0, HiW, WtHiWiWtHi, 1.0, p->P); - //calculate Py, KPy, PKPy - gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, p->y, 0.0, p->Py); + //eigenlib_dgemv("N", 1.0, p->P, p->y, 0.0, p->Py); + double d; for (size_t i=0; i<n_vc+1; i++) { gsl_vector_view KPy=gsl_matrix_column (p->KPy_mat, i); gsl_vector_view PKPy=gsl_matrix_column (p->PKPy_mat, i); @@ -150,11 +258,22 @@ void UpdateParam (const gsl_vector *log_sigma2, VC_PARAM *p) if (i==n_vc) { gsl_vector_memcpy (&KPy.vector, p->Py); } else { - gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + gsl_matrix_const_view K_sub=gsl_matrix_const_submatrix (p->K, 0, n1*i, n1, n1); + //seems to be important to use gsl dgemv here instead of eigenlib_dgemv; otherwise gsl_blas_dgemv(CblasNoTrans, 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); + //eigenlib_dgemv("N", 1.0, &K_sub.matrix, p->Py, 0.0, &KPy.vector); } - + gsl_blas_dgemv(CblasNoTrans, 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + //eigenlib_dgemv("N", 1.0, p->P, &KPy.vector, 0.0, &PKPy.vector); + + //when phenotypes are not normalized well, then some values in the following matrix maybe nan; change that to 0; this seems to only happen when eigenlib_dgemv was used above + for (size_t j=0; j<p->KPy_mat->size1; j++) { + d=gsl_matrix_get (p->KPy_mat, j, i); + if (std::isnan(d)) {gsl_matrix_set (p->KPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl;} + d=gsl_matrix_get (p->PKPy_mat, j, i); + if (std::isnan(d)) {gsl_matrix_set (p->PKPy_mat, j, i, 0); cout<<"nan appears in "<<i<<" "<<j<<endl;} + } } gsl_matrix_free (K_temp); @@ -173,7 +292,7 @@ int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) VC_PARAM *p=(VC_PARAM *) params; size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; - + double tr, d; //update parameters @@ -199,8 +318,12 @@ int LogRL_dev1 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1) gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); - d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); - + if (p->noconstrain) { + d=(-0.5*tr+0.5*d); + } else { + d=(-0.5*tr+0.5*d)*exp(gsl_vector_get(log_sigma2, i)); + } + gsl_vector_set(dev1, i, d); } @@ -214,32 +337,47 @@ int LogRL_dev2 (const gsl_vector *log_sigma2, void *params, gsl_matrix *dev2) VC_PARAM *p=(VC_PARAM *) params; size_t n_vc=log_sigma2->size-1; - + double d, sigma2_i, sigma2_j; //update parameters UpdateParam (log_sigma2, p); - + //calculate dev2=0.5(yPKPKPy) for (size_t i=0; i<n_vc+1; i++) { gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); - sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + if (p->noconstrain) { + sigma2_i=gsl_vector_get(log_sigma2, i); + } else { + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + } for (size_t j=i; j<n_vc+1; j++) { gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); - sigma2_j=exp(gsl_vector_get(log_sigma2, j)); - - d*=-0.5*sigma2_i*sigma2_j; + if (p->noconstrain) { + sigma2_j=gsl_vector_get(log_sigma2, j); + d*=-0.5; + } else { + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} - } + } } gsl_matrix_memcpy (p->Hessian, dev2); - + /* + for (size_t i=0; i<dev2->size1; i++) { + for (size_t j=0; j<dev2->size2; j++) { + cout<<gsl_matrix_get (dev2, i, j)<<" "; + } + cout<<endl; + } + */ return GSL_SUCCESS; } @@ -250,14 +388,14 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g VC_PARAM *p=(VC_PARAM *) params; size_t n1=(p->K)->size1, n_vc=log_sigma2->size-1; - + double tr, d, sigma2_i, sigma2_j; //update parameters UpdateParam (log_sigma2, p); - //calculate dev1=-0.5*trace(PK_i)+0.5*yPKPy - //calculate dev2=0.5(yPKPKPy) + //calculate dev1=(-0.5*trace(PK_i)+0.5*yPK_iPy)*sigma2_i + //calculate dev2=0.5(yPK_iPK_jPy)*sigma2_i*sigma2_j for (size_t i=0; i<n_vc+1; i++) { if (i==n_vc) { tr=0; @@ -277,21 +415,31 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g gsl_vector_view KPy_i=gsl_matrix_column (p->KPy_mat, i); gsl_blas_ddot(p->Py, &KPy_i.vector, &d); - sigma2_i=exp(gsl_vector_get(log_sigma2, i)); - d=(-0.5*tr+0.5*d)*sigma2_i; - + if (p->noconstrain) { + sigma2_i=gsl_vector_get(log_sigma2, i); + d=(-0.5*tr+0.5*d); + } else { + sigma2_i=exp(gsl_vector_get(log_sigma2, i)); + d=(-0.5*tr+0.5*d)*sigma2_i; + } + gsl_vector_set(dev1, i, d); - + for (size_t j=i; j<n_vc+1; j++) { gsl_vector_view PKPy_j=gsl_matrix_column (p->PKPy_mat, j); gsl_blas_ddot(&KPy_i.vector, &PKPy_j.vector, &d); - sigma2_j=exp(gsl_vector_get(log_sigma2, j)); - d*=-0.5*sigma2_i*sigma2_j; + if (p->noconstrain) { + sigma2_j=gsl_vector_get(log_sigma2, j); + d*=-0.5; + } else { + sigma2_j=exp(gsl_vector_get(log_sigma2, j)); + d*=-0.5*sigma2_i*sigma2_j; + } gsl_matrix_set(dev2, i, j, d); if (j!=i) {gsl_matrix_set(dev2, j, i, d);} - } + } } @@ -303,13 +451,1195 @@ int LogRL_dev12 (const gsl_vector *log_sigma2, void *params, gsl_vector *dev1, g -void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) + +//read header to determine which column contains which item +bool ReadHeader (const string &line, HEADER &header) +{ + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID"}; + set<string> rs_set(rs_ptr, rs_ptr+10); + string chr_ptr[]={"chr","CHR"}; + set<string> chr_set(chr_ptr, chr_ptr+2); + string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; + set<string> pos_set(pos_ptr, pos_ptr+8); + string cm_ptr[]={"cm","CM"}; + set<string> cm_set(cm_ptr, cm_ptr+2); + string a1_ptr[]={"a1","A1","allele1","ALLELE1"}; + set<string> a1_set(a1_ptr, a1_ptr+4); + string a0_ptr[]={"a0","A0","allele0","ALLELE0"}; + set<string> a0_set(a0_ptr, a0_ptr+4); + + string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; + set<string> z_set(z_ptr, z_ptr+6); + string beta_ptr[]={"beta","BETA","b","B"}; + set<string> beta_set(beta_ptr, beta_ptr+4); + string sebeta_ptr[]={"se_beta","SE_BETA","se","SE"}; + set<string> sebeta_set(sebeta_ptr, sebeta_ptr+4); + string chisq_ptr[]={"chisq","CHISQ","chisquare","CHISQUARE"}; + set<string> chisq_set(chisq_ptr, chisq_ptr+4); + string p_ptr[]={"p","P","pvalue","PVALUE","p-value","P-VALUE"}; + set<string> p_set(p_ptr, p_ptr+6); + + string n_ptr[]={"n","N","ntotal","NTOTAL","n_total","N_TOTAL"}; + set<string> n_set(n_ptr, n_ptr+6); + string nmis_ptr[]={"nmis","NMIS","n_mis","N_MIS","n_miss","N_MISS"}; + set<string> nmis_set(nmis_ptr, nmis_ptr+6); + string nobs_ptr[]={"nobs","NOBS","n_obs","N_OBS"}; + set<string> nobs_set(nobs_ptr, nobs_ptr+4); + + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY"}; + set<string> af_set(af_ptr, af_ptr+10); + string var_ptr[]={"var","VAR"}; + set<string> var_set(var_ptr, var_ptr+2); + + string ws_ptr[]={"window_size","WINDOW_SIZE","ws","WS"}; + set<string> ws_set(ws_ptr, ws_ptr+4); + string cor_ptr[]={"cor","COR","r","R"}; + set<string> cor_set(cor_ptr, cor_ptr+4); + + header.rs_col=0; header.chr_col=0; header.pos_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + + char *ch_ptr; + string type; + size_t n_error=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + type=ch_ptr; + if (rs_set.count(type)!=0) { + if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."<<endl; n_error++;} + } else if (chr_set.count(type)!=0) { + if (header.chr_col==0) {header.chr_col=header.coln+1;} else {cout<<"error! more than two chr columns in the file."<<endl; n_error++;} + } else if (pos_set.count(type)!=0) { + if (header.pos_col==0) {header.pos_col=header.coln+1;} else {cout<<"error! more than two pos columns in the file."<<endl; n_error++;} + } else if (cm_set.count(type)!=0) { + if (header.cm_col==0) {header.cm_col=header.coln+1;} else {cout<<"error! more than two cm columns in the file."<<endl; n_error++;} + } else if (a1_set.count(type)!=0) { + if (header.a1_col==0) {header.a1_col=header.coln+1;} else {cout<<"error! more than two allele1 columns in the file."<<endl; n_error++;} + } else if (a0_set.count(type)!=0) { + if (header.a0_col==0) {header.a0_col=header.coln+1;} else {cout<<"error! more than two allele0 columns in the file."<<endl; n_error++;} + } else if (z_set.count(type)!=0) { + if (header.z_col==0) {header.z_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + } else if (beta_set.count(type)!=0) { + if (header.beta_col==0) {header.beta_col=header.coln+1;} else {cout<<"error! more than two beta columns in the file."<<endl; n_error++;} + } else if (sebeta_set.count(type)!=0) { + if (header.sebeta_col==0) {header.sebeta_col=header.coln+1;} else {cout<<"error! more than two se_beta columns in the file."<<endl; n_error++;} + } else if (chisq_set.count(type)!=0) { + if (header.chisq_col==0) {header.chisq_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + } else if (p_set.count(type)!=0) { + if (header.p_col==0) {header.p_col=header.coln+1;} else {cout<<"error! more than two p columns in the file."<<endl; n_error++;} + } else if (n_set.count(type)!=0) { + if (header.n_col==0) {header.n_col=header.coln+1;} else {cout<<"error! more than two n_total columns in the file."<<endl; n_error++;} + } else if (nmis_set.count(type)!=0) { + if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;} + } else if (nobs_set.count(type)!=0) { + if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;} + } else if (ws_set.count(type)!=0) { + if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;} + } else if (af_set.count(type)!=0) { + if (header.af_col==0) {header.af_col=header.coln+1;} else {cout<<"error! more than two af columns in the file."<<endl; n_error++;} + } else if (cor_set.count(type)!=0) { + if (header.cor_col==0) {header.cor_col=header.coln+1;} else {cout<<"error! more than two cor columns in the file."<<endl; n_error++;} + } else {} + + ch_ptr=strtok (NULL, " , \t"); + header.coln++; + } + + if (header.cor_col!=0 && header.cor_col!=header.coln) {cout<<"error! the cor column should be the last column."<<endl; n_error++;} + + if (header.rs_col==0) { + if (header.chr_col!=0 && header.pos_col!=0) { + cout<<"missing an rs column. rs id will be replaced by chr:pos"<<endl; + } else { + cout<<"error! missing an rs column."<<endl; n_error++; + } + } + + if (n_error==0) {return true;} else {return false;} +} + + + + + + +//read cov file the first time, record mapRS2in, mapRS2var (in case var is not provided in the z file), store vec_n and vec_rs +void ReadFile_cor (const string &file_cor, const set<string> &setSnps, vector<string> &vec_rs, vector<size_t> &vec_n, vector<double> &vec_cm, vector<double> &vec_bp, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var) +{ + vec_rs.clear(); + vec_n.clear(); + mapRS2in.clear(); + mapRS2var.clear(); + + igzstream infile (file_cor.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open cov file: "<<file_cor<<endl; return;} + + string line; + char *ch_ptr; + + string rs, chr, a1, a0, pos, cm; + double af=0, var_x=0, d_pos, d_cm; + size_t n_total=0, n_mis=0, n_obs=0, ni_total=0; + size_t ns_test=0, ns_total=0; + + HEADER header; + + //header + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if (header.nobs_col==0 && header.nmis_col==0) { + cout<<"error! missing sample size in the cor file."<<endl; + } else { + cout<<"total sample size will be replaced by obs/mis sample size."<<endl; + } + } + + while (!safeGetline(infile, line).eof()) { + //do not read cor values this time; upto col_n-1 + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; d_cm=0; d_pos=0; + for (size_t i=0; i<header.coln-1; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; d_pos=atof(ch_ptr);} + if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; d_cm=atof(ch_ptr);} + if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} + if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} + + if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} + if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} + if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + + if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} + if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + if (header.n_col==0) { + n_total=n_mis+n_obs; + } + + //record rs, n + vec_rs.push_back(rs); + vec_n.push_back(n_total); + if (d_cm>0) {vec_cm.push_back(d_cm);} else {vec_cm.push_back(d_cm);} + if (d_pos>0) {vec_bp.push_back(d_pos);} else {vec_bp.push_back(d_pos);} + + //record mapRS2in and mapRS2var + if (setSnps.size()==0 || setSnps.count(rs)!=0) { + if (mapRS2in.count(rs)==0) { + mapRS2in[rs]=1; + + if (header.var_col!=0) { + mapRS2var[rs]=var_x; + } else if (header.af_col!=0) { + var_x=2.0*af*(1.0-af); + mapRS2var[rs]=var_x; + } else {} + + ns_test++; + + } else { + cout<<"error! more than one snp has the same id "<<rs<<" in cor file?"<<endl; + } + } + + //record max pos, + + ni_total=max(ni_total, n_total); + ns_total++; + } + + // cout<<"## number of analyzed individuals in the reference = "<<ni_total<<endl; + // cout<<"## number of analyzed SNPs in the reference = "<<ns_total<<endl; + + infile.close(); + infile.clear(); + + return; +} + + + + + + +//read beta file, store mapRS2var if var is provided here, calculate q and var_y +void ReadFile_beta (const bool flag_priorscale, const string &file_beta, const map<string, size_t> &mapRS2cat, map<string, size_t> &mapRS2in, map<string, double> &mapRS2var, map<string, size_t> &mapRS2nsamp, gsl_vector *q_vec, gsl_vector *qvar_vec, gsl_vector *s_vec, size_t &ni_total, size_t &ns_total) +{ + mapRS2nsamp.clear(); + + igzstream infile (file_beta.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;} + + string line; + char *ch_ptr; + string type; + + string rs, chr, a1, a0, pos, cm; + double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0; + size_t n_total=0, n_mis=0, n_obs=0; + size_t ns_test=0; + ns_total=0; ni_total=0; + + vector<double> vec_q, vec_qvar, vec_s; + for (size_t i=0; i<q_vec->size; i++) { + vec_q.push_back(0.0); + vec_qvar.push_back(0.0); + vec_s.push_back(0.0); + } + + //read header + HEADER header; + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + if (header.n_col==0 ) { + if (header.nobs_col==0 && header.nmis_col==0) { + cout<<"error! missing sample size in the beta file."<<endl; + } else { + cout<<"total sample size will be replaced by obs/mis sample size."<<endl; + } + } + + if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) { + cout<<"error! missing z scores in the beta file."<<endl; + } + + if (header.af_col==0 && header.var_col==0 && mapRS2var.size()==0) { + cout<<"error! missing allele frequency in the beta file."<<endl; + } + + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + z=0; beta=0; se_beta=0; chisq=0; pvalue=0; + n_total=0; n_mis=0; n_obs=0; af=0; var_x=0; + for (size_t i=0; i<header.coln; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr;} + if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr;} + if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} + if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} + + if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} + if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} + if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);} + if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} + if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} + + if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} + if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} + if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + + if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} + if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + if (header.n_col==0) { + n_total=n_mis+n_obs; + } + + //both z values and beta/se_beta have directions, while chisq/pvalue do not + if (header.z_col!=0) { + zsquare=z*z; + } else if (header.beta_col!=0 && header.sebeta_col!=0) { + z=beta/se_beta; + zsquare=z*z; + } else if (header.chisq_col!=0) { + zsquare=chisq; + } else if (header.p_col!=0) { + zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); + } else {zsquare=0;} + + //if the snp is also present in cor file, then do calculations + if ((header.var_col!=0 || header.af_col!=0 || mapRS2var.count(rs)!=0) && mapRS2in.count(rs)!=0 && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) ) { + if (mapRS2in.at(rs)>1) { + cout<<"error! more than one snp has the same id "<<rs<<" in beta file?"<<endl; + break; + } + + if (header.var_col==0) { + if (header.af_col!=0) { + var_x=2.0*af*(1.0-af); + } else { + var_x=mapRS2var.at(rs); + } + } + + if (flag_priorscale) {var_x=1;} + + mapRS2in[rs]++; + mapRS2var[rs]=var_x; + mapRS2nsamp[rs]=n_total; + + if (mapRS2cat.size()!=0) { + vec_q[mapRS2cat.at(rs) ]+=(zsquare-1.0)*var_x/(double)n_total; + vec_s[mapRS2cat.at(rs) ]+=var_x; + vec_qvar[mapRS2cat.at(rs) ]+=var_x*var_x/((double)n_total*(double)n_total); + } else { + vec_q[0]+=(zsquare-1.0)*var_x/(double)n_total; + vec_s[0]+=var_x; + vec_qvar[0]+=var_x*var_x/((double)n_total*(double)n_total); + } + + ni_total=max(ni_total, n_total); + ns_test++; + } + + ns_total++; + } + + for (size_t i=0; i<q_vec->size; i++) { + gsl_vector_set(q_vec, i, vec_q[i]); + gsl_vector_set(qvar_vec, i, 2.0*vec_qvar[i]); + gsl_vector_set(s_vec, i, vec_s[i]); + } + + + infile.clear(); + infile.close(); + + return; +} + + + + + +//read covariance file the second time +//look for rs, n_mis+n_obs, var, window_size, cov +//if window_cm/bp/ns is provided, then use these max values to calibrate estimates +void ReadFile_cor (const string &file_cor, const vector<string> &vec_rs, const vector<size_t> &vec_n, const vector<double> &vec_cm, const vector<double> &vec_bp, const map<string, size_t> &mapRS2cat, const map<string, size_t> &mapRS2in, const map<string, double> &mapRS2var, const map<string, size_t> &mapRS2nsamp, const size_t crt, const double &window_cm, const double &window_bp, const double &window_ns, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *qvar_vec, size_t &ni_total, size_t &ns_total, size_t &ns_test, size_t &ns_pair) +{ + igzstream infile (file_cor.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open cov file: "<<file_cor<<endl; return;} + + string line; + char *ch_ptr; + + string rs1, rs2; + double d1, d2, d3, cor, var1, var2; + size_t n_nb, nsamp1, nsamp2, n12, bin_size=10, bin; + + vector<vector<double> > mat_S, mat_Svar, mat_tmp; + vector<double> vec_qvar, vec_tmp; + vector<vector<vector<double> > > mat3d_Sbin; + + for (size_t i=0; i<S_mat->size1; i++) { + vec_qvar.push_back(0.0); + } + + for (size_t i=0; i<S_mat->size1; i++) { + mat_S.push_back(vec_qvar); + mat_Svar.push_back(vec_qvar); + } + + for (size_t k=0; k<bin_size; k++) { + vec_tmp.push_back(0.0); + } + for (size_t i=0; i<S_mat->size1; i++) { + mat_tmp.push_back(vec_tmp); + } + for (size_t i=0; i<S_mat->size1; i++) { + mat3d_Sbin.push_back(mat_tmp); + } + + string rs, chr, a1, a0, type, pos, cm; + size_t n_total=0, n_mis=0, n_obs=0; + double d_pos1, d_pos2, d_pos, d_cm1, d_cm2, d_cm; + ns_test=0; ns_total=0; ns_pair=0; ni_total=0; + + //header + HEADER header; + + !safeGetline(infile, line).eof(); + ReadHeader (line, header); + + while (!safeGetline(infile, line).eof()) { + //do not read cor values this time; upto col_n-1 + d_pos1=0; d_cm1=0; + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + for (size_t i=0; i<header.coln-1; i++) { + if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} + if (header.chr_col!=0 && header.chr_col==i+1) {chr=ch_ptr;} + if (header.pos_col!=0 && header.pos_col==i+1) {pos=ch_ptr; d_pos1=atof(ch_ptr);} + if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; d_cm1=atof(ch_ptr); } + if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr;} + if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr;} + + if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr);} + if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} + if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} + + ch_ptr=strtok (NULL, " , \t"); + } + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + if (header.n_col==0) { + n_total=n_mis+n_obs; + } + + rs1=rs; + + if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs1)!=0) && mapRS2in.count(rs1)!=0 && mapRS2in.at(rs1)==2) { + var1=mapRS2var.at(rs1); + nsamp1=mapRS2nsamp.at(rs1); + d2=var1*var1; + + if (mapRS2cat.size()!=0) { + mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+=(1-1.0/(double)vec_n[ns_total])*d2; + mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ]+=d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]); + if (crt==1) { + mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs1) ][0]+=(1-1.0/(double)vec_n[ns_total])*d2; + } + } else { + //mat_S[0][0]+=(1-1.0/(double)vec_n[ns_total])*d2; + mat_S[0][0]+=(1-1.0/(double)vec_n[ns_total])*d2; + mat_Svar[0][0]+=d2*d2/((double)vec_n[ns_total]*(double)vec_n[ns_total]); + if (crt==1) { + mat3d_Sbin[0][0][0]+=(1-1.0/(double)vec_n[ns_total])*d2; + } + } + + n_nb=0; + while(ch_ptr!=NULL) { + type=ch_ptr; + if (type.compare("NA")!=0 && type.compare("na")!=0 && type.compare("nan")!=0 && type.compare("-nan")!=0) { + cor=atof(ch_ptr); + rs2=vec_rs[ns_total+n_nb+1]; + d_pos2=vec_bp[ns_total+n_nb+1]; + d_cm2=vec_cm[ns_total+n_nb+1]; + d_pos=abs(d_pos2-d_pos1); + d_cm=abs(d_cm2-d_cm1); + + if ( (mapRS2cat.size()==0 || mapRS2cat.count(rs2)!=0) && mapRS2in.count(rs2)!=0 && mapRS2in.at(rs2)==2) { + var2=mapRS2var.at(rs2); + nsamp2=mapRS2nsamp.at(rs2); + d1=cor*cor-1.0/(double)min(vec_n[ns_total], vec_n[ns_total+n_nb+1]); + d2=var1*var2; + d3=cor*cor/((double)nsamp1*(double)nsamp2); + n12=min(vec_n[ns_total], vec_n[ns_total+n_nb+1]); + + //compute bin + if (crt==1) { + if (window_cm!=0 && d_cm1!=0 && d_cm2!=0) { + bin=min( (int)floor(d_cm/window_cm*bin_size), (int)bin_size); + } else if (window_bp!=0 && d_pos1!=0 && d_pos2!=0) { + bin=min( (int)floor(d_pos/window_bp*bin_size), (int)bin_size); + } else if (window_ns!=0) { + bin=min( (int)floor(((double)n_nb+1)/window_ns*bin_size), (int)bin_size); + } + } + + //if (mat_S[0][0]!=mat_S[0][0] && flag_nan==0) { + //if (rs1.compare("rs10915560")==0 || rs1.compare("rs241273")==0) {cout<<rs1<<" "<<rs2<<" "<<ns_total<<" "<<n_nb<<" "<<vec_n[ns_total]<<" "<<vec_n[ns_total+n_nb+1]<<" "<<nsamp1<<" "<<nsamp2<<" "<<var1<<" "<<var2<<" "<<cor<<" "<<d1<<" "<<d2<<" "<<d3<<" "<<mat_S[0][0]<<endl; flag_nan++;} + if (mapRS2cat.size()!=0) { + if (mapRS2cat.at(rs1)==mapRS2cat.at(rs2)) { + vec_qvar[mapRS2cat.at(rs1)]+=2*d3*d2; + mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=2*d1*d2; + mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=2*d2*d2/((double)n12*(double)n12); + if (crt==1) { + mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+=2*d1*d2; + } + } else { + mat_S[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=d1*d2; + mat_Svar[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ]+=d2*d2/((double)n12*(double)n12); + if (crt==1) { + mat3d_Sbin[mapRS2cat.at(rs1) ][mapRS2cat.at(rs2) ][bin]+=d1*d2; + } + } + } else { + vec_qvar[0]+=2*d3*d2; + mat_S[0][0]+=2*d1*d2; + mat_Svar[0][0]+=2*d2*d2/((double)n12*(double)n12); + + if (crt==1) { + mat3d_Sbin[0][0][bin]+=2*d1*d2; + } + } + ns_pair++; + } + } + + ch_ptr=strtok (NULL, " , \t"); + n_nb++; + } + ni_total=max(ni_total, n_total); + ns_test++; + } + + ns_total++; + } + + //use S_bin to fit a rational function y=1/(a+bx)^2, where x=seq(0.5,bin_size-0.5,by=1) + //and then compute a correlation factor as a percentage + double a, b, x, y, n, var_y, var_x, mean_y, mean_x, cov_xy, crt_factor; + if (crt==1) { + for (size_t i=0; i<S_mat->size1; i++) { + for (size_t j=i; j<S_mat->size2; j++) { + + //correct mat_S + n=0; var_y=0; var_x=0; mean_y=0; mean_x=0; cov_xy=0; + for (size_t k=0; k<bin_size; k++) { + if (j==i) { + y=mat3d_Sbin[i][j][k]; + } else { + y=mat3d_Sbin[i][j][k]+mat3d_Sbin[j][i][k]; + } + x=k+0.5; + cout<<y<<", "; + if (y>0) { + y=1/sqrt(y); + mean_x+=x; mean_y+=y; var_x+=x*x; var_y+=y*y; cov_xy+=x*y; + n++; + } + } + cout<<endl; + + if (n>=5) { + mean_x/=n; mean_y/=n; var_x/=n; var_y/=n; cov_xy/=n; + var_x-=mean_x*mean_x; var_y-=mean_y*mean_y; cov_xy-=mean_x*mean_y; + b=cov_xy/var_x; + a=mean_y-b*mean_x; + crt_factor=a/(b*(bin_size+0.5))+1; + if (i==j) { + mat_S[i][j]*=crt_factor; + } else { + mat_S[i][j]*=crt_factor; mat_S[j][i]*=crt_factor; + } + cout<<crt_factor<<endl; + //correct qvar + if (i==j) { + vec_qvar[i]*=crt_factor; //=vec_qvar[i]*crt_factor+(ns_test*ns_test-ns_pair*crt_factor)/pow(ni_total, 3.0); + } + } + } + } + } + + //save to gsl_vector and gsl_matrix: qvar_vec, S_mat, Svar_mat + for (size_t i=0; i<S_mat->size1; i++) { + d1=gsl_vector_get(qvar_vec, i)+2*vec_qvar[i]; + gsl_vector_set(qvar_vec, i, d1); + for (size_t j=0; j<S_mat->size2; j++) { + if (i==j) { + gsl_matrix_set(S_mat, i, j, mat_S[i][i]); + gsl_matrix_set(Svar_mat, i, j, 2.0*mat_Svar[i][i]*ns_test*ns_test/(2.0*ns_pair) ); + } else { + gsl_matrix_set(S_mat, i, j, mat_S[i][j]+mat_S[j][i]); + gsl_matrix_set(Svar_mat, i, j, 2.0*(mat_Svar[i][j]+mat_Svar[j][i])*ns_test*ns_test/(2.0*ns_pair) ); + } + } + } + + + + infile.clear(); + infile.close(); + + return; +} + + + + + +//copied from lmm.cpp; is used in the following function VCss +//map a number 1-(n_cvt+2) to an index between 0 and [(n_c+2)^2+(n_c+2)]/2-1 +size_t GetabIndex (const size_t a, const size_t b, const size_t n_cvt) { + if (a>n_cvt+2 || b>n_cvt+2 || a<=0 || b<=0) {cout<<"error in GetabIndex."<<endl; return 0;} + size_t index; + size_t l, h; + if (b>a) {l=a; h=b;} else {l=b; h=a;} + + size_t n=n_cvt+2; + index=(2*n-l+2)*(l-1)/2+h-l; + + return index; +} + + +//use the new method to calculate variance components with summary statistics +//first, use a function CalcS to compute S matrix (where the diagonal elements are part of V(q) ), and then use bootstrap to compute the variance for S, use a set of genotypes, phenotypes, and individual ids, and snp category label +void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) { + size_t n_vc=S_mat->size1; + + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + + gsl_vector *pve=gsl_vector_alloc (n_vc); + gsl_vector *pve_plus=gsl_vector_alloc (n_vc+1); + gsl_vector *tmp=gsl_vector_alloc (n_vc+1); + gsl_vector *sigma2persnp=gsl_vector_alloc (n_vc); + gsl_vector *enrich=gsl_vector_alloc (n_vc); + gsl_vector *se_pve=gsl_vector_alloc (n_vc); + gsl_vector *se_sigma2persnp=gsl_vector_alloc (n_vc); + gsl_vector *se_enrich=gsl_vector_alloc (n_vc); + + double d; + + //calculate S^{-1}q + gsl_matrix_memcpy (tmp_mat, S_mat); + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + LUDecomp (tmp_mat, pmt, &sig); + LUInvert (tmp_mat, pmt, Si_mat); + + //calculate sigma2snp and pve + gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); + gsl_vector_memcpy(sigma2persnp, pve); + gsl_vector_div(sigma2persnp, s_vec); + + //get qvar_mat + /* + if (n_block==0 || n_block==1) { + double s=1.0; + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get(pve, i); + gsl_vector_set(pve_plus, i, d); + s-=d; + } + gsl_vector_set(pve_plus, n_vc, s); + + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + size_t t_ij=GetabIndex (i+1, j+1, n_vc-2); + gsl_matrix_const_view Vsub=gsl_matrix_const_submatrix(V, 0, t_ij*(n_vc+1), n_vc+1, n_vc+1); + gsl_blas_dgemv (CblasNoTrans, 1.0, &Vsub.matrix, pve_plus, 0.0, tmp); + gsl_blas_ddot (pve_plus, tmp, &d); + + d*=2/(df*df); + + gsl_matrix_set (qvar_mat, i, j, d); + if (i!=j) {gsl_matrix_set (qvar_mat, j, i, d);} + //cout<<t_ij<<"/"<<d<<" "; + } + //cout<<endl; + } + } else { + */ + gsl_matrix_memcpy (qvar_mat, Vq); + gsl_matrix_scale (qvar_mat, 1.0/(df*df)); + //} + + //gsl_matrix_memcpy (qvar_mat, S_mat); + //gsl_matrix_scale (qvar_mat, 2/(df*df)); + + //calculate variance for these estimates + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + d=gsl_matrix_get(Svar_mat, i, j); + d*=gsl_vector_get(pve, i)*gsl_vector_get(pve, j); + //cout<<d<<" "; + + d+=gsl_matrix_get(qvar_mat, i, j); + gsl_matrix_set(Var_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);} + } + //cout<<endl; + } + + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, Var_mat, 0.0, tmp_mat); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat); + + for (size_t i=0; i<n_vc; i++) { + d=sqrt(gsl_matrix_get(Var_mat, i, i)); + gsl_vector_set(se_pve, i, d); + d/=gsl_vector_get(s_vec, i); + gsl_vector_set(se_sigma2persnp, i, d); + } + + //compute pve_total, se_pve_total + pve_total=0; se_pve_total=0; + for (size_t i=0; i<n_vc; i++) { + pve_total+=gsl_vector_get(pve, i); + + for (size_t j=0; j<n_vc; j++) { + se_pve_total+=gsl_matrix_get(Var_mat, i, j); + } + } + se_pve_total=sqrt(se_pve_total); + + //compute enrichment and its variance + double s_pve=0, s_snp=0; + for (size_t i=0; i<n_vc; i++) { + s_pve+=gsl_vector_get(pve, i); + s_snp+=gsl_vector_get(s_vec, i); + } + gsl_vector_memcpy (enrich, sigma2persnp); + gsl_vector_scale (enrich, s_snp/s_pve); + + gsl_matrix_set_identity(tmp_mat); + + double d1; + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get(pve, i)/s_pve; + d1=gsl_vector_get(s_vec, i); + for (size_t j=0; j<n_vc; j++) { + if (i==j) { + gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve); + } else { + gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve); + } + } + } + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Var_mat, 0.0, tmp_mat1); + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, tmp_mat1, tmp_mat, 0.0, VarEnrich_mat); + + for (size_t i=0; i<n_vc; i++) { + d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i)); + gsl_vector_set(se_enrich, i, d); + } + + cout<<"pve = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(pve, i)<<" "; + } + cout<<endl; + + cout<<"se(pve) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(se_pve, i)<<" "; + } + cout<<endl; + + cout<<"sigma2 per snp = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(sigma2persnp, i)<<" "; + } + cout<<endl; + + cout<<"se(sigma2 per snp) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(se_sigma2persnp, i)<<" "; + } + cout<<endl; + + cout<<"enrichment = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(enrich, i)<<" "; + } + cout<<endl; + + cout<<"se(enrichment) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<gsl_vector_get(se_enrich, i)<<" "; + } + cout<<endl; + + //save data + v_pve.clear(); v_se_pve.clear(); + v_sigma2.clear(); v_se_sigma2.clear(); + v_enrich.clear(); v_se_enrich.clear(); + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get(pve, i); + v_pve.push_back(d); + d=gsl_vector_get(se_pve, i); + v_se_pve.push_back(d); + + d=gsl_vector_get(sigma2persnp, i); + v_sigma2.push_back(d); + d=gsl_vector_get(se_sigma2persnp, i); + v_se_sigma2.push_back(d); + + d=gsl_vector_get(enrich, i); + v_enrich.push_back(d); + d=gsl_vector_get(se_enrich, i); + v_se_enrich.push_back(d); + } + + //delete matrices + gsl_matrix_free(Si_mat); + gsl_matrix_free(Var_mat); + gsl_matrix_free(VarEnrich_mat); + gsl_matrix_free(tmp_mat); + gsl_matrix_free(tmp_mat1); + gsl_matrix_free(qvar_mat); + + gsl_vector_free(pve); + gsl_vector_free(pve_plus); + gsl_vector_free(tmp); + gsl_vector_free(sigma2persnp); + gsl_vector_free(enrich); + gsl_vector_free(se_pve); + gsl_vector_free(se_sigma2persnp); + gsl_vector_free(se_enrich); + + return; +} + + + + + +//Ks are not scaled; +void VC::CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) +{ + size_t n1=K->size1, n2=K->size2; + size_t n_vc=n2/n1; + + double r=(double)n1/(double)(n1 - W->size2); + double var_y, var_y_new; + double d, tr, s, v; + vector<double> traceG_new; + + //new matrices/vectors + gsl_matrix *K_scale=gsl_matrix_alloc (n1, n2); + gsl_vector *y_scale=gsl_vector_alloc (n1); + gsl_matrix *Kry=gsl_matrix_alloc (n1, n_vc); + gsl_matrix *yKrKKry=gsl_matrix_alloc (n_vc, n_vc*(n_vc+1) ); + gsl_vector *KKry=gsl_vector_alloc (n1); + + //old matrices/vectors + gsl_vector *pve=gsl_vector_alloc (n_vc); + gsl_vector *se_pve=gsl_vector_alloc (n_vc); + gsl_vector *q_vec=gsl_vector_alloc (n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *S_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + + //center and scale K by W + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_view Kscale_sub = gsl_matrix_submatrix (K_scale, 0, n1*i, n1, n1); + gsl_matrix_const_view K_sub = gsl_matrix_const_submatrix (K, 0, n1*i, n1, n1); + gsl_matrix_memcpy (&Kscale_sub.matrix, &K_sub.matrix); + + CenterMatrix (&Kscale_sub.matrix, W); + d=ScaleMatrix (&Kscale_sub.matrix); + traceG_new.push_back(d); + } + + //center y by W, and standardize it to have variance 1 (t(y)%*%y/n=1) + gsl_vector_memcpy (y_scale, y); + CenterVector (y_scale, W); + + var_y=VectorVar (y); + var_y_new=VectorVar (y_scale); + + StandardizeVector (y_scale); + + //compute Kry, which is used for confidence interval; also compute q_vec (*n^2) + for (size_t i=0; i<n_vc; i++) { + gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*i, n1, n1); + gsl_vector_view Kry_col=gsl_matrix_column (Kry, i); + + gsl_vector_memcpy (&Kry_col.vector, y_scale); + gsl_blas_dgemv(CblasNoTrans, 1.0, &Kscale_sub.matrix, y_scale, -1.0*r, &Kry_col.vector); + + gsl_blas_ddot (&Kry_col.vector, y_scale, &d); + gsl_vector_set(q_vec, i, d); + } + + //compuate yKrKKry, which is used later for confidence interval + for (size_t i=0; i<n_vc; i++) { + gsl_vector_const_view Kry_coli=gsl_matrix_const_column (Kry, i); + for (size_t j=i; j<n_vc; j++) { + gsl_vector_const_view Kry_colj=gsl_matrix_const_column (Kry, j); + for (size_t l=0; l<n_vc; l++) { + gsl_matrix_const_view Kscale_sub = gsl_matrix_const_submatrix (K_scale, 0, n1*l, n1, n1); + gsl_blas_dgemv (CblasNoTrans, 1.0, &Kscale_sub.matrix, &Kry_coli.vector, 0.0, KKry); + gsl_blas_ddot (&Kry_colj.vector, KKry, &d); + gsl_matrix_set(yKrKKry, i, l*n_vc+j, d); + if (i!=j) {gsl_matrix_set(yKrKKry, j, l*n_vc+i, d);} + } + gsl_blas_ddot (&Kry_coli.vector, &Kry_colj.vector, &d); + gsl_matrix_set(yKrKKry, i, n_vc*n_vc+j, d); + if (i!=j) {gsl_matrix_set(yKrKKry, j, n_vc*n_vc+i, d);} + } + } + + //compute Sij (*n^2) + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + tr=0; + for (size_t l=0; l<n1; l++) { + gsl_vector_const_view Ki_col=gsl_matrix_const_column (K_scale, i*n1+l); + gsl_vector_const_view Kj_col=gsl_matrix_const_column (K_scale, j*n1+l); + gsl_blas_ddot (&Ki_col.vector, &Kj_col.vector, &d); + tr+=d; + } + + tr=tr-r*(double)n1; + gsl_matrix_set (S_mat, i, j, tr); + if (i!=j) {gsl_matrix_set (S_mat, j, i, tr);} + } + } + + /* + cout<<"q_vec = "<<endl; + for (size_t i=0; i<q_vec->size; i++) { + cout<<gsl_vector_get(q_vec, i)<<" "; + } + cout<<endl; + + cout<<"S_mat = "<<endl; + for (size_t i=0; i<S_mat->size1; i++) { + for (size_t j=0; j<S_mat->size2; j++) { + cout<<gsl_matrix_get(S_mat, i, j)<<" "; + } + cout<<endl; + } + */ + + //compute S^{-1}q + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + LUDecomp (S_mat, pmt, &sig); + LUInvert (S_mat, pmt, Si_mat); + + //compute pve (on the transformed scale) + gsl_blas_dgemv (CblasNoTrans, 1.0, Si_mat, q_vec, 0.0, pve); + + //compute q_var (*n^4) + gsl_matrix_set_zero (qvar_mat); + s=1; + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get(pve, i); + gsl_matrix_view yKrKKry_sub=gsl_matrix_submatrix(yKrKKry, 0, i*n_vc, n_vc, n_vc); + gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix); + gsl_matrix_scale(tmp_mat, d); + gsl_matrix_add (qvar_mat, tmp_mat); + s-=d; + } + gsl_matrix_view yKrKKry_sub=gsl_matrix_submatrix(yKrKKry, 0, n_vc*n_vc, n_vc, n_vc); + gsl_matrix_memcpy (tmp_mat, &yKrKKry_sub.matrix); + gsl_matrix_scale(tmp_mat, s); + gsl_matrix_add (qvar_mat, tmp_mat); + + gsl_matrix_scale(qvar_mat, 2.0); + + //compute S^{-1}var_qS^{-1} + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, qvar_mat, 0.0, tmp_mat); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat); + + //transform pve back to the original scale and save data + v_pve.clear(); v_se_pve.clear(); + v_sigma2.clear(); v_se_sigma2.clear(); + + s=1.0, v=0, pve_total=0, se_pve_total=0; + for (size_t i=0; i<n_vc; i++) { + d=gsl_vector_get (pve, i); + //cout<<var_y<<" "<<var_y_new<<" "<<v_traceG[i]<<" "<<traceG_new[i]<<endl; + v_sigma2.push_back(d*var_y_new/traceG_new[i]); + v_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)); + s-=d; + pve_total+=d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y); + + d=sqrt(gsl_matrix_get (Var_mat, i, i)); + v_se_sigma2.push_back(d*var_y_new/traceG_new[i]); + v_se_pve.push_back(d*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)); + + //d*=sqrt(var_y/v_traceG[i]-v_sigma2[i]); + //v_se_pve.push_back(d/var_y); + + for (size_t j=0; j<n_vc; j++) { + v+=gsl_matrix_get(Var_mat, i, j); + se_pve_total+=gsl_matrix_get(Var_mat, i, j)*(var_y_new/traceG_new[i])*(v_traceG[i]/var_y)*(var_y_new/traceG_new[j])*(v_traceG[j]/var_y); + } + } + v_sigma2.push_back(s*r*var_y_new); + v_se_sigma2.push_back(sqrt(v)*r*var_y_new); + se_pve_total=sqrt(se_pve_total); + + cout<<"sigma2 = "; + for (size_t i=0; i<n_vc+1; i++) { + cout<<v_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"se(sigma2) = "; + for (size_t i=0; i<n_vc+1; i++) { + cout<<v_se_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"pve = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_pve[i]<<" "; + } + cout<<endl; + + cout<<"se(pve) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_se_pve[i]<<" "; + } + cout<<endl; + + if (n_vc>1) { + cout<<"total pve = "<<pve_total<<endl; + cout<<"se(total pve) = "<<se_pve_total<<endl; + } + + gsl_permutation_free(pmt); + gsl_matrix_free(K_scale); + gsl_vector_free(y_scale); + gsl_matrix_free(Kry); + gsl_matrix_free(yKrKKry); + gsl_vector_free(KKry); + + //old matrices/vectors + gsl_vector_free(pve); + gsl_vector_free(se_pve); + gsl_vector_free(q_vec); + gsl_matrix_free(qvar_mat); + gsl_matrix_free(tmp_mat); + gsl_matrix_free(S_mat); + gsl_matrix_free(Si_mat); + gsl_matrix_free(Var_mat); + + return; +} + + + + +//reml for log(sigma2) based on the AI algorithm +void VC::CalcVCreml (bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y) { size_t n1=K->size1, n2=K->size2; size_t n_vc=n2/n1; gsl_vector *log_sigma2=gsl_vector_alloc (n_vc+1); double d, s; + /* + //compare eigenlib vs lapack + //dgemm + gsl_matrix *K2=gsl_matrix_alloc(K->size1, K->size1); + + clock_t time_start=clock(); + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, K, K, 0.0, K2); + cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + for (size_t j=0; j<2; j++) { + cout<<gsl_matrix_get(K2, i, j)<<" "; + } + cout<<endl; + } + + time_start=clock(); + lapack_dgemm ((char *)"N", (char *)"T", 1.0, K, K, 0.0, K2); + cout<<"lapack time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + for (size_t j=0; j<2; j++) { + cout<<gsl_matrix_get(K2, i, j)<<" "; + } + cout<<endl; + } + + time_start=clock(); + eigenlib_dgemm((char *)"N", (char *)"T", 1.0, K, K, 0.0, K2); + cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + for (size_t j=0; j<2; j++) { + cout<<gsl_matrix_get(K2, i, j)<<" "; + } + cout<<endl; + } + + //dgemv + gsl_vector_const_view W_col=gsl_matrix_const_column (K, 0); + gsl_vector *v=gsl_vector_alloc (K->size1); + time_start=clock(); + for (size_t i=0; i<1000; i++) { + gsl_blas_dgemv(CblasNoTrans, 1.0, K2, &W_col.vector, 0.0, v); + } + cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + cout<<gsl_vector_get(v, i)<<endl; + } + + time_start=clock(); + for (size_t i=0; i<1000; i++) { + eigenlib_dgemv((char *)"N", 1.0, K2, &W_col.vector, 0.0, v); + } + cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + cout<<gsl_vector_get(v, i)<<endl; + } + + //eigen + gsl_matrix *K2copy=gsl_matrix_alloc(K->size1, K->size1); + gsl_matrix *K3=gsl_matrix_alloc(K->size1, K->size1); + + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + EigenDecomp(K2copy, K3, v, 0); + cout<<"standard time 0: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + cout<<gsl_vector_get(v, i)<<endl; + } + + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + EigenDecomp(K2copy, K3, v, 1); + cout<<"standard time 1: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + cout<<gsl_vector_get(v, i)<<endl; + } + + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + eigenlib_eigensymm(K2copy, K3, v); + cout<<"eigenlib time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + cout<<gsl_vector_get(v, i)<<endl; + } + + + + //invert + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + int sigcopy; + gsl_permutation * pmt1=gsl_permutation_alloc (K2->size1); + LUDecomp (K2copy, pmt1, &sigcopy); + LUInvert (K2copy, pmt1, K3); + gsl_permutation_free(pmt1); + cout<<"standard time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + for (size_t j=0; j<2; j++) { + cout<<gsl_matrix_get(K3, i, j)<<" "; + } + cout<<endl; + } + + gsl_matrix_memcpy(K2copy, K2); + time_start=clock(); + eigenlib_invert(K2copy); + cout<<"eigen time: "<<(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0)<<endl; + for (size_t i=0; i<2; i++) { + for (size_t j=0; j<2; j++) { + cout<<gsl_matrix_get(K2copy, i, j)<<" "; + } + cout<<endl; + } + */ + //set up params gsl_matrix *P=gsl_matrix_alloc (n1, n1); gsl_vector *Py=gsl_vector_alloc (n1); @@ -318,18 +1648,26 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector gsl_vector *dev1=gsl_vector_alloc (n_vc+1); gsl_matrix *dev2=gsl_matrix_alloc (n_vc+1, n_vc+1); gsl_matrix *Hessian=gsl_matrix_alloc (n_vc+1, n_vc+1); - VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian}; + VC_PARAM params={K, W, y, P, Py, KPy_mat, PKPy_mat, Hessian, noconstrain}; //initialize sigma2/log_sigma2 + CalcVChe (K, W, y); + gsl_blas_ddot (y, y, &s); s/=(double)n1; for (size_t i=0; i<n_vc+1; i++) { + if (noconstrain) { + d=v_sigma2[i]; + } else { + if (v_sigma2[i]<=0) {d=log(0.1);} else {d=log(v_sigma2[i]);} + } + /* if (i==n_vc) { d=s/((double)n_vc+1.0); } else { d=s/( ((double)n_vc+1.0)*v_traceG[i]); } - + */ gsl_vector_set (log_sigma2, i, d); } // gsl_vector_set (log_sigma2, 0, 0.38); @@ -338,7 +1676,11 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector cout<<"iteration "<<0<<endl; cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { - cout<<exp(gsl_vector_get(log_sigma2, i))<<" "; + if (noconstrain) { + cout<<gsl_vector_get(log_sigma2, i)<<" "; + } else { + cout<<exp(gsl_vector_get(log_sigma2, i))<<" "; + } } cout<<endl; @@ -349,15 +1691,15 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector FDF.f=&LogRL_dev1; FDF.df=&LogRL_dev2; FDF.fdf=&LogRL_dev12; - - //set up solver + + //set up solver int status; int iter=0, max_iter=100; const gsl_multiroot_fdfsolver_type *T_fdf; gsl_multiroot_fdfsolver *s_fdf; T_fdf=gsl_multiroot_fdfsolver_hybridsj; - s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1); + s_fdf=gsl_multiroot_fdfsolver_alloc (T_fdf, n_vc+1); gsl_multiroot_fdfsolver_set (s_fdf, &FDF, log_sigma2); @@ -370,37 +1712,55 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector cout<<"iteration "<<iter<<endl; cout<<"sigma2 = "; for (size_t i=0; i<n_vc+1; i++) { - cout<<exp(gsl_vector_get(s_fdf->x, i))<<" "; + if (noconstrain) { + cout<<gsl_vector_get(s_fdf->x, i)<<" "; + } else { + cout<<exp(gsl_vector_get(s_fdf->x, i))<<" "; + } } cout<<endl; + /* cout<<"derivatives = "; for (size_t i=0; i<n_vc+1; i++) { cout<<gsl_vector_get(s_fdf->f, i)<<" "; } cout<<endl; - - status=gsl_multiroot_test_residual (s_fdf->f, 1e-3); + */ + status=gsl_multiroot_test_residual (s_fdf->f, 1e-3); } - while (status==GSL_CONTINUE && iter<max_iter); - - //obtain Hessian inverse - int sig=LogRL_dev12 (s_fdf->f, ¶ms, dev1, dev2); + while (status==GSL_CONTINUE && iter<max_iter); + + //obtain Hessian and Hessian inverse + int sig=LogRL_dev12 (s_fdf->x, ¶ms, dev1, dev2); + /* + for (size_t i=0; i<dev2->size1; i++) { + for (size_t j=0; j<dev2->size2; j++) { + cout<<gsl_matrix_get (dev2, i, j)<<" "; + } + cout<<endl; + } + */ gsl_permutation * pmt=gsl_permutation_alloc (n_vc+1); - LUDecomp (dev2, pmt, &sig); + LUDecomp (dev2, pmt, &sig); LUInvert (dev2, pmt, Hessian); gsl_permutation_free(pmt); - //save data - v_sigma2.clear(); + //save sigma2 and se_sigma2 + v_sigma2.clear(); v_se_sigma2.clear(); for (size_t i=0; i<n_vc+1; i++) { - d=exp(gsl_vector_get(s_fdf->x, i)); + if (noconstrain) { + d=gsl_vector_get(s_fdf->x, i); + } else { + d=exp(gsl_vector_get(s_fdf->x, i)); + } v_sigma2.push_back(d); - } - v_se_sigma2.clear(); - for (size_t i=0; i<n_vc+1; i++) { - d=-1.0*v_sigma2[i]*v_sigma2[i]*gsl_matrix_get(Hessian, i, i); + if (noconstrain) { + d=-1.0*gsl_matrix_get(Hessian, i, i); + } else { + d=-1.0*d*d*gsl_matrix_get(Hessian, i, i); + } v_se_sigma2.push_back(sqrt(d)); } @@ -409,20 +1769,80 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector s+=v_traceG[i]*v_sigma2[i]; } s+=v_sigma2[n_vc]; - - v_pve.clear(); + + //compute pve + v_pve.clear(); pve_total=0; for (size_t i=0; i<n_vc; i++) { d=v_traceG[i]*v_sigma2[i]/s; v_pve.push_back(d); + pve_total+=d; } - v_se_pve.clear(); - for (size_t i=0; i<n_vc; i++) { - d=v_traceG[i]*(s-v_sigma2[i]*v_traceG[i])/(s*s)*v_se_sigma2[i]*v_se_sigma2[i]; - v_se_pve.push_back(sqrt(d) ); + //compute se_pve; k=n_vc+1: total + double d1, d2; + v_se_pve.clear(); se_pve_total=0; + for (size_t k=0; k<n_vc+1; k++) { + d=0; + for (size_t i=0; i<n_vc+1; i++) { + if (noconstrain) { + d1=gsl_vector_get(s_fdf->x, i); + d1=1; + } else { + d1=exp(gsl_vector_get(s_fdf->x, i)); + } + + if (k<n_vc) { + if (i==k) { + d1*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s); + } else if (i==n_vc) { + d1*=-1*v_traceG[k]*v_sigma2[k]/(s*s); + } else { + d1*=-1*v_traceG[i]*v_traceG[k]*v_sigma2[k]/(s*s); + } + } else { + if (i==k) { + d1*=-1*(s-v_sigma2[n_vc])/(s*s); + } else { + d1*=v_traceG[i]*v_sigma2[n_vc]/(s*s); + } + } + + for (size_t j=0; j<n_vc+1; j++) { + if (noconstrain) { + d2=gsl_vector_get(s_fdf->x, j); + d2=1; + } else { + d2=exp(gsl_vector_get(s_fdf->x, j)); + } + + if (k<n_vc) { + if (j==k) { + d2*=v_traceG[k]*(s-v_sigma2[k]*v_traceG[k])/(s*s); + } else if (j==n_vc) { + d2*=-1*v_traceG[k]*v_sigma2[k]/(s*s); + } else { + d2*=-1*v_traceG[j]*v_traceG[k]*v_sigma2[k]/(s*s); + } + } else { + if (j==k) { + d2*=-1*(s-v_sigma2[n_vc])/(s*s); + } else { + d2*=v_traceG[j]*v_sigma2[n_vc]/(s*s); + } + } + + d+=-1.0*d1*d2*gsl_matrix_get(Hessian, i, j); + } + } + + if (k<n_vc) { + v_se_pve.push_back(sqrt(d) ); + } else { + se_pve_total=sqrt(d); + } } - - gsl_multiroot_fdfsolver_free(s_fdf); + + gsl_multiroot_fdfsolver_free(s_fdf); gsl_vector_free(log_sigma2); gsl_matrix_free(P); @@ -437,7 +1857,643 @@ void VC::CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector } - + +//read bimbam mean genotype file and compute XWz +bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + + string line; + char *ch_ptr; + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + gsl_vector *wz=gsl_vector_alloc (w->size); + gsl_vector_memcpy (wz, z); + gsl_vector_mul(wz, w); + + for (size_t t=0; t<indicator_snp.size(); ++t) { + !safeGetline(infile, line).eof(); + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + geno_mean=0.0; n_miss=0; geno_var=0.0; + gsl_vector_set_all(geno_miss, 0); + + size_t j=0; + for (size_t i=0; i<indicator_idv.size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ch_ptr=strtok (NULL, " , \t"); + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;} + else { + d=atof(ch_ptr); + gsl_vector_set (geno, j, d); + gsl_vector_set (geno_miss, j, 1); + geno_mean+=d; + geno_var+=d*d; + } + j++; + } + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + } + + gsl_vector_add_constant (geno, -1.0*geno_mean); + + gsl_vector_view XWz_col=gsl_matrix_column(XWz, vec_cat[ns_test]); + d=gsl_vector_get (wz, ns_test); + gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector); + + ns_test++; + } + + cout<<endl; + + gsl_vector_free (geno); + gsl_vector_free (geno_miss); + gsl_vector_free (wz); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//read plink bed file and compute XWz +bool PlinkXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + + char ch[1]; + bitset<8> b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *wz=gsl_vector_alloc (w->size); + gsl_vector_memcpy (wz, z); + gsl_vector_mul(wz, w); + + int n_bit; + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && ci_total==ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } + else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;} + } + else { + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); } + else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; } + } + + ci_test++; + ci_total++; + } + } + + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + d=gsl_vector_get(geno,i); + if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} + } + + gsl_vector_add_constant (geno, -1.0*geno_mean); + + gsl_vector_view XWz_col=gsl_matrix_column(XWz, vec_cat[ns_test]); + d=gsl_vector_get (wz, ns_test); + gsl_blas_daxpy (d/sqrt(geno_var), geno, &XWz_col.vector); + + ns_test++; + } + cout<<endl; + + gsl_vector_free (geno); + gsl_vector_free (wz); + + infile.close(); + infile.clear(); + + return true; +} + + + +//read multiple genotype files and compute XWz +bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz) +{ + gsl_matrix_set_zero(XWz); + + igzstream infile (file_mfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;} + + string file_name; + size_t l=0, ns_test=0; + + while (!safeGetline(infile, file_name).eof()) { + if (mfile_mode==1) { + file_name+=".bed"; + PlinkXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz); + } else { + BimbamXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], vec_cat, w, z, ns_test, XWz); + } + + l++; + } + + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//read bimbam mean genotype file and compute X_i^TX_jWz +bool BimbamXtXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) +{ + igzstream infile (file_geno.c_str(), igzstream::in); + //ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + + string line; + char *ch_ptr; + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno_miss=gsl_vector_alloc (ni_test); + + for (size_t t=0; t<indicator_snp.size(); ++t) { + !safeGetline(infile, line).eof(); + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + ch_ptr=strtok (NULL, " , \t"); + ch_ptr=strtok (NULL, " , \t"); + + geno_mean=0.0; n_miss=0; geno_var=0.0; + gsl_vector_set_all(geno_miss, 0); + + size_t j=0; + for (size_t i=0; i<indicator_idv.size(); ++i) { + if (indicator_idv[i]==0) {continue;} + ch_ptr=strtok (NULL, " , \t"); + if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;} + else { + d=atof(ch_ptr); + gsl_vector_set (geno, j, d); + gsl_vector_set (geno_miss, j, 1); + geno_mean+=d; + geno_var+=d*d; + } + j++; + } + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + } + + gsl_vector_add_constant (geno, -1.0*geno_mean); + + for (size_t i=0; i<XWz->size2; i++) { + gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i); + gsl_blas_ddot (geno, &XWz_col.vector, &d); + gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); + } + + ns_test++; + } + + cout<<endl; + + gsl_vector_free (geno); + gsl_vector_free (geno_miss); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//read plink bed file and compute XWz +bool PlinkXtXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + + char ch[1]; + bitset<8> b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test=XWz->size1; + size_t ni_total=indicator_idv.size(); + gsl_vector *geno=gsl_vector_alloc (ni_test); + + int n_bit; + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t<indicator_snp.size(); ++t) { + if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (indicator_snp[t]==0) {continue;} + + infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + + //read genotypes + geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; + for (int i=0; i<n_bit; ++i) { + infile.read(ch,1); + b=ch[0]; + for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + if ((i==(n_bit-1)) && ci_total==ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } + else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;} + } + else { + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); } + else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; } + } + + ci_test++; + ci_total++; + } + } + + + geno_mean/=(double)(ni_test-n_miss); + geno_var+=geno_mean*geno_mean*(double)n_miss; + geno_var/=(double)ni_test; + geno_var-=geno_mean*geno_mean; +// geno_var=geno_mean*(1-geno_mean*0.5); + + for (size_t i=0; i<ni_test; ++i) { + d=gsl_vector_get(geno,i); + if (d==-9.0) {gsl_vector_set(geno, i, geno_mean);} + } + + gsl_vector_add_constant (geno, -1.0*geno_mean); + + for (size_t i=0; i<XWz->size2; i++) { + gsl_vector_const_view XWz_col=gsl_matrix_const_column(XWz, i); + gsl_blas_ddot (geno, &XWz_col.vector, &d); + gsl_matrix_set (XtXWz, ns_test, i, d/sqrt(geno_var)); + } + + ns_test++; + } + cout<<endl; + + gsl_vector_free (geno); + + infile.close(); + infile.clear(); + + return true; +} + + + +//read multiple genotype files and compute XWz +bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz) +{ + gsl_matrix_set_zero(XtXWz); + + igzstream infile (file_mfile.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;} + + string file_name; + size_t l=0, ns_test=0; + + while (!safeGetline(infile, file_name).eof()) { + if (mfile_mode==1) { + file_name+=".bed"; + PlinkXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz); + } else { + BimbamXtXwz (file_name, display_pace, indicator_idv, mindicator_snp[l], XWz, ns_test, XtXWz); + } + + l++; + } + + infile.close(); + infile.clear(); + + return true; +} + + +//compute confidence intervals from summary statistics +void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector<size_t> &vec_cat, const vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich) { + size_t n_vc=XWz->size2, ns_test=w->size, ni_test=XWz->size1; + + //set up matrices + gsl_vector *w_pve=gsl_vector_alloc (ns_test); + gsl_vector *wz=gsl_vector_alloc (ns_test); + gsl_vector *zwz=gsl_vector_alloc (n_vc); + gsl_vector *zz=gsl_vector_alloc (n_vc); + gsl_vector *Xz_pve=gsl_vector_alloc (ni_test); + gsl_vector *WXtXWz=gsl_vector_alloc (ns_test); + + gsl_matrix *Si_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *Var_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *tmp_mat1=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *VarEnrich_mat=gsl_matrix_alloc (n_vc, n_vc); + gsl_matrix *qvar_mat=gsl_matrix_alloc (n_vc, n_vc); + + double d, s0, s1, s, s_pve, s_snp; + + //compute wz and zwz + gsl_vector_memcpy (wz, z); + gsl_vector_mul (wz, w); + + gsl_vector_set_zero (zwz); + gsl_vector_set_zero (zz); + for (size_t i=0; i<w->size; i++) { + d=gsl_vector_get (wz, i)*gsl_vector_get (z, i); + d+=gsl_vector_get (zwz, vec_cat[i]); + gsl_vector_set (zwz, vec_cat[i], d); + + d=gsl_vector_get (z, i)*gsl_vector_get (z, i); + d+=gsl_vector_get (zz, vec_cat[i]); + gsl_vector_set (zz, vec_cat[i], d); + } + + //compute wz, ve and Xz_pve + gsl_vector_set_zero (Xz_pve); s_pve=0; s_snp=0; + for (size_t i=0; i<n_vc; i++) { + s_pve+=v_pve[i]; + s_snp+=gsl_vector_get(s_vec, i); + + gsl_vector_const_view Xz_col=gsl_matrix_const_column (Xz, i); + gsl_blas_daxpy (v_pve[i]/gsl_vector_get(s_vec, i), &Xz_col.vector, Xz_pve); + } + + //set up wpve vector + for (size_t i=0; i<w->size; i++) { + d=v_pve[vec_cat[i]]/gsl_vector_get(s_vec, vec_cat[i]); + gsl_vector_set (w_pve, i, d); + } + + //compute Vq (in qvar_mat) + s0=1-s_pve; + for (size_t i=0; i<n_vc; i++) { + s0+=gsl_vector_get (zz, i)*v_pve[i]/gsl_vector_get(s_vec, i); + } + + for (size_t i=0; i<n_vc; i++) { + s1=s0; + s1-=gsl_vector_get (zwz, i)*(1-s_pve)/gsl_vector_get(s_vec, i); + + gsl_vector_const_view XWz_col1=gsl_matrix_const_column (XWz, i); + gsl_vector_const_view XtXWz_col1=gsl_matrix_const_column (XtXWz, i); + + gsl_vector_memcpy (WXtXWz, &XtXWz_col1.vector); + gsl_vector_mul (WXtXWz, w_pve); + + gsl_blas_ddot (Xz_pve, &XWz_col1.vector, &d); + s1-=d/gsl_vector_get(s_vec, i); + + for (size_t j=0; j<n_vc; j++) { + s=s1; + + s-=gsl_vector_get (zwz, j)*(1-s_pve)/gsl_vector_get(s_vec, j); + + gsl_vector_const_view XWz_col2=gsl_matrix_const_column (XWz, j); + gsl_vector_const_view XtXWz_col2=gsl_matrix_const_column (XtXWz, j); + + gsl_blas_ddot (WXtXWz, &XtXWz_col2.vector, &d); + s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j)); + + gsl_blas_ddot (&XWz_col1.vector, &XWz_col2.vector, &d); + s+=d/(gsl_vector_get(s_vec, i)*gsl_vector_get(s_vec, j))*(1-s_pve); + + gsl_blas_ddot (Xz_pve, &XWz_col2.vector, &d); + s-=d/gsl_vector_get(s_vec, j); + + gsl_matrix_set (qvar_mat, i, j, s); + } + + } + + d=(double)(ni_test-1); + gsl_matrix_scale (qvar_mat, 2.0/(d*d*d)); + + //cout<<scientific<<gsl_matrix_get(qvar_mat, 0, 0)<<endl; + + //calculate S^{-1} + gsl_matrix_memcpy (tmp_mat, S_mat); + int sig; + gsl_permutation * pmt=gsl_permutation_alloc (n_vc); + LUDecomp (tmp_mat, pmt, &sig); + LUInvert (tmp_mat, pmt, Si_mat); + + //calculate variance for the estimates + for (size_t i=0; i<n_vc; i++) { + for (size_t j=i; j<n_vc; j++) { + d=gsl_matrix_get(Svar_mat, i, j); + d*=v_pve[i]*v_pve[j]; + //cout<<d<<" "; + + d+=gsl_matrix_get(qvar_mat, i, j); + gsl_matrix_set(Var_mat, i, j, d); + if (i!=j) {gsl_matrix_set(Var_mat, j, i, d);} + } + //cout<<endl; + } + + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Si_mat, Var_mat, 0.0, tmp_mat); + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Si_mat, 0.0, Var_mat); + + //compute sigma2 per snp, enrich + v_sigma2.clear(); v_enrich.clear(); + for (size_t i=0; i<n_vc; i++) { + v_sigma2.push_back(v_pve[i]/gsl_vector_get(s_vec, i) ); + v_enrich.push_back(v_pve[i]/gsl_vector_get(s_vec, i)*s_snp/s_pve); + } + + //compute se_pve, se_sigma2 + for (size_t i=0; i<n_vc; i++) { + d=sqrt(gsl_matrix_get(Var_mat, i, i)); + v_se_pve.push_back(d); + v_se_sigma2.push_back(d/gsl_vector_get(s_vec, i)); + } + + //compute pve_total, se_pve_total + pve_total=0; + for (size_t i=0; i<n_vc; i++) { + pve_total+=v_pve[i]; + } + + se_pve_total=0; + for (size_t i=0; i<n_vc; i++) { + for (size_t j=0; j<n_vc; j++) { + se_pve_total+=gsl_matrix_get(Var_mat, i, j); + } + } + se_pve_total=sqrt(se_pve_total); + + //compute se_enrich + gsl_matrix_set_identity(tmp_mat); + + double d1; + for (size_t i=0; i<n_vc; i++) { + d=v_pve[i]/s_pve; + d1=gsl_vector_get(s_vec, i); + for (size_t j=0; j<n_vc; j++) { + if (i==j) { + gsl_matrix_set(tmp_mat, i, j, (1-d)/d1*s_snp/s_pve); + } else { + gsl_matrix_set(tmp_mat, i, j, -1*d/d1*s_snp/s_pve); + } + } + } + gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, tmp_mat, Var_mat, 0.0, tmp_mat1); + gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, tmp_mat1, tmp_mat, 0.0, VarEnrich_mat); + + for (size_t i=0; i<n_vc; i++) { + d=sqrt(gsl_matrix_get(VarEnrich_mat, i, i)); + v_se_enrich.push_back(d); + } + + cout<<"pve = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_pve[i]<<" "; + } + cout<<endl; + + cout<<"se(pve) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_se_pve[i]<<" "; + } + cout<<endl; + + cout<<"sigma2 per snp = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"se(sigma2 per snp) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_se_sigma2[i]<<" "; + } + cout<<endl; + + cout<<"enrichment = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_enrich[i]<<" "; + } + cout<<endl; + + cout<<"se(enrichment) = "; + for (size_t i=0; i<n_vc; i++) { + cout<<v_se_enrich[i]<<" "; + } + cout<<endl; + + //delete matrices + gsl_matrix_free(Si_mat); + gsl_matrix_free(Var_mat); + gsl_matrix_free(VarEnrich_mat); + gsl_matrix_free(tmp_mat); + gsl_matrix_free(tmp_mat1); + gsl_matrix_free(qvar_mat); + + gsl_vector_free(w_pve); + gsl_vector_free(wz); + gsl_vector_free(zwz); + gsl_vector_free(WXtXWz); + gsl_vector_free(Xz_pve); + + return; +} @@ -16,7 +16,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef __VC_H__ +#ifndef __VC_H__ #define __VC_H__ #include "gsl/gsl_vector.h" @@ -38,7 +38,7 @@ using namespace std; class VC_PARAM { -public: +public: const gsl_matrix *K; const gsl_matrix *W; const gsl_vector *y; @@ -47,18 +47,34 @@ public: gsl_matrix *KPy_mat; gsl_matrix *PKPy_mat; gsl_matrix *Hessian; + bool noconstrain; }; + class VC { public: // IO related parameters + size_t a_mode; + string file_cat; + string file_beta; + string file_cor; + string file_mq; + string file_ms; + string file_out; string path_out; + set<string> setSnps; + + size_t ni_total_ref, ns_total_ref, ns_pair; + size_t ni_total, ns_total, ns_test; + size_t n_vc; + + double pve_total, se_pve_total; vector<double> v_sigma2; vector<double> v_se_sigma2; vector<double> v_pve; @@ -67,16 +83,33 @@ public: vector<double> v_beta; vector<double> v_se_beta; + size_t crt; + double window_cm, window_bp, window_ns; + double time_UtX; double time_opt; - + // Main functions void CopyFromParam (PARAM &cPar); void CopyToParam (PARAM &cPar); + void WriteFile_qs (const gsl_vector *s_vec, const gsl_vector *q_vec, const gsl_vector *qvar_vec, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat); void CalcVChe (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); - void CalcVCreml (const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); + void CalcVCreml (const bool noconstrain, const gsl_matrix *K, const gsl_matrix *W, const gsl_vector *y); }; +void CalcVCss(const gsl_matrix *Vq, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *q_vec, const gsl_vector *s_vec, const double df, vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich); + + +bool BimbamXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz); +bool PlinkXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, size_t ns_test, gsl_matrix *XWz); +bool MFILEXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const vector<size_t> &vec_cat, const gsl_vector *w, const gsl_vector *z, gsl_matrix *XWz); + +bool BimbamXtXwz (const string &file_geno, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz); +bool PlinkXtXwz (const string &file_bed, const int display_pace, vector<int> &indicator_idv, vector<int> &indicator_snp, const gsl_matrix *XWz, size_t ns_test, gsl_matrix *XtXWz); +bool MFILEXtXwz (const size_t mfile_mode, const string &file_mfile, const int display_pace, vector<int> &indicator_idv, vector<vector<int> > &mindicator_snp, const gsl_matrix *XWz, gsl_matrix *XtXWz); + +void CalcCIss(const gsl_matrix *Xz, const gsl_matrix *XWz, const gsl_matrix *XtXWz, const gsl_matrix *S_mat, const gsl_matrix *Svar_mat, const gsl_vector *w, const gsl_vector *z, const gsl_vector *s_vec, const vector<size_t> &vec_cat, const vector<double> &v_pve, vector<double> &v_se_pve, double &pve_total, double &se_pve_total, vector<double> &v_sigma2, vector<double> &v_se_sigma2, vector<double> &v_enrich, vector<double> &v_se_enrich); + #endif |