diff options
Diffstat (limited to 'src/param.h')
-rw-r--r-- | src/param.h | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/src/param.h b/src/param.h new file mode 100644 index 0000000..fa18181 --- /dev/null +++ b/src/param.h @@ -0,0 +1,232 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __PARAM_H__ +#define __PARAM_H__ + +#include <vector> +#include <map> +#include <set> +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + +using namespace std; + + + +class SNPINFO { +public: + string chr; + string rs_number; + double cM; + long int base_position; + string a_minor; + string a_major; + size_t n_miss; + double missingness; + double maf; +}; + +//results for lmm +class SUMSTAT { +public: + double beta; //REML estimator for beta + double se; //SE for beta + double lambda_remle; //REML estimator for lambda + double lambda_mle; //MLE estimator for lambda + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test +}; + +//results for mvlmm +class MPHSUMSTAT { +public: + vector<double> v_beta; //REML estimator for beta + double p_wald; //p value from a Wald test + double p_lrt; //p value from a likelihood ratio test + double p_score; //p value from a score test + vector<double> v_Vg; //estimator for Vg, right half + vector<double> v_Ve; //estimator for Ve, right half + vector<double> v_Vbeta; //estimator for Vbeta, right half +}; + + +//hyper-parameters for bslmm +class HYPBSLMM { +public: + double h; + double pve; + double rho; + double pge; + double logp; + + size_t n_gamma; +}; + + + + +class PARAM { +public: + // IO related parameters + bool mode_silence; + int a_mode; //analysis mode, 1/2/3/4 for Frequentist tests + int k_mode; //kinship read mode: 1: n by n matrix, 2: id/id/k_value; + vector<size_t> p_column; //which phenotype column needs analysis + size_t d_pace; //display pace + + string file_bfile; + string file_geno; + string file_pheno; + string file_anno; //optional + string file_cvt; //optional + string file_kin; + string file_ku, file_kd; + string file_mk; + string file_out; + string path_out; + + string file_epm; //estimated parameter file + string file_ebv; //estimated breeding value file + string file_log; //log file containing mean estimate + + string file_read; //file containing total number of reads + string file_gene; //gene expression file + + string file_snps; //file containing analyzed snps or genes + + + + // QC related parameters + double miss_level; + double maf_level; + double hwe_level; + double r2_level; + + // LMM related parameters + double l_min; + double l_max; + size_t n_region; + double l_mle_null, l_remle_null; + double logl_mle_H0, logl_remle_H0; + double pve_null, pve_se_null; + double vg_remle_null, ve_remle_null, vg_mle_null, ve_mle_null; + vector<double> Vg_remle_null, Ve_remle_null, Vg_mle_null, Ve_mle_null; + vector<double> VVg_remle_null, VVe_remle_null, VVg_mle_null, VVe_mle_null; + vector<double> beta_remle_null, se_beta_remle_null, beta_mle_null, se_beta_mle_null; + double p_nr; + double em_prec, nr_prec; + size_t em_iter, nr_iter; + size_t crt; + double pheno_mean; //phenotype mean from bslmm fitting or for prediction + + //for fitting multiple variance components + //the first three are of size n_vc, and the next two are of size n_vc+1 + vector<double> v_traceG; + vector<double> v_pve; + vector<double> v_se_pve; + + vector<double> v_sigma2; + vector<double> v_se_sigma2; + vector<double> v_beta; + vector<double> v_se_beta; + + // BSLMM MCMC related parameters + double h_min, h_max, h_scale; //priors for h + double rho_min, rho_max, rho_scale; //priors for rho + double logp_min, logp_max, logp_scale; //priors for log(pi) + size_t s_min, s_max; //minimum and maximum number of gammas + size_t w_step; //number of warm up/burn in iterations + size_t s_step; //number of sampling iterations + size_t r_pace; //record pace + size_t w_pace; //write pace + size_t n_accept; //number of acceptance + size_t n_mh; //number of MH steps within each iteration + double geo_mean; //mean of the geometric distribution + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + bool error; + size_t ni_total, ni_test, ni_cvt; //number of individuals + size_t np_obs, np_miss; //number of observed and missing phenotypes + size_t ns_total, ns_test; //number of snps + size_t ng_total, ng_test; //number of genes + size_t ni_control, ni_case; //number of controls and number of cases + size_t n_cvt; //number of covariates + size_t n_ph; //number of phenotypes + size_t n_vc; //number of variance components (including the diagonal matrix) + double time_total; //record total time + double time_G; //time spent on reading files the second time and calculate K + double time_eigen; //time spent on eigen-decomposition + double time_UtX; //time spent on calculating UX and Uy + double time_UtZ; //time spent on calculating UtZ, for probit BSLMM + double time_opt; //time spent on optimization iterations/or mcmc + double time_Omega; //time spent on calculating Omega + double time_hyp; //time spent on sampling hyper-parameters, in PMM + double time_Proposal; //time spend on constructing the proposal distribution (i.e. the initial lmm or lm analysis) + + // Data + vector<vector<double> > pheno; //a vector record all phenotypes, NA replaced with -9 + vector<vector<double> > cvt; //a vector record all covariates, NA replaced with -9 + vector<vector<int> > indicator_pheno; //a matrix record when a phenotype is missing for an individual; 0 missing, 1 available + vector<int> indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector<int> indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + vector<int> indicator_cvt; //indicator for covariates, 0 missing, 1 available for analysis + + vector<int> indicator_bv; //indicator for estimated breeding value file, 0 missing, 1 available for analysis + vector<int> indicator_read; //indicator for read file, 0 missing, 1 available for analysis + vector<double> vec_read; //total number of reads + vector<double> vec_bv; //breeding values + vector<size_t> est_column; + + map<string, int> mapID2num; //map small ID number to number, from 0 to n-1 + map<string, string> mapRS2chr; //map rs# to chromosome location + map<string, long int> mapRS2bp; //map rs# to base position + map<string, double> mapRS2cM; //map rs# to cM + map<string, double> mapRS2est; //map rs# to parameters + + vector<SNPINFO> snpInfo; //record SNP information + set<string> setSnps; //a set of snps for analysis + + //constructor + PARAM(); + + //functions + void ReadFiles (); + void CheckParam (); + void CheckData (); + void PrintSummary (); + void ReadGenotypes (gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); + void CheckCvt (); + void CopyCvt (gsl_matrix *W); + void ProcessCvtPhen(); + void CopyCvtPhen (gsl_matrix *W, gsl_vector *y, size_t flag); + void CopyCvtPhen (gsl_matrix *W, gsl_matrix *Y, size_t flag); + void CalcKin (gsl_matrix *matrix_kin); + void WriteMatrix (const gsl_matrix *matrix_U, const string suffix); + void WriteVector (const gsl_vector *vector_D, const string suffix); + void CopyRead (gsl_vector *log_N); +}; + + +#endif + |