diff options
Diffstat (limited to 'src/param.cpp')
-rw-r--r-- | src/param.cpp | 212 |
1 files changed, 55 insertions, 157 deletions
diff --git a/src/param.cpp b/src/param.cpp index 3b319e9..bf6c195 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,12 +18,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <iostream> +#include <iomanip> +#include <string> #include <algorithm> #include <cmath> #include <cstring> #include <fstream> -#include <iostream> -#include <string> #include <sys/stat.h> #include "gsl/gsl_blas.h" @@ -66,7 +69,7 @@ void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps, // (indicator_idv[x] == 1). This should match indicator_cvt etc. If // this gives problems with certain sets we can simply trim to size. -void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { +void trim_individuals(vector<int> &idvs, size_t ni_max) { if (ni_max) { size_t count = 0; for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) { @@ -76,7 +79,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { break; } if (count != idvs.size()) { - if (debug) + if (is_debug_mode()) cout << "**** TEST MODE: trim individuals from " << idvs.size() << " to " << count << endl; idvs.resize(count); @@ -87,7 +90,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { // ---- PARAM class implementation PARAM::PARAM(void) - : mode_silence(false), a_mode(0), k_mode(1), d_pace(100000), + : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001), @@ -97,7 +100,7 @@ PARAM::PARAM(void) rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000), r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), - error(false), ni_subsample(0), n_cvt(1), n_vc(1), n_cat(0), + error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} @@ -221,7 +224,7 @@ void PARAM::ReadFiles(void) { } else { n_cvt = 1; } - trim_individuals(indicator_cvt, ni_max, mode_debug); + trim_individuals(indicator_cvt, ni_max); if (!file_gxe.empty()) { if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) { @@ -234,38 +237,7 @@ void PARAM::ReadFiles(void) { } } - trim_individuals(indicator_idv, ni_max, mode_debug); - - // WJA added. - // Read genotype and phenotype file for bgen format. - if (!file_oxford.empty()) { - file_str = file_oxford + ".sample"; - if (ReadFile_sample(file_str, indicator_pheno, pheno, p_column, - indicator_cvt, cvt, n_cvt) == false) { - error = true; - } - if ((indicator_cvt).size() == 0) { - n_cvt = 1; - } - - // Post-process covariates and phenotypes, obtain - // ni_test, save all useful covariates. - ProcessCvtPhen(); - - // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); - - file_str = file_oxford + ".bgen"; - if (ReadFile_bgen(file_str, setSnps, W, indicator_idv, indicator_snp, - snpInfo, maf_level, miss_level, hwe_level, r2_level, - ns_test) == false) { - error = true; - } - gsl_matrix_free(W); - - ns_total = indicator_snp.size(); - } + trim_individuals(indicator_idv, ni_max); // Read genotype and phenotype file for PLINK format. if (!file_bfile.empty()) { @@ -297,16 +269,16 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W1); file_str = file_bfile + ".bed"; - if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test) == false) { error = true; } - gsl_matrix_free(W); + gsl_matrix_free(W1); ns_total = indicator_snp.size(); } @@ -330,17 +302,17 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W2); - trim_individuals(indicator_idv, ni_max, mode_debug); - trim_individuals(indicator_cvt, ni_max, mode_debug); - if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp, + trim_individuals(indicator_idv, ni_max); + trim_individuals(indicator_cvt, ni_max); + if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test, mode_debug) == false) { + mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; } - gsl_matrix_free(W); + gsl_matrix_free(W2); ns_total = indicator_snp.size(); } @@ -356,7 +328,7 @@ void PARAM::ReadFiles(void) { string file_name; size_t t = 0, ns_test_tmp = 0; - gsl_matrix *W; + gsl_matrix *W3 = NULL; while (!safeGetline(infile, file_name).eof()) { file_str = file_name + ".bim"; @@ -388,12 +360,12 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W3); } file_str = file_name + ".bed"; - if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test_tmp) == false) { error = true; @@ -406,7 +378,7 @@ void PARAM::ReadFiles(void) { t++; } - gsl_matrix_free(W); + if (W3) gsl_matrix_free(W3); infile.close(); infile.clear(); @@ -432,8 +404,8 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W4); igzstream infile(file_mgeno.c_str(), igzstream::in); if (!infile) { @@ -445,9 +417,9 @@ void PARAM::ReadFiles(void) { string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { - if (ReadFile_geno(file_name, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp, mode_debug) == false) { + mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; } @@ -457,7 +429,7 @@ void PARAM::ReadFiles(void) { ns_total += indicator_snp.size(); } - gsl_matrix_free(W); + gsl_matrix_free(W4); infile.close(); infile.clear(); @@ -485,8 +457,8 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); + // CopyCvt(W5); if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) { error = true; @@ -741,19 +713,6 @@ void PARAM::CheckParam(void) { } } - if (!file_oxford.empty()) { - str = file_oxford + ".bgen"; - if (stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open .bgen file: " << str << endl; - error = true; - } - str = file_oxford + ".sample"; - if (stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open .sample file: " << str << endl; - error = true; - } - } - if ((!file_geno.empty() || !file_gene.empty())) { str = file_pheno; if (stat(str.c_str(), &fileInfo) == -1) { @@ -864,11 +823,6 @@ void PARAM::CheckParam(void) { flag++; } - // WJA added. - if (!file_oxford.empty()) { - flag++; - } - if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 && a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67) { @@ -942,14 +896,12 @@ void PARAM::CheckParam(void) { enforce_fexists(file_snps, "open file"); enforce_fexists(file_ksnps, "open file"); enforce_fexists(file_gwasnps, "open file"); - enforce_fexists(file_log, "open file"); enforce_fexists(file_anno, "open file"); if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); - enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); - enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)"); + // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); enforce_msg(!file_anno.empty(), "LOCO requires annotation file (-a switch)"); @@ -957,54 +909,15 @@ void PARAM::CheckParam(void) { enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } - str = file_kin; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open relatedness matrix file: " << str << endl; - error = true; - } - - str = file_mk; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open relatedness matrix file: " << str << endl; - error = true; - } - - str = file_cvt; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open covariates file: " << str << endl; - error = true; - } - - str = file_gxe; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open environmental covariate file: " << str << endl; - error = true; - } - - str = file_weight; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open the residual weight file: " << str << endl; - error = true; - } - - str = file_epm; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open estimated parameter file: " << str << endl; - error = true; - } - - str = file_ebv; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open estimated breeding value file: " << str - << endl; - error = true; - } - - str = file_read; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open total read file: " << str << endl; - error = true; - } + enforce_fexists(file_kin, "open file"); + enforce_fexists(file_mk, "open file"); + enforce_fexists(file_cvt, "open file"); + enforce_fexists(file_gxe, "open file"); + enforce_fexists(file_log, "open file"); + enforce_fexists(file_weight, "open file"); + enforce_fexists(file_epm, "open file"); + enforce_fexists(file_ebv, "open file"); + enforce_fexists(file_read, "open file"); // Check if files are compatible with analysis mode. if (k_mode == 2 && !file_geno.empty()) { @@ -1056,14 +969,6 @@ void PARAM::CheckParam(void) { void PARAM::CheckData(void) { - // WJA NOTE: I added this condition so that covariates can be added - // through sample, probably not exactly what is wanted. - if (file_oxford.empty()) { - if ((file_cvt).empty() || (indicator_cvt).size() == 0) { - n_cvt = 1; - } - } - if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) { cout << "error! the number of pve estimates does not equal to " << "the number of categories in the cat file:" << v_pve.size() << " " @@ -1194,21 +1099,21 @@ void PARAM::CheckData(void) { cout << "## number of total genes = " << ng_total << endl; } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) { if (!loco.empty()) - cout << "## leave one chromosome out (LOCO) = " << loco << endl; - cout << "## number of total SNPs = " << ns_total << endl; + cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl; + cout << "## number of total SNPs/var = " << setw(8) << ns_total << endl; if (setSnps.size()) - cout << "## number of considered SNPS = " << setSnps.size() << endl; + cout << "## number of considered SNPS = " << setw(8) << setSnps.size() << endl; if (setKSnps.size()) - cout << "## number of SNPS for K = " << setKSnps.size() << endl; + cout << "## number of SNPS for K = " << setw(8) << setKSnps.size() << endl; if (setGWASnps.size()) - cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl; - cout << "## number of analyzed SNPs = " << ns_test << endl; + cout << "## number of SNPS for GWAS = " << setw(8) << setGWASnps.size() << endl; + cout << "## number of analyzed SNPs = " << setw(8) << ns_test << endl; } else { } } // Set d_pace to 1000 for gene expression. - if (!file_gene.empty() && d_pace == 100000) { + if (!file_gene.empty() && d_pace == DEFAULT_PACE) { d_pace = 1000; } @@ -1340,7 +1245,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, - calc_K, mode_debug) == false) { + calc_K) == false) { error = true; } } @@ -1360,7 +1265,7 @@ void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test, mode_debug) == false) { + ni_test, ns_test) == false) { error = true; } } @@ -1375,18 +1280,11 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) { if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - enforce_msg(loco.empty(), "FIXME: LOCO nyi"); + // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } - } else if (!file_oxford.empty()) { - file_str = file_oxford + ".bgen"; - enforce_msg(loco.empty(), "FIXME: LOCO nyi"); - if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == - false) { - error = true; - } } else { file_str = file_geno; if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, |