aboutsummaryrefslogtreecommitdiff
path: root/src/param.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/param.cpp')
-rw-r--r--src/param.cpp212
1 files changed, 55 insertions, 157 deletions
diff --git a/src/param.cpp b/src/param.cpp
index 3b319e9..bf6c195 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -1,6 +1,8 @@
/*
Genome-wide Efficient Mixed Model Association (GEMMA)
- Copyright (C) 2011-2017, Xiang Zhou
+ Copyright © 2011-2017, Xiang Zhou
+ Copyright © 2017, Peter Carbonetto
+ Copyright © 2017, Pjotr Prins
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,12 +18,13 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <iostream>
+#include <iomanip>
+#include <string>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <fstream>
-#include <iostream>
-#include <string>
#include <sys/stat.h>
#include "gsl/gsl_blas.h"
@@ -66,7 +69,7 @@ void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
// (indicator_idv[x] == 1). This should match indicator_cvt etc. If
// this gives problems with certain sets we can simply trim to size.
-void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) {
+void trim_individuals(vector<int> &idvs, size_t ni_max) {
if (ni_max) {
size_t count = 0;
for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) {
@@ -76,7 +79,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) {
break;
}
if (count != idvs.size()) {
- if (debug)
+ if (is_debug_mode())
cout << "**** TEST MODE: trim individuals from " << idvs.size()
<< " to " << count << endl;
idvs.resize(count);
@@ -87,7 +90,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) {
// ---- PARAM class implementation
PARAM::PARAM(void)
- : mode_silence(false), a_mode(0), k_mode(1), d_pace(100000),
+ : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE),
file_out("result"), path_out("./output/"), miss_level(0.05),
maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5),
n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001),
@@ -97,7 +100,7 @@ PARAM::PARAM(void)
rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000),
r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0),
randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200),
- error(false), ni_subsample(0), n_cvt(1), n_vc(1), n_cat(0),
+ error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1),
time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0),
time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {}
@@ -221,7 +224,7 @@ void PARAM::ReadFiles(void) {
} else {
n_cvt = 1;
}
- trim_individuals(indicator_cvt, ni_max, mode_debug);
+ trim_individuals(indicator_cvt, ni_max);
if (!file_gxe.empty()) {
if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
@@ -234,38 +237,7 @@ void PARAM::ReadFiles(void) {
}
}
- trim_individuals(indicator_idv, ni_max, mode_debug);
-
- // WJA added.
- // Read genotype and phenotype file for bgen format.
- if (!file_oxford.empty()) {
- file_str = file_oxford + ".sample";
- if (ReadFile_sample(file_str, indicator_pheno, pheno, p_column,
- indicator_cvt, cvt, n_cvt) == false) {
- error = true;
- }
- if ((indicator_cvt).size() == 0) {
- n_cvt = 1;
- }
-
- // Post-process covariates and phenotypes, obtain
- // ni_test, save all useful covariates.
- ProcessCvtPhen();
-
- // Obtain covariate matrix.
- gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
-
- file_str = file_oxford + ".bgen";
- if (ReadFile_bgen(file_str, setSnps, W, indicator_idv, indicator_snp,
- snpInfo, maf_level, miss_level, hwe_level, r2_level,
- ns_test) == false) {
- error = true;
- }
- gsl_matrix_free(W);
-
- ns_total = indicator_snp.size();
- }
+ trim_individuals(indicator_idv, ni_max);
// Read genotype and phenotype file for PLINK format.
if (!file_bfile.empty()) {
@@ -297,16 +269,16 @@ void PARAM::ReadFiles(void) {
ProcessCvtPhen();
// Obtain covariate matrix.
- gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
+ auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt);
+ CopyCvt(W1);
file_str = file_bfile + ".bed";
- if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp,
+ if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp,
snpInfo, maf_level, miss_level, hwe_level, r2_level,
ns_test) == false) {
error = true;
}
- gsl_matrix_free(W);
+ gsl_matrix_free(W1);
ns_total = indicator_snp.size();
}
@@ -330,17 +302,17 @@ void PARAM::ReadFiles(void) {
ProcessCvtPhen();
// Obtain covariate matrix.
- gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
+ auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt);
+ CopyCvt(W2);
- trim_individuals(indicator_idv, ni_max, mode_debug);
- trim_individuals(indicator_cvt, ni_max, mode_debug);
- if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp,
+ trim_individuals(indicator_idv, ni_max);
+ trim_individuals(indicator_cvt, ni_max);
+ if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp,
maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
- mapRS2bp, mapRS2cM, snpInfo, ns_test, mode_debug) == false) {
+ mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
error = true;
}
- gsl_matrix_free(W);
+ gsl_matrix_free(W2);
ns_total = indicator_snp.size();
}
@@ -356,7 +328,7 @@ void PARAM::ReadFiles(void) {
string file_name;
size_t t = 0, ns_test_tmp = 0;
- gsl_matrix *W;
+ gsl_matrix *W3 = NULL;
while (!safeGetline(infile, file_name).eof()) {
file_str = file_name + ".bim";
@@ -388,12 +360,12 @@ void PARAM::ReadFiles(void) {
ProcessCvtPhen();
// Obtain covariate matrix.
- W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
+ W3 = gsl_matrix_safe_alloc(ni_test, n_cvt);
+ CopyCvt(W3);
}
file_str = file_name + ".bed";
- if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp,
+ if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp,
snpInfo, maf_level, miss_level, hwe_level, r2_level,
ns_test_tmp) == false) {
error = true;
@@ -406,7 +378,7 @@ void PARAM::ReadFiles(void) {
t++;
}
- gsl_matrix_free(W);
+ if (W3) gsl_matrix_free(W3);
infile.close();
infile.clear();
@@ -432,8 +404,8 @@ void PARAM::ReadFiles(void) {
ProcessCvtPhen();
// Obtain covariate matrix.
- gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
+ gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt);
+ CopyCvt(W4);
igzstream infile(file_mgeno.c_str(), igzstream::in);
if (!infile) {
@@ -445,9 +417,9 @@ void PARAM::ReadFiles(void) {
string file_name;
size_t ns_test_tmp;
while (!safeGetline(infile, file_name).eof()) {
- if (ReadFile_geno(file_name, setSnps, W, indicator_idv, indicator_snp,
+ if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp,
maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
- mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp, mode_debug) == false) {
+ mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) {
error = true;
}
@@ -457,7 +429,7 @@ void PARAM::ReadFiles(void) {
ns_total += indicator_snp.size();
}
- gsl_matrix_free(W);
+ gsl_matrix_free(W4);
infile.close();
infile.clear();
@@ -485,8 +457,8 @@ void PARAM::ReadFiles(void) {
ProcessCvtPhen();
// Obtain covariate matrix.
- gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
- CopyCvt(W);
+ // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt);
+ // CopyCvt(W5);
if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) {
error = true;
@@ -741,19 +713,6 @@ void PARAM::CheckParam(void) {
}
}
- if (!file_oxford.empty()) {
- str = file_oxford + ".bgen";
- if (stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open .bgen file: " << str << endl;
- error = true;
- }
- str = file_oxford + ".sample";
- if (stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open .sample file: " << str << endl;
- error = true;
- }
- }
-
if ((!file_geno.empty() || !file_gene.empty())) {
str = file_pheno;
if (stat(str.c_str(), &fileInfo) == -1) {
@@ -864,11 +823,6 @@ void PARAM::CheckParam(void) {
flag++;
}
- // WJA added.
- if (!file_oxford.empty()) {
- flag++;
- }
-
if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 &&
a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 &&
a_mode != 63 && a_mode != 66 && a_mode != 67) {
@@ -942,14 +896,12 @@ void PARAM::CheckParam(void) {
enforce_fexists(file_snps, "open file");
enforce_fexists(file_ksnps, "open file");
enforce_fexists(file_gwasnps, "open file");
- enforce_fexists(file_log, "open file");
enforce_fexists(file_anno, "open file");
if (!loco.empty()) {
enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22,
"LOCO only works with LMM and K");
- enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
- enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)");
+ // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
enforce_msg(!file_anno.empty(),
"LOCO requires annotation file (-a switch)");
@@ -957,54 +909,15 @@ void PARAM::CheckParam(void) {
enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
}
- str = file_kin;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open relatedness matrix file: " << str << endl;
- error = true;
- }
-
- str = file_mk;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open relatedness matrix file: " << str << endl;
- error = true;
- }
-
- str = file_cvt;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open covariates file: " << str << endl;
- error = true;
- }
-
- str = file_gxe;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open environmental covariate file: " << str << endl;
- error = true;
- }
-
- str = file_weight;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open the residual weight file: " << str << endl;
- error = true;
- }
-
- str = file_epm;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open estimated parameter file: " << str << endl;
- error = true;
- }
-
- str = file_ebv;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open estimated breeding value file: " << str
- << endl;
- error = true;
- }
-
- str = file_read;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open total read file: " << str << endl;
- error = true;
- }
+ enforce_fexists(file_kin, "open file");
+ enforce_fexists(file_mk, "open file");
+ enforce_fexists(file_cvt, "open file");
+ enforce_fexists(file_gxe, "open file");
+ enforce_fexists(file_log, "open file");
+ enforce_fexists(file_weight, "open file");
+ enforce_fexists(file_epm, "open file");
+ enforce_fexists(file_ebv, "open file");
+ enforce_fexists(file_read, "open file");
// Check if files are compatible with analysis mode.
if (k_mode == 2 && !file_geno.empty()) {
@@ -1056,14 +969,6 @@ void PARAM::CheckParam(void) {
void PARAM::CheckData(void) {
- // WJA NOTE: I added this condition so that covariates can be added
- // through sample, probably not exactly what is wanted.
- if (file_oxford.empty()) {
- if ((file_cvt).empty() || (indicator_cvt).size() == 0) {
- n_cvt = 1;
- }
- }
-
if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) {
cout << "error! the number of pve estimates does not equal to "
<< "the number of categories in the cat file:" << v_pve.size() << " "
@@ -1194,21 +1099,21 @@ void PARAM::CheckData(void) {
cout << "## number of total genes = " << ng_total << endl;
} else if (file_epm.empty() && a_mode != 43 && a_mode != 5) {
if (!loco.empty())
- cout << "## leave one chromosome out (LOCO) = " << loco << endl;
- cout << "## number of total SNPs = " << ns_total << endl;
+ cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl;
+ cout << "## number of total SNPs/var = " << setw(8) << ns_total << endl;
if (setSnps.size())
- cout << "## number of considered SNPS = " << setSnps.size() << endl;
+ cout << "## number of considered SNPS = " << setw(8) << setSnps.size() << endl;
if (setKSnps.size())
- cout << "## number of SNPS for K = " << setKSnps.size() << endl;
+ cout << "## number of SNPS for K = " << setw(8) << setKSnps.size() << endl;
if (setGWASnps.size())
- cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl;
- cout << "## number of analyzed SNPs = " << ns_test << endl;
+ cout << "## number of SNPS for GWAS = " << setw(8) << setGWASnps.size() << endl;
+ cout << "## number of analyzed SNPs = " << setw(8) << ns_test << endl;
} else {
}
}
// Set d_pace to 1000 for gene expression.
- if (!file_gene.empty() && d_pace == 100000) {
+ if (!file_gene.empty() && d_pace == DEFAULT_PACE) {
d_pace = 1000;
}
@@ -1340,7 +1245,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
}
} else {
if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K,
- calc_K, mode_debug) == false) {
+ calc_K) == false) {
error = true;
}
}
@@ -1360,7 +1265,7 @@ void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K,
}
} else {
if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K,
- ni_test, ns_test, mode_debug) == false) {
+ ni_test, ns_test) == false) {
error = true;
}
}
@@ -1375,18 +1280,11 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) {
if (!file_bfile.empty()) {
file_str = file_bfile + ".bed";
- enforce_msg(loco.empty(), "FIXME: LOCO nyi");
+ // enforce_msg(loco.empty(), "FIXME: LOCO nyi");
if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
false) {
error = true;
}
- } else if (!file_oxford.empty()) {
- file_str = file_oxford + ".bgen";
- enforce_msg(loco.empty(), "FIXME: LOCO nyi");
- if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
- false) {
- error = true;
- }
} else {
file_str = file_geno;
if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,