aboutsummaryrefslogtreecommitdiff
path: root/src/param.cpp
diff options
context:
space:
mode:
authorPeter Carbonetto2017-08-07 13:23:24 -0500
committerGitHub2017-08-07 13:23:24 -0500
commit7360d14216400b8f12fbfda03ac2f4827b102711 (patch)
tree63a4031267b10f587b695adb487aca5213889b20 /src/param.cpp
parentd8db988550d4cd0303f0b82a75499c2c94d97d45 (diff)
parent449d882a3b33ef81ef4f0127c3932b01fa796dbb (diff)
downloadpangemma-7360d14216400b8f12fbfda03ac2f4827b102711.tar.gz
Merge pull request #63 from genenetwork/loco-formatted
LOCO is implemented in GEMMA for the BIMBAM format.
Diffstat (limited to 'src/param.cpp')
-rw-r--r--src/param.cpp146
1 files changed, 115 insertions, 31 deletions
diff --git a/src/param.cpp b/src/param.cpp
index 2572bbb..6ab4339 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -38,6 +38,54 @@
using namespace std;
+// ---- Helper functions which do not use the PARAM scope
+
+// Calculate the SNP sets as used in LOCO from mapchr which was filled
+// from the annotation file. Returns ksnps (used for K) and gwasnps (used for
+// GWA). Both are complimentary with each other and subsets of setSnps.
+
+void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
+ const map<string, string> mapchr, const string loco) {
+ enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice");
+ enforce_msg(gwasnps.size() == 0,
+ "make sure gwasnps is not initialized twice");
+ for (auto &kv : mapchr) {
+ auto snp = kv.first;
+ auto chr = kv.second;
+ if (chr != loco) {
+ ksnps.insert(snp);
+ } else {
+ gwasnps.insert(snp);
+ }
+ }
+}
+
+// Trim #individuals to size which is used to write tests that run faster
+//
+// Note it actually trims the number of functional individuals
+// (indicator_idv[x] == 1). This should match indicator_cvt etc. If
+// this gives problems with certain sets we can simply trim to size.
+
+void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) {
+ if (ni_max) {
+ size_t count = 0;
+ for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) {
+ if (*ind)
+ count++;
+ if (count >= ni_max)
+ break;
+ }
+ if (count != idvs.size()) {
+ if (debug)
+ cout << "**** TEST MODE: trim individuals from " << idvs.size()
+ << " to " << count << endl;
+ idvs.resize(count);
+ }
+ }
+}
+
+// ---- PARAM class implementation
+
PARAM::PARAM(void)
: mode_silence(false), a_mode(0), k_mode(1), d_pace(100000),
file_out("result"), path_out("./output/"), miss_level(0.05),
@@ -96,6 +144,15 @@ void PARAM::ReadFiles(void) {
setSnps.clear();
}
+ // Read KSNP set.
+ if (!file_ksnps.empty()) {
+ if (ReadFile_snps(file_ksnps, setKSnps) == false) {
+ error = true;
+ }
+ } else {
+ setKSnps.clear();
+ }
+
// For prediction.
if (!file_epm.empty()) {
if (ReadFile_est(file_epm, est_column, mapRS2est) == false) {
@@ -164,6 +221,7 @@ void PARAM::ReadFiles(void) {
} else {
n_cvt = 1;
}
+ trim_individuals(indicator_cvt, ni_max, mode_debug);
if (!file_gxe.empty()) {
if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
@@ -176,6 +234,8 @@ void PARAM::ReadFiles(void) {
}
}
+ trim_individuals(indicator_idv, ni_max, mode_debug);
+
// WJA added.
// Read genotype and phenotype file for bgen format.
if (!file_oxford.empty()) {
@@ -273,12 +333,15 @@ void PARAM::ReadFiles(void) {
gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
CopyCvt(W);
+ trim_individuals(indicator_idv, ni_max, mode_debug);
+ trim_individuals(indicator_cvt, ni_max, mode_debug);
if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp,
maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
error = true;
}
gsl_matrix_free(W);
+
ns_total = indicator_snp.size();
}
@@ -457,6 +520,11 @@ void PARAM::ReadFiles(void) {
// ni_test, save all useful covariates.
ProcessCvtPhen();
}
+
+ // Compute setKSnps when -loco is passed in
+ if (!loco.empty()) {
+ LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco);
+ }
return;
}
@@ -869,22 +937,22 @@ void PARAM::CheckParam(void) {
error = true;
}
- str = file_snps;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open snps file: " << str << endl;
- error = true;
- }
-
- str = file_log;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open log file: " << str << endl;
- error = true;
- }
+ enforce_fexists(file_snps, "open file");
+ enforce_fexists(file_ksnps, "open file");
+ enforce_fexists(file_gwasnps, "open file");
+ enforce_fexists(file_log, "open file");
+ enforce_fexists(file_anno, "open file");
- str = file_anno;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open annotation file: " << str << endl;
- error = true;
+ if (!loco.empty()) {
+ enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22,
+ "LOCO only works with LMM and K");
+ enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
+ enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)");
+ enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
+ enforce_msg(!file_anno.empty(),
+ "LOCO requires annotation file (-a switch)");
+ enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch");
+ enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
}
str = file_kin;
@@ -1005,7 +1073,7 @@ void PARAM::CheckData(void) {
(indicator_cvt).size() != (indicator_idv).size()) {
error = true;
cout << "error! number of rows in the covariates file do not "
- << "match the number of individuals. " << endl;
+ << "match the number of individuals. " << indicator_cvt.size() << endl;
return;
}
if ((indicator_gxe).size() != 0 &&
@@ -1123,7 +1191,15 @@ void PARAM::CheckData(void) {
if (!file_gene.empty()) {
cout << "## number of total genes = " << ng_total << endl;
} else if (file_epm.empty() && a_mode != 43 && a_mode != 5) {
- cout << "## number of total SNPs = " << ns_total << endl;
+ if (!loco.empty())
+ cout << "## leave one chromosome out (LOCO) = " << loco << endl;
+ cout << "## number of total SNPs = " << ns_total << endl;
+ if (setSnps.size())
+ cout << "## number of considered SNPS = " << setSnps.size() << endl;
+ if (setKSnps.size())
+ cout << "## number of SNPS for K = " << setKSnps.size() << endl;
+ if (setGWASnps.size())
+ cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl;
cout << "## number of analyzed SNPs = " << ns_test << endl;
} else {
}
@@ -1297,20 +1373,22 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) {
if (!file_bfile.empty()) {
file_str = file_bfile + ".bed";
+ enforce_msg(loco.empty(), "FIXME: LOCO nyi");
if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
false) {
error = true;
}
} else if (!file_oxford.empty()) {
file_str = file_oxford + ".bgen";
+ enforce_msg(loco.empty(), "FIXME: LOCO nyi");
if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
false) {
error = true;
}
} else {
file_str = file_geno;
- if (BimbamKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
- false) {
+ if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
+ matrix_kin, ni_max == 0) == false) {
error = true;
}
}
@@ -1720,37 +1798,43 @@ void PARAM::CalcS(const map<string, double> &mapRS2wA,
} else if (!file_geno.empty()) {
file_str = file_geno;
if (mapRS2wA.size() == 0) {
- if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK,
- mapRS2cat, snpInfo, W, K, ns) == false) {
+ if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
+ indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA,
- mapRS2cat, snpInfo, W, A, ns) == false) {
+ if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
+ indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A,
+ ns) == false) {
error = true;
}
}
} else if (!file_mbfile.empty()) {
if (mapRS2wA.size() == 0) {
- if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp,
- mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) {
+ if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp,
- mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) {
+ if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
+ ns) == false) {
error = true;
}
}
} else if (!file_mgeno.empty()) {
if (mapRS2wA.size() == 0) {
- if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp,
- mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) {
+ if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp,
- mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) {
+ if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
+ ns) == false) {
error = true;
}
}