aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorPjotr Prins2017-08-03 10:26:52 +0000
committerPjotr Prins2017-08-03 10:26:52 +0000
commit449d882a3b33ef81ef4f0127c3932b01fa796dbb (patch)
tree63a4031267b10f587b695adb487aca5213889b20 /src
parentd8db988550d4cd0303f0b82a75499c2c94d97d45 (diff)
downloadpangemma-449d882a3b33ef81ef4f0127c3932b01fa796dbb.tar.gz
LOCO is implemented in GEMMA for the BIMBAM format. Pass in the -loco
1 switch for LOCO of chromosome 1. What are the use cases? 1. User runs vanilla GEMMA: all SNPs are considered input for GWA and K 2. User passes in -snps: all these SNPs are considered for GWA and K 3. User passes in -snps and -ksnps: All these SNPs are used for GWA, Ksnps are used for K 4. User passes in -loco: SNPs are split by chromosome (GWA incl., K excl.) 5. User passes in -snps, -gwasnps and -ksnps could mean that also GWA is subset explicitely (nyi) In all cases indicator_snp is honored and we get the most flexible way for studying SNP combinations that can be passed in in different ways. Overall added: - various comments in source code - tests in test framework inlc. fast-check - NDEBUG compilation support in the Makefile - -debug switch for GEMMA debug output - debug.h which includes enforce functions which work like assert. Unlike assert, enforce also works in release compilation - -nind switch limit the number of individuals used (trim_individuals for testing) - enforcing tests of input files - e.g. are number of individuals correct - checks for memory allocation - we should add more of those - more checks for gsl results - we should add more of those - replaced strtoken with regex as a first case. They should all be replaced. strtoken is not thread safe, for one. - introduced C++ iterators - introduced C++ closure in BimBam LMM for cached processing - more localized initialization of variables - makes for demonstratably more correct code - -ksnps adds snps into setKSnps - -gwasnps adds snps into setGWASnps - both sets are computed by -loco - attempted to make the code easier to read
Diffstat (limited to 'src')
-rw-r--r--src/debug.h46
-rw-r--r--src/gemma.cpp49
-rw-r--r--src/io.cpp117
-rw-r--r--src/io.h29
-rw-r--r--src/lmm.cpp203
-rw-r--r--src/lmm.h6
-rw-r--r--src/mvlmm.cpp6
-rw-r--r--src/param.cpp146
-rw-r--r--src/param.h30
-rw-r--r--src/vc.h2
10 files changed, 431 insertions, 203 deletions
diff --git a/src/debug.h b/src/debug.h
new file mode 100644
index 0000000..84637e1
--- /dev/null
+++ b/src/debug.h
@@ -0,0 +1,46 @@
+#ifndef __DEBUG_H__
+#define __DEBUG_H__
+
+// enforce works like assert but also when NDEBUG is set (i.e., it
+// always works). enforce_msg prints message instead of expr
+
+#if defined NDEBUG
+#define debug_msg(msg)
+#else
+#define debug_msg(msg) cout << "**** DEBUG: " << msg << endl;
+#endif
+
+/* This prints an "Assertion failed" message and aborts. */
+extern "C" void __assert_fail(const char *__assertion, const char *__file,
+ unsigned int __line,
+ const char *__function) __THROW
+ __attribute__((__noreturn__));
+
+#define __ASSERT_FUNCTION __PRETTY_FUNCTION__
+
+#define enforce(expr) \
+ ((expr) \
+ ? __ASSERT_VOID_CAST(0) \
+ : __assert_fail(__STRING(expr), __FILE__, __LINE__, __ASSERT_FUNCTION))
+
+#define enforce_msg(expr, msg) \
+ ((expr) ? __ASSERT_VOID_CAST(0) \
+ : __assert_fail(msg, __FILE__, __LINE__, __ASSERT_FUNCTION))
+
+#define enforce_str(expr, msg) \
+ ((expr) \
+ ? __ASSERT_VOID_CAST(0) \
+ : __assert_fail((msg).c_str(), __FILE__, __LINE__, __ASSERT_FUNCTION))
+
+// Helpers to create a unique varname per MACRO
+#define COMBINE1(X, Y) X##Y
+#define COMBINE(X, Y) COMBINE1(X, Y)
+
+#define enforce_gsl(expr) \
+ auto COMBINE(res, __LINE__) = (expr); \
+ (COMBINE(res, __LINE__) == 0 \
+ ? __ASSERT_VOID_CAST(0) \
+ : __assert_fail(gsl_strerror(COMBINE(res, __LINE__)), __FILE__, \
+ __LINE__, __ASSERT_FUNCTION))
+
+#endif
diff --git a/src/gemma.cpp b/src/gemma.cpp
index c72475b..0fb8c54 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -151,6 +151,7 @@ void GEMMA::PrintHelp(size_t option) {
cout << " 11: obtain predicted values" << endl;
cout << " 12: calculate snp variance covariance" << endl;
cout << " 13: note" << endl;
+ cout << " 14: debug options" << endl;
cout << endl;
}
@@ -446,7 +447,16 @@ void GEMMA::PrintHelp(size_t option) {
}
if (option == 4) {
- cout << " RELATEDNESS MATRIX CALCULATION OPTIONS" << endl;
+ cout << " RELATEDNESS MATRIX (K) CALCULATION OPTIONS" << endl;
+ cout << " -ksnps [filename] "
+ << " specify input snps file name to compute K" << endl;
+ cout << " -loco [chr] "
+ << " leave one chromosome out (LOCO) by name (requires -a annotation "
+ "file)"
+ << endl;
+ cout << " -a [filename] "
+ << " specify input BIMBAM SNP annotation file name (LOCO only)"
+ << endl;
cout << " -gk [num] "
<< " specify which type of kinship/relatedness matrix to generate "
"(default 1)"
@@ -682,6 +692,14 @@ void GEMMA::PrintHelp(size_t option) {
cout << endl;
}
+ if (option == 14) {
+ cout << " DEBUG OPTIONS" << endl;
+ cout << " -silence silent terminal display" << endl;
+ cout << " -debug debug output" << endl;
+ cout << " -nind [num] read up to num individuals" << endl;
+ cout << endl;
+ }
+
return;
}
@@ -1008,7 +1026,7 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
str.clear();
str.assign(argv[i]);
cPar.k_mode = atoi(str.c_str());
- } else if (strcmp(argv[i], "-n") == 0) {
+ } else if (strcmp(argv[i], "-n") == 0) { // set pheno column (range)
(cPar.p_column).clear();
while (argv[i + 1] != NULL && argv[i + 1][0] != '-') {
++i;
@@ -1076,6 +1094,12 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
cPar.r2_level = atof(str.c_str());
} else if (strcmp(argv[i], "-notsnp") == 0) {
cPar.maf_level = -1;
+ } else if (strcmp(argv[i], "-loco") == 0) {
+ assert(argv[i + 1]);
+ ++i;
+ str.clear();
+ str.assign(argv[i]);
+ cPar.loco = str;
} else if (strcmp(argv[i], "-gk") == 0) {
if (cPar.a_mode != 0) {
cPar.error = true;
@@ -1315,6 +1339,14 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
str.clear();
str.assign(argv[i]);
cPar.nr_iter = atoi(str.c_str());
+ } else if (strcmp(argv[i], "-nind") == 0) {
+ if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
+ continue;
+ }
+ ++i;
+ str.clear();
+ str.assign(argv[i]);
+ cPar.ni_max = atoi(str.c_str()); // for testing purposes
} else if (strcmp(argv[i], "-emp") == 0) {
if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
continue;
@@ -1533,6 +1565,8 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
str.clear();
str.assign(argv[i]);
cPar.window_ns = atoi(str.c_str());
+ } else if (strcmp(argv[i], "-debug") == 0) {
+ cPar.mode_debug = true;
} else {
cout << "error! unrecognized option: " << argv[i] << endl;
cPar.error = true;
@@ -1808,14 +1842,16 @@ void GEMMA::BatchRun(PARAM &cPar) {
gsl_matrix_free(H_full);
}
- // Generate Kinship matrix
+ // Generate Kinship matrix (optionally using LOCO)
if (cPar.a_mode == 21 || cPar.a_mode == 22) {
cout << "Calculating Relatedness Matrix ... " << endl;
gsl_matrix *G = gsl_matrix_alloc(cPar.ni_total, cPar.ni_total);
+ enforce_msg(G, "allocate G"); // just to be sure
time_start = clock();
cPar.CalcKin(G);
+
cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
if (cPar.error == true) {
cout << "error! fail to calculate relatedness matrix. " << endl;
@@ -2613,6 +2649,7 @@ return;}
cPar.a_mode == 4 || cPar.a_mode == 5 ||
cPar.a_mode == 31) { // Fit LMM or mvLMM or eigen
gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph);
+ enforce_msg(Y, "allocate Y"); // just to be sure
gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt);
gsl_matrix *B = gsl_matrix_alloc(Y->size2, W->size2); // B is a d by c
// matrix
@@ -2749,7 +2786,7 @@ return;}
CalcUtX(U, Y, UtY);
// calculate REMLE/MLE estimate and pve for univariate model
- if (cPar.n_ph == 1) {
+ if (cPar.n_ph == 1) { // one phenotype
gsl_vector_view beta = gsl_matrix_row(B, 0);
gsl_vector_view se_beta = gsl_matrix_row(se_B, 0);
gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0);
@@ -2819,7 +2856,7 @@ return;}
}
}
- // Fit LMM or mvLMM
+ // Fit LMM or mvLMM (w. LOCO)
if (cPar.a_mode == 1 || cPar.a_mode == 2 || cPar.a_mode == 3 ||
cPar.a_mode == 4) {
if (cPar.n_ph == 1) {
@@ -2844,7 +2881,7 @@ return;}
} else {
if (cPar.file_gxe.empty()) {
cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W,
- &Y_col.vector);
+ &Y_col.vector, cPar.setGWASnps);
} else {
cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W,
&Y_col.vector, env);
diff --git a/src/io.cpp b/src/io.cpp
index 44251aa..79e753a 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -25,6 +25,7 @@
#include <iomanip>
#include <iostream>
#include <map>
+#include <regex>
#include <set>
#include <sstream>
#include <stdio.h>
@@ -132,7 +133,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) {
}
// Read SNP file.
-bool ReadFile_snps(const string &file_snps, set<string> &setSnps) {
+bool ReadFile_snps(const string file_snps, set<string> &setSnps) {
setSnps.clear();
igzstream infile(file_snps.c_str(), igzstream::in);
@@ -654,6 +655,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
major = ch_ptr;
if (setSnps.size() != 0 && setSnps.count(rs) == 0) {
+ // if SNP in geno but not in -snps we add an missing value
SNPINFO sInfo = {"-9", rs, -9, -9, minor, major,
0, -9, -9, 0, 0, file_pos};
snpInfo.push_back(sInfo);
@@ -684,9 +686,8 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
gsl_vector_set_zero(genotype_miss);
for (int i = 0; i < ni_total; ++i) {
ch_ptr = strtok(NULL, " , \t");
- if (indicator_idv[i] == 0) {
+ if (indicator_idv[i] == 0)
continue;
- }
if (strcmp(ch_ptr, "NA") == 0) {
gsl_vector_set(genotype_miss, c_idv, 1);
@@ -1334,55 +1335,78 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
}
// Read bimbam mean genotype file and calculate kinship matrix.
-bool BimbamKin(const string &file_geno, vector<int> &indicator_snp,
- const int k_mode, const int display_pace,
- gsl_matrix *matrix_kin) {
+bool BimbamKin(const string file_geno, const set<string> ksnps,
+ vector<int> &indicator_snp, const int k_mode,
+ const int display_pace, gsl_matrix *matrix_kin,
+ const bool test_nind) {
igzstream infile(file_geno.c_str(), igzstream::in);
- if (!infile) {
- cout << "error reading genotype file:" << file_geno << endl;
- return false;
- }
-
- string line;
- char *ch_ptr;
+ enforce_msg(infile, "error reading genotype file");
size_t n_miss;
double d, geno_mean, geno_var;
+ // setKSnp and/or LOCO support
+ bool process_ksnps = ksnps.size();
+
size_t ni_total = matrix_kin->size1;
gsl_vector *geno = gsl_vector_alloc(ni_total);
gsl_vector *geno_miss = gsl_vector_alloc(ni_total);
- // Create a large matrix.
- size_t msize = 10000;
+ // Xlarge contains inds x markers
+ const size_t msize = K_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize);
+ enforce_msg(Xlarge, "allocate Xlarge");
+
gsl_matrix_set_zero(Xlarge);
+ // For every SNP read the genotype per individual
size_t ns_test = 0;
for (size_t t = 0; t < indicator_snp.size(); ++t) {
+ string line;
!safeGetline(infile, line).eof();
if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) {
ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1);
}
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
- ch_ptr = strtok((char *)line.c_str(), " , \t");
- ch_ptr = strtok(NULL, " , \t");
- ch_ptr = strtok(NULL, " , \t");
+ std::regex_token_iterator<std::string::iterator> rend;
+ regex split_on("[,[:blank:]]+");
+ regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
+ split_on, -1);
+ if (test_nind) {
+ // ascertain the number of genotype fields match
+ uint token_num = 0;
+ for (auto x = tokens; x != rend; x++)
+ token_num++;
+ enforce_str(token_num == ni_total + 3, line + " count fields");
+ }
+
+ auto snp = *tokens; // first field
+ // check whether SNP is included in ksnps (used by LOCO)
+ if (process_ksnps && ksnps.count(snp) == 0)
+ continue;
+
+ tokens++; // skip nucleotide fields
+ tokens++; // skip nucleotide fields
+ // calc SNP stats
geno_mean = 0.0;
n_miss = 0;
geno_var = 0.0;
gsl_vector_set_all(geno_miss, 0);
for (size_t i = 0; i < ni_total; ++i) {
- ch_ptr = strtok(NULL, " , \t");
- if (strcmp(ch_ptr, "NA") == 0) {
+ tokens++;
+ enforce_str(tokens != rend, line + " number of fields");
+ string field = *tokens;
+ if (field == "NA") {
gsl_vector_set(geno_miss, i, 0);
n_miss++;
} else {
- d = atof(ch_ptr);
+ d = stod(field);
+ // make sure genotype field contains a number
+ if (field != "0" && field != "0.0")
+ enforce_str(d != 0.0f, field);
gsl_vector_set(geno, i, d);
gsl_vector_set(geno_miss, i, 1);
geno_mean += d;
@@ -1406,23 +1430,28 @@ bool BimbamKin(const string &file_geno, vector<int> &indicator_snp,
if (k_mode == 2 && geno_var != 0) {
gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
}
+ // set the SNP column ns_test
gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
- gsl_vector_memcpy(&Xlarge_col.vector, geno);
+ enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
ns_test++;
+ // compute kinship matrix and return in matrix_kin a SNP at a time
if (ns_test % msize == 0) {
eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
gsl_matrix_set_zero(Xlarge);
}
}
-
if (ns_test % msize != 0) {
eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
}
cout << endl;
- gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test);
+ // scale the kinship matrix
+ enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test));
+
+ // and transpose
+ // FIXME: the following is very slow
for (size_t i = 0; i < ni_total; ++i) {
for (size_t j = 0; j < i; ++j) {
@@ -1430,6 +1459,8 @@ bool BimbamKin(const string &file_geno, vector<int> &indicator_snp,
gsl_matrix_set(matrix_kin, i, j, d);
}
}
+ // GSL is faster - and there are even faster methods
+ // enforce_gsl(gsl_matrix_transpose(matrix_kin));
gsl_vector_free(geno);
gsl_vector_free(geno_miss);
@@ -1463,7 +1494,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp,
int n_bit;
// Create a large matrix.
- size_t msize = 10000;
+ const size_t msize = K_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize);
gsl_matrix_set_zero(Xlarge);
@@ -1583,7 +1614,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp,
// Read bimbam mean genotype file, the second time, recode "mean"
// genotype and calculate K.
-bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv,
+bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv,
vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K,
const bool calc_K) {
igzstream infile(file_geno.c_str(), igzstream::in);
@@ -3326,13 +3357,14 @@ bool ReadFile_mcat(const string &file_mcat, map<string, size_t> &mapRS2cat,
// Read bimbam mean genotype file and calculate kinship matrix; this
// time, the kinship matrix is not centered, and can contain multiple
// K matrix.
-bool BimbamKin(const string &file_geno, const int display_pace,
- const vector<int> &indicator_idv,
- const vector<int> &indicator_snp,
- const map<string, double> &mapRS2weight,
- const map<string, size_t> &mapRS2cat,
- const vector<SNPINFO> &snpInfo, const gsl_matrix *W,
- gsl_matrix *matrix_kin, gsl_vector *vector_ns) {
+bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps,
+ const int display_pace,
+ const vector<int> &indicator_idv,
+ const vector<int> &indicator_snp,
+ const map<string, double> &mapRS2weight,
+ const map<string, size_t> &mapRS2cat,
+ const vector<SNPINFO> &snpInfo, const gsl_matrix *W,
+ gsl_matrix *matrix_kin, gsl_vector *vector_ns) {
igzstream infile(file_geno.c_str(), igzstream::in);
if (!infile) {
cout << "error reading genotype file:" << file_geno << endl;
@@ -3368,7 +3400,7 @@ bool BimbamKin(const string &file_geno, const int display_pace,
}
// Create a large matrix.
- size_t msize = 10000;
+ const size_t msize = K_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc);
gsl_matrix_set_zero(Xlarge);
@@ -3378,9 +3410,8 @@ bool BimbamKin(const string &file_geno, const int display_pace,
if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) {
ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1);
}
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
ch_ptr = strtok((char *)line.c_str(), " , \t");
ch_ptr = strtok(NULL, " , \t");
@@ -3562,7 +3593,7 @@ bool PlinkKin(const string &file_bed, const int display_pace,
}
// Create a large matrix.
- size_t msize = 10000;
+ const size_t msize = K_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc);
gsl_matrix_set_zero(Xlarge);
@@ -3742,7 +3773,8 @@ bool PlinkKin(const string &file_bed, const int display_pace,
}
bool MFILEKin(const size_t mfile_mode, const string &file_mfile,
- const int display_pace, const vector<int> &indicator_idv,
+ const set<string> setKSnps, const int display_pace,
+ const vector<int> &indicator_idv,
const vector<vector<int>> &mindicator_snp,
const map<string, double> &mapRS2weight,
const map<string, size_t> &mapRS2cat,
@@ -3774,8 +3806,9 @@ bool MFILEKin(const size_t mfile_mode, const string &file_mfile,
PlinkKin(file_name, display_pace, indicator_idv, mindicator_snp[l],
mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp);
} else {
- BimbamKin(file_name, display_pace, indicator_idv, mindicator_snp[l],
- mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp);
+ BimbamKinUncentered(file_name, setKSnps, display_pace, indicator_idv,
+ mindicator_snp[l], mapRS2weight, mapRS2cat,
+ msnpInfo[l], W, kin_tmp, ns_tmp);
}
// Add ns.
diff --git a/src/io.h b/src/io.h
index 3e1145a..27f145f 100644
--- a/src/io.h
+++ b/src/io.h
@@ -34,7 +34,7 @@ void ProgressBar(string str, double p, double total);
void ProgressBar(string str, double p, double total, double ratio);
std::istream &safeGetline(std::istream &is, std::string &t);
-bool ReadFile_snps(const string &file_snps, set<string> &setSnps);
+bool ReadFile_snps(const string file_snps, set<string> &setSnps);
bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps);
bool ReadFile_log(const string &file_log, double &pheno_mean);
@@ -83,13 +83,14 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
-bool BimbamKin(const string &file_geno, vector<int> &indicator_snp,
- const int k_mode, const int display_pace,
- gsl_matrix *matrix_kin);
+bool BimbamKin(const string file_geno, const set<string> ksnps,
+ vector<int> &indicator_snp, const int k_mode,
+ const int display_pace, gsl_matrix *matrix_kin,
+ const bool test_nind);
bool PlinkKin(const string &file_bed, vector<int> &indicator_snp,
const int k_mode, const int display_pace, gsl_matrix *matrix_kin);
-bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv,
+bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv,
vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K,
const bool calc_K);
bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv,
@@ -124,13 +125,14 @@ bool ReadFile_catc(const string &file_cat,
bool ReadFile_mcatc(const string &file_mcat,
map<string, vector<double>> &mapRS2catc, size_t &n_cat);
-bool BimbamKin(const string &file_geno, const int display_pace,
- const vector<int> &indicator_idv,
- const vector<int> &indicator_snp,
- const map<string, double> &mapRS2weight,
- const map<string, size_t> &mapRS2cat,
- const vector<SNPINFO> &snpInfo, const gsl_matrix *W,
- gsl_matrix *matrix_kin, gsl_vector *vector_ns);
+bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps,
+ const int display_pace,
+ const vector<int> &indicator_idv,
+ const vector<int> &indicator_snp,
+ const map<string, double> &mapRS2weight,
+ const map<string, size_t> &mapRS2cat,
+ const vector<SNPINFO> &snpInfo, const gsl_matrix *W,
+ gsl_matrix *matrix_kin, gsl_vector *vector_ns);
bool PlinkKin(const string &file_bed, const int display_pace,
const vector<int> &indicator_idv,
const vector<int> &indicator_snp,
@@ -139,7 +141,8 @@ bool PlinkKin(const string &file_bed, const int display_pace,
const vector<SNPINFO> &snpInfo, const gsl_matrix *W,
gsl_matrix *matrix_kin, gsl_vector *vector_ns);
bool MFILEKin(const size_t mfile_mode, const string &file_mfile,
- const int display_pace, const vector<int> &indicator_idv,
+ const set<string> setKSnps, const int display_pace,
+ const vector<int> &indicator_idv,
const vector<vector<int>> &mindicator_snp,
const map<string, double> &mapRS2weight,
const map<string, size_t> &mapRS2cat,
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 3f51073..eb76265 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -80,6 +80,7 @@ void LMM::CopyFromParam(PARAM &cPar) {
indicator_idv = cPar.indicator_idv;
indicator_snp = cPar.indicator_snp;
snpInfo = cPar.snpInfo;
+ setGWASnps = cPar.setGWASnps;
return;
}
@@ -165,6 +166,7 @@ void LMM::WriteFiles() {
}
}
} else {
+ bool process_gwasnps = setGWASnps.size();
outfile << "chr"
<< "\t"
<< "rs"
@@ -217,9 +219,13 @@ void LMM::WriteFiles() {
size_t t = 0;
for (size_t i = 0; i < snpInfo.size(); ++i) {
- if (indicator_snp[i] == 0) {
+
+ if (indicator_snp[i] == 0)
continue;
- }
+ auto snp = snpInfo[i].rs_number;
+ if (process_gwasnps && setGWASnps.count(snp) == 0)
+ continue;
+ // cout << t << endl;
outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
<< snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
@@ -1311,78 +1317,126 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval,
void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
const gsl_matrix *UtW, const gsl_vector *Uty,
- const gsl_matrix *W, const gsl_vector *y) {
- igzstream infile(file_geno.c_str(), igzstream::in);
- if (!infile) {
- cout << "error reading genotype file:" << file_geno << endl;
- return;
- }
-
+ const gsl_matrix *W, const gsl_vector *y,
+ const set<string> gwasnps) {
clock_t time_start = clock();
- string line;
- char *ch_ptr;
-
- double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0;
- double p_lrt = 0, p_score = 0;
- double logl_H1 = 0.0;
- int n_miss, c_phen;
- double geno, x_mean;
+ // LOCO support
+ bool process_gwasnps = gwasnps.size();
+ if (process_gwasnps)
+ debug_msg("AnalyzeBimbam w. LOCO");
// Calculate basic quantities.
size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
- gsl_vector *x = gsl_vector_alloc(U->size1);
- gsl_vector *x_miss = gsl_vector_alloc(U->size1);
+ const size_t inds = U->size1;
+ gsl_vector *x = gsl_vector_alloc(inds); // #inds
+ gsl_vector *x_miss = gsl_vector_alloc(inds);
gsl_vector *Utx = gsl_vector_alloc(U->size2);
gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index);
gsl_vector *ab = gsl_vector_alloc(n_index);
- // Create a large matrix.
- size_t msize = 10000;
- gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
- gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
- gsl_matrix_set_zero(Xlarge);
+ // Create a large matrix with LMM_BATCH_SIZE columns for batched processing
+ // const size_t msize=(process_gwasnps ? 1 : LMM_BATCH_SIZE);
+ const size_t msize = LMM_BATCH_SIZE;
+ gsl_matrix *Xlarge = gsl_matrix_alloc(inds, msize);
+ gsl_matrix *UtXlarge = gsl_matrix_alloc(inds, msize);
+ enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure
+ gsl_matrix_set_zero(Xlarge);
gsl_matrix_set_zero(Uab);
CalcUab(UtW, Uty, Uab);
// start reading genotypes and analyze
- size_t c = 0, t_last = 0;
- for (size_t t = 0; t < indicator_snp.size(); ++t) {
- if (indicator_snp[t] == 0) {
- continue;
+ size_t c = 0;
+
+ igzstream infile(file_geno.c_str(), igzstream::in);
+ enforce_msg(infile, "error reading genotype file");
+
+ auto batch_compute = [&](size_t l) { // using a C++ closure
+ // Compute SNPs in batch, note the computations are independent per SNP
+ gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l);
+ gsl_matrix_view UtXlarge_sub =
+ gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
+
+ time_start = clock();
+ eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
+ &UtXlarge_sub.matrix);
+ time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ gsl_matrix_set_zero(Xlarge);
+ for (size_t i = 0; i < l; i++) {
+ // for every batch...
+ gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
+ gsl_vector_memcpy(Utx, &UtXlarge_col.vector);
+
+ CalcUab(UtW, Uty, Utx, Uab);
+
+ time_start = clock();
+ FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0};
+
+ double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0;
+ double p_lrt = 0, p_score = 0;
+ double logl_H1 = 0.0;
+
+ // 3 is before 1.
+ if (a_mode == 3 || a_mode == 4) {
+ CalcRLScore(l_mle_null, param1, beta, se, p_score);
+ }
+
+ if (a_mode == 1 || a_mode == 4) {
+ // for univariate a_mode is 1
+ CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+ CalcRLWald(lambda_remle, param1, beta, se, p_wald);
+ }
+
+ if (a_mode == 2 || a_mode == 4) {
+ CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+ p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
+ }
+
+ time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ // Store summary data.
+ SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle,
+ p_wald, p_lrt, p_score};
+ sumStat.push_back(SNPs);
}
- t_last++;
- }
+ };
+
for (size_t t = 0; t < indicator_snp.size(); ++t) {
- !safeGetline(infile, line).eof();
+ // for every SNP
+ string line;
+ safeGetline(infile, line);
if (t % d_pace == 0 || t == (ns_total - 1)) {
ProgressBar("Reading SNPs ", t, ns_total - 1);
}
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
- ch_ptr = strtok((char *)line.c_str(), " , \t");
+ char *ch_ptr = strtok((char *)line.c_str(), " , \t");
+ auto snp = string(ch_ptr);
+ // check whether SNP is included in gwasnps (used by LOCO)
+ if (process_gwasnps && gwasnps.count(snp) == 0)
+ continue;
ch_ptr = strtok(NULL, " , \t");
ch_ptr = strtok(NULL, " , \t");
- x_mean = 0.0;
- c_phen = 0;
- n_miss = 0;
+ double x_mean = 0.0;
+ int c_phen = 0;
+ int n_miss = 0;
gsl_vector_set_zero(x_miss);
for (size_t i = 0; i < ni_total; ++i) {
+ // get the genotypes per individual and compute stats per SNP
ch_ptr = strtok(NULL, " , \t");
- if (indicator_idv[i] == 0) {
+ if (indicator_idv[i] == 0)
continue;
- }
if (strcmp(ch_ptr, "NA") == 0) {
gsl_vector_set(x_miss, c_phen, 0.0);
n_miss++;
} else {
- geno = atof(ch_ptr);
+ double geno = atof(ch_ptr);
gsl_vector_set(x, c_phen, geno);
gsl_vector_set(x_miss, c_phen, 1.0);
@@ -1398,66 +1452,16 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector_set(x, i, x_mean);
}
}
-
+ // copy genotype values for SNP into Xlarge cache
gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize);
gsl_vector_memcpy(&Xlarge_col.vector, x);
- c++;
-
- if (c % msize == 0 || c == t_last) {
- size_t l = 0;
- if (c % msize == 0) {
- l = msize;
- } else {
- l = c % msize;
- }
-
- gsl_matrix_view Xlarge_sub =
- gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l);
- gsl_matrix_view UtXlarge_sub =
- gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l);
-
- time_start = clock();
- eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
- &UtXlarge_sub.matrix);
- time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
-
- gsl_matrix_set_zero(Xlarge);
+ c++; // count SNPs going in
- for (size_t i = 0; i < l; i++) {
- gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
- gsl_vector_memcpy(Utx, &UtXlarge_col.vector);
-
- CalcUab(UtW, Uty, Utx, Uab);
-
- time_start = clock();
- FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0};
-
- // 3 is before 1.
- if (a_mode == 3 || a_mode == 4) {
- CalcRLScore(l_mle_null, param1, beta, se, p_score);
- }
-
- if (a_mode == 1 || a_mode == 4) {
- CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle,
- logl_H1);
- CalcRLWald(lambda_remle, param1, beta, se, p_wald);
- }
-
- if (a_mode == 2 || a_mode == 4) {
- CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
- p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
- }
-
- time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
-
- // Store summary data.
- SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle,
- p_wald, p_lrt, p_score};
-
- sumStat.push_back(SNPs);
- }
- }
+ if (c == msize)
+ batch_compute(msize);
}
+ batch_compute(c % msize);
+ // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
cout << endl;
gsl_vector_free(x);
@@ -1505,7 +1509,7 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector *ab = gsl_vector_alloc(n_index);
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
@@ -1528,9 +1532,8 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
size_t c = 0, t_last = 0;
for (size_t t = 0; t < snpInfo.size(); ++t) {
- if (indicator_snp[t] == 0) {
+ if (indicator_snp[t] == 0)
continue;
- }
t_last++;
}
for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
@@ -1697,7 +1700,7 @@ void LMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval,
gsl_vector *ab = gsl_vector_alloc(n_index);
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
diff --git a/src/lmm.h b/src/lmm.h
index c393daf..4d57ab1 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -26,6 +26,8 @@
using namespace std;
+#define LMM_BATCH_SIZE 10000 // used for batch processing
+
class FUNC_PARAM {
public:
@@ -78,6 +80,7 @@ public:
vector<int> indicator_snp;
vector<SNPINFO> snpInfo; // Record SNP information.
+ set<string> setGWASnps; // Record SNP information.
// Not included in PARAM.
vector<SUMSTAT> sumStat; // Output SNPSummary Data.
@@ -97,7 +100,8 @@ public:
const gsl_matrix *W, const gsl_vector *y);
void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
const gsl_matrix *UtW, const gsl_vector *Uty,
- const gsl_matrix *W, const gsl_vector *y);
+ const gsl_matrix *W, const gsl_vector *y,
+ const set<string> gwasnps);
void AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
const gsl_matrix *UtW, const gsl_vector *Uty,
const gsl_matrix *W, const gsl_vector *y,
diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp
index f1ab3fc..eb591ca 100644
--- a/src/mvlmm.cpp
+++ b/src/mvlmm.cpp
@@ -2974,7 +2974,7 @@ void MVLMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval,
string line;
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
@@ -3531,7 +3531,7 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2;
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
@@ -3968,7 +3968,7 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2;
// Create a large matrix.
- size_t msize = 10000;
+ size_t msize = LMM_BATCH_SIZE;
gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize);
gsl_matrix_set_zero(Xlarge);
diff --git a/src/param.cpp b/src/param.cpp
index 2572bbb..6ab4339 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -38,6 +38,54 @@
using namespace std;
+// ---- Helper functions which do not use the PARAM scope
+
+// Calculate the SNP sets as used in LOCO from mapchr which was filled
+// from the annotation file. Returns ksnps (used for K) and gwasnps (used for
+// GWA). Both are complimentary with each other and subsets of setSnps.
+
+void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
+ const map<string, string> mapchr, const string loco) {
+ enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice");
+ enforce_msg(gwasnps.size() == 0,
+ "make sure gwasnps is not initialized twice");
+ for (auto &kv : mapchr) {
+ auto snp = kv.first;
+ auto chr = kv.second;
+ if (chr != loco) {
+ ksnps.insert(snp);
+ } else {
+ gwasnps.insert(snp);
+ }
+ }
+}
+
+// Trim #individuals to size which is used to write tests that run faster
+//
+// Note it actually trims the number of functional individuals
+// (indicator_idv[x] == 1). This should match indicator_cvt etc. If
+// this gives problems with certain sets we can simply trim to size.
+
+void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) {
+ if (ni_max) {
+ size_t count = 0;
+ for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) {
+ if (*ind)
+ count++;
+ if (count >= ni_max)
+ break;
+ }
+ if (count != idvs.size()) {
+ if (debug)
+ cout << "**** TEST MODE: trim individuals from " << idvs.size()
+ << " to " << count << endl;
+ idvs.resize(count);
+ }
+ }
+}
+
+// ---- PARAM class implementation
+
PARAM::PARAM(void)
: mode_silence(false), a_mode(0), k_mode(1), d_pace(100000),
file_out("result"), path_out("./output/"), miss_level(0.05),
@@ -96,6 +144,15 @@ void PARAM::ReadFiles(void) {
setSnps.clear();
}
+ // Read KSNP set.
+ if (!file_ksnps.empty()) {
+ if (ReadFile_snps(file_ksnps, setKSnps) == false) {
+ error = true;
+ }
+ } else {
+ setKSnps.clear();
+ }
+
// For prediction.
if (!file_epm.empty()) {
if (ReadFile_est(file_epm, est_column, mapRS2est) == false) {
@@ -164,6 +221,7 @@ void PARAM::ReadFiles(void) {
} else {
n_cvt = 1;
}
+ trim_individuals(indicator_cvt, ni_max, mode_debug);
if (!file_gxe.empty()) {
if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
@@ -176,6 +234,8 @@ void PARAM::ReadFiles(void) {
}
}
+ trim_individuals(indicator_idv, ni_max, mode_debug);
+
// WJA added.
// Read genotype and phenotype file for bgen format.
if (!file_oxford.empty()) {
@@ -273,12 +333,15 @@ void PARAM::ReadFiles(void) {
gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt);
CopyCvt(W);
+ trim_individuals(indicator_idv, ni_max, mode_debug);
+ trim_individuals(indicator_cvt, ni_max, mode_debug);
if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp,
maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
error = true;
}
gsl_matrix_free(W);
+
ns_total = indicator_snp.size();
}
@@ -457,6 +520,11 @@ void PARAM::ReadFiles(void) {
// ni_test, save all useful covariates.
ProcessCvtPhen();
}
+
+ // Compute setKSnps when -loco is passed in
+ if (!loco.empty()) {
+ LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco);
+ }
return;
}
@@ -869,22 +937,22 @@ void PARAM::CheckParam(void) {
error = true;
}
- str = file_snps;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open snps file: " << str << endl;
- error = true;
- }
-
- str = file_log;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open log file: " << str << endl;
- error = true;
- }
+ enforce_fexists(file_snps, "open file");
+ enforce_fexists(file_ksnps, "open file");
+ enforce_fexists(file_gwasnps, "open file");
+ enforce_fexists(file_log, "open file");
+ enforce_fexists(file_anno, "open file");
- str = file_anno;
- if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) {
- cout << "error! fail to open annotation file: " << str << endl;
- error = true;
+ if (!loco.empty()) {
+ enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22,
+ "LOCO only works with LMM and K");
+ enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
+ enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)");
+ enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
+ enforce_msg(!file_anno.empty(),
+ "LOCO requires annotation file (-a switch)");
+ enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch");
+ enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
}
str = file_kin;
@@ -1005,7 +1073,7 @@ void PARAM::CheckData(void) {
(indicator_cvt).size() != (indicator_idv).size()) {
error = true;
cout << "error! number of rows in the covariates file do not "
- << "match the number of individuals. " << endl;
+ << "match the number of individuals. " << indicator_cvt.size() << endl;
return;
}
if ((indicator_gxe).size() != 0 &&
@@ -1123,7 +1191,15 @@ void PARAM::CheckData(void) {
if (!file_gene.empty()) {
cout << "## number of total genes = " << ng_total << endl;
} else if (file_epm.empty() && a_mode != 43 && a_mode != 5) {
- cout << "## number of total SNPs = " << ns_total << endl;
+ if (!loco.empty())
+ cout << "## leave one chromosome out (LOCO) = " << loco << endl;
+ cout << "## number of total SNPs = " << ns_total << endl;
+ if (setSnps.size())
+ cout << "## number of considered SNPS = " << setSnps.size() << endl;
+ if (setKSnps.size())
+ cout << "## number of SNPS for K = " << setKSnps.size() << endl;
+ if (setGWASnps.size())
+ cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl;
cout << "## number of analyzed SNPs = " << ns_test << endl;
} else {
}
@@ -1297,20 +1373,22 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) {
if (!file_bfile.empty()) {
file_str = file_bfile + ".bed";
+ enforce_msg(loco.empty(), "FIXME: LOCO nyi");
if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
false) {
error = true;
}
} else if (!file_oxford.empty()) {
file_str = file_oxford + ".bgen";
+ enforce_msg(loco.empty(), "FIXME: LOCO nyi");
if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
false) {
error = true;
}
} else {
file_str = file_geno;
- if (BimbamKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
- false) {
+ if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
+ matrix_kin, ni_max == 0) == false) {
error = true;
}
}
@@ -1720,37 +1798,43 @@ void PARAM::CalcS(const map<string, double> &mapRS2wA,
} else if (!file_geno.empty()) {
file_str = file_geno;
if (mapRS2wA.size() == 0) {
- if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK,
- mapRS2cat, snpInfo, W, K, ns) == false) {
+ if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
+ indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA,
- mapRS2cat, snpInfo, W, A, ns) == false) {
+ if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv,
+ indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A,
+ ns) == false) {
error = true;
}
}
} else if (!file_mbfile.empty()) {
if (mapRS2wA.size() == 0) {
- if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp,
- mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) {
+ if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp,
- mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) {
+ if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
+ ns) == false) {
error = true;
}
}
} else if (!file_mgeno.empty()) {
if (mapRS2wA.size() == 0) {
- if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp,
- mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) {
+ if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K,
+ ns) == false) {
error = true;
}
} else {
- if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp,
- mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) {
+ if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv,
+ mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A,
+ ns) == false) {
error = true;
}
}
diff --git a/src/param.h b/src/param.h
index 33e2431..45d8c0f 100644
--- a/src/param.h
+++ b/src/param.h
@@ -19,12 +19,15 @@
#ifndef __PARAM_H__
#define __PARAM_H__
+#include "debug.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_vector.h"
#include <map>
#include <set>
#include <vector>
+#define K_BATCH_SIZE 10000 // #snps used for batched K
+
using namespace std;
class SNPINFO {
@@ -110,6 +113,7 @@ class PARAM {
public:
// IO-related parameters.
bool mode_silence;
+ bool mode_debug = false;
int a_mode; // Analysis mode, 1/2/3/4 for Frequentist tests
int k_mode; // Kinship read mode: 1: n by n matrix, 2: id/id/k_value;
vector<size_t> p_column; // Which phenotype column needs analysis.
@@ -135,12 +139,14 @@ public:
string file_bf, file_hyp;
string path_out;
- string file_epm; // Estimated parameter file.
- string file_ebv; // Estimated breeding value file.
- string file_log; // Log file containing mean estimate.
- string file_read; // File containing total number of reads.
- string file_gene; // Gene expression file.
- string file_snps; // File containing analyzed SNPs or genes.
+ string file_epm; // Estimated parameter file.
+ string file_ebv; // Estimated breeding value file.
+ string file_log; // Log file containing mean estimate.
+ string file_read; // File containing total number of reads.
+ string file_gene; // Gene expression file.
+ string file_snps; // File containing analyzed SNPs or genes.
+ string file_ksnps; // File SNPs for computing K
+ string file_gwasnps; // File SNPs for computing GWAS
// WJA added.
string file_oxford;
@@ -152,6 +158,7 @@ public:
double r2_level;
// LMM-related parameters.
+ string loco;
double l_min;
double l_max;
size_t n_region;
@@ -215,6 +222,7 @@ public:
// Number of individuals.
size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref;
+ size_t ni_max = 0; // -nind switch for testing purposes
// Number of observed and missing phenotypes.
size_t np_obs, np_miss;
@@ -305,7 +313,9 @@ public:
vector<SNPINFO> snpInfo; // Record SNP information.
vector<vector<SNPINFO>> msnpInfo; // Record SNP information.
- set<string> setSnps; // Set of snps for analysis.
+ set<string> setSnps; // Set of snps for analysis (-snps).
+ set<string> setKSnps; // Set of snps for K (-ksnps and LOCO)
+ set<string> setGWASnps; // Set of snps for GWA (-gwasnps and LOCO)
// Constructor.
PARAM();
@@ -351,4 +361,10 @@ public:
size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt);
+// Helpers for checking parameters
+#define enforce_fexists(fn, msg) \
+ if (!fn.empty()) \
+ enforce_msg(stat(fn.c_str(), &fileInfo) == 0, \
+ ((std::string(__STRING(fn)) + ": " + msg).c_str()));
+
#endif
diff --git a/src/vc.h b/src/vc.h
index c6f66b4..49397bb 100644
--- a/src/vc.h
+++ b/src/vc.h
@@ -40,6 +40,8 @@ public:
bool noconstrain;
};
+// Methods for variant component (VC) estimation
+
class VC {
public: