diff options
author | Pjotr Prins | 2017-08-03 10:26:52 +0000 |
---|---|---|
committer | Pjotr Prins | 2017-08-03 10:26:52 +0000 |
commit | 449d882a3b33ef81ef4f0127c3932b01fa796dbb (patch) | |
tree | 63a4031267b10f587b695adb487aca5213889b20 /src | |
parent | d8db988550d4cd0303f0b82a75499c2c94d97d45 (diff) | |
download | pangemma-449d882a3b33ef81ef4f0127c3932b01fa796dbb.tar.gz |
LOCO is implemented in GEMMA for the BIMBAM format. Pass in the -loco
1 switch for LOCO of chromosome 1.
What are the use cases?
1. User runs vanilla GEMMA: all SNPs are considered input for GWA and K
2. User passes in -snps: all these SNPs are considered for GWA and K
3. User passes in -snps and -ksnps: All these SNPs are used for GWA,
Ksnps are used for K
4. User passes in -loco: SNPs are split by chromosome (GWA incl., K
excl.)
5. User passes in -snps, -gwasnps and -ksnps could mean that also GWA
is subset explicitely (nyi)
In all cases indicator_snp is honored and we get the most flexible way for
studying SNP combinations that can be passed in in different ways.
Overall added:
- various comments in source code
- tests in test framework inlc. fast-check
- NDEBUG compilation support in the Makefile
- -debug switch for GEMMA debug output
- debug.h which includes enforce functions which work like
assert. Unlike assert, enforce also works in release compilation
- -nind switch limit the number of individuals used
(trim_individuals for testing)
- enforcing tests of input files - e.g. are number of individuals correct
- checks for memory allocation - we should add more of those
- more checks for gsl results - we should add more of those
- replaced strtoken with regex as a first case. They should all be
replaced. strtoken is not thread safe, for one.
- introduced C++ iterators
- introduced C++ closure in BimBam LMM for cached processing
- more localized initialization of variables - makes for demonstratably
more correct code
- -ksnps adds snps into setKSnps
- -gwasnps adds snps into setGWASnps
- both sets are computed by -loco
- attempted to make the code easier to read
Diffstat (limited to 'src')
-rw-r--r-- | src/debug.h | 46 | ||||
-rw-r--r-- | src/gemma.cpp | 49 | ||||
-rw-r--r-- | src/io.cpp | 117 | ||||
-rw-r--r-- | src/io.h | 29 | ||||
-rw-r--r-- | src/lmm.cpp | 203 | ||||
-rw-r--r-- | src/lmm.h | 6 | ||||
-rw-r--r-- | src/mvlmm.cpp | 6 | ||||
-rw-r--r-- | src/param.cpp | 146 | ||||
-rw-r--r-- | src/param.h | 30 | ||||
-rw-r--r-- | src/vc.h | 2 |
10 files changed, 431 insertions, 203 deletions
diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..84637e1 --- /dev/null +++ b/src/debug.h @@ -0,0 +1,46 @@ +#ifndef __DEBUG_H__ +#define __DEBUG_H__ + +// enforce works like assert but also when NDEBUG is set (i.e., it +// always works). enforce_msg prints message instead of expr + +#if defined NDEBUG +#define debug_msg(msg) +#else +#define debug_msg(msg) cout << "**** DEBUG: " << msg << endl; +#endif + +/* This prints an "Assertion failed" message and aborts. */ +extern "C" void __assert_fail(const char *__assertion, const char *__file, + unsigned int __line, + const char *__function) __THROW + __attribute__((__noreturn__)); + +#define __ASSERT_FUNCTION __PRETTY_FUNCTION__ + +#define enforce(expr) \ + ((expr) \ + ? __ASSERT_VOID_CAST(0) \ + : __assert_fail(__STRING(expr), __FILE__, __LINE__, __ASSERT_FUNCTION)) + +#define enforce_msg(expr, msg) \ + ((expr) ? __ASSERT_VOID_CAST(0) \ + : __assert_fail(msg, __FILE__, __LINE__, __ASSERT_FUNCTION)) + +#define enforce_str(expr, msg) \ + ((expr) \ + ? __ASSERT_VOID_CAST(0) \ + : __assert_fail((msg).c_str(), __FILE__, __LINE__, __ASSERT_FUNCTION)) + +// Helpers to create a unique varname per MACRO +#define COMBINE1(X, Y) X##Y +#define COMBINE(X, Y) COMBINE1(X, Y) + +#define enforce_gsl(expr) \ + auto COMBINE(res, __LINE__) = (expr); \ + (COMBINE(res, __LINE__) == 0 \ + ? __ASSERT_VOID_CAST(0) \ + : __assert_fail(gsl_strerror(COMBINE(res, __LINE__)), __FILE__, \ + __LINE__, __ASSERT_FUNCTION)) + +#endif diff --git a/src/gemma.cpp b/src/gemma.cpp index c72475b..0fb8c54 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -151,6 +151,7 @@ void GEMMA::PrintHelp(size_t option) { cout << " 11: obtain predicted values" << endl; cout << " 12: calculate snp variance covariance" << endl; cout << " 13: note" << endl; + cout << " 14: debug options" << endl; cout << endl; } @@ -446,7 +447,16 @@ void GEMMA::PrintHelp(size_t option) { } if (option == 4) { - cout << " RELATEDNESS MATRIX CALCULATION OPTIONS" << endl; + cout << " RELATEDNESS MATRIX (K) CALCULATION OPTIONS" << endl; + cout << " -ksnps [filename] " + << " specify input snps file name to compute K" << endl; + cout << " -loco [chr] " + << " leave one chromosome out (LOCO) by name (requires -a annotation " + "file)" + << endl; + cout << " -a [filename] " + << " specify input BIMBAM SNP annotation file name (LOCO only)" + << endl; cout << " -gk [num] " << " specify which type of kinship/relatedness matrix to generate " "(default 1)" @@ -682,6 +692,14 @@ void GEMMA::PrintHelp(size_t option) { cout << endl; } + if (option == 14) { + cout << " DEBUG OPTIONS" << endl; + cout << " -silence silent terminal display" << endl; + cout << " -debug debug output" << endl; + cout << " -nind [num] read up to num individuals" << endl; + cout << endl; + } + return; } @@ -1008,7 +1026,7 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.clear(); str.assign(argv[i]); cPar.k_mode = atoi(str.c_str()); - } else if (strcmp(argv[i], "-n") == 0) { + } else if (strcmp(argv[i], "-n") == 0) { // set pheno column (range) (cPar.p_column).clear(); while (argv[i + 1] != NULL && argv[i + 1][0] != '-') { ++i; @@ -1076,6 +1094,12 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { cPar.r2_level = atof(str.c_str()); } else if (strcmp(argv[i], "-notsnp") == 0) { cPar.maf_level = -1; + } else if (strcmp(argv[i], "-loco") == 0) { + assert(argv[i + 1]); + ++i; + str.clear(); + str.assign(argv[i]); + cPar.loco = str; } else if (strcmp(argv[i], "-gk") == 0) { if (cPar.a_mode != 0) { cPar.error = true; @@ -1315,6 +1339,14 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.clear(); str.assign(argv[i]); cPar.nr_iter = atoi(str.c_str()); + } else if (strcmp(argv[i], "-nind") == 0) { + if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { + continue; + } + ++i; + str.clear(); + str.assign(argv[i]); + cPar.ni_max = atoi(str.c_str()); // for testing purposes } else if (strcmp(argv[i], "-emp") == 0) { if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { continue; @@ -1533,6 +1565,8 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.clear(); str.assign(argv[i]); cPar.window_ns = atoi(str.c_str()); + } else if (strcmp(argv[i], "-debug") == 0) { + cPar.mode_debug = true; } else { cout << "error! unrecognized option: " << argv[i] << endl; cPar.error = true; @@ -1808,14 +1842,16 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_matrix_free(H_full); } - // Generate Kinship matrix + // Generate Kinship matrix (optionally using LOCO) if (cPar.a_mode == 21 || cPar.a_mode == 22) { cout << "Calculating Relatedness Matrix ... " << endl; gsl_matrix *G = gsl_matrix_alloc(cPar.ni_total, cPar.ni_total); + enforce_msg(G, "allocate G"); // just to be sure time_start = clock(); cPar.CalcKin(G); + cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); if (cPar.error == true) { cout << "error! fail to calculate relatedness matrix. " << endl; @@ -2613,6 +2649,7 @@ return;} cPar.a_mode == 4 || cPar.a_mode == 5 || cPar.a_mode == 31) { // Fit LMM or mvLMM or eigen gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph); + enforce_msg(Y, "allocate Y"); // just to be sure gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt); gsl_matrix *B = gsl_matrix_alloc(Y->size2, W->size2); // B is a d by c // matrix @@ -2749,7 +2786,7 @@ return;} CalcUtX(U, Y, UtY); // calculate REMLE/MLE estimate and pve for univariate model - if (cPar.n_ph == 1) { + if (cPar.n_ph == 1) { // one phenotype gsl_vector_view beta = gsl_matrix_row(B, 0); gsl_vector_view se_beta = gsl_matrix_row(se_B, 0); gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); @@ -2819,7 +2856,7 @@ return;} } } - // Fit LMM or mvLMM + // Fit LMM or mvLMM (w. LOCO) if (cPar.a_mode == 1 || cPar.a_mode == 2 || cPar.a_mode == 3 || cPar.a_mode == 4) { if (cPar.n_ph == 1) { @@ -2844,7 +2881,7 @@ return;} } else { if (cPar.file_gxe.empty()) { cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector); + &Y_col.vector, cPar.setGWASnps); } else { cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); @@ -25,6 +25,7 @@ #include <iomanip> #include <iostream> #include <map> +#include <regex> #include <set> #include <sstream> #include <stdio.h> @@ -132,7 +133,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) { } // Read SNP file. -bool ReadFile_snps(const string &file_snps, set<string> &setSnps) { +bool ReadFile_snps(const string file_snps, set<string> &setSnps) { setSnps.clear(); igzstream infile(file_snps.c_str(), igzstream::in); @@ -654,6 +655,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, major = ch_ptr; if (setSnps.size() != 0 && setSnps.count(rs) == 0) { + // if SNP in geno but not in -snps we add an missing value SNPINFO sInfo = {"-9", rs, -9, -9, minor, major, 0, -9, -9, 0, 0, file_pos}; snpInfo.push_back(sInfo); @@ -684,9 +686,8 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, gsl_vector_set_zero(genotype_miss); for (int i = 0; i < ni_total; ++i) { ch_ptr = strtok(NULL, " , \t"); - if (indicator_idv[i] == 0) { + if (indicator_idv[i] == 0) continue; - } if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(genotype_miss, c_idv, 1); @@ -1334,55 +1335,78 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { } // Read bimbam mean genotype file and calculate kinship matrix. -bool BimbamKin(const string &file_geno, vector<int> &indicator_snp, - const int k_mode, const int display_pace, - gsl_matrix *matrix_kin) { +bool BimbamKin(const string file_geno, const set<string> ksnps, + vector<int> &indicator_snp, const int k_mode, + const int display_pace, gsl_matrix *matrix_kin, + const bool test_nind) { igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - string line; - char *ch_ptr; + enforce_msg(infile, "error reading genotype file"); size_t n_miss; double d, geno_mean, geno_var; + // setKSnp and/or LOCO support + bool process_ksnps = ksnps.size(); + size_t ni_total = matrix_kin->size1; gsl_vector *geno = gsl_vector_alloc(ni_total); gsl_vector *geno_miss = gsl_vector_alloc(ni_total); - // Create a large matrix. - size_t msize = 10000; + // Xlarge contains inds x markers + const size_t msize = K_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); + enforce_msg(Xlarge, "allocate Xlarge"); + gsl_matrix_set_zero(Xlarge); + // For every SNP read the genotype per individual size_t ns_test = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { + string line; !safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + std::regex_token_iterator<std::string::iterator> rend; + regex split_on("[,[:blank:]]+"); + regex_token_iterator<string::iterator> tokens(line.begin(), line.end(), + split_on, -1); + if (test_nind) { + // ascertain the number of genotype fields match + uint token_num = 0; + for (auto x = tokens; x != rend; x++) + token_num++; + enforce_str(token_num == ni_total + 3, line + " count fields"); + } + + auto snp = *tokens; // first field + // check whether SNP is included in ksnps (used by LOCO) + if (process_ksnps && ksnps.count(snp) == 0) + continue; + + tokens++; // skip nucleotide fields + tokens++; // skip nucleotide fields + // calc SNP stats geno_mean = 0.0; n_miss = 0; geno_var = 0.0; gsl_vector_set_all(geno_miss, 0); for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); - if (strcmp(ch_ptr, "NA") == 0) { + tokens++; + enforce_str(tokens != rend, line + " number of fields"); + string field = *tokens; + if (field == "NA") { gsl_vector_set(geno_miss, i, 0); n_miss++; } else { - d = atof(ch_ptr); + d = stod(field); + // make sure genotype field contains a number + if (field != "0" && field != "0.0") + enforce_str(d != 0.0f, field); gsl_vector_set(geno, i, d); gsl_vector_set(geno_miss, i, 1); geno_mean += d; @@ -1406,23 +1430,28 @@ bool BimbamKin(const string &file_geno, vector<int> &indicator_snp, if (k_mode == 2 && geno_var != 0) { gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); } + // set the SNP column ns_test gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); ns_test++; + // compute kinship matrix and return in matrix_kin a SNP at a time if (ns_test % msize == 0) { eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } - if (ns_test % msize != 0) { eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); } cout << endl; - gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test); + // scale the kinship matrix + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + // and transpose + // FIXME: the following is very slow for (size_t i = 0; i < ni_total; ++i) { for (size_t j = 0; j < i; ++j) { @@ -1430,6 +1459,8 @@ bool BimbamKin(const string &file_geno, vector<int> &indicator_snp, gsl_matrix_set(matrix_kin, i, j, d); } } + // GSL is faster - and there are even faster methods + // enforce_gsl(gsl_matrix_transpose(matrix_kin)); gsl_vector_free(geno); gsl_vector_free(geno_miss); @@ -1463,7 +1494,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, int n_bit; // Create a large matrix. - size_t msize = 10000; + const size_t msize = K_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); gsl_matrix_set_zero(Xlarge); @@ -1583,7 +1614,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, // Read bimbam mean genotype file, the second time, recode "mean" // genotype and calculate K. -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, +bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { igzstream infile(file_geno.c_str(), igzstream::in); @@ -3326,13 +3357,14 @@ bool ReadFile_mcat(const string &file_mcat, map<string, size_t> &mapRS2cat, // Read bimbam mean genotype file and calculate kinship matrix; this // time, the kinship matrix is not centered, and can contain multiple // K matrix. -bool BimbamKin(const string &file_geno, const int display_pace, - const vector<int> &indicator_idv, - const vector<int> &indicator_snp, - const map<string, double> &mapRS2weight, - const map<string, size_t> &mapRS2cat, - const vector<SNPINFO> &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns) { +bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, + const int display_pace, + const vector<int> &indicator_idv, + const vector<int> &indicator_snp, + const map<string, double> &mapRS2weight, + const map<string, size_t> &mapRS2cat, + const vector<SNPINFO> &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns) { igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { cout << "error reading genotype file:" << file_geno << endl; @@ -3368,7 +3400,7 @@ bool BimbamKin(const string &file_geno, const int display_pace, } // Create a large matrix. - size_t msize = 10000; + const size_t msize = K_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); @@ -3378,9 +3410,8 @@ bool BimbamKin(const string &file_geno, const int display_pace, if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } ch_ptr = strtok((char *)line.c_str(), " , \t"); ch_ptr = strtok(NULL, " , \t"); @@ -3562,7 +3593,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, } // Create a large matrix. - size_t msize = 10000; + const size_t msize = K_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); @@ -3742,7 +3773,8 @@ bool PlinkKin(const string &file_bed, const int display_pace, } bool MFILEKin(const size_t mfile_mode, const string &file_mfile, - const int display_pace, const vector<int> &indicator_idv, + const set<string> setKSnps, const int display_pace, + const vector<int> &indicator_idv, const vector<vector<int>> &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, @@ -3774,8 +3806,9 @@ bool MFILEKin(const size_t mfile_mode, const string &file_mfile, PlinkKin(file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); } else { - BimbamKin(file_name, display_pace, indicator_idv, mindicator_snp[l], - mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + BimbamKinUncentered(file_name, setKSnps, display_pace, indicator_idv, + mindicator_snp[l], mapRS2weight, mapRS2cat, + msnpInfo[l], W, kin_tmp, ns_tmp); } // Add ns. @@ -34,7 +34,7 @@ void ProgressBar(string str, double p, double total); void ProgressBar(string str, double p, double total, double ratio); std::istream &safeGetline(std::istream &is, std::string &t); -bool ReadFile_snps(const string &file_snps, set<string> &setSnps); +bool ReadFile_snps(const string file_snps, set<string> &setSnps); bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps); bool ReadFile_log(const string &file_log, double &pheno_mean); @@ -83,13 +83,14 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); -bool BimbamKin(const string &file_geno, vector<int> &indicator_snp, - const int k_mode, const int display_pace, - gsl_matrix *matrix_kin); +bool BimbamKin(const string file_geno, const set<string> ksnps, + vector<int> &indicator_snp, const int k_mode, + const int display_pace, gsl_matrix *matrix_kin, + const bool test_nind); bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin); -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, +bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, @@ -124,13 +125,14 @@ bool ReadFile_catc(const string &file_cat, bool ReadFile_mcatc(const string &file_mcat, map<string, vector<double>> &mapRS2catc, size_t &n_cat); -bool BimbamKin(const string &file_geno, const int display_pace, - const vector<int> &indicator_idv, - const vector<int> &indicator_snp, - const map<string, double> &mapRS2weight, - const map<string, size_t> &mapRS2cat, - const vector<SNPINFO> &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, + const int display_pace, + const vector<int> &indicator_idv, + const vector<int> &indicator_snp, + const map<string, double> &mapRS2weight, + const map<string, size_t> &mapRS2cat, + const vector<SNPINFO> &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns); bool PlinkKin(const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, @@ -139,7 +141,8 @@ bool PlinkKin(const string &file_bed, const int display_pace, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns); bool MFILEKin(const size_t mfile_mode, const string &file_mfile, - const int display_pace, const vector<int> &indicator_idv, + const set<string> setKSnps, const int display_pace, + const vector<int> &indicator_idv, const vector<vector<int>> &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, diff --git a/src/lmm.cpp b/src/lmm.cpp index 3f51073..eb76265 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -80,6 +80,7 @@ void LMM::CopyFromParam(PARAM &cPar) { indicator_idv = cPar.indicator_idv; indicator_snp = cPar.indicator_snp; snpInfo = cPar.snpInfo; + setGWASnps = cPar.setGWASnps; return; } @@ -165,6 +166,7 @@ void LMM::WriteFiles() { } } } else { + bool process_gwasnps = setGWASnps.size(); outfile << "chr" << "\t" << "rs" @@ -217,9 +219,13 @@ void LMM::WriteFiles() { size_t t = 0; for (size_t i = 0; i < snpInfo.size(); ++i) { - if (indicator_snp[i] == 0) { + + if (indicator_snp[i] == 0) continue; - } + auto snp = snpInfo[i].rs_number; + if (process_gwasnps && setGWASnps.count(snp) == 0) + continue; + // cout << t << endl; outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" @@ -1311,78 +1317,126 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y) { - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return; - } - + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps) { clock_t time_start = clock(); - string line; - char *ch_ptr; - - double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; - double p_lrt = 0, p_score = 0; - double logl_H1 = 0.0; - int n_miss, c_phen; - double geno, x_mean; + // LOCO support + bool process_gwasnps = gwasnps.size(); + if (process_gwasnps) + debug_msg("AnalyzeBimbam w. LOCO"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *x_miss = gsl_vector_alloc(U->size1); + const size_t inds = U->size1; + gsl_vector *x = gsl_vector_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_alloc(inds); gsl_vector *Utx = gsl_vector_alloc(U->size2); gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); gsl_vector *ab = gsl_vector_alloc(n_index); - // Create a large matrix. - size_t msize = 10000; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); + // Create a large matrix with LMM_BATCH_SIZE columns for batched processing + // const size_t msize=(process_gwasnps ? 1 : LMM_BATCH_SIZE); + const size_t msize = LMM_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_alloc(inds, msize); + enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + gsl_matrix_set_zero(Xlarge); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); // start reading genotypes and analyze - size_t c = 0, t_last = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (indicator_snp[t] == 0) { - continue; + size_t c = 0; + + igzstream infile(file_geno.c_str(), igzstream::in); + enforce_msg(infile, "error reading genotype file"); + + auto batch_compute = [&](size_t l) { // using a C++ closure + // Compute SNPs in batch, note the computations are independent per SNP + gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); + gsl_matrix_view UtXlarge_sub = + gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); + + time_start = clock(); + eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); + time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < l; i++) { + // for every batch... + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_memcpy(Utx, &UtXlarge_col.vector); + + CalcUab(UtW, Uty, Utx, Uab); + + time_start = clock(); + FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; + + double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; + double p_lrt = 0, p_score = 0; + double logl_H1 = 0.0; + + // 3 is before 1. + if (a_mode == 3 || a_mode == 4) { + CalcRLScore(l_mle_null, param1, beta, se, p_score); + } + + if (a_mode == 1 || a_mode == 4) { + // for univariate a_mode is 1 + CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald(lambda_remle, param1, beta, se, p_wald); + } + + if (a_mode == 2 || a_mode == 4) { + CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); + } + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + // Store summary data. + SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, + p_wald, p_lrt, p_score}; + sumStat.push_back(SNPs); } - t_last++; - } + }; + for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + // for every SNP + string line; + safeGetline(infile, line); if (t % d_pace == 0 || t == (ns_total - 1)) { ProgressBar("Reading SNPs ", t, ns_total - 1); } - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + char *ch_ptr = strtok((char *)line.c_str(), " , \t"); + auto snp = string(ch_ptr); + // check whether SNP is included in gwasnps (used by LOCO) + if (process_gwasnps && gwasnps.count(snp) == 0) + continue; ch_ptr = strtok(NULL, " , \t"); ch_ptr = strtok(NULL, " , \t"); - x_mean = 0.0; - c_phen = 0; - n_miss = 0; + double x_mean = 0.0; + int c_phen = 0; + int n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { + // get the genotypes per individual and compute stats per SNP ch_ptr = strtok(NULL, " , \t"); - if (indicator_idv[i] == 0) { + if (indicator_idv[i] == 0) continue; - } if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { - geno = atof(ch_ptr); + double geno = atof(ch_ptr); gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); @@ -1398,66 +1452,16 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, gsl_vector_set(x, i, x_mean); } } - + // copy genotype values for SNP into Xlarge cache gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); gsl_vector_memcpy(&Xlarge_col.vector, x); - c++; - - if (c % msize == 0 || c == t_last) { - size_t l = 0; - if (c % msize == 0) { - l = msize; - } else { - l = c % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); + c++; // count SNPs going in - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(Utx, &UtXlarge_col.vector); - - CalcUab(UtW, Uty, Utx, Uab); - - time_start = clock(); - FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; - - // 3 is before 1. - if (a_mode == 3 || a_mode == 4) { - CalcRLScore(l_mle_null, param1, beta, se, p_score); - } - - if (a_mode == 1 || a_mode == 4) { - CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, - logl_H1); - CalcRLWald(lambda_remle, param1, beta, se, p_wald); - } - - if (a_mode == 2 || a_mode == 4) { - CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, - p_wald, p_lrt, p_score}; - - sumStat.push_back(SNPs); - } - } + if (c == msize) + batch_compute(msize); } + batch_compute(c % msize); + // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; gsl_vector_free(x); @@ -1505,7 +1509,7 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, gsl_vector *ab = gsl_vector_alloc(n_index); // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); @@ -1528,9 +1532,8 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, size_t c = 0, t_last = 0; for (size_t t = 0; t < snpInfo.size(); ++t) { - if (indicator_snp[t] == 0) { + if (indicator_snp[t] == 0) continue; - } t_last++; } for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { @@ -1697,7 +1700,7 @@ void LMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, gsl_vector *ab = gsl_vector_alloc(n_index); // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); @@ -26,6 +26,8 @@ using namespace std; +#define LMM_BATCH_SIZE 10000 // used for batch processing + class FUNC_PARAM { public: @@ -78,6 +80,7 @@ public: vector<int> indicator_snp; vector<SNPINFO> snpInfo; // Record SNP information. + set<string> setGWASnps; // Record SNP information. // Not included in PARAM. vector<SUMSTAT> sumStat; // Output SNPSummary Data. @@ -97,7 +100,8 @@ public: const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y); + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps); void AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index f1ab3fc..eb591ca 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -2974,7 +2974,7 @@ void MVLMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, string line; // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); @@ -3531,7 +3531,7 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2; // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); @@ -3968,7 +3968,7 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2; // Create a large matrix. - size_t msize = 10000; + size_t msize = LMM_BATCH_SIZE; gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); gsl_matrix_set_zero(Xlarge); diff --git a/src/param.cpp b/src/param.cpp index 2572bbb..6ab4339 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -38,6 +38,54 @@ using namespace std; +// ---- Helper functions which do not use the PARAM scope + +// Calculate the SNP sets as used in LOCO from mapchr which was filled +// from the annotation file. Returns ksnps (used for K) and gwasnps (used for +// GWA). Both are complimentary with each other and subsets of setSnps. + +void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps, + const map<string, string> mapchr, const string loco) { + enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice"); + enforce_msg(gwasnps.size() == 0, + "make sure gwasnps is not initialized twice"); + for (auto &kv : mapchr) { + auto snp = kv.first; + auto chr = kv.second; + if (chr != loco) { + ksnps.insert(snp); + } else { + gwasnps.insert(snp); + } + } +} + +// Trim #individuals to size which is used to write tests that run faster +// +// Note it actually trims the number of functional individuals +// (indicator_idv[x] == 1). This should match indicator_cvt etc. If +// this gives problems with certain sets we can simply trim to size. + +void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { + if (ni_max) { + size_t count = 0; + for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) { + if (*ind) + count++; + if (count >= ni_max) + break; + } + if (count != idvs.size()) { + if (debug) + cout << "**** TEST MODE: trim individuals from " << idvs.size() + << " to " << count << endl; + idvs.resize(count); + } + } +} + +// ---- PARAM class implementation + PARAM::PARAM(void) : mode_silence(false), a_mode(0), k_mode(1), d_pace(100000), file_out("result"), path_out("./output/"), miss_level(0.05), @@ -96,6 +144,15 @@ void PARAM::ReadFiles(void) { setSnps.clear(); } + // Read KSNP set. + if (!file_ksnps.empty()) { + if (ReadFile_snps(file_ksnps, setKSnps) == false) { + error = true; + } + } else { + setKSnps.clear(); + } + // For prediction. if (!file_epm.empty()) { if (ReadFile_est(file_epm, est_column, mapRS2est) == false) { @@ -164,6 +221,7 @@ void PARAM::ReadFiles(void) { } else { n_cvt = 1; } + trim_individuals(indicator_cvt, ni_max, mode_debug); if (!file_gxe.empty()) { if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) { @@ -176,6 +234,8 @@ void PARAM::ReadFiles(void) { } } + trim_individuals(indicator_idv, ni_max, mode_debug); + // WJA added. // Read genotype and phenotype file for bgen format. if (!file_oxford.empty()) { @@ -273,12 +333,15 @@ void PARAM::ReadFiles(void) { gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); CopyCvt(W); + trim_individuals(indicator_idv, ni_max, mode_debug); + trim_individuals(indicator_cvt, ni_max, mode_debug); if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; } gsl_matrix_free(W); + ns_total = indicator_snp.size(); } @@ -457,6 +520,11 @@ void PARAM::ReadFiles(void) { // ni_test, save all useful covariates. ProcessCvtPhen(); } + + // Compute setKSnps when -loco is passed in + if (!loco.empty()) { + LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco); + } return; } @@ -869,22 +937,22 @@ void PARAM::CheckParam(void) { error = true; } - str = file_snps; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open snps file: " << str << endl; - error = true; - } - - str = file_log; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open log file: " << str << endl; - error = true; - } + enforce_fexists(file_snps, "open file"); + enforce_fexists(file_ksnps, "open file"); + enforce_fexists(file_gwasnps, "open file"); + enforce_fexists(file_log, "open file"); + enforce_fexists(file_anno, "open file"); - str = file_anno; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open annotation file: " << str << endl; - error = true; + if (!loco.empty()) { + enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22, + "LOCO only works with LMM and K"); + enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); + enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)"); + enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); + enforce_msg(!file_anno.empty(), + "LOCO requires annotation file (-a switch)"); + enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch"); + enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } str = file_kin; @@ -1005,7 +1073,7 @@ void PARAM::CheckData(void) { (indicator_cvt).size() != (indicator_idv).size()) { error = true; cout << "error! number of rows in the covariates file do not " - << "match the number of individuals. " << endl; + << "match the number of individuals. " << indicator_cvt.size() << endl; return; } if ((indicator_gxe).size() != 0 && @@ -1123,7 +1191,15 @@ void PARAM::CheckData(void) { if (!file_gene.empty()) { cout << "## number of total genes = " << ng_total << endl; } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) { - cout << "## number of total SNPs = " << ns_total << endl; + if (!loco.empty()) + cout << "## leave one chromosome out (LOCO) = " << loco << endl; + cout << "## number of total SNPs = " << ns_total << endl; + if (setSnps.size()) + cout << "## number of considered SNPS = " << setSnps.size() << endl; + if (setKSnps.size()) + cout << "## number of SNPS for K = " << setKSnps.size() << endl; + if (setGWASnps.size()) + cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl; cout << "## number of analyzed SNPs = " << ns_test << endl; } else { } @@ -1297,20 +1373,22 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) { if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; + enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else if (!file_oxford.empty()) { file_str = file_oxford + ".bgen"; + enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { file_str = file_geno; - if (BimbamKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == - false) { + if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, + matrix_kin, ni_max == 0) == false) { error = true; } } @@ -1720,37 +1798,43 @@ void PARAM::CalcS(const map<string, double> &mapRS2wA, } else if (!file_geno.empty()) { file_str = file_geno; if (mapRS2wA.size() == 0) { - if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, - mapRS2cat, snpInfo, W, K, ns) == false) { + if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv, + indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, + ns) == false) { error = true; } } else { - if (BimbamKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, - mapRS2cat, snpInfo, W, A, ns) == false) { + if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv, + indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, + ns) == false) { error = true; } } } else if (!file_mbfile.empty()) { if (mapRS2wA.size() == 0) { - if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp, - mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) { + if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv, + mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, + ns) == false) { error = true; } } else { - if (MFILEKin(1, file_mbfile, d_pace, indicator_idv, mindicator_snp, - mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) { + if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv, + mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, + ns) == false) { error = true; } } } else if (!file_mgeno.empty()) { if (mapRS2wA.size() == 0) { - if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp, - mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) { + if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv, + mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, + ns) == false) { error = true; } } else { - if (MFILEKin(0, file_mgeno, d_pace, indicator_idv, mindicator_snp, - mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) { + if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv, + mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, + ns) == false) { error = true; } } diff --git a/src/param.h b/src/param.h index 33e2431..45d8c0f 100644 --- a/src/param.h +++ b/src/param.h @@ -19,12 +19,15 @@ #ifndef __PARAM_H__ #define __PARAM_H__ +#include "debug.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" #include <map> #include <set> #include <vector> +#define K_BATCH_SIZE 10000 // #snps used for batched K + using namespace std; class SNPINFO { @@ -110,6 +113,7 @@ class PARAM { public: // IO-related parameters. bool mode_silence; + bool mode_debug = false; int a_mode; // Analysis mode, 1/2/3/4 for Frequentist tests int k_mode; // Kinship read mode: 1: n by n matrix, 2: id/id/k_value; vector<size_t> p_column; // Which phenotype column needs analysis. @@ -135,12 +139,14 @@ public: string file_bf, file_hyp; string path_out; - string file_epm; // Estimated parameter file. - string file_ebv; // Estimated breeding value file. - string file_log; // Log file containing mean estimate. - string file_read; // File containing total number of reads. - string file_gene; // Gene expression file. - string file_snps; // File containing analyzed SNPs or genes. + string file_epm; // Estimated parameter file. + string file_ebv; // Estimated breeding value file. + string file_log; // Log file containing mean estimate. + string file_read; // File containing total number of reads. + string file_gene; // Gene expression file. + string file_snps; // File containing analyzed SNPs or genes. + string file_ksnps; // File SNPs for computing K + string file_gwasnps; // File SNPs for computing GWAS // WJA added. string file_oxford; @@ -152,6 +158,7 @@ public: double r2_level; // LMM-related parameters. + string loco; double l_min; double l_max; size_t n_region; @@ -215,6 +222,7 @@ public: // Number of individuals. size_t ni_total, ni_test, ni_cvt, ni_study, ni_ref; + size_t ni_max = 0; // -nind switch for testing purposes // Number of observed and missing phenotypes. size_t np_obs, np_miss; @@ -305,7 +313,9 @@ public: vector<SNPINFO> snpInfo; // Record SNP information. vector<vector<SNPINFO>> msnpInfo; // Record SNP information. - set<string> setSnps; // Set of snps for analysis. + set<string> setSnps; // Set of snps for analysis (-snps). + set<string> setKSnps; // Set of snps for K (-ksnps and LOCO) + set<string> setGWASnps; // Set of snps for GWA (-gwasnps and LOCO) // Constructor. PARAM(); @@ -351,4 +361,10 @@ public: size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt); +// Helpers for checking parameters +#define enforce_fexists(fn, msg) \ + if (!fn.empty()) \ + enforce_msg(stat(fn.c_str(), &fileInfo) == 0, \ + ((std::string(__STRING(fn)) + ": " + msg).c_str())); + #endif @@ -40,6 +40,8 @@ public: bool noconstrain; }; +// Methods for variant component (VC) estimation + class VC { public: |