diff options
author | Pjotr Prins | 2017-10-06 10:44:59 +0000 |
---|---|---|
committer | Pjotr Prins | 2017-10-06 10:45:07 +0000 |
commit | df1e049875adba1d3bb5762310bf31f0057fbf8d (patch) | |
tree | 5bad06a67225131c1ae3719901ed5caedc6836b0 /src/lmm.cpp | |
parent | 6e95011f7d23246baef78cb2f6fdbe12f5c0793e (diff) | |
download | pangemma-df1e049875adba1d3bb5762310bf31f0057fbf8d.tar.gz |
LMM: lifter genotype parsing for BIMBAM. All tests pass.
Diffstat (limited to 'src/lmm.cpp')
-rw-r--r-- | src/lmm.cpp | 47 |
1 files changed, 30 insertions, 17 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp index 47ede97..8c0a303 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -1289,7 +1289,7 @@ in. */ -void LMM::Analyze(std::function< string(size_t) >& fetch_line, +void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, @@ -1383,17 +1383,13 @@ void LMM::Analyze(std::function< string(size_t) >& fetch_line, if (indicator_snp[t] == 0) continue; - string line = fetch_line(t); + auto tup = fetch_snp(t); + auto snp = get<0>(tup); + auto gs = get<1>(tup); - char *ch_ptr = strtok((char *)line.c_str(), " , \t"); - enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // just to be sure - - auto snp = string(ch_ptr); // check whether SNP is included in gwasnps (used by LOCO) if (process_gwasnps && gwasnps.count(snp) == 0) continue; - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); double x_mean = 0.0; int c_phen = 0; @@ -1401,16 +1397,14 @@ void LMM::Analyze(std::function< string(size_t) >& fetch_line, gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { // get the genotypes per individual and compute stats per SNP - ch_ptr = strtok(NULL, " , \t"); - if (indicator_idv[i] == 0) + if (indicator_idv[i] == 0) // skip individual column continue; - if (strcmp(ch_ptr, "NA") == 0) { + double geno = gs[i]; + if (std::isnan(geno)) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { - double geno = atof(ch_ptr); - gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); x_mean += geno; @@ -1448,7 +1442,6 @@ void LMM::Analyze(std::function< string(size_t) >& fetch_line, } - void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, @@ -1459,16 +1452,36 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, enforce_msg(infile, "error reading genotype file"); size_t prev_line = 0; - std::function<string(size_t)> fetch_line = [&](size_t num) { + // fetch_snp is a callback function for every SNP row + std::function<SnpNameValues(size_t)> fetch_snp = [&](size_t num) { string line; while (prev_line <= num) { + // also read SNPs that were skipped safeGetline(infile, line); prev_line++; } - return line; + char *ch_ptr = strtok((char *)line.c_str(), " , \t"); + enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // ch_ptr should not be NULL + + auto snp = string(ch_ptr); + ch_ptr = strtok(NULL, " , \t"); // skip column + ch_ptr = strtok(NULL, " , \t"); // skip column + + std::vector <double> gs; + gs.resize(ni_total); + + for (size_t i = 0; i < ni_total; ++i) { + ch_ptr = strtok(NULL, " , \t"); + enforce_msg(ch_ptr,line.c_str()); + double geno = atof(ch_ptr); + if (strcmp(ch_ptr, "NA") == 0) + geno = nan(""); + gs[i] = geno; + } + return std::make_tuple(snp,gs); }; - LMM::Analyze(fetch_line,U,eval,UtW,Uty,W,y,gwasnps); + LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); infile.close(); infile.clear(); |