diff options
Diffstat (limited to 'src/io.cpp')
-rw-r--r-- | src/io.cpp | 1088 |
1 files changed, 154 insertions, 934 deletions
@@ -41,6 +41,7 @@ #include "debug.h" #include "eigenlib.h" +#include "fastblas.h" #include "gzstream.h" #include "io.h" #include "lapack.h" @@ -49,43 +50,17 @@ using namespace std; // Print progress bar. -void ProgressBar(string str, double p, double total) { - double progress = (100.0 * p / total); - int barsize = (int)(progress / 2.0); - char bar[51]; - - cout << str; - for (int i = 0; i < 50; i++) { - if (i < barsize) { - bar[i] = '='; - } else { - bar[i] = ' '; - } - cout << bar[i]; - } - cout << setprecision(2) << fixed << progress << "%\r" << flush; - - return; -} - -// Print progress bar with acceptance ratio. void ProgressBar(string str, double p, double total, double ratio) { - double progress = (100.0 * p / total); - int barsize = (int)(progress / 2.0); - char bar[51]; - - cout << str; - for (int i = 0; i < 50; i++) { - if (i < barsize) { - bar[i] = '='; - } else { - bar[i] = ' '; - } - cout << bar[i]; - } - cout << setprecision(2) << fixed << progress << "% " << ratio << "\r" - << flush; - return; + assert(p<=total); + const double progress = (100.0 * p / total); + const uint barsize = (int)(progress / 2.0); // characters + cout << str << " "; + cout << std::string(barsize,'='); + cout << std::string(50-barsize,' '); + cout << setprecision(0) << fixed << " " << progress << "%"; + if (ratio != -1.0) + cout << setprecision(2) << " " << ratio; + cout << "\r" << flush; } bool isBlankLine(char const *line) { @@ -177,7 +152,7 @@ bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.rs_col == 0 && (header.chr_col == 0 || header.pos_col == 0)) { @@ -233,7 +208,7 @@ bool ReadFile_log(const string &file_log, double &pheno_mean) { size_t flag = 0; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); ch_ptr = strtok(NULL, " , \t"); if (ch_ptr != NULL && strcmp(ch_ptr, "estimated") == 0) { @@ -241,7 +216,7 @@ bool ReadFile_log(const string &file_log, double &pheno_mean) { if (ch_ptr != NULL && strcmp(ch_ptr, "mean") == 0) { ch_ptr = strtok(NULL, " , \t"); if (ch_ptr != NULL && strcmp(ch_ptr, "=") == 0) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); pheno_mean = atof(ch_ptr); flag = 1; } @@ -339,7 +314,7 @@ bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, string id; double p; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (int i = 0; i < (p_column - 1); ++i) { ch_ptr = strtok(NULL, " , \t"); } @@ -511,17 +486,17 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { string minor; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); chr = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); cM = atof(ch_ptr); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); b_pos = atol(ch_ptr); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); minor = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); major = ch_ptr; SNPINFO sInfo = {chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; @@ -567,12 +542,12 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, } while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe(NULL, " \t"); id = ch_ptr; - ch_ptr = strtok(NULL, " \t"); - ch_ptr = strtok(NULL, " \t"); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); ch_ptr = strtok(NULL, " \t"); size_t i = 0; @@ -620,7 +595,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, - size_t &ns_test, bool debug) { + size_t &ns_test) { debug_msg("entered"); indicator_snp.clear(); snpInfo.clear(); @@ -631,12 +606,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, return false; } - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -674,11 +649,11 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, file_pos = 0; auto count_warnings = 0; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); minor = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); major = ch_ptr; if (setSnps.size() != 0 && setSnps.count(rs) == 0) { @@ -693,7 +668,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } if (mapRS2bp.count(rs) == 0) { - if (debug && count_warnings++ < 10) { + if (is_debug_mode() && count_warnings++ < 10) { std::string msg = "Can't figure out position for "; msg += rs; debug_msg(msg); @@ -719,7 +694,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, c_idv = 0; gsl_vector_set_zero(genotype_miss); for (int i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) continue; @@ -842,12 +817,12 @@ bool ReadFile_bed(const string &file_bed, const set<string> &setSnps, return false; } - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -1029,13 +1004,13 @@ bool Bimbam_ReadOneSNP(const size_t inc, const vector<int> &indicator_idv, bool flag = false; for (size_t i = 0; i < inc; i++) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); } if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); geno_mean = 0.0; double d; @@ -1043,7 +1018,7 @@ bool Bimbam_ReadOneSNP(const size_t inc, const vector<int> &indicator_idv, vector<size_t> geno_miss; for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -1159,9 +1134,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, size_t i_test = 0, i_total = 0, j_test = 0, j_total = 0; while (getline(infile, line)) { if (i_total == ni_total) { - cout << "error! number of rows in the kinship " - << "file is larger than the number of phentypes." << endl; - error = true; + fail_msg("number of rows in the kinship file is larger than the number of phentypes"); } if (indicator_idv[i_total] == 0) { @@ -1174,10 +1147,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, ch_ptr = strtok((char *)line.c_str(), " , \t"); while (ch_ptr != NULL) { if (j_total == ni_total) { - cout << "error! number of columns in the " - << "kinship file is larger than the number" - << " of phenotypes for row = " << i_total << endl; - error = true; + fail_msg(string("number of columns in the kinship file is larger than the number of individuals for row = ")+to_string(i_total)); } d = atof(ch_ptr); @@ -1190,18 +1160,14 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, ch_ptr = strtok(NULL, " , \t"); } if (j_total != ni_total) { - cout << "error! number of columns in the kinship " - << "file do not match the number of phentypes for " - << "row = " << i_total << endl; - error = true; + string msg = "number of columns in the kinship file does not match the number of individuals for row = " + to_string( i_total ); + fail_msg(msg); } i_total++; i_test++; } if (i_total != ni_total) { - cout << "error! number of rows in the kinship file do " - << "not match the number of phenotypes." << endl; - error = true; + fail_msg("number of rows in the kinship file does not match the number of individuals."); } } else { map<size_t, size_t> mapID2ID; @@ -1218,11 +1184,11 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, size_t n_id1, n_id2; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); id1 = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); id2 = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); d = atof(ch_ptr); if (mapID2num.count(id1) == 0 || mapID2num.count(id2) == 0) { continue; @@ -1237,9 +1203,10 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, Cov_d = gsl_matrix_get(G, n_id1, n_id2); if (Cov_d != 0 && Cov_d != d) { - cout << "error! redundant and unequal terms in the " + cerr << "error! redundant and unequal terms in the " << "kinship file, for id1 = " << id1 << " and id2 = " << id2 << endl; + fail_msg(""); } else { gsl_matrix_set(G, n_id1, n_id2, d); gsl_matrix_set(G, n_id2, n_id1, d); @@ -1278,7 +1245,6 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, infile.close(); infile.clear(); - return; } void ReadFile_eigenU(const string &file_ku, bool &error, gsl_matrix *U) { @@ -1354,7 +1320,7 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { error = true; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); d = atof(ch_ptr); ch_ptr = strtok(NULL, " , \t"); @@ -1391,12 +1357,12 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, bool process_ksnps = ksnps.size(); size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); - gsl_vector *geno_miss = gsl_vector_alloc(ni_total); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); // Xlarge contains inds x markers const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); enforce_msg(Xlarge, "allocate Xlarge"); gsl_matrix_set_zero(Xlarge); @@ -1405,9 +1371,9 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, size_t ns_test = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { string line; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) continue; @@ -1480,12 +1446,12 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, // compute kinship matrix and return in matrix_kin a SNP at a time if (ns_test % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } if (ns_test % msize != 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); } cout << endl; @@ -1531,14 +1497,14 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, double d, geno_mean, geno_var; size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); size_t ns_test = 0; int n_bit; // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); gsl_matrix_set_zero(Xlarge); // Calculate n_bit and c, the number of bit for each snp. @@ -1556,7 +1522,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -1626,13 +1592,13 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, ns_test++; if (ns_test % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } if (ns_test % msize != 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); } cout << endl; @@ -1659,7 +1625,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, // genotype and calculate K. bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K, bool debug) { + const bool calc_K) { debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { @@ -1674,8 +1640,8 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(UtX->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(UtX->size1); + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(UtX->size1); double geno, geno_mean; size_t n_miss; @@ -1687,21 +1653,21 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, int c_idv = 0, c_snp = 0; for (int i = 0; i < ns_total; ++i) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (indicator_snp[i] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); c_idv = 0; geno_mean = 0; n_miss = 0; gsl_vector_set_zero(genotype_miss); for (int j = 0; j < ni_total; ++j) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[j] == 0) { continue; } @@ -1764,7 +1730,7 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, - const size_t ns_test, bool debug) { + const size_t ns_test) { debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { @@ -1785,8 +1751,8 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(ni_test); - gsl_vector *genotype_miss = gsl_vector_alloc(ni_test); + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); double geno, geno_mean; size_t n_miss; @@ -1796,21 +1762,21 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, size_t c_idv = 0, c_snp = 0; for (size_t i = 0; i < ns_total; ++i) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (indicator_snp[i] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); c_idv = 0; geno_mean = 0; n_miss = 0; gsl_vector_set_zero(genotype_miss); for (uint j = 0; j < ni_total; ++j) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[j] == 0) { continue; } @@ -1904,7 +1870,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(UtX->size1); + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); double geno, geno_mean; size_t n_miss; @@ -2040,7 +2006,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(ni_test); + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); double geno, geno_mean; size_t n_miss; @@ -2160,7 +2126,7 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, size_t n = *max_element(est_column.begin(), est_column.end()); while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); alpha = 0.0; beta = 0.0; @@ -2179,7 +2145,7 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, gamma = atof(ch_ptr); } if (i < n) { - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); } } @@ -2237,7 +2203,7 @@ bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, getline(infile, line); while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; ch_ptr = strtok(NULL, " , \t"); @@ -2274,759 +2240,6 @@ bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, return true; } -// WJA Added -// Read Oxford sample file. -bool ReadFile_sample(const string &file_sample, - vector<vector<int>> &indicator_pheno, - vector<vector<double>> &pheno, - const vector<size_t> &p_column, vector<int> &indicator_cvt, - vector<vector<double>> &cvt, size_t &n_cvt) { - debug_msg("entered"); - indicator_pheno.clear(); - pheno.clear(); - indicator_cvt.clear(); - - igzstream infile(file_sample.c_str(), igzstream::in); - - if (!infile) { - cout << "error! fail to open sample file: " << file_sample << endl; - return false; - } - - string line; - char *ch_ptr; - - string id; - double p, d; - - vector<double> pheno_row; - vector<int> ind_pheno_row; - int flag_na = 0; - - size_t num_cols = 0; - size_t num_p_in_file = 0; - size_t num_cvt_in_file = 0; - - map<size_t, size_t> mapP2c; - for (size_t i = 0; i < p_column.size(); i++) { - mapP2c[p_column[i]] = i; - pheno_row.push_back(-9); - ind_pheno_row.push_back(0); - } - - // Read header line1. - if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - if (strcmp(ch_ptr, "ID_1") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "ID_2") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "missing") != 0) { - return false; - } - while (ch_ptr != NULL) { - num_cols++; - ch_ptr = strtok(NULL, " \t"); - } - num_cols--; - } - - vector<map<uint32_t, size_t>> cvt_factor_levels; - - char col_type[num_cols]; - - // Read header line2. - if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - size_t it = 0; - ch_ptr = strtok(NULL, " \t"); - if (ch_ptr != NULL) - while (ch_ptr != NULL) { - col_type[it++] = ch_ptr[0]; - if (ch_ptr[0] == 'D') { - cvt_factor_levels.push_back(map<uint32_t, size_t>()); - num_cvt_in_file++; - } - if (ch_ptr[0] == 'C') { - num_cvt_in_file++; - } - if ((ch_ptr[0] == 'P') || (ch_ptr[0] == 'B')) { - num_p_in_file++; - } - ch_ptr = strtok(NULL, " \t"); - } - } - - while (!safeGetline(infile, line).eof()) { - - ch_ptr = strtok((char *)line.c_str(), " \t"); - - for (int it = 0; it < 3; it++) { - ch_ptr = strtok(NULL, " \t"); - } - - size_t i = 0; - size_t p_i = 0; - size_t fac_cvt_i = 0; - - while (i < num_cols) { - - if ((col_type[i] == 'P') || (col_type[i] == 'B')) { - if (mapP2c.count(p_i + 1) != 0) { - if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[p_i + 1]] = 0; - pheno_row[mapP2c[p_i + 1]] = -9; - } else { - p = atof(ch_ptr); - ind_pheno_row[mapP2c[p_i + 1]] = 1; - pheno_row[mapP2c[p_i + 1]] = p; - } - } - p_i++; - } - if (col_type[i] == 'D') { - - // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL - // IS INTEGRAL i.e for atoi error. - if (strcmp(ch_ptr, "NA") != 0) { - uint32_t level = atoi(ch_ptr); - if (cvt_factor_levels[fac_cvt_i].count(level) == 0) { - cvt_factor_levels[fac_cvt_i][level] = - cvt_factor_levels[fac_cvt_i].size(); - } - } - fac_cvt_i++; - } - - ch_ptr = strtok(NULL, " \t"); - i++; - } - - indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - } - - // Close and reopen the file. - infile.close(); - infile.clear(); - - if (num_cvt_in_file > 0) { - igzstream infile2(file_sample.c_str(), igzstream::in); - - if (!infile2) { - cout << "error! fail to open sample file: " << file_sample << endl; - return false; - } - - // Skip header. - safeGetline(infile2, line); - safeGetline(infile2, line); - - // Pull in the covariates now we now the number of - // factor levels. - while (!safeGetline(infile2, line).eof()) { - - vector<double> v_d; - flag_na = 0; - ch_ptr = strtok((char *)line.c_str(), " \t"); - - for (int it = 0; it < 3; it++) { - ch_ptr = strtok(NULL, " \t"); - } - - size_t i = 0; - size_t fac_cvt_i = 0; - size_t num_fac_levels; - while (i < num_cols) { - - if (col_type[i] == 'C') { - if (strcmp(ch_ptr, "NA") == 0) { - flag_na = 1; - d = -9; - } else { - d = atof(ch_ptr); - } - - v_d.push_back(d); - } - - if (col_type[i] == 'D') { - - // NOTE THIS DOES NOT CHECK TO BE SURE - // LEVEL IS INTEGRAL i.e for atoi error. - num_fac_levels = cvt_factor_levels[fac_cvt_i].size(); - if (num_fac_levels > 1) { - if (strcmp(ch_ptr, "NA") == 0) { - flag_na = 1; - for (size_t it = 0; it < num_fac_levels - 1; it++) { - v_d.push_back(-9); - } - } else { - uint32_t level = atoi(ch_ptr); - for (size_t it = 0; it < num_fac_levels - 1; it++) { - cvt_factor_levels[fac_cvt_i][level] == it + 1 - ? v_d.push_back(1.0) - : v_d.push_back(0.0); - } - } - } - fac_cvt_i++; - } - - ch_ptr = strtok(NULL, " \t"); - i++; - } - - if (flag_na == 0) { - indicator_cvt.push_back(1); - } else { - indicator_cvt.push_back(0); - } - cvt.push_back(v_d); - } - - if (indicator_cvt.empty()) { - n_cvt = 0; - } else { - flag_na = 0; - for (vector<int>::size_type i = 0; i < indicator_cvt.size(); ++i) { - if (indicator_cvt[i] == 0) { - continue; - } - - if (flag_na == 0) { - flag_na = 1; - n_cvt = cvt[i].size(); - } - if (flag_na != 0 && n_cvt != cvt[i].size()) { - cout << "error! number of covariates in row " << i - << " do not match other rows." << endl; - return false; - } - } - } - - infile2.close(); - infile2.clear(); - } - return true; -} - -// WJA Added. -// Read bgen file, the first time. -bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, - const gsl_matrix *W, vector<int> &indicator_idv, - vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, - const double &maf_level, const double &miss_level, - const double &hwe_level, const double &r2_level, - size_t &ns_test) { - - debug_msg("entered"); - indicator_snp.clear(); - - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return false; - } - - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - bool LongIds = bgen_flags & 0x4; - - if (!LongIds) { - return false; - } - - infile.ignore(bgen_snp_block_offset); - - ns_test = 0; - - size_t ns_total = static_cast<size_t>(bgen_nsnps); - - snpInfo.clear(); - string rs; - long int b_pos; - string chr; - string major; - string minor; - string id; - - double v_x, v_w; - int c_idv = 0; - - double maf, geno, geno_old; - size_t n_miss; - size_t n_0, n_1, n_2; - int flag_poly; - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - // Total number of samples in phenotype file. - size_t ni_total = indicator_idv.size(); - - // Number of samples to use in test. - size_t ni_test = 0; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - - for (size_t i = 0; i < ni_total; ++i) { - ni_test += indicator_idv[i]; - } - - for (size_t t = 0; t < ns_total; ++t) { - - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - // Should we switch according to MAF? - minor = bgen_B_allele; - major = bgen_A_allele; - b_pos = static_cast<long int>(bgen_SNP_pos); - - uint16_t unzipped_data[3 * bgen_N]; - - if (setSnps.size() != 0 && setSnps.count(rs) == 0) { - SNPINFO sInfo = { - "-9", rs, -9, -9, minor, major, static_cast<size_t>(-9), - -9, (long int)-9}; - - snpInfo.push_back(sInfo); - indicator_snp.push_back(0); - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - maf = 0; - n_miss = 0; - flag_poly = 0; - geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; - c_idv = 0; - gsl_vector_set_zero(genotype_miss); - for (size_t i = 0; i < bgen_N; ++i) { - - // CHECK this set correctly! - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - - // CHECK 0.1 OK. - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - c_idv++; - continue; - } - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } - - gsl_vector_set(genotype, c_idv, geno); - - // CHECK WHAT THIS DOES. - if (flag_poly == 0) { - geno_old = geno; - flag_poly = 2; - } - if (flag_poly == 2 && geno != geno_old) { - flag_poly = 1; - } - - maf += geno; - - c_idv++; - } - - maf /= 2.0 * static_cast<double>(ni_test - n_miss); - - SNPINFO sInfo = {chr, rs, -9, b_pos, - minor, major, n_miss, (double)n_miss / (double)ni_test, - maf}; - snpInfo.push_back(sInfo); - - if ((double)n_miss / (double)ni_test > miss_level) { - indicator_snp.push_back(0); - continue; - } - - if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { - indicator_snp.push_back(0); - continue; - } - - if (flag_poly != 1) { - indicator_snp.push_back(0); - continue; - } - - if (hwe_level != 0 && maf_level != -1) { - if (CalcHWE(n_0, n_2, n_1) < hwe_level) { - indicator_snp.push_back(0); - continue; - } - } - - // Filter SNP if it is correlated with W - // unless W has only one column, of 1s. - for (size_t i = 0; i < genotype->size; ++i) { - if (gsl_vector_get(genotype_miss, i) == 1) { - geno = maf * 2.0; - gsl_vector_set(genotype, i, geno); - } - } - - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_ddot(genotype, genotype, &v_x); - gsl_blas_ddot(Wtx, WtWiWtx, &v_w); - - if (W->size2 != 1 && v_w / v_x >= r2_level) { - indicator_snp.push_back(0); - continue; - } - - indicator_snp.push_back(1); - ns_test++; - } - - return true; -} - -// Read oxford genotype file and calculate kinship matrix. -bool bgenKin(const string &file_oxford, vector<int> &indicator_snp, - const int k_mode, const int display_pace, gsl_matrix *matrix_kin) { - debug_msg("entered"); - string file_bgen = file_oxford; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return false; - } - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - double genotype; - - size_t n_miss; - double d, geno_mean, geno_var; - - size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); - gsl_vector *geno_miss = gsl_vector_alloc(ni_total); - - size_t ns_test = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading bgen SNPs ", t, indicator_snp.size() - 1); - } - - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - geno_mean = 0.0; - n_miss = 0; - geno_var = 0.0; - gsl_vector_set_all(geno_miss, 0); - - for (size_t i = 0; i < bgen_N; ++i) { - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - // WJA - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(geno_miss, i, 0.0); - n_miss++; - } else { - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - genotype = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(geno, i, genotype); - gsl_vector_set(geno_miss, i, 1.0); - geno_mean += genotype; - geno_var += genotype * genotype; - } - } - - geno_mean /= (double)(ni_total - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_total; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_total; ++i) { - if (gsl_vector_get(geno_miss, i) == 0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - if (geno_var != 0) { - if (k_mode == 1) { - gsl_blas_dsyr(CblasUpper, 1.0, geno, matrix_kin); - } else if (k_mode == 2) { - gsl_blas_dsyr(CblasUpper, 1.0 / geno_var, geno, matrix_kin); - } else { - cout << "Unknown kinship mode." << endl; - } - } - - ns_test++; - } - cout << endl; - - gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test); - - for (size_t i = 0; i < ni_total; ++i) { - for (size_t j = 0; j < i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i); - gsl_matrix_set(matrix_kin, i, j, d); - } - } - - gsl_vector_free(geno); - gsl_vector_free(geno_miss); - - infile.close(); - infile.clear(); - - return true; -} - // Read header to determine which column contains which item. bool ReadHeader_io(const string &line, HEADER &header) { debug_msg("entered"); @@ -3314,7 +2527,7 @@ bool ReadFile_cat(const string &file_cat, map<string, size_t> &mapRS2cat, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); // Use the header to count the number of categories. @@ -3340,10 +2553,11 @@ bool ReadFile_cat(const string &file_cat, map<string, size_t> &mapRS2cat, // Read the following lines to record mapRS2cat. while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); i_cat = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } else if (header.chr_col != 0 && header.chr_col == i + 1) { @@ -3436,13 +2650,13 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, double d, geno_mean, geno_var; size_t ni_test = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_test); - gsl_vector *geno_miss = gsl_vector_alloc(ni_test); + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_test); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -3459,21 +2673,21 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); size_t ns_test = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) continue; - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); rs = snpInfo[t].rs_number; // This line is new. @@ -3487,7 +2701,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, if (indicator_idv[i] == 0) { continue; } - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(geno_miss, i, 0); n_miss++; @@ -3536,7 +2750,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, ns_vec[0]++; if (ns_vec[0] % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs) != 0) { @@ -3553,7 +2767,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix( matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); @@ -3569,7 +2783,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); } } @@ -3628,12 +2842,12 @@ bool PlinkKin(const string &file_bed, const int display_pace, size_t ni_test = matrix_kin->size1; size_t ni_total = indicator_idv.size(); - gsl_vector *geno = gsl_vector_alloc(ni_test); + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -3653,7 +2867,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); // Calculate n_bit and c, the number of bit for each SNP. @@ -3671,7 +2885,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -3762,7 +2976,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, ns_vec[0]++; if (ns_vec[0] % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs) != 0) { @@ -3779,7 +2993,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix( matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); @@ -3795,7 +3009,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); } } @@ -3852,8 +3066,8 @@ bool MFILEKin(const size_t mfile_mode, const string &file_mfile, string file_name; - gsl_matrix *kin_tmp = gsl_matrix_alloc(matrix_kin->size1, matrix_kin->size2); - gsl_vector *ns_tmp = gsl_vector_alloc(vector_ns->size); + gsl_matrix *kin_tmp = gsl_matrix_safe_alloc(matrix_kin->size1, matrix_kin->size2); + gsl_vector *ns_tmp = gsl_vector_safe_alloc(vector_ns->size); size_t l = 0; double d; @@ -3929,9 +3143,9 @@ bool ReadFile_wsnp(const string &file_wsnp, map<string, double> &mapRS2weight) { double weight; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); weight = atof(ch_ptr); mapRS2weight[rs] = weight; } @@ -3960,17 +3174,18 @@ bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); while (!safeGetline(infile, line).eof()) { if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); size_t t = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } else if (header.chr_col != 0 && header.chr_col == i + 1) { @@ -4052,7 +3267,7 @@ void ReadFile_beta(const string &file_beta, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.n_col == 0) { @@ -4074,7 +3289,7 @@ void ReadFile_beta(const string &file_beta, if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); z = 0; beta = 0; @@ -4089,6 +3304,7 @@ void ReadFile_beta(const string &file_beta, af = 0; var_x = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -4234,7 +3450,7 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.n_col == 0) { @@ -4255,7 +3471,7 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); z = 0; beta = 0; @@ -4270,6 +3486,7 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, af = 0; var_x = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -4540,8 +3757,8 @@ void ReadFile_vector(const string &file_vec, gsl_vector *vec) { char *ch_ptr; for (size_t i = 0; i < vec->size; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); gsl_vector_set(vec, i, atof(ch_ptr)); } @@ -4563,9 +3780,10 @@ void ReadFile_matrix(const string &file_mat, gsl_matrix *mat) { char *ch_ptr; for (size_t i = 0; i < mat->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } @@ -4590,18 +3808,20 @@ void ReadFile_matrix(const string &file_mat, gsl_matrix *mat1, char *ch_ptr; for (size_t i = 0; i < mat1->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat1->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat1, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } } for (size_t i = 0; i < mat2->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat2->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat2, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } @@ -4621,7 +3841,7 @@ void ReadFile_study(const string &file_study, gsl_matrix *Vq_mat, string sfile = file_study + ".size.txt"; string qfile = file_study + ".q.txt"; - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); ReadFile_matrix(Vqfile, Vq_mat); ReadFile_vector(sfile, s); @@ -4646,7 +3866,7 @@ void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, string sfile = file_ref + ".size.txt"; string Sfile = file_ref + ".S.txt"; - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); ReadFile_vector(sfile, s); ReadFile_matrix(Sfile, S_mat, Svar_mat); @@ -4672,9 +3892,9 @@ void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector_set_zero(s_vec); ni = 0; - gsl_matrix *Vq_sub = gsl_matrix_alloc(Vq_mat->size1, Vq_mat->size2); - gsl_vector *q_sub = gsl_vector_alloc(q_vec->size); - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_matrix *Vq_sub = gsl_matrix_safe_alloc(Vq_mat->size1, Vq_mat->size2); + gsl_vector *q_sub = gsl_vector_safe_alloc(q_vec->size); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); igzstream infile(file_mstudy.c_str(), igzstream::in); if (!infile) { @@ -4763,9 +3983,9 @@ void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, gsl_vector_set_zero(s_vec); ni = 0; - gsl_matrix *S_sub = gsl_matrix_alloc(S_mat->size1, S_mat->size2); - gsl_matrix *Svar_sub = gsl_matrix_alloc(Svar_mat->size1, Svar_mat->size2); - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_matrix *S_sub = gsl_matrix_safe_alloc(S_mat->size1, S_mat->size2); + gsl_matrix *Svar_sub = gsl_matrix_safe_alloc(Svar_mat->size1, Svar_mat->size2); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); igzstream infile(file_mref.c_str(), igzstream::in); if (!infile) { |