diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/checkpoint.h | 2 | ||||
| -rw-r--r-- | src/debug.cpp | 4 | ||||
| -rw-r--r-- | src/debug.h | 11 | ||||
| -rw-r--r-- | src/gemma.cpp | 128 | ||||
| -rw-r--r-- | src/gemma.h | 1 | ||||
| -rw-r--r-- | src/gemma_io.cpp | 410 | ||||
| -rw-r--r-- | src/gemma_io.h | 14 | ||||
| -rw-r--r-- | src/lmm.cpp | 484 | ||||
| -rw-r--r-- | src/lmm.h | 49 | ||||
| -rw-r--r-- | src/mathfunc.cpp | 33 | ||||
| -rw-r--r-- | src/mathfunc.h | 5 | ||||
| -rw-r--r-- | src/param.cpp | 176 | ||||
| -rw-r--r-- | src/param.h | 12 | ||||
| -rw-r--r-- | src/vc.cpp | 2 |
14 files changed, 1002 insertions, 329 deletions
diff --git a/src/checkpoint.h b/src/checkpoint.h index ae46361..05fd836 100644 --- a/src/checkpoint.h +++ b/src/checkpoint.h @@ -1,7 +1,7 @@ /* Checkpoints for pangemma propagators - Copyright © 2015, Pjotr Prins + Copyright © 2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/debug.cpp b/src/debug.cpp index b26e173..6cefcc7 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -162,6 +162,8 @@ void disable_segfpe() { } */ +#ifndef NDEBUG + void write(const char *s, const char *msg) { if (!is_debug_data_mode() && !is_debug_dump_mode()) return; ofstream out(debug_dump_path + "debug-dump-" + msg + ".txt"); @@ -232,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) { cout << "}" << endl; } +#endif // NDEBUG + /* Helper function to make sure gsl allocations do their job because gsl_matrix_alloc does not initiatize values (behaviour that changed diff --git a/src/debug.h b/src/debug.h index 0489a81..a32bfd2 100644 --- a/src/debug.h +++ b/src/debug.h @@ -60,11 +60,22 @@ void disable_segfpe(); { auto x = m * n; \ enforce_msg(x / m == n, "multiply integer overflow"); } +#ifndef NDEBUG + void write(const double d, const char *msg = ""); void write(const char *s, const char *msg = ""); void write(const gsl_vector *v, const char *msg = ""); void write(const gsl_matrix *m, const char *msg = ""); +#else // NDEBUG + +inline void write(const double d, const char *msg = "") {}; +inline void write(const char *s, const char *msg = "") {}; +inline void write(const gsl_vector *v, const char *msg = "") {}; +inline void write(const gsl_matrix *m, const char *msg = "") {}; + +#endif // NDEBUG + gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols); int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src); void gsl_matrix_safe_free (gsl_matrix *v); diff --git a/src/gemma.cpp b/src/gemma.cpp index b30c3c3..a39d67a 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -87,7 +87,7 @@ using namespace gemmaguile; void GEMMA::PrintHeader(void) { cout << - "Pangemma --- GEMMA 0.98.5 compatible executable " << version << " (" << date << ") with guile " << global_guile_version() << " by Xiang Zhou, Pjotr Prins and team (C) 2012-" << year << endl ; + "Pangemma " << version << " --- GEMMA 0.98.5 compatible executable (" << date << ") with" << OPENBLAS_VERSION << "and guile " << global_guile_version() << endl << "by Xiang Zhou, Pjotr Prins and team (C) 2012-" << year << endl ; return; } @@ -1911,32 +1911,52 @@ void GEMMA::BatchRun(PARAM &cPar) { } // Generate Kinship matrix (optionally using LOCO) - if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) { + // if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) { + if (is_kinship_compute(cPar.a_mode)) { cout << "Calculating Relatedness Matrix ... " << endl; - gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); - enforce_msg(G, "allocate G"); // just to be sure + if (cPar.is_mdb) { + time_start = clock(); + auto K = cPar.MdbCalcKin(); + cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + if (cPar.a_mode == M_KIN) { + cPar.WriteMatrix(K, "cXX"); + } else { // M_KIN2 + cPar.WriteMatrix(K, "sXX"); + } + + gsl_matrix_safe_free(K); + return; + } + + enforce(cPar.ni_total > 0); + gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); + enforce_msg(K, "allocate G"); // just to be sure time_start = clock(); - cPar.CalcKin(G); + if (cPar.error == true) { + cout << "error! failed to prepare for calculating relatedness matrix. " << endl; + return; + } + cPar.CalcKin(K); cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); if (cPar.error == true) { - cout << "error! fail to calculate relatedness matrix. " << endl; + cout << "error! failed to calculate relatedness matrix. " << endl; return; } // Now we have the Kinship matrix test it - validate_K(G); + validate_K(K); if (cPar.a_mode == M_KIN) { - cPar.WriteMatrix(G, "cXX"); + cPar.WriteMatrix(K, "cXX"); } else { // M_KIN2 - cPar.WriteMatrix(G, "sXX"); + cPar.WriteMatrix(K, "sXX"); } - gsl_matrix_safe_free(G); + gsl_matrix_safe_free(K); } // Compute the LDSC weights (not implemented yet) @@ -2572,19 +2592,23 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.a_mode == M_LMM4 || cPar.a_mode == M_LMM5 || cPar.a_mode == M_LMM9 || cPar.a_mode == M_EIGEN) { // Fit LMM or mvLMM or eigen write(cPar.a_mode, "Mode"); - gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); // Y is ind x phenotypes + auto n_ph = cPar.n_ph; // # of trait values + auto ni_test = cPar.ni_test; // # of individuals to test + enforce(n_ph > 0); + enforce(ni_test > 0); + gsl_matrix *Y = gsl_matrix_safe_alloc(ni_test, n_ph); // Y is ind x phenotypes enforce_msg(Y, "allocate Y"); // just to be sure - gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); // W is ind x covariates - gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2); // B is a d by c (effect size) - // matrix - gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2); // se - gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1); // Kinship aka as G/GRM in the equation ind x ind - gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1); - gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2); // Transformed ind x covariates - gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2); // Transforemd ind x phenotypes - gsl_vector *eval = gsl_vector_calloc(Y->size1); // eigen values - gsl_vector *env = gsl_vector_safe_alloc(Y->size1); // E environment - gsl_vector *weight = gsl_vector_safe_alloc(Y->size1); // weights + gsl_matrix *W = gsl_matrix_safe_alloc(ni_test, cPar.n_cvt); // W is ind x covariates + gsl_matrix *B = gsl_matrix_safe_alloc(n_ph, W->size2); // B is a d by c (effect size) + gsl_matrix *se_B = gsl_matrix_safe_alloc(n_ph, W->size2); // se + enforce(ni_test > 0); + gsl_matrix *K = gsl_matrix_safe_alloc(ni_test, ni_test); // Kinship aka as G/GRM in the equation ind x ind + gsl_matrix *U = gsl_matrix_safe_alloc(ni_test, ni_test); + gsl_matrix *UtW = gsl_matrix_calloc(ni_test, W->size2); // Transformed ind x covariates + gsl_matrix *UtY = gsl_matrix_calloc(ni_test, n_ph); // Transformed ind x phenotypes + gsl_vector *eval = gsl_vector_calloc(ni_test); // eigen values + gsl_vector *env = gsl_vector_safe_alloc(ni_test); // E environment + gsl_vector *weight = gsl_vector_safe_alloc(ni_test); // weights debug_msg("Started on LMM"); assert_issue(is_issue(26), UtY->data[0] == 0.0); @@ -2818,30 +2842,36 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and its transformed write(&Y_col.vector, "Y_col"); - if (!cPar.file_bfile.empty()) { - // PLINK analysis - if (cPar.file_gxe.empty()) { - cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, cPar.setGWASnps); - } - else { - cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, env); - } + if (cPar.is_mdb) { + cLmm.mdb_calc_gwa(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, cPar.loco); + cLmm.CopyToParam(cPar); } else { - // BIMBAM analysis - - if (cPar.file_gxe.empty()) { - cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, cPar.setGWASnps); - } else { - cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, env); + if (!cPar.file_bfile.empty()) { + // PLINK analysis + if (cPar.file_gxe.empty()) { + cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, cPar.setGWASnps); + } + else { + cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, env); + } + } + else { + // BIMBAM analysis + + if (cPar.file_gxe.empty()) { + cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, cPar.setGWASnps); + } else { + cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, env); + } } + cLmm.WriteFiles(); // write output files + cLmm.CopyToParam(cPar); } - cLmm.WriteFiles(); // write output files - cLmm.CopyToParam(cPar); } else { debug_msg("fit mvLMM (multiple phenotypes)"); MVLMM cMvlmm; @@ -2901,7 +2931,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // run bvsr if rho==1 if (cPar.rho_min == 1 && cPar.rho_max == 1) { // read genotypes X (not UtX) - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // perform BSLMM analysis BSLMM cBslmm; @@ -2919,7 +2949,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // read relatedness matrix G if (!(cPar.file_kin).empty()) { - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // read relatedness matrix G ReadFile_kin(cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, @@ -2933,7 +2963,7 @@ void GEMMA::BatchRun(PARAM &cPar) { CenterMatrix(G); validate_K(G); } else { - cPar.ReadGenotypes(UtX, G, true); + cPar.ReadBIMBAMGenotypes(UtX, G, true); } // eigen-decomposition and calculate trace_G @@ -3011,7 +3041,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // run bvsr if rho==1 if (cPar.rho_min == 1 && cPar.rho_max == 1) { // read genotypes X (not UtX) - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // perform BSLMM analysis BSLMM cBslmm; @@ -3030,7 +3060,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // read relatedness matrix G if (!(cPar.file_kin).empty()) { - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // read relatedness matrix G ReadFile_kin(cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, @@ -3045,7 +3075,7 @@ void GEMMA::BatchRun(PARAM &cPar) { validate_K(G); } else { - cPar.ReadGenotypes(UtX, G, true); + cPar.ReadBIMBAMGenotypes(UtX, G, true); } // eigen-decomposition and calculate trace_G @@ -3171,7 +3201,7 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { } outfile << "##" << endl; - outfile << "## GEMMA Version = " << version << " (" << date << ")" << endl; + outfile << "## PANGEMMA Version = " << version << " (" << date << ")" << endl; // outfile << "## Build profile = " << GEMMA_PROFILE << endl ; #ifdef __GNUC__ outfile << "## GCC version = " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__ << endl; diff --git a/src/gemma.h b/src/gemma.h index b34a958..fa7c24d 100644 --- a/src/gemma.h +++ b/src/gemma.h @@ -64,6 +64,7 @@ public: void Assign(int argc, char **argv, PARAM &cPar); void BatchRun(PARAM &cPar); void WriteLog(int argc, char **argv, PARAM &cPar); + inline bool is_kinship_compute(uint a_mode) { return (a_mode == M_KIN || a_mode == M_KIN2); } }; #endif diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index 7d1f0ca..e630f64 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2020, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -41,6 +41,7 @@ #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> #include "checkpoint.h" #include "debug.h" @@ -61,7 +62,7 @@ void ProgressBar(string str, double p, double total, double ratio) { const double progress = (100.0 * p / total); const uint barsize = (int)(progress / 2.0); // characters // cout << barsize << endl; - // cout << str << " "; + cout << str << " "; // cout << p << "/" << total << endl; assert(barsize < 101); // corrupted data somehow if (barsize > 0) { @@ -153,6 +154,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) { // Read SNP file. A single column of SNP names. bool ReadFile_snps(const string file_snps, set<string> &setSnps) { debug_msg("enter ReadFile_snps"); + checkpoint("read-file-snps",file_snps); setSnps.clear(); igzstream infile(file_snps.c_str(), igzstream::in); @@ -182,6 +184,7 @@ bool ReadFile_snps(const string file_snps, set<string> &setSnps) { bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { debug_msg("entered"); setSnps.clear(); + checkpoint("read-file-header",file_snps); igzstream infile(file_snps.c_str(), igzstream::in); if (!infile) { @@ -239,6 +242,7 @@ bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { // Read log file. bool ReadFile_log(const string &file_log, double &pheno_mean) { debug_msg("ReadFile_log"); + checkpoint("read-file-log",file_log); ifstream infile(file_log.c_str(), ifstream::in); if (!infile) { cout << "error! fail to open log file: " << file_log << endl; @@ -282,6 +286,8 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM) { debug_msg("ReadFile_anno"); + checkpoint("read-file-anno",file_anno); + mapRS2chr.clear(); mapRS2bp.clear(); @@ -345,6 +351,7 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr, bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column) { debug_msg("entered"); + checkpoint("read-file-column",file_pheno); indicator_idv.clear(); pheno.clear(); @@ -389,6 +396,7 @@ bool ReadFile_pheno(const string &file_pheno, vector<vector<double>> &pheno, const vector<size_t> &p_column) { debug_msg("entered"); + checkpoint("read-file-pheno",file_pheno); indicator_pheno.clear(); pheno.clear(); @@ -398,35 +406,36 @@ bool ReadFile_pheno(const string &file_pheno, return false; } - string line; - char *ch_ptr; - - string id; + // string id; double p; vector<double> pheno_row; - vector<int> ind_pheno_row; + + string line; + char *ch_ptr; + + vector<int> indicator_pheno_row; size_t p_max = *max_element(p_column.begin(), p_column.end()); map<size_t, size_t> mapP2c; for (size_t i = 0; i < p_column.size(); i++) { mapP2c[p_column[i]] = i; pheno_row.push_back(-9); - ind_pheno_row.push_back(0); + indicator_pheno_row.push_back(0); } while (!safeGetline(infile, line).eof()) { ch_ptr = strtok((char *)line.c_str(), " ,\t"); size_t i = 0; while (i < p_max) { - enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match phenotypes in geno file"); + enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match selected columns"); if (mapP2c.count(i + 1) != 0) { if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[i + 1]] = 0; + indicator_pheno_row[mapP2c[i + 1]] = 0; // skip this trait row pheno_row[mapP2c[i + 1]] = -9; } else { p = atof(ch_ptr); - ind_pheno_row[mapP2c[i + 1]] = 1; + indicator_pheno_row[mapP2c[i + 1]] = 1; // use this trait row pheno_row[mapP2c[i + 1]] = p; } } @@ -434,7 +443,7 @@ bool ReadFile_pheno(const string &file_pheno, ch_ptr = strtok(NULL, " ,\t"); } - indicator_pheno.push_back(ind_pheno_row); + indicator_pheno.push_back(indicator_pheno_row); pheno.push_back(pheno_row); } @@ -448,6 +457,7 @@ bool ReadFile_pheno(const string &file_pheno, bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double>> &cvt, size_t &n_cvt) { debug_msg("entered"); + checkpoint("read-file-cvt",file_cvt); indicator_cvt.clear(); ifstream infile(file_cvt.c_str(), ifstream::in); @@ -515,6 +525,7 @@ bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt, // Read .bim file. bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { + checkpoint("read-file-bim",file_bim); debug_msg("entered"); snpInfo.clear(); @@ -563,6 +574,7 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, vector<vector<double>> &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column) { + checkpoint("read-file-fam",file_fam); debug_msg("entered"); indicator_pheno.clear(); pheno.clear(); @@ -639,13 +651,14 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, return true; } + // Read bimbam mean genotype file, the first time, to obtain #SNPs for // analysis (ns_test) and total #SNP (ns_total). /* ## Modified (Mutated) Parameters: 1. **`indicator_idv`** (vector<int>&) - - Actually **not modified** - only read from to determine which individuals to include + - **Not modified** - only read from to determine which individuals to include 2. **`indicator_snp`** (vector<int>&) - **Modified**: Cleared at the start (`indicator_snp.clear()`) @@ -678,14 +691,15 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, The function returns a `bool` indicating success/failure, but the real outputs are these three modified reference parameters. */ -bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, - const gsl_matrix *W, const vector<int> &indicator_idv, - vector<int> &indicator_snp, const double &maf_level, - const double &miss_level, const double &hwe_level, - const double &r2_level, map<string, string> &mapRS2chr, - map<string, long int> &mapRS2bp, - map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, - size_t &ns_test) { +bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, + const gsl_matrix *W, const vector<int> &indicator_idv, + vector<int> &indicator_snp, const double &maf_level, + const double &miss_level, const double &hwe_level, + const double &r2_level, map<string, string> &mapRS2chr, + map<string, long int> &mapRS2bp, + map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, + size_t &ns_test) { + checkpoint("read-file-geno",file_geno); debug_msg("entered"); indicator_snp.clear(); snpInfo.clear(); @@ -724,9 +738,8 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, double cM; size_t file_pos; - double maf, geno, geno_old; + double geno, geno_old; size_t n_miss; - size_t n_0, n_1, n_2; double min_g = std::numeric_limits<float>::max(); double max_g = std::numeric_limits<float>::min(); @@ -737,6 +750,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, for (int i = 0; i < ni_total; ++i) { ni_test += indicator_idv[i]; } + // assert(ni_test == ni_total); // we use indicator_idv? ns_test = 0; file_pos = 0; @@ -783,15 +797,14 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } // Start on a new marker/SNP - maf = 0; n_miss = 0; flag_poly = 0; geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; c_idv = 0; gsl_vector_set_zero(genotype_miss); + double maf = 0.0; + size_t n_0=0,n_1=0,n_2=0; + auto infilen = file_geno.c_str(); for (int i = 0; i < ni_total; ++i) { // read genotypes ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str()); @@ -808,19 +821,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } geno = atof(ch_ptr); - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } gsl_vector_set(genotype, c_idv, geno); - if (geno < min_g) min_g = geno; - if (geno > max_g) max_g = geno; + if (geno < min_g) + min_g = geno; + if (geno > max_g) + max_g = geno; // going through genotypes with 0.0 < geno < 2.0 if (flag_poly == 0) { // first init in marker @@ -831,10 +837,20 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, flag_poly = 1; // genotypes differ } + // compute ratios + if (hwe_level != 0 && maf_level != -1) { + if (geno >= 0 && geno <= 0.5) + n_0++; + if (geno > 0.5 && geno < 1.5) + n_1++; + if (geno >= 1.5 && geno <= 2.0) + n_2++; + } maf += geno; c_idv++; } + maf /= 2.0 * (double)(ni_test - n_miss); SNPINFO sInfo = {chr, rs, @@ -871,8 +887,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, continue; } } - - // -r2 flag for (size_t i = 0; i < genotype->size; ++i) { if (gsl_vector_get(genotype_miss, i) == 1) { @@ -880,7 +894,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, gsl_vector_set(genotype, i, geno); } } - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot(genotype, genotype, &v_x); @@ -924,6 +937,7 @@ bool ReadFile_bed(const string &file_bed, const set<string> &setSnps, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); indicator_snp.clear(); size_t ns_total = snpInfo.size(); @@ -1232,6 +1246,7 @@ void Plink_ReadOneSNP(const int pos, const vector<int> &indicator_idv, void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) { + checkpoint("read-file-kin",file_kin); debug_msg("entered"); igzstream infile(file_kin.c_str(), igzstream::in); if (!infile) { @@ -1335,7 +1350,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, infile.close(); infile.clear(); - checkpoint("read-kinship-file",file_kin); + checkpoint("end-read-file-kin",file_kin); return; } @@ -1461,6 +1476,197 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { return; } + +// Read bimbam mean genotype file and calculate kinship matrix. +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) { + checkpoint("mdb-kin",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr = 0; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } + + /* Create and open the LMDB environment: */ + auto env = lmdb::env::create(); + + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + string format; + + size_t ni_total = 0, num_markers = 0; + { + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto info = lmdb::dbi::open(rtxn, "info"); + + string_view meta; + if (info.get(rtxn, "meta", meta)) { + cout << meta << endl; + } else { + cout << "meta not found!" << endl; + } + std::string_view view_int; + info.get(rtxn, "numsamples", view_int); + ni_total = lmdb::from_sv<size_t>(view_int); + string_view info_key,info_value; + info.get(rtxn,"format",info_value); + format = string(info_value); + + MDB_stat stat; + mdb_stat(rtxn, info, &stat); + assert(stat.ms_entries == 5); + } + + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + // cout << "Number of records: " << stat.ms_entries << endl; + num_markers = stat.ms_entries; + + gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total); + gsl_matrix_set_zero(matrix_kin); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); + + // Xlarge contains inds x markers + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE); + enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE"); + if (is_check_mode()) + gsl_matrix_set_zero(Xlarge); + + // [ ] check XLarge is zeroed properly + // [ ] handle missing data + + // For every marker read the genotype x inds + size_t ns_test = 0; + size_t display_pace = 1000; + + cout << "## number of total individuals = " << ni_total << endl; + cout << "## number of analyzed individuals = " << ni_total << endl; + cout << "## number of analyzed SNPs/var = " << num_markers << endl; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + + auto mdb_fetch = MDB_FIRST; + for (size_t t = 0; t < num_markers; ++t) { + string line; + if (t % display_pace == 0) { + ProgressBar("Reading SNPs", t, num_markers - 1); + } + string_view key, value; + cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + + if (is_loco && loco_chr == chr) + continue; + + // ---- Depending on the format we get different buffers - currently float and byte are supported: + vector<double> gs; + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + auto size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + auto size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + + size_t n_miss = 0; + double geno_mean = 0.0; + double geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + + for (size_t i = 0; i < ni_total; ++i) { + double d = gs[i]; + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + // impute missing values by plugging in the mean + for (size_t i = 0; i < ni_total; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + // subtract the mean (centering genotype values) + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + // scale the genotypes + if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2 + // flag does this + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + + // set the SNP column ns_test to copy geno into the compute matrix Xlarge + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % K_BATCH_SIZE); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); + + ns_test++; + + // Every K_BATCH_SIZE rows batch compute kinship matrix and return + // by adding to matrix_kin + if (ns_test % K_BATCH_SIZE == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K updated"); + } + } + if (ns_test % K_BATCH_SIZE != 0) { // compute last batch + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + write(matrix_kin,"K updated"); + } + ProgressBar("Reading SNPs", num_markers, num_markers); + cout << endl; + + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + gsl_matrix_free(Xlarge); + + return matrix_kin; +} + // Read bimbam mean genotype file and calculate kinship matrix. bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, @@ -1470,7 +1676,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, auto infilen = file_geno.c_str(); igzstream infile(infilen, igzstream::in); enforce_msg(infilen, "error reading genotype file"); - checkpoint("read-geno-file",file_geno); + checkpoint("bimbam-kin",file_geno); size_t n_miss; double geno_mean, geno_var; @@ -1792,6 +1998,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { + checkpoint("read-file-geno",file_geno); debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { @@ -1893,121 +2100,12 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, return true; } -// Compact version of the above function, using uchar instead of -// gsl_matrix. -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, - vector<int> &indicator_snp, - vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test) { - debug_msg("entered"); - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - Xt.clear(); - vector<unsigned char> Xt_row; - for (size_t i = 0; i < ni_test; i++) { - Xt_row.push_back(0); - } - - string line; - char *ch_ptr; - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); - double geno, geno_mean; - size_t n_miss; - - size_t ni_total = indicator_idv.size(); - size_t ns_total = indicator_snp.size(); - - size_t c_idv = 0, c_snp = 0; - - auto infilen = file_geno.c_str(); - for (size_t i = 0; i < ns_total; ++i) { - safeGetline(infile, line).eof(); - if (indicator_snp[i] == 0) { - continue; - } - - ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen); - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - - c_idv = 0; - geno_mean = 0; - n_miss = 0; - gsl_vector_set_zero(genotype_miss); - for (uint j = 0; j < ni_total; ++j) { - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - if (indicator_idv[j] == 0) { - continue; - } - - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - } else { - geno = atof(ch_ptr); - gsl_vector_set(genotype, c_idv, geno); - geno_mean += geno; - } - c_idv++; - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t j = 0; j < genotype->size; ++j) { - if (gsl_vector_get(genotype_miss, j) == 1) { - geno = geno_mean; - } else { - geno = gsl_vector_get(genotype, j); - } - - Xt_row[j] = Double02ToUchar(geno); - gsl_vector_set(genotype, j, (geno - geno_mean)); - } - Xt.push_back(Xt_row); - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - - infile.clear(); - infile.close(); - - return true; -} - // Read bimbam mean genotype file, the second time, recode "mean" // genotype and calculate K. bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { @@ -2141,6 +2239,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { @@ -2277,6 +2376,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est) { + checkpoint("read-file-est",file_est); debug_msg("entered"); mapRS2est.clear(); @@ -2344,7 +2444,8 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, } bool CountFileLines(const string &file_input, size_t &n_lines) { - debug_msg("entered"); + checkpoint("count-lines",file_input); + igzstream infile(file_input.c_str(), igzstream::in); if (!infile) { cout << "error! fail to open file: " << file_input << endl; @@ -2361,6 +2462,7 @@ bool CountFileLines(const string &file_input, size_t &n_lines) { // Read gene expression file. bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total) { + checkpoint("read-file-gene",file_gene); debug_msg("entered"); vec_read.clear(); ng_total = 0; diff --git a/src/gemma_io.h b/src/gemma_io.h index 5c14227..06cd0ee 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -26,12 +26,17 @@ #include <algorithm> #include <map> #include <vector> +#include <cstdint> #include "gzstream.h" #include "param.h" #define tab(col) ( col ? "\t" : "") +constexpr uint8_t CHR_X = 'X'; +constexpr uint8_t CHR_Y = 'Y'; +constexpr uint8_t CHR_M = 'M'; + using namespace std; void ProgressBar(string str, double p, double total, double ratio = -1.0); @@ -59,7 +64,7 @@ bool ReadFile_pheno(const string &file_pheno, bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column); -bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, +bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, const vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, @@ -87,6 +92,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco); + bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, @@ -100,11 +107,6 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, - vector<int> &indicator_snp, - vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test); bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, diff --git a/src/lmm.cpp b/src/lmm.cpp index 87b9e1e..2b730f3 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2022 Pjotr Prins + Copyright © 2017-2025 Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,9 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> +// #include <lmdb.h> +#include <sys/mman.h> #include "gzstream.h" #include "gemma.h" @@ -55,6 +58,7 @@ using namespace std; void LMM::CopyFromParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-from-param"); a_mode = cPar.a_mode; d_pace = cPar.d_pace; @@ -91,10 +95,12 @@ void LMM::CopyFromParam(PARAM &cPar) { } void LMM::CopyToParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-to-param"); cPar.time_UtX = time_UtX; cPar.time_opt = time_opt; - - cPar.ng_test = ng_test; + cPar.ns_total = ns_total; + cPar.ns_test = ns_test; + cPar.ng_test = ng_test; // number of markers tested return; } @@ -105,10 +111,11 @@ void LMM::WriteFiles() { file_str = path_out + "/" + file_out; file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { - cout << "error writing file: " << file_str.c_str() << endl; + cout << "error writing file: " << file_str << endl; return; } @@ -148,7 +155,7 @@ void LMM::WriteFiles() { outfile << scientific << setprecision(6); if (a_mode != M_LMM2) { - outfile << st.beta << "\t"; + outfile << scientific << st.beta << "\t"; outfile << st.se << "\t"; } @@ -202,21 +209,29 @@ void LMM::WriteFiles() { common_header(); - size_t t = 0; - for (size_t i = 0; i < snpInfo.size(); ++i) { - if (indicator_snp[i] == 0) - continue; - auto snp = snpInfo[i].rs_number; - if (process_gwasnps && setGWASnps.count(snp) == 0) - continue; - // cout << t << endl; - outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" - << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" - << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" - << fixed << setprecision(3) << snpInfo[i].maf << "\t"; - - sumstats(sumStat[t]); - t++; + if (snpInfo.size()) { + size_t t = 0; + for (size_t i = 0; i < snpInfo.size(); ++i) { + if (indicator_snp[i] == 0) + continue; + auto snp = snpInfo[i].rs_number; + if (process_gwasnps && setGWASnps.count(snp) == 0) + continue; + // cout << t << endl; + outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" + << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" + << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" + << fixed << setprecision(3) << snpInfo[i].maf << "\t"; + + sumstats(sumStat[t]); + t++; + } + } + else + { + for (auto &s : sumStat) { + sumstats(s); + } } } @@ -1662,12 +1677,9 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const set<string> gwasnps) { clock_t time_start = clock(); - write(W, "W"); - write(y, "y"); + checkpoint_nofile("start-lmm-analyze"); // Subset/LOCO support bool process_gwasnps = gwasnps.size(); - if (process_gwasnps) - debug_msg("Analyze subset of SNPs (LOCO)"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; @@ -1704,6 +1716,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, auto batch_compute = [&](size_t l) { // using a C++ closure // Compute SNPs in batch, note the computations are independent per SNP // debug_msg("enter batch_compute"); + checkpoint_nofile("\nstart-batch_compute"); gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); gsl_matrix_view UtXlarge_sub = gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); @@ -1763,6 +1776,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, sumStat.push_back(SNPs); } // debug_msg("exit batch_compute"); + checkpoint_nofile("end-batch_compute"); }; const auto num_snps = indicator_snp.size(); @@ -1844,9 +1858,10 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } batch_compute(c % msize); - ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1); + ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1); // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; + checkpoint_nofile("end-lmm-analyze"); gsl_vector_safe_free(x); gsl_vector_safe_free(x_miss); @@ -1860,6 +1875,425 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } /* + This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead. + */ + + + +void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + size_t num_markers) { + vector<SUMSTAT2> sumstat2; + clock_t time_start = clock(); + + checkpoint_nofile("start-lmm-mdb-analyze"); + + // Calculate basic quantities. + size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; + + const size_t inds = U->size1; + enforce(inds == ni_test); + assert(inds > 0); + + gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_safe_alloc(inds); + assert(ni_test == U->size2); + assert(ni_test > 0); + assert(ni_total > 0); + assert(n_index > 0); + gsl_vector *Utx = gsl_vector_safe_alloc(ni_test); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); + + const size_t msize = LMM_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_safe_alloc(inds, msize); + enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + enforce(Xlarge->size1 == inds); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero(Uab); + CalcUab(UtW, Uty, Uab); + + // start reading genotypes and analyze + size_t c = 0; + + /* + batch_compute(l) takes l x SNPs that have been loaded into Xlarge, + transforms them all at once using the eigenvector matrix U, then + loops through each transformed SNP to compute association + statistics (beta, standard errors, p-values) and stores results in + sumStat. + */ + auto batch_compute = [&](size_t l, const Markers &markers) { // using a C++ closure + // Compute SNPs in batch, note the computations are independent per SNP + debug_msg("enter batch_compute"); + assert(l>0); + assert(inds>0); + gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); + gsl_matrix_view UtXlarge_sub = + gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); + + time_start = clock(); + // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge + // This is much faster than doing l separate matrix-vector products + // U is the eigenvector matrix from the spectral decomposition of the kinship matrix + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); + time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < l; i++) { + // for each snp batch item extract transformed genotype: + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); + + // Calculate design matrix components and compute sufficient statistics for the regression model + CalcUab(UtW, Uty, Utx, Uab); + + time_start = clock(); + FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; + + double lambda_mle = 0.0, lambda_remle = 0.0, beta = 0.0, se = 0.0, p_wald = 0.0; + double p_lrt = 0.0, p_score = 0.0; + double logl_H1 = 0.0; + + // Run statistical tests based on analysis mode + // 3 is before 1. + if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) { + CalcRLScore(l_mle_null, param1, beta, se, p_score); + } + + // Computes Wald statistic for testing association + if (a_mode == M_LMM1 || a_mode == M_LMM4) { + // for univariate a_mode is 1 + // Estimates variance component (lambda) via REML + CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald(lambda_remle, param1, beta, se, p_wald); + } + + // Estimates variance component (lambda) via REML + // Likelihood Ratio Test (modes 2, 4, 9): + // Estimates variance component via MLE + // Compares log-likelihood under alternative vs null hypothesis + if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) { + CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); + } + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + auto markerinfo = markers[i]; + // Store summary data. + SUMSTAT2 st = {markerinfo, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score, logl_H1}; + sumstat2.push_back(st); + } + }; + + /* + const auto num_markers = indicator_snp.size(); + enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?"); + if (num_markers < 50) { + cerr << num_markers << " SNPs" << endl; + warning_msg("very few SNPs processed"); + } + */ + const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace); + Markers markers; + + assert(num_markers > 0); + for (size_t t = 0; t < num_markers; ++t) { + // for (size_t t = 0; t < 2; ++t) { + if (t % progress_step == 0 || t == (num_markers - 1)) { + ProgressBar("Reading markers", t, num_markers - 1); + } + // if (indicator_snp[t] == 0) + // continue; + + auto tup = fetch_snp(t); // use the callback + auto state = get<0>(tup); + if (state == SKIP) + continue; + if (state == LAST) + break; // marker loop because of LOCO + + auto markerinfo = get<1>(tup); + auto gs = get<2>(tup); + + markers.push_back(markerinfo); + + // drop missing idv and plug mean values for missing geno + double x_total = 0.0; // sum genotype values to compute x_mean + uint vpos = 0; // position in target vector + uint n_miss = 0; // count NA genotypes + gsl_vector_set_zero(x_miss); + for (size_t i = 0; i < ni_total; ++i) { + // get the genotypes per individual and compute stats per SNP + if (indicator_idv[i] == 0) // skip individual + continue; + + double geno = gs[i]; + if (isnan(geno)) { + gsl_vector_set(x_miss, vpos, 1.0); + n_miss++; + } else { + gsl_vector_set(x, vpos, geno); + x_total += geno; + } + vpos++; + } + enforce(vpos == ni_test); + + const double x_mean = x_total/(double)(ni_test - n_miss); + + // plug x_mean back into missing values + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(x_miss, i) == 1.0) { + gsl_vector_set(x, i, x_mean); + } + } + + enforce(x->size == ni_test); + + // copy genotype values for SNP into Xlarge cache + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); + gsl_vector_safe_memcpy(&Xlarge_col.vector, x); + c++; // count markers going in + + if (c % msize == 0) { + batch_compute(msize,markers); + markers.clear(); + markers.reserve(msize); + } + } + + if (c % msize) + batch_compute(c % msize,markers); + ProgressBar("Reading markers", num_markers - 1, num_markers - 1); + cout << endl; + cout << "Counted markers " << c << " sumStat " << sumstat2.size() << endl; + checkpoint_nofile("end-lmm-mdb-analyze"); + + string file_str; + debug_msg("LMM::WriteFiles"); + file_str = path_out + "/" + file_out; + file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); + + ofstream outfile(file_str.c_str(), ofstream::out); + if (!outfile) { + cout << "error writing file: " << file_str << endl; + return; + } + + auto sumstats = [&] (SUMSTAT2 st) { + outfile << scientific << setprecision(6); + auto m = st.markerinfo; + auto name = m.name; + auto chr = m.chr; + string chr_s; + if (chr == CHR_X) + chr_s = "X"; + else if (chr == CHR_Y) + chr_s = "Y"; + else if (chr == CHR_M) + chr_s = "M"; + else + chr_s = to_string(chr); + + outfile << chr_s << "\t"; + outfile << name << "\t"; + outfile << m.pos << "\t"; + outfile << m.n_miss << "\t-\t-\t"; // n_miss column + allele1 + allele0 + outfile << fixed << setprecision(3) << m.maf << "\t"; + outfile << scientific << setprecision(6); + if (a_mode != M_LMM2) { + outfile << st.beta << "\t"; + outfile << st.se << "\t"; + } + + if (a_mode != M_LMM3 && a_mode != M_LMM9) + outfile << st.logl_H1 << "\t"; + + switch(a_mode) { + case M_LMM1: + outfile << st.lambda_remle << "\t" + << st.p_wald << endl; + break; + case M_LMM2: + case M_LMM9: + outfile << st.lambda_mle << "\t" + << st.p_lrt << endl; + break; + case M_LMM3: + outfile << st.p_score << endl; + break; + case M_LMM4: + outfile << st.lambda_remle << "\t" + << st.lambda_mle << "\t" + << st.p_wald << "\t" + << st.p_lrt << "\t" + << st.p_score << endl; + break; + } + }; + + for (auto &s : sumstat2) { + sumstats(s); + } + + gsl_vector_safe_free(x); + gsl_vector_safe_free(x_miss); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_free(ab); // unused + + gsl_matrix_safe_free(Xlarge); + gsl_matrix_safe_free(UtXlarge); + +} + + +void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, const string loco) { + checkpoint("mdb-calc-gwa",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } + + auto env = lmdb::env::create(); + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + // Get mmap info using lmdb++ wrapper + MDB_envinfo info; + mdb_env_info(env.handle(), &info); + // Linux kernel aggressive readahead hints + madvise(info.me_mapaddr, info.me_mapsize, MADV_SEQUENTIAL); + madvise(info.me_mapaddr, info.me_mapsize, MADV_WILLNEED); + + std::cout << "## LMDB opened with optimized readahead; map size = " << (info.me_mapsize / 1024 / 1024) << " MB" << std::endl; + + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + auto marker_mdb = lmdb::dbi::open(rtxn, "marker"); + auto info_mdb = lmdb::dbi::open(rtxn, "info"); + string_view info_key,info_value; + info_mdb.get(rtxn,"format",info_value); + auto format = string(info_value); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + auto num_markers = stat.ms_entries; + + auto mdb_fetch = MDB_FIRST; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + cout << "## number of total individuals = " << ni_total << endl; + cout << "## number of analyzed individuals = " << ni_test << endl; + cout << "## number of analyzed SNPs/var = " << num_markers << endl; + + std::function<SnpNameValues2(size_t)> fetch_snp = [&](size_t num) { + + string_view key,value; + + auto mdb_success = cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + + // uint8_t chr; + vector<double> gs; + MarkerInfo markerinfo; + + if (mdb_success) { + size_t size = 0; + // ---- Depending on the format we get different buffers - currently float and byte are supported: + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + // Start unpacking key chr+pos + if (key.size() != 10) { + cerr << "key.size=" << key.size() << endl; + throw std::runtime_error("Invalid key size"); + } + // "S>L>L>" + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + // Extract big-endian uint32 + // uint32_t rest = static_cast<uint32_t>(data[2]); + uint32_t pos = (data[2] << 24) | (data[3] << 16) | + (data[4] << 8) | data[5]; + + uint32_t num = (data[6] << 24) | (data[7] << 16) | + (data[8] << 8) | data[9]; + + // printf("%#02x %#02x\n", chr, loco_chr); + + if (is_loco && loco_chr != chr) { + if (chr > loco_chr) + return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + else + return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + } + + string_view value2; + marker_mdb.get(rtxn,key,value2); + auto marker = string(value2); + // 1 rs13476251 174792257 + // cout << static_cast<int>(chr) << ":" << pos2 << " line " << rest2 << ":" << marker << endl ; + + // compute maf and n_miss (NAs) + size_t n_miss = 0; // count NAs: FIXME + double maf = compute_maf(ni_total, ni_test, n_miss, gs.data(), indicator_idv); + + markerinfo = MarkerInfo { .name=marker,.chr=chr,.pos=pos,.line_no=num,.n_miss=n_miss,.maf=maf }; + + // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl; + } + return make_tuple(COMPUTE, markerinfo, gs); + }; + LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers); + + ns_total = ns_test = num_markers; // track global number of snps in original gemma (goes to cPar) +} + + +/* Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified: diff --git a/src/lmm.h b/src/lmm.h index 736c679..295602a 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ using namespace std; -#define LMM_BATCH_SIZE 20000 // used for batch processing +#define LMM_BATCH_SIZE 5000 // used for batch processing class FUNC_PARAM { @@ -44,7 +44,42 @@ public: size_t e_mode; }; -typedef std::tuple<string,std::vector<double> > SnpNameValues; + +// typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerInfo; // name, chr, pos, line +struct MarkerInfo { + string name; + uint8_t chr; + size_t pos, line_no, n_miss; + double maf; +} ; + +typedef vector<MarkerInfo> Markers; +typedef tuple< string,vector<double> > SnpNameValues; + +enum MarkerState { + FAIL, + COMPUTE, + SKIP, + LAST +}; + +typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) +// Results for LMM. +class SUMSTAT2 { +public: + MarkerInfo markerinfo; + double beta; // REML estimator for beta. + double se; // SE for beta. + double lambda_remle; // REML estimator for lambda. + double lambda_mle; // MLE estimator for lambda. + double p_wald; // p value from a Wald test. + double p_lrt; // p value from a likelihood ratio test. + double p_score; // p value from a score test. + double logl_H1; // log likelihood under the alternative + // hypothesis as a measure of goodness of fit, + // see https://github.com/genetics-statistics/GEMMA/issues/81 +}; + class LMM { @@ -100,6 +135,14 @@ public: const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps); + void mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, size_t num_markers); + void mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const string loco); void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index a351509..c82872e 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -645,3 +645,36 @@ double UcharToDouble02(const unsigned char c) { return (double)c * 0.01; } unsigned char Double02ToUchar(const double dosage) { return (int)(dosage * 100); } + + +std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) { + size_t n_0 = 0; + size_t n_1 = 0; + size_t n_2 = 0; + for (size_t i = 0; i < ni_total; ++i) { // read genotypes + double geno = gsl_vector_get(gs, i); + if (geno >= 0 && geno <= 0.5) { + n_0++; + } + if (geno > 0.5 && geno < 1.5) { + n_1++; + } + if (geno >= 1.5 && geno <= 2.0) { + n_2++; + } + } + return {n_0,n_1,n_2}; +} + +double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs, const vector<int> &indicator) { + double maf = 0.0; + + for (size_t i = 0; i < ni_total; ++i) { // read genotypes + if (indicator[i]) { + double geno = gs[i]; + maf += geno; // 0..2 range + } + } + maf /= 2.0 * (double)(ni_test - n_miss); // Assumption is that geno value is between 0 and 2. FIXME + return maf; +} diff --git a/src/mathfunc.h b/src/mathfunc.h index 641d0a3..f7c9e6e 100644 --- a/src/mathfunc.h +++ b/src/mathfunc.h @@ -24,6 +24,7 @@ // #include "Eigen/Dense" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include <tuple> #define CONDITIONED_MAXRATIO 2e+6 // based on http://mathworld.wolfram.com/ConditionNumber.html #define EIGEN_MINVALUE 1e-10 @@ -77,4 +78,8 @@ unsigned char Double02ToUchar(const double dosage); // void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, // const size_t i_row, Eigen::VectorXd &x_row); +std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs); +double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs, const vector<int> &indicator); + + #endif diff --git a/src/param.cpp b/src/param.cpp index f96e9c3..96a9cd2 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,7 @@ #include "mathfunc.h" #include "param.h" #include "fastblas.h" +#include "checkpoint.h" using namespace std; @@ -104,7 +105,9 @@ PARAM::PARAM(void) randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), - time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} + time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) { + is_mdb = false; + } PARAM::~PARAM() { if (gsl_r) @@ -113,6 +116,7 @@ PARAM::~PARAM() { // Read files: obtain ns_total, ng_total, ns_test, ni_test. void PARAM::ReadFiles(void) { + checkpoint_nofile("PARAM::ReadFiles"); string file_str; // Read cat file. @@ -145,22 +149,24 @@ void PARAM::ReadFiles(void) { } } - // Read SNP set. - if (!file_snps.empty()) { - if (ReadFile_snps(file_snps, setSnps) == false) { - error = true; + if (!is_mdb) { + // Read SNP set into setSnps (without filtering) + if (!file_snps.empty()) { + if (ReadFile_snps(file_snps, setSnps) == false) { + error = true; + } + } else { + setSnps.clear(); } - } else { - setSnps.clear(); - } - // Read KSNP set. - if (!file_ksnps.empty()) { - if (ReadFile_snps(file_ksnps, setKSnps) == false) { - error = true; + // Also read KSNP set. Reads all Snps later to be used by GRM. + if (!file_ksnps.empty()) { + if (ReadFile_snps(file_ksnps, setKSnps) == false) { + error = true; + } + } else { + setKSnps.clear(); } - } else { - setKSnps.clear(); } // For prediction. @@ -180,13 +186,15 @@ void PARAM::ReadFiles(void) { } } + if (!file_geno.empty()) { - if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == - false) { + // --- Read (multi-)column phenotypes into pheno and set the NA indicator + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } - if (CountFileLines(file_geno, ns_total) == false) { + // --- compute ns_total by readin geno file + if (!is_mdb && is_check_mode() && CountFileLines(file_geno, ns_total) == false) { error = true; } } @@ -215,8 +223,6 @@ void PARAM::ReadFiles(void) { indicator_idv.push_back(k); } - ns_test = 0; - return; } @@ -273,7 +279,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -290,7 +296,7 @@ void PARAM::ReadFiles(void) { } // Read genotype and phenotype file for BIMBAM format. - if (!file_geno.empty()) { + if (!is_mdb && !file_geno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -299,14 +305,14 @@ void PARAM::ReadFiles(void) { } } - // Phenotype file before genotype file. + // Phenotype file before genotype file. Already done this(?!) if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -314,9 +320,10 @@ void PARAM::ReadFiles(void) { trim_individuals(indicator_idv, ni_max); trim_individuals(indicator_cvt, ni_max); - if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, - maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { + // The following reads the geno file to get the SNPs - only for BIMBAM + if (ReadFile_bimbam_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, + maf_level, miss_level, hwe_level, r2_level, mapRS2chr, + mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; return; } @@ -324,6 +331,17 @@ void PARAM::ReadFiles(void) { ns_total = indicator_snp.size(); } + // lmdb-type genotype file: + if (is_mdb && !file_geno.empty()) { + if (!file_kin.empty()) { // GWA + // Phenotype file before genotype file. Already done this(?!) + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { + error = true; + } + ProcessInclusionIndicators(); + ns_total = indicator_snp.size(); + } + } // Read genotype file for multiple PLINK files. if (!file_mbfile.empty()) { @@ -360,7 +378,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -387,8 +405,8 @@ void PARAM::ReadFiles(void) { infile.clear(); } - // Read genotype and phenotype file for multiple BIMBAM files. - if (!file_mgeno.empty()) { + // Read genotype and phenotype file for multiple BIMBAM files. Note this goes untested. + if (!is_mdb && !file_mgeno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -404,7 +422,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain ni_test, // save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -420,7 +438,7 @@ void PARAM::ReadFiles(void) { string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { - if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, + if (ReadFile_bimbam_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; @@ -457,7 +475,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); @@ -480,7 +498,8 @@ void PARAM::ReadFiles(void) { ni_test += indicator_idv[i]; } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); } // For ridge prediction, read phenotype only. @@ -491,7 +510,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); } // Compute setKSnps when -loco is passed in @@ -920,13 +939,18 @@ void PARAM::CheckParam(void) { enforce_fexists(file_gwasnps, "open file"); enforce_fexists(file_anno, "open file"); + if (file_geno.find(".mdb") != string::npos) { + is_mdb = true; + } + if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 9 || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); - enforce_msg(!file_anno.empty(), - "LOCO requires annotation file (-a switch)"); + if (!is_mdb) + enforce_msg(!file_anno.empty(), "Without mdb LOCO requires annotation file (-a switch)"); + enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch"); enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } @@ -1078,13 +1102,15 @@ void PARAM::CheckData(void) { } } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) { + enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && - file_study.empty() && file_beta.empty() && file_bf.empty()) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; - return; + if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && + file_study.empty() && file_beta.empty() && file_bf.empty()) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + return; + } } if (a_mode == 43) { @@ -1103,7 +1129,7 @@ void PARAM::CheckData(void) { } // Output some information. - if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && + if (!is_mdb && file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode != 15 && a_mode != 27 && a_mode != 28) { cout << "## number of total individuals = " << ni_total << endl; if (a_mode == 43) { @@ -1251,14 +1277,14 @@ void PARAM::CheckData(void) { void PARAM::PrintSummary() { if (n_ph == 1) { - cout << "pve estimate =" << pve_null << endl; - cout << "se(pve) =" << pve_se_null << endl; + cout << "## pve estimate = " << pve_null << endl; + cout << "## se(pve) = " << pve_se_null << endl; } else { } return; } -void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { +void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { string file_str; if (!file_bfile.empty()) { @@ -1268,8 +1294,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { error = true; } } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, - calc_K) == false) { + if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, calc_K) == false) { error = true; } } @@ -1277,44 +1302,25 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { return; } -void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K) { - string file_str; - - if (!file_bfile.empty()) { - file_str = file_bfile + ".bed"; - if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } - - return; +gsl_matrix *PARAM::MdbCalcKin() { + return mdb_calc_kin(file_geno, a_mode - 20, loco); } void PARAM::CalcKin(gsl_matrix *matrix_kin) { + checkpoint_nofile("PARAM::CalcKin"); string file_str; - gsl_matrix_set_zero(matrix_kin); if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { - file_str = file_geno; - if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, - matrix_kin, ni_max == 0) == false) { - error = true; - } + // indicator_snp is an array of bool + error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace, + matrix_kin, ni_max == 0); } return; @@ -1990,7 +1996,7 @@ void PARAM::CheckCvt() { } // Post-process phenotypes and covariates. -void PARAM::ProcessCvtPhen() { +void PARAM::ProcessInclusionIndicators() { // Convert indicator_pheno to indicator_idv. int k = 1; @@ -2028,11 +2034,8 @@ void PARAM::ProcessCvtPhen() { // Obtain ni_test. ni_test = 0; - for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) { - if (indicator_idv[i] == 0) { - continue; - } - ni_test++; + for(auto &i : indicator_idv) { + ni_test += indicator_idv[i]; } // If subsample number is set, perform a random sub-sampling @@ -2071,13 +2074,14 @@ void PARAM::ProcessCvtPhen() { } // Check ni_test. - if (a_mode != M_BSLMM5) - enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && a_mode != M_BSLMM5) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; + if (!is_mdb) { + if (a_mode != M_BSLMM5) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (ni_test == 0 && a_mode != M_BSLMM5) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + } } - // Check covariates to see if they are correlated with each // other, and to see if the intercept term is included. // After getting ni_test. diff --git a/src/param.h b/src/param.h index d3ce686..046f543 100644 --- a/src/param.h +++ b/src/param.h @@ -160,6 +160,8 @@ public: string file_ksnps; // File SNPs for computing K string file_gwasnps; // File SNPs for computing GWAS + bool is_mdb; + // QC-related parameters. double miss_level; double maf_level; @@ -336,17 +338,16 @@ public: void CheckParam(); void CheckData(); void PrintSummary(); - void ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); - void ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K); + void ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); void CheckCvt(); void CopyCvt(gsl_matrix *W); void CopyA(size_t flag, gsl_matrix *A); void CopyGxe(gsl_vector *gxe); void CopyWeight(gsl_vector *w); - void ProcessCvtPhen(); + void ProcessInclusionIndicators(); void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag); + gsl_matrix *MdbCalcKin(); void CalcKin(gsl_matrix *matrix_kin); void CalcS(const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, @@ -368,6 +369,9 @@ public: const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat); void UpdateSNP(const map<string, double> &mapRS2wA); + + inline bool is_loco() { return !loco.empty(); } + }; size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt); diff --git a/src/vc.cpp b/src/vc.cpp index 22aaea9..970c556 100644 --- a/src/vc.cpp +++ b/src/vc.cpp @@ -1021,7 +1021,7 @@ void ReadFile_cor(const string &file_cor, const vector<string> &vec_rs, string rs1, rs2; double d1, d2, d3, cor, var1, var2; - size_t n_nb, nsamp1, nsamp2, n12, bin_size = 10, bin; + size_t n_nb, nsamp1, nsamp2, n12, bin_size = 10, bin = 0; vector<vector<double>> mat_S, mat_Svar, mat_tmp; vector<double> vec_qvar, vec_tmp; |
