diff options
Diffstat (limited to 'src/param.cpp')
| -rw-r--r-- | src/param.cpp | 176 |
1 files changed, 90 insertions, 86 deletions
diff --git a/src/param.cpp b/src/param.cpp index f96e9c3..96a9cd2 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,7 @@ #include "mathfunc.h" #include "param.h" #include "fastblas.h" +#include "checkpoint.h" using namespace std; @@ -104,7 +105,9 @@ PARAM::PARAM(void) randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), - time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} + time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) { + is_mdb = false; + } PARAM::~PARAM() { if (gsl_r) @@ -113,6 +116,7 @@ PARAM::~PARAM() { // Read files: obtain ns_total, ng_total, ns_test, ni_test. void PARAM::ReadFiles(void) { + checkpoint_nofile("PARAM::ReadFiles"); string file_str; // Read cat file. @@ -145,22 +149,24 @@ void PARAM::ReadFiles(void) { } } - // Read SNP set. - if (!file_snps.empty()) { - if (ReadFile_snps(file_snps, setSnps) == false) { - error = true; + if (!is_mdb) { + // Read SNP set into setSnps (without filtering) + if (!file_snps.empty()) { + if (ReadFile_snps(file_snps, setSnps) == false) { + error = true; + } + } else { + setSnps.clear(); } - } else { - setSnps.clear(); - } - // Read KSNP set. - if (!file_ksnps.empty()) { - if (ReadFile_snps(file_ksnps, setKSnps) == false) { - error = true; + // Also read KSNP set. Reads all Snps later to be used by GRM. + if (!file_ksnps.empty()) { + if (ReadFile_snps(file_ksnps, setKSnps) == false) { + error = true; + } + } else { + setKSnps.clear(); } - } else { - setKSnps.clear(); } // For prediction. @@ -180,13 +186,15 @@ void PARAM::ReadFiles(void) { } } + if (!file_geno.empty()) { - if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == - false) { + // --- Read (multi-)column phenotypes into pheno and set the NA indicator + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } - if (CountFileLines(file_geno, ns_total) == false) { + // --- compute ns_total by readin geno file + if (!is_mdb && is_check_mode() && CountFileLines(file_geno, ns_total) == false) { error = true; } } @@ -215,8 +223,6 @@ void PARAM::ReadFiles(void) { indicator_idv.push_back(k); } - ns_test = 0; - return; } @@ -273,7 +279,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -290,7 +296,7 @@ void PARAM::ReadFiles(void) { } // Read genotype and phenotype file for BIMBAM format. - if (!file_geno.empty()) { + if (!is_mdb && !file_geno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -299,14 +305,14 @@ void PARAM::ReadFiles(void) { } } - // Phenotype file before genotype file. + // Phenotype file before genotype file. Already done this(?!) if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -314,9 +320,10 @@ void PARAM::ReadFiles(void) { trim_individuals(indicator_idv, ni_max); trim_individuals(indicator_cvt, ni_max); - if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, - maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { + // The following reads the geno file to get the SNPs - only for BIMBAM + if (ReadFile_bimbam_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, + maf_level, miss_level, hwe_level, r2_level, mapRS2chr, + mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; return; } @@ -324,6 +331,17 @@ void PARAM::ReadFiles(void) { ns_total = indicator_snp.size(); } + // lmdb-type genotype file: + if (is_mdb && !file_geno.empty()) { + if (!file_kin.empty()) { // GWA + // Phenotype file before genotype file. Already done this(?!) + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { + error = true; + } + ProcessInclusionIndicators(); + ns_total = indicator_snp.size(); + } + } // Read genotype file for multiple PLINK files. if (!file_mbfile.empty()) { @@ -360,7 +378,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -387,8 +405,8 @@ void PARAM::ReadFiles(void) { infile.clear(); } - // Read genotype and phenotype file for multiple BIMBAM files. - if (!file_mgeno.empty()) { + // Read genotype and phenotype file for multiple BIMBAM files. Note this goes untested. + if (!is_mdb && !file_mgeno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -404,7 +422,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain ni_test, // save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -420,7 +438,7 @@ void PARAM::ReadFiles(void) { string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { - if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, + if (ReadFile_bimbam_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; @@ -457,7 +475,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); @@ -480,7 +498,8 @@ void PARAM::ReadFiles(void) { ni_test += indicator_idv[i]; } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); } // For ridge prediction, read phenotype only. @@ -491,7 +510,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); } // Compute setKSnps when -loco is passed in @@ -920,13 +939,18 @@ void PARAM::CheckParam(void) { enforce_fexists(file_gwasnps, "open file"); enforce_fexists(file_anno, "open file"); + if (file_geno.find(".mdb") != string::npos) { + is_mdb = true; + } + if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 9 || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); - enforce_msg(!file_anno.empty(), - "LOCO requires annotation file (-a switch)"); + if (!is_mdb) + enforce_msg(!file_anno.empty(), "Without mdb LOCO requires annotation file (-a switch)"); + enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch"); enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } @@ -1078,13 +1102,15 @@ void PARAM::CheckData(void) { } } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) { + enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && - file_study.empty() && file_beta.empty() && file_bf.empty()) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; - return; + if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && + file_study.empty() && file_beta.empty() && file_bf.empty()) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + return; + } } if (a_mode == 43) { @@ -1103,7 +1129,7 @@ void PARAM::CheckData(void) { } // Output some information. - if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && + if (!is_mdb && file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode != 15 && a_mode != 27 && a_mode != 28) { cout << "## number of total individuals = " << ni_total << endl; if (a_mode == 43) { @@ -1251,14 +1277,14 @@ void PARAM::CheckData(void) { void PARAM::PrintSummary() { if (n_ph == 1) { - cout << "pve estimate =" << pve_null << endl; - cout << "se(pve) =" << pve_se_null << endl; + cout << "## pve estimate = " << pve_null << endl; + cout << "## se(pve) = " << pve_se_null << endl; } else { } return; } -void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { +void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { string file_str; if (!file_bfile.empty()) { @@ -1268,8 +1294,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { error = true; } } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, - calc_K) == false) { + if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, calc_K) == false) { error = true; } } @@ -1277,44 +1302,25 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { return; } -void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K) { - string file_str; - - if (!file_bfile.empty()) { - file_str = file_bfile + ".bed"; - if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } - - return; +gsl_matrix *PARAM::MdbCalcKin() { + return mdb_calc_kin(file_geno, a_mode - 20, loco); } void PARAM::CalcKin(gsl_matrix *matrix_kin) { + checkpoint_nofile("PARAM::CalcKin"); string file_str; - gsl_matrix_set_zero(matrix_kin); if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { - file_str = file_geno; - if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, - matrix_kin, ni_max == 0) == false) { - error = true; - } + // indicator_snp is an array of bool + error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace, + matrix_kin, ni_max == 0); } return; @@ -1990,7 +1996,7 @@ void PARAM::CheckCvt() { } // Post-process phenotypes and covariates. -void PARAM::ProcessCvtPhen() { +void PARAM::ProcessInclusionIndicators() { // Convert indicator_pheno to indicator_idv. int k = 1; @@ -2028,11 +2034,8 @@ void PARAM::ProcessCvtPhen() { // Obtain ni_test. ni_test = 0; - for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) { - if (indicator_idv[i] == 0) { - continue; - } - ni_test++; + for(auto &i : indicator_idv) { + ni_test += indicator_idv[i]; } // If subsample number is set, perform a random sub-sampling @@ -2071,13 +2074,14 @@ void PARAM::ProcessCvtPhen() { } // Check ni_test. - if (a_mode != M_BSLMM5) - enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && a_mode != M_BSLMM5) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; + if (!is_mdb) { + if (a_mode != M_BSLMM5) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (ni_test == 0 && a_mode != M_BSLMM5) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + } } - // Check covariates to see if they are correlated with each // other, and to see if the intercept term is included. // After getting ni_test. |
