/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "gsl/gsl_blas.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_randist.h" #include "gsl/gsl_vector.h" #include "eigenlib.h" #include "gemma.h" #include "gemma_io.h" #include "mathfunc.h" #include "param.h" #include "fastblas.h" using namespace std; // ---- Helper functions which do not use the PARAM scope // Calculate the SNP sets as used in LOCO from mapchr which was filled // from the annotation file. Returns ksnps (used for K) and gwasnps (used for // GWA). Both are complimentary with each other and subsets of setSnps. void LOCO_set_Snps(set &ksnps, set &gwasnps, const map mapchr, const string loco) { enforce_msg(ksnps.size() == 0, "make sure knsps is not initialized twice"); enforce_msg(gwasnps.size() == 0, "make sure gwasnps is not initialized twice"); for (auto &kv : mapchr) { auto snp = kv.first; auto chr = kv.second; if (chr != loco) { ksnps.insert(snp); } else { gwasnps.insert(snp); } } } // Trim #individuals to size which is used to write tests that run faster // // Note it actually trims the number of functional individuals // (indicator_idv[x] == 1). This should match indicator_cvt etc. If // this gives problems with certain sets we can simply trim to size. void trim_individuals(vector &idvs, size_t ni_max) { if (ni_max) { size_t count = 0; for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) { if (*ind) count++; if (count >= ni_max) break; } if (count != idvs.size()) { if (is_debug_mode()) cout << "**** TEST MODE: trim individuals from " << idvs.size() << " to " << count << endl; idvs.resize(count); } } } // ---- PARAM class implementation PARAM::PARAM(void) : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001), em_iter(10000), nr_iter(100), crt(0), pheno_mean(0), noconstrain(false), h_min(-1), h_max(-1), h_scale(-1), rho_min(0.0), rho_max(1.0), rho_scale(-1), logp_min(0.0), logp_max(0.0), logp_scale(-1), h_ngrid(10), rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000), r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} PARAM::~PARAM() { if (gsl_r) gsl_rng_free(gsl_r); } // Read files: obtain ns_total, ng_total, ns_test, ni_test. void PARAM::ReadFiles(void) { string file_str; // Read cat file. if (!file_mcat.empty()) { if (ReadFile_mcat(file_mcat, mapRS2cat, n_vc) == false) { error = true; } } else if (!file_cat.empty()) { if (ReadFile_cat(file_cat, mapRS2cat, n_vc) == false) { error = true; } } // Read snp weight files. if (!file_wcat.empty()) { if (ReadFile_wsnp(file_wcat, n_vc, mapRS2wcat) == false) { error = true; } } if (!file_wsnp.empty()) { if (ReadFile_wsnp(file_wsnp, mapRS2wsnp) == false) { error = true; } } // Count number of kinship files. if (!file_mk.empty()) { if (CountFileLines(file_mk, n_vc) == false) { error = true; } } // Read SNP set. if (!file_snps.empty()) { if (ReadFile_snps(file_snps, setSnps) == false) { error = true; } } else { setSnps.clear(); } // Read KSNP set. if (!file_ksnps.empty()) { if (ReadFile_snps(file_ksnps, setKSnps) == false) { error = true; } } else { setKSnps.clear(); } // For prediction. if (!file_epm.empty()) { if (ReadFile_est(file_epm, est_column, mapRS2est) == false) { error = true; } if (!file_bfile.empty()) { file_str = file_bfile + ".bim"; if (ReadFile_bim(file_str, snpInfo) == false) { error = true; } file_str = file_bfile + ".fam"; if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) == false) { error = true; } } if (!file_geno.empty()) { if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } if (CountFileLines(file_geno, ns_total) == false) { error = true; } } if (!file_ebv.empty()) { if (ReadFile_column(file_ebv, indicator_bv, vec_bv, 1) == false) { error = true; } } if (!file_log.empty()) { if (ReadFile_log(file_log, pheno_mean) == false) { error = true; } } // Convert indicator_pheno to indicator_idv. int k = 1; for (size_t i = 0; i < indicator_pheno.size(); i++) { k = 1; for (size_t j = 0; j < indicator_pheno[i].size(); j++) { if (indicator_pheno[i][j] == 0) { k = 0; } } indicator_idv.push_back(k); } ns_test = 0; return; } // Read covariates before the genotype files. if (!file_cvt.empty()) { if (ReadFile_cvt(file_cvt, indicator_cvt, cvt, n_cvt) == false) { error = true; } if ((indicator_cvt).size() == 0) { n_cvt = 1; } } else { n_cvt = 1; } trim_individuals(indicator_cvt, ni_max); if (!file_gxe.empty()) { if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) { error = true; } } if (!file_weight.empty()) { if (ReadFile_column(file_weight, indicator_weight, weight, 1) == false) { error = true; } } trim_individuals(indicator_idv, ni_max); // Read genotype and phenotype file for PLINK format. if (!file_bfile.empty()) { file_str = file_bfile + ".bim"; snpInfo.clear(); if (ReadFile_bim(file_str, snpInfo) == false) { error = true; } // If both fam file and pheno files are used, use // phenotypes inside the pheno file. if (!file_pheno.empty()) { // Phenotype file before genotype file. if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } } else { file_str = file_bfile + ".fam"; if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) == false) { error = true; } } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. ProcessCvtPhen(); // Obtain covariate matrix. auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); CopyCvt(W1); file_str = file_bfile + ".bed"; if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test) == false) { error = true; } gsl_matrix_free(W1); ns_total = indicator_snp.size(); } // Read genotype and phenotype file for BIMBAM format. if (!file_geno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) { error = true; } } // Phenotype file before genotype file. if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. ProcessCvtPhen(); // Obtain covariate matrix. auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); CopyCvt(W2); trim_individuals(indicator_idv, ni_max); trim_individuals(indicator_cvt, ni_max); if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; return; } gsl_matrix_free(W2); ns_total = indicator_snp.size(); } // Read genotype file for multiple PLINK files. if (!file_mbfile.empty()) { igzstream infile(file_mbfile.c_str(), igzstream::in); enforce_msg(infile,"fail to open mbfile file"); string file_name; size_t t = 0, ns_test_tmp = 0; gsl_matrix *W3 = NULL; while (!safeGetline(infile, file_name).eof()) { file_str = file_name + ".bim"; if (ReadFile_bim(file_str, snpInfo) == false) { error = true; } if (t == 0) { // If both fam file and pheno files are used, use // phenotypes inside the pheno file. if (!file_pheno.empty()) { // Phenotype file before genotype file. if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } } else { file_str = file_name + ".fam"; if (ReadFile_fam(file_str, indicator_pheno, pheno, mapID2num, p_column) == false) { error = true; } } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. ProcessCvtPhen(); // Obtain covariate matrix. W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); CopyCvt(W3); } file_str = file_name + ".bed"; if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test_tmp) == false) { error = true; } mindicator_snp.push_back(indicator_snp); msnpInfo.push_back(snpInfo); ns_test += ns_test_tmp; ns_total += indicator_snp.size(); t++; } if (W3) gsl_matrix_free(W3); infile.close(); infile.clear(); } // Read genotype and phenotype file for multiple BIMBAM files. if (!file_mgeno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { if (ReadFile_anno(file_anno, mapRS2chr, mapRS2bp, mapRS2cM) == false) { error = true; } } // Phenotype file before genotype file. if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain ni_test, // save all useful covariates. ProcessCvtPhen(); // Obtain covariate matrix. gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); CopyCvt(W4); igzstream infile(file_mgeno.c_str(), igzstream::in); if (!infile) { cout << "error! fail to open mgeno file: " << file_mgeno << endl; error = true; return; } string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; } mindicator_snp.push_back(indicator_snp); msnpInfo.push_back(snpInfo); ns_test += ns_test_tmp; ns_total += indicator_snp.size(); } gsl_matrix_free(W4); infile.close(); infile.clear(); } if (!file_gene.empty()) { if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Convert indicator_pheno to indicator_idv. int k = 1; for (size_t i = 0; i < indicator_pheno.size(); i++) { k = 1; for (size_t j = 0; j < indicator_pheno[i].size(); j++) { if (indicator_pheno[i][j] == 0) { k = 0; } } indicator_idv.push_back(k); } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. ProcessCvtPhen(); // Obtain covariate matrix. // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); // CopyCvt(W5); if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) { error = true; } } // Read is after gene file. if (!file_read.empty()) { if (ReadFile_column(file_read, indicator_read, vec_read, 1) == false) { error = true; } ni_test = 0; for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { indicator_idv[i] *= indicator_read[i]; ni_test += indicator_idv[i]; } enforce_msg(ni_test,"number of analyzed individuals equals 0."); } // For ridge prediction, read phenotype only. if (file_geno.empty() && file_gene.empty() && !file_pheno.empty()) { if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. ProcessCvtPhen(); } // Compute setKSnps when -loco is passed in if (!loco.empty()) { LOCO_set_Snps(setKSnps, setGWASnps, mapRS2chr, loco); } return; } void PARAM::CheckParam(void) { struct stat fileInfo; string str; // Check parameters. if (k_mode != 1 && k_mode != 2) { cout << "error! unknown kinship/relatedness input mode: " << k_mode << endl; error = true; } if (a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 && a_mode != 5 && a_mode != M_LMM9 && a_mode != 11 && a_mode != 12 && a_mode != 13 && a_mode != 14 && a_mode != 15 && a_mode != 21 && a_mode != 22 && a_mode != 25 && a_mode != 26 && a_mode != 27 && a_mode != 28 && a_mode != 31 && a_mode != 41 && a_mode != 42 && a_mode != 43 && a_mode != 51 && a_mode != 52 && a_mode != 53 && a_mode != 54 && a_mode != 61 && a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67 && a_mode != 71) { cout << "error! unknown analysis mode: " << a_mode << ". make sure -gk or -eigen or -lmm or -bslmm -predict or " << "-calccov is specified correctly." << endl; error = true; } if (miss_level > 1) { cout << "error! missing level needs to be between 0 and 1. " << "current value = " << miss_level << endl; error = true; } if (maf_level > 0.5) { cout << "error! maf level needs to be between 0 and 0.5. " << "current value = " << maf_level << endl; error = true; } if (hwe_level > 1) { cout << "error! hwe level needs to be between 0 and 1. " << "current value = " << hwe_level << endl; error = true; } if (r2_level > 1) { cout << "error! r2 level needs to be between 0 and 1. " << "current value = " << r2_level << endl; error = true; } if (l_max < l_min) { cout << "error! maximum lambda value must be larger than the " << "minimal value. current values = " << l_max << " and " << l_min << endl; error = true; } if (h_max < h_min) { cout << "error! maximum h value must be larger than the minimal " << "value. current values = " << h_max << " and " << h_min << endl; error = true; } if (s_max < s_min) { cout << "error! maximum s value must be larger than the minimal " << "value. current values = " << s_max << " and " << s_min << endl; error = true; } if (rho_max < rho_min) { cout << "error! maximum rho value must be larger than the" << "minimal value. current values = " << rho_max << " and " << rho_min << endl; error = true; } if (logp_max < logp_min) { cout << "error! maximum logp value must be larger than the " << "minimal value. current values = " << logp_max / log(10) << " and " << logp_min / log(10) << endl; error = true; } if (h_max > 1) { cout << "error! h values must be bewtween 0 and 1. current " << "values = " << h_max << " and " << h_min << endl; error = true; } if (rho_max > 1) { cout << "error! rho values must be between 0 and 1. current " << "values = " << rho_max << " and " << rho_min << endl; error = true; } if (logp_max > 0) { cout << "error! maximum logp value must be smaller than 0. " << "current values = " << logp_max / log(10) << " and " << logp_min / log(10) << endl; error = true; } if (l_max < l_min) { cout << "error! maximum lambda value must be larger than the " << "minimal value. current values = " << l_max << " and " << l_min << endl; error = true; } if (h_scale > 1.0) { cout << "error! hscale value must be between 0 and 1. " << "current value = " << h_scale << endl; error = true; } if (rho_scale > 1.0) { cout << "error! rscale value must be between 0 and 1. " << "current value = " << rho_scale << endl; error = true; } if (logp_scale > 1.0) { cout << "error! pscale value must be between 0 and 1. " << "current value = " << logp_scale << endl; error = true; } if (rho_max == 1 && rho_min == 1 && a_mode == 12) { cout << "error! ridge regression does not support a rho " << "parameter. current values = " << rho_max << " and " << rho_min << endl; error = true; } if (window_cm < 0) { cout << "error! windowcm values must be non-negative. " << "current values = " << window_cm << endl; error = true; } if (window_cm == 0 && window_bp == 0 && window_ns == 0) { window_bp = 1000000; } // Check p_column, and (no need to) sort p_column into // ascending order. if (p_column.size() == 0) { p_column.push_back(1); } else { for (size_t i = 0; i < p_column.size(); i++) { for (size_t j = 0; j < i; j++) { if (p_column[i] == p_column[j]) { cout << "error! identical phenotype " << "columns: " << p_column[i] << endl; error = true; } } } } n_ph = p_column.size(); // Only LMM option (and one prediction option) can deal with // multiple phenotypes and no gene expression files. if (n_ph > 1 && a_mode != 1 && a_mode != 2 && a_mode != 3 && a_mode != 4 && a_mode != 9 && a_mode != 43) { cout << "error! the current analysis mode " << a_mode << " can not deal with multiple phenotypes." << endl; error = true; } if (n_ph > 1 && !file_gene.empty()) { cout << "error! multiple phenotype analysis option not " << "allowed with gene expression files. " << endl; error = true; } if (p_nr > 1) { cout << "error! pnr value must be between 0 and 1. current value = " << p_nr << endl; error = true; } // check est_column if (est_column.size() == 0) { if (file_ebv.empty()) { est_column.push_back(2); est_column.push_back(5); est_column.push_back(6); est_column.push_back(7); } else { est_column.push_back(2); est_column.push_back(0); est_column.push_back(6); est_column.push_back(7); } } if (est_column.size() != 4) { cout << "error! -en not followed by four numbers. current number = " << est_column.size() << endl; error = true; } if (est_column[0] == 0) { cout << "error! -en rs column can not be zero. current number = " << est_column.size() << endl; error = true; } // Check if files are compatible with each other, and if files exist. if (!file_bfile.empty()) { str = file_bfile + ".bim"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .bim file: " << str << endl; error = true; } str = file_bfile + ".bed"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .bed file: " << str << endl; error = true; } str = file_bfile + ".fam"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .fam file: " << str << endl; error = true; } } if ((!file_geno.empty() || !file_gene.empty())) { str = file_pheno; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open phenotype file: " << str << endl; error = true; } } str = file_geno; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mean genotype file: " << str << endl; error = true; } str = file_gene; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open gene expression file: " << str << endl; error = true; } str = file_cat; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open category file: " << str << endl; error = true; } str = file_mcat; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mcategory file: " << str << endl; error = true; } str = file_beta; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open beta file: " << str << endl; error = true; } str = file_cor; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open correlation file: " << str << endl; error = true; } if (!file_study.empty()) { str = file_study + ".Vq.txt"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .Vq.txt file: " << str << endl; error = true; } str = file_study + ".q.txt"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .q.txt file: " << str << endl; error = true; } str = file_study + ".size.txt"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .size.txt file: " << str << endl; error = true; } } if (!file_ref.empty()) { str = file_ref + ".S.txt"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .S.txt file: " << str << endl; error = true; } str = file_ref + ".size.txt"; if (stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open .size.txt file: " << str << endl; error = true; } } str = file_mstudy; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mstudy file: " << str << endl; error = true; } str = file_mref; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mref file: " << str << endl; error = true; } str = file_mgeno; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mgeno file: " << str << endl; error = true; } str = file_mbfile; if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { cout << "error! fail to open mbfile file: " << str << endl; error = true; } size_t flag = 0; if (!file_bfile.empty()) { flag++; } if (!file_geno.empty()) { flag++; } if (!file_gene.empty()) { flag++; } // Always set up random environment. gsl_rng_env_setup(); // sets gsl_rng_default_seed const gsl_rng_type *T = gsl_rng_default; // pick up environment GSL_RNG_SEED if (randseed >= 0) gsl_rng_default_seed = randseed; // CLI option used else if (gsl_rng_default_seed == 0) { // by default we will randomize the seed time_t rawtime; time(&rawtime); tm *ptm = gmtime(&rawtime); gsl_rng_default_seed = (unsigned)(ptm->tm_hour % 24 * 3600 + ptm->tm_min * 60 + ptm->tm_sec); } gsl_r = gsl_rng_alloc(T); if (is_debug_mode()) { printf ("GSL random generator type: %s; ", gsl_rng_name (gsl_r)); printf ("seed = %lu (option %li); ", gsl_rng_default_seed, randseed); printf ("first value = %lu\n", gsl_rng_get (gsl_r)); } if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 && a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67) { cout << "error! either plink binary files, or bimbam mean" << "genotype files, or gene expression files are required." << endl; error = true; } if (file_pheno.empty() && (a_mode == 43 || a_mode == 5)) { cout << "error! phenotype file is required." << endl; error = true; } if (a_mode == 61 || a_mode == 62) { if (!file_beta.empty()) { if (file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty() && file_mref.empty() && file_ref.empty()) { cout << "error! missing genotype file or ref/mref file." << endl; error = true; } } else if (!file_pheno.empty()) { if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) && file_mk.empty()) { cout << "error! missing relatedness file. " << endl; error = true; } } else if ((file_mstudy.empty() && file_study.empty()) || (file_mref.empty() && file_ref.empty())) { cout << "error! either beta file, or phenotype files or " << "study/ref mstudy/mref files are required." << endl; error = true; } } if (a_mode == 63) { if (file_kin.empty() && (file_ku.empty() || file_kd.empty()) && file_mk.empty()) { cout << "error! missing relatedness file. " << endl; error = true; } if (file_pheno.empty()) { cout << "error! missing phenotype file." << endl; error = true; } } if (a_mode == 66 || a_mode == 67) { if (file_beta.empty() || (file_mbfile.empty() && file_bfile.empty() && file_mgeno.empty() && file_geno.empty())) { cout << "error! missing beta file or genotype file." << endl; error = true; } } if (!file_epm.empty() && file_bfile.empty() && file_geno.empty()) { cout << "error! estimated parameter file also requires genotype " << "file." << endl; error = true; } if (!file_ebv.empty() && file_kin.empty()) { cout << "error! estimated breeding value file also requires " << "relatedness file." << endl; error = true; } if (!file_log.empty() && pheno_mean != 0) { cout << "error! either log file or mu value can be provide." << endl; error = true; } enforce_fexists(file_snps, "open file"); enforce_fexists(file_ksnps, "open file"); enforce_fexists(file_gwasnps, "open file"); enforce_fexists(file_anno, "open file"); if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 9 || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); enforce_msg(!file_anno.empty(), "LOCO requires annotation file (-a switch)"); enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch"); enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } enforce_fexists(file_kin, "open file"); enforce_fexists(file_mk, "open file"); enforce_fexists(file_cvt, "open file"); enforce_fexists(file_gxe, "open file"); enforce_fexists(file_log, "open file"); enforce_fexists(file_weight, "open file"); enforce_fexists(file_epm, "open file"); enforce_fexists(file_ebv, "open file"); enforce_fexists(file_read, "open file"); // Check if files are compatible with analysis mode. if (k_mode == 2 && !file_geno.empty()) { cout << "error! use \"-km 1\" when using bimbam mean genotype " << "file. " << endl; error = true; } if ((a_mode == 1 || a_mode == 2 || a_mode == 3 || a_mode == 4 || a_mode == 5 || a_mode == 9 || a_mode == 31) && (file_kin.empty() && (file_ku.empty() || file_kd.empty()))) { cout << "error! missing relatedness file. " << endl; error = true; } if ((a_mode == 43) && file_kin.empty()) { cout << "error! missing relatedness file. -predict option requires " << "-k option to provide a relatedness file." << endl; error = true; } if ((a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14 || a_mode == 16) && !file_cvt.empty()) { cout << "error! -bslmm option does not support covariates files." << endl; error = true; } if (a_mode == 41 || a_mode == 42) { if (!file_cvt.empty()) { cout << "error! -predict option does not support " << "covariates files." << endl; error = true; } if (file_epm.empty()) { cout << "error! -predict option requires estimated " << "parameter files." << endl; error = true; } } if (file_beta.empty() && (a_mode == 27 || a_mode == 28)) { cout << "error! beta effects file is required." << endl; error = true; } return; } void PARAM::CheckData(void) { if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) { cout << "error! the number of pve estimates does not equal to " << "the number of categories in the cat file:" << v_pve.size() << " " << n_vc << endl; error = true; } if ((indicator_cvt).size() != 0 && (indicator_cvt).size() != (indicator_idv).size()) { error = true; cout << "error! number of rows in the covariates file do not " << "match the number of individuals. " << indicator_cvt.size() << endl; return; } if ((indicator_gxe).size() != 0 && (indicator_gxe).size() != (indicator_idv).size()) { error = true; cout << "error! number of rows in the gxe file do not match the number " << "of individuals. " << endl; return; } if ((indicator_weight).size() != 0 && (indicator_weight).size() != (indicator_idv).size()) { error = true; cout << "error! number of rows in the weight file do not match " << "the number of individuals. " << endl; return; } if ((indicator_read).size() != 0 && (indicator_read).size() != (indicator_idv).size()) { error = true; cout << "error! number of rows in the total read file do not " << "match the number of individuals. " << endl; return; } // Calculate ni_total and ni_test, and set indicator_idv to 0 // whenever indicator_cvt=0, and calculate np_obs and np_miss. ni_total = (indicator_idv).size(); ni_test = 0; for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { if (indicator_idv[i] == 0) { continue; } ni_test++; } ni_cvt = 0; for (size_t i = 0; i < indicator_cvt.size(); i++) { if (indicator_cvt[i] == 0) { continue; } ni_cvt++; } np_obs = 0; np_miss = 0; for (size_t i = 0; i < indicator_pheno.size(); i++) { if (indicator_cvt.size() != 0) { if (indicator_cvt[i] == 0) { continue; } } if (indicator_gxe.size() != 0) { if (indicator_gxe[i] == 0) { continue; } } if (indicator_weight.size() != 0) { if (indicator_weight[i] == 0) { continue; } } for (size_t j = 0; j < indicator_pheno[i].size(); j++) { if (indicator_pheno[i][j] == 0) { np_miss++; } else { np_obs++; } } } enforce_msg(ni_test,"number of analyzed individuals equals 0."); if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && file_study.empty() && file_beta.empty() && file_bf.empty()) { error = true; cout << "error! number of analyzed individuals equals 0. " << endl; return; } if (a_mode == 43) { if (ni_cvt == ni_test) { error = true; cout << "error! no individual has missing " << "phenotypes." << endl; return; } if ((np_obs + np_miss) != (ni_cvt * n_ph)) { error = true; cout << "error! number of phenotypes do not match the " << "summation of missing and observed phenotypes." << endl; return; } } // Output some information. if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode != 15 && a_mode != 27 && a_mode != 28) { cout << "## number of total individuals = " << ni_total << endl; if (a_mode == 43) { cout << "## number of analyzed individuals = " << ni_cvt << endl; cout << "## number of individuals with full phenotypes = " << ni_test << endl; } else { cout << "## number of analyzed individuals = " << ni_test << endl; } cout << "## number of covariates = " << n_cvt << endl; cout << "## number of phenotypes = " << n_ph << endl; if (a_mode == 43) { cout << "## number of observed data = " << np_obs << endl; cout << "## number of missing data = " << np_miss << endl; } if (!file_gene.empty()) { cout << "## number of total genes = " << ng_total << endl; } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) { if (!loco.empty()) cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl; cout << "## number of total SNPs/var = " << setw(8) << ns_total << endl; if (setSnps.size()) cout << "## number of considered SNPS = " << setw(8) << setSnps.size() << endl; if (setKSnps.size()) cout << "## number of SNPS for K = " << setw(8) << setKSnps.size() << endl; if (setGWASnps.size()) cout << "## number of SNPS for GWAS = " << setw(8) << setGWASnps.size() << endl; cout << "## number of analyzed SNPs = " << setw(8) << ns_test << endl; } else { } } // Set d_pace to 1000 for gene expression. if (!file_gene.empty() && d_pace == DEFAULT_PACE) { d_pace = 1000; } // For case-control studies, count # cases and # controls. int flag_cc = 0; if (a_mode == 13) { ni_case = 0; ni_control = 0; for (size_t i = 0; i < indicator_idv.size(); i++) { if (indicator_idv[i] == 0) { continue; } if (pheno[i][0] == 0) { ni_control++; } else if (pheno[i][0] == 1) { ni_case++; } else { flag_cc = 1; } } cout << "## number of cases = " << ni_case << endl; cout << "## number of controls = " << ni_control << endl; } if (flag_cc == 1) { cout << "Unexpected non-binary phenotypes for " << "case/control analysis. Use default (BSLMM) analysis instead." << endl; a_mode = 11; } // Set parameters for BSLMM and check for predict. if (a_mode == 11 || a_mode == 12 || a_mode == 13 || a_mode == 14) { if (a_mode == 11) { n_mh = 1; } if (logp_min == 0) { logp_min = -1.0 * log((double)ns_test); } if (h_scale == -1) { h_scale = min(1.0, 10.0 / sqrt((double)ni_test)); } if (rho_scale == -1) { rho_scale = min(1.0, 10.0 / sqrt((double)ni_test)); } if (logp_scale == -1) { logp_scale = min(1.0, 5.0 / sqrt((double)ni_test)); } if (h_min == -1) { h_min = 0.0; } if (h_max == -1) { h_max = 1.0; } if (s_max > ns_test) { s_max = ns_test; cout << "s_max is re-set to the number of analyzed SNPs." << endl; } if (s_max < s_min) { cout << "error! maximum s value must be larger than the " << "minimal value. current values = " << s_max << " and " << s_min << endl; error = true; } } else if (a_mode == 41 || a_mode == 42) { if (indicator_bv.size() != 0) { if (indicator_idv.size() != indicator_bv.size()) { cout << "error! number of rows in the " << "phenotype file does not match that in the " << "estimated breeding value file: " << indicator_idv.size() << "\t" << indicator_bv.size() << endl; error = true; } else { size_t flag_bv = 0; for (size_t i = 0; i < (indicator_bv).size(); ++i) { if (indicator_idv[i] != indicator_bv[i]) { flag_bv++; } } if (flag_bv != 0) { cout << "error! individuals with missing value in the " << "phenotype file does not match that in the " << "estimated breeding value file: " << flag_bv << endl; error = true; } } } } if (a_mode == 62 && !file_beta.empty() && mapRS2wcat.size() == 0) { cout << "vc analysis with beta files requires -wcat file." << endl; error = true; } if (a_mode == 67 && mapRS2wcat.size() == 0) { cout << "ci analysis with beta files requires -wcat file." << endl; error = true; } // File_mk needs to contain more than one line. if (n_vc == 1 && !file_mk.empty()) { cout << "error! -mk file should contain more than one line." << endl; error = true; } return; } void PARAM::PrintSummary() { if (n_ph == 1) { cout << "pve estimate =" << pve_null << endl; cout << "se(pve) =" << pve_se_null << endl; } else { } return; } void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { string file_str; if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; if (ReadFile_bed(file_str, indicator_idv, indicator_snp, UtX, K, calc_K) == false) { error = true; } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, calc_K) == false) { error = true; } } return; } void PARAM::ReadGenotypes(vector> &Xt, gsl_matrix *K, const bool calc_K) { string file_str; if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test) == false) { error = true; } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, ni_test, ns_test) == false) { error = true; } } return; } void PARAM::CalcKin(gsl_matrix *matrix_kin) { string file_str; gsl_matrix_set_zero(matrix_kin); if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { file_str = file_geno; if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, matrix_kin, ni_max == 0) == false) { error = true; } } return; } // From an existing n by nd A and K matrices, compute the d by d S // matrix (which is not necessary symmetric). void compAKtoS(const gsl_matrix *A, const gsl_matrix *K, const size_t n_cvt, gsl_matrix *S) { size_t n_vc = S->size1, ni_test = A->size1; double di, dj, tr_AK, sum_A, sum_K, s_A, s_K, sum_AK, tr_A, tr_K, d; for (size_t i = 0; i < n_vc; i++) { for (size_t j = 0; j < n_vc; j++) { tr_AK = 0; sum_A = 0; sum_K = 0; sum_AK = 0; tr_A = 0; tr_K = 0; for (size_t l = 0; l < ni_test; l++) { s_A = 0; s_K = 0; for (size_t k = 0; k < ni_test; k++) { di = gsl_matrix_get(A, l, k + ni_test * i); dj = gsl_matrix_get(K, l, k + ni_test * j); s_A += di; s_K += dj; tr_AK += di * dj; sum_A += di; sum_K += dj; if (l == k) { tr_A += di; tr_K += dj; } } sum_AK += s_A * s_K; } sum_A /= (double)ni_test; sum_K /= (double)ni_test; sum_AK /= (double)ni_test; tr_A -= sum_A; tr_K -= sum_K; d = tr_AK - 2 * sum_AK + sum_A * sum_K; if (tr_A == 0 || tr_K == 0) { d = 0; } else { assert((tr_A * tr_K) - 1 != 0); assert(ni_test - n_cvt != 0); d = d / (tr_A * tr_K) - 1 / (double)(ni_test - n_cvt); } gsl_matrix_set(S, i, j, d); } } return; } /* // Copied from lmm.cpp; is used in the following function compKtoV // map a number 1..(n_cvt+2) to an index between 0 and [(n_cvt+2)*2+(n_cvt+2)]/2-1 // or 1..cols to 0..(cols*2+cols)/2-1. For a 3x3 matrix the following index gets returned to CalcPab: CalcPab * 1,1:0 * 1,2:1 * 1,3:2 * 2,2:5 * 2,3:6 * 3,3:9 which is really the iteration moving forward along the diagonal and items to the right of it. */ size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt) { auto cols = n_cvt + 2; enforce_msg(a<=cols && b<=cols,"GetabIndex problem"); size_t a1 = a, b1 = b; if (b <= a) { a1 = b; b1 = a; } size_t index = (2 * cols - a1 + 2) * (a1 - 1) / 2 + b1 - a1; // cout << "* GetabIndx " << a1 << "," << b1 << "," << cols << ":" << index << endl; // FIXME: should add a contract for index range return index; // return ( b < a ? ((2 * cols - b + 2) * (b - 1) / 2 + a - b ): ((2 * cols - a + 2) * (a - 1) / 2 + b - a) ); } // From an existing n by nd (centered) G matrix, compute the d+1 by // d*(d-1)/2*(d+1) Q matrix where inside i'th d+1 by d+1 matrix, each // element is tr(KiKlKjKm)-r*tr(KmKiKl)-r*tr(KlKjKm)+r^2*tr(KlKm), // where r=n/(n-1) void compKtoV(const gsl_matrix *G, gsl_matrix *V) { size_t n_vc = G->size2 / G->size1, ni_test = G->size1; gsl_matrix *KiKj = gsl_matrix_alloc(ni_test, (n_vc * (n_vc + 1)) / 2 * ni_test); gsl_vector *trKiKj = gsl_vector_alloc(n_vc * (n_vc + 1) / 2); gsl_vector *trKi = gsl_vector_alloc(n_vc); assert(ni_test != 1); double d, tr, r = (double)ni_test / (double)(ni_test - 1); size_t t, t_il, t_jm, t_lm, t_im, t_jl, t_ij; // Compute KiKj for all pairs of i and j (not including the identity // matrix). t = 0; for (size_t i = 0; i < n_vc; i++) { gsl_matrix_const_view Ki = gsl_matrix_const_submatrix(G, 0, i * ni_test, ni_test, ni_test); for (size_t j = i; j < n_vc; j++) { gsl_matrix_const_view Kj = gsl_matrix_const_submatrix(G, 0, j * ni_test, ni_test, ni_test); gsl_matrix_view KiKj_sub = gsl_matrix_submatrix(KiKj, 0, t * ni_test, ni_test, ni_test); fast_dgemm("N", "N", 1.0, &Ki.matrix, &Kj.matrix, 0.0, &KiKj_sub.matrix); t++; } } // Compute trKi, trKiKj. t = 0; for (size_t i = 0; i < n_vc; i++) { for (size_t j = i; j < n_vc; j++) { tr = 0; for (size_t k = 0; k < ni_test; k++) { tr += gsl_matrix_get(KiKj, k, t * ni_test + k); } gsl_vector_set(trKiKj, t, tr); t++; } tr = 0; for (size_t k = 0; k < ni_test; k++) { tr += gsl_matrix_get(G, k, i * ni_test + k); } gsl_vector_set(trKi, i, tr); } // Compute V. for (size_t i = 0; i < n_vc; i++) { for (size_t j = i; j < n_vc; j++) { t_ij = GetabIndex(i + 1, j + 1, n_vc - 2); for (size_t l = 0; l < n_vc + 1; l++) { for (size_t m = 0; m < n_vc + 1; m++) { if (l != n_vc && m != n_vc) { t_il = GetabIndex(i + 1, l + 1, n_vc - 2); t_jm = GetabIndex(j + 1, m + 1, n_vc - 2); t_lm = GetabIndex(l + 1, m + 1, n_vc - 2); tr = 0; for (size_t k = 0; k < ni_test; k++) { gsl_vector_const_view KiKl_row = gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test); gsl_vector_const_view KiKl_col = gsl_matrix_const_column(KiKj, t_il * ni_test + k); gsl_vector_const_view KjKm_row = gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test); gsl_vector_const_view KjKm_col = gsl_matrix_const_column(KiKj, t_jm * ni_test + k); gsl_vector_const_view Kl_row = gsl_matrix_const_subrow(G, k, l * ni_test, ni_test); gsl_vector_const_view Km_row = gsl_matrix_const_subrow(G, k, m * ni_test, ni_test); if (i <= l && j <= m) { gsl_blas_ddot(&KiKl_row.vector, &KjKm_col.vector, &d); tr += d; gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d); tr -= r * d; gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d); tr -= r * d; } else if (i <= l && j > m) { gsl_blas_ddot(&KiKl_row.vector, &KjKm_row.vector, &d); tr += d; gsl_blas_ddot(&Km_row.vector, &KiKl_col.vector, &d); tr -= r * d; gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d); tr -= r * d; } else if (i > l && j <= m) { gsl_blas_ddot(&KiKl_col.vector, &KjKm_col.vector, &d); tr += d; gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d); tr -= r * d; gsl_blas_ddot(&Kl_row.vector, &KjKm_col.vector, &d); tr -= r * d; } else { gsl_blas_ddot(&KiKl_col.vector, &KjKm_row.vector, &d); tr += d; gsl_blas_ddot(&Km_row.vector, &KiKl_row.vector, &d); tr -= r * d; gsl_blas_ddot(&Kl_row.vector, &KjKm_row.vector, &d); tr -= r * d; } } tr += r * r * gsl_vector_get(trKiKj, t_lm); } else if (l != n_vc && m == n_vc) { t_il = GetabIndex(i + 1, l + 1, n_vc - 2); t_jl = GetabIndex(j + 1, l + 1, n_vc - 2); tr = 0; for (size_t k = 0; k < ni_test; k++) { gsl_vector_const_view KiKl_row = gsl_matrix_const_subrow(KiKj, k, t_il * ni_test, ni_test); gsl_vector_const_view KiKl_col = gsl_matrix_const_column(KiKj, t_il * ni_test + k); gsl_vector_const_view Kj_row = gsl_matrix_const_subrow(G, k, j * ni_test, ni_test); if (i <= l) { gsl_blas_ddot(&KiKl_row.vector, &Kj_row.vector, &d); tr += d; } else { gsl_blas_ddot(&KiKl_col.vector, &Kj_row.vector, &d); tr += d; } } tr += -r * gsl_vector_get(trKiKj, t_il) - r * gsl_vector_get(trKiKj, t_jl) + r * r * gsl_vector_get(trKi, l); } else if (l == n_vc && m != n_vc) { t_jm = GetabIndex(j + 1, m + 1, n_vc - 2); t_im = GetabIndex(i + 1, m + 1, n_vc - 2); tr = 0; for (size_t k = 0; k < ni_test; k++) { gsl_vector_const_view KjKm_row = gsl_matrix_const_subrow(KiKj, k, t_jm * ni_test, ni_test); gsl_vector_const_view KjKm_col = gsl_matrix_const_column(KiKj, t_jm * ni_test + k); gsl_vector_const_view Ki_row = gsl_matrix_const_subrow(G, k, i * ni_test, ni_test); if (j <= m) { gsl_blas_ddot(&KjKm_row.vector, &Ki_row.vector, &d); tr += d; } else { gsl_blas_ddot(&KjKm_col.vector, &Ki_row.vector, &d); tr += d; } } tr += -r * gsl_vector_get(trKiKj, t_im) - r * gsl_vector_get(trKiKj, t_jm) + r * r * gsl_vector_get(trKi, m); } else { tr = gsl_vector_get(trKiKj, t_ij) - r * gsl_vector_get(trKi, i) - r * gsl_vector_get(trKi, j) + r * r * (double)(ni_test - 1); } gsl_matrix_set(V, l, t_ij * (n_vc + 1) + m, tr); } } } } assert(ni_test != 0); gsl_matrix_scale(V, 1.0 / pow((double)ni_test, 2)); gsl_matrix_free(KiKj); gsl_vector_free(trKiKj); gsl_vector_free(trKi); return; } // Perform Jacknife sampling for variance of S. void JackknifeAKtoS(const gsl_matrix *W, const gsl_matrix *A, const gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar) { size_t n_vc = Svar->size1, ni_test = A->size1, n_cvt = W->size2; vector>> trAK, sumAK; vector> sumA, sumK, trA, trK, sA, sK; vector vec_tmp; double di, dj, d, m, v; // Initialize and set all elements to zero. for (size_t i = 0; i < ni_test; i++) { vec_tmp.push_back(0); } for (size_t i = 0; i < n_vc; i++) { sumA.push_back(vec_tmp); sumK.push_back(vec_tmp); trA.push_back(vec_tmp); trK.push_back(vec_tmp); sA.push_back(vec_tmp); sK.push_back(vec_tmp); } for (size_t i = 0; i < n_vc; i++) { trAK.push_back(sumK); sumAK.push_back(sumK); } // Run jackknife. for (size_t i = 0; i < n_vc; i++) { for (size_t l = 0; l < ni_test; l++) { for (size_t k = 0; k < ni_test; k++) { di = gsl_matrix_get(A, l, k + ni_test * i); dj = gsl_matrix_get(K, l, k + ni_test * i); for (size_t t = 0; t < ni_test; t++) { if (t == l || t == k) { continue; } sumA[i][t] += di; sumK[i][t] += dj; if (l == k) { trA[i][t] += di; trK[i][t] += dj; } } sA[i][l] += di; sK[i][l] += dj; } } for (size_t t = 0; t < ni_test; t++) { assert(ni_test != 1); sumA[i][t] /= (double)(ni_test - 1); sumK[i][t] /= (double)(ni_test - 1); } } for (size_t i = 0; i < n_vc; i++) { for (size_t j = 0; j < n_vc; j++) { for (size_t l = 0; l < ni_test; l++) { for (size_t k = 0; k < ni_test; k++) { di = gsl_matrix_get(A, l, k + ni_test * i); dj = gsl_matrix_get(K, l, k + ni_test * j); d = di * dj; for (size_t t = 0; t < ni_test; t++) { if (t == l || t == k) { continue; } trAK[i][j][t] += d; } } for (size_t t = 0; t < ni_test; t++) { if (t == l) { continue; } di = gsl_matrix_get(A, l, t + ni_test * i); dj = gsl_matrix_get(K, l, t + ni_test * j); sumAK[i][j][t] += (sA[i][l] - di) * (sK[j][l] - dj); } } for (size_t t = 0; t < ni_test; t++) { assert(ni_test != 1); sumAK[i][j][t] /= (double)(ni_test - 1); } m = 0; v = 0; for (size_t t = 0; t < ni_test; t++) { d = trAK[i][j][t] - 2 * sumAK[i][j][t] + sumA[i][t] * sumK[j][t]; if ((trA[i][t] - sumA[i][t]) == 0 || (trK[j][t] - sumK[j][t]) == 0) { d = 0; } else { d /= (trA[i][t] - sumA[i][t]) * (trK[j][t] - sumK[j][t]); d -= 1 / (double)(ni_test - n_cvt - 1); } m += d; v += d * d; } m /= (double)ni_test; v /= (double)ni_test; v -= m * m; v *= (double)(ni_test - 1); gsl_matrix_set(Svar, i, j, v); if (n_cvt == 1) { d = gsl_matrix_get(S, i, j); d = (double)ni_test * d - (double)(ni_test - 1) * m; gsl_matrix_set(S, i, j, d); } } } return; } // Compute the d by d S matrix with its d by d variance matrix of // Svar, and the d+1 by d(d+1) matrix of Q for V(q). void PARAM::CalcS(const map &mapRS2wA, const map &mapRS2wK, const gsl_matrix *W, gsl_matrix *A, gsl_matrix *K, gsl_matrix *S, gsl_matrix *Svar, gsl_vector *ns) { string file_str; gsl_matrix_set_zero(S); gsl_matrix_set_zero(Svar); gsl_vector_set_zero(ns); // Compute the kinship matrix G for multiple categories; these // matrices are not centered, for convienence of Jacknife sampling. if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; if (mapRS2wA.size() == 0) { if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns) == false) { error = true; } } else { if (PlinkKin(file_str, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns) == false) { error = true; } } } else if (!file_geno.empty()) { file_str = file_geno; if (mapRS2wA.size() == 0) { if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv, indicator_snp, mapRS2wK, mapRS2cat, snpInfo, W, K, ns) == false) { error = true; } } else { if (BimbamKinUncentered(file_str, setKSnps, d_pace, indicator_idv, indicator_snp, mapRS2wA, mapRS2cat, snpInfo, W, A, ns) == false) { error = true; } } } else if (!file_mbfile.empty()) { if (mapRS2wA.size() == 0) { if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) { error = true; } } else { if (MFILEKin(1, file_mbfile, setKSnps, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) { error = true; } } } else if (!file_mgeno.empty()) { if (mapRS2wA.size() == 0) { if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv, mindicator_snp, mapRS2wK, mapRS2cat, msnpInfo, W, K, ns) == false) { error = true; } } else { if (MFILEKin(0, file_mgeno, setKSnps, d_pace, indicator_idv, mindicator_snp, mapRS2wA, mapRS2cat, msnpInfo, W, A, ns) == false) { error = true; } } } if (mapRS2wA.size() == 0) { gsl_matrix_memcpy(A, K); } // Center and scale every kinship matrix inside G. for (size_t i = 0; i < n_vc; i++) { gsl_matrix_view Ksub = gsl_matrix_submatrix(K, 0, i * ni_test, ni_test, ni_test); CenterMatrix(&Ksub.matrix); // Scale the matrix G such that the mean diagonal = 1. ScaleMatrix(&Ksub.matrix); gsl_matrix_view Asub = gsl_matrix_submatrix(A, 0, i * ni_test, ni_test, ni_test); CenterMatrix(&Asub.matrix); ScaleMatrix(&Asub.matrix); } // Cased on G, compute S. compAKtoS(A, K, W->size2, S); // Compute Svar and update S with Jacknife. JackknifeAKtoS(W, A, K, S, Svar); return; } void PARAM::WriteVector(const gsl_vector *q, const gsl_vector *s, const size_t n_total, const string suffix) { string file_str; file_str = path_out + "/" + file_out; file_str += "."; file_str += suffix; file_str += ".txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } outfile.precision(10); for (size_t i = 0; i < q->size; ++i) { outfile << gsl_vector_get(q, i) << endl; } for (size_t i = 0; i < s->size; ++i) { outfile << gsl_vector_get(s, i) << endl; } outfile << n_total << endl; outfile.close(); outfile.clear(); return; } void PARAM::WriteVar(const string suffix) { string file_str, rs; file_str = path_out + "/" + file_out; file_str += "."; file_str += suffix; file_str += ".txt.gz"; ogzstream outfile(file_str.c_str(), ogzstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } outfile.precision(10); if (mindicator_snp.size() != 0) { for (size_t t = 0; t < mindicator_snp.size(); t++) { indicator_snp = mindicator_snp[t]; for (size_t i = 0; i < indicator_snp.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; outfile << rs << endl; } } } else { for (size_t i = 0; i < indicator_snp.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; outfile << rs << endl; } } outfile.close(); outfile.clear(); return; } void PARAM::WriteMatrix(const gsl_matrix *matrix_U, const string suffix) { string file_str; file_str = path_out + "/" + file_out; file_str += "."; file_str += suffix; file_str += ".txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } outfile.precision(10); for (size_t i = 0; i < matrix_U->size1; ++i) { for (size_t j = 0; j < matrix_U->size2; ++j) { outfile << tab(j) << gsl_matrix_get(matrix_U, i, j); } outfile << endl; } outfile.close(); outfile.clear(); return; } void PARAM::WriteVector(const gsl_vector *vector_D, const string suffix) { string file_str; file_str = path_out + "/" + file_out; file_str += "."; file_str += suffix; file_str += ".txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } outfile.precision(10); for (size_t i = 0; i < vector_D->size; ++i) { outfile << gsl_vector_get(vector_D, i) << endl; } outfile.close(); outfile.clear(); return; } void PARAM::CheckCvt() { if (indicator_cvt.size() == 0) { return; } size_t ci_test = 0; gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) { continue; } for (size_t j = 0; j < n_cvt; ++j) { gsl_matrix_set(W, ci_test, j, (cvt)[i][j]); } ci_test++; } size_t flag_ipt = 0; double v_min, v_max; set set_remove; // Check if any columns is an intercept. for (size_t i = 0; i < W->size2; i++) { gsl_vector_view w_col = gsl_matrix_column(W, i); gsl_vector_minmax(&w_col.vector, &v_min, &v_max); if (v_min == v_max) { flag_ipt = 1; set_remove.insert(i); } } // Add an intercept term if needed. if (n_cvt == set_remove.size()) { indicator_cvt.clear(); n_cvt = 1; } else if (flag_ipt == 0) { info_msg("no intercept term is found in the cvt file: a column of 1s is added"); for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) { continue; } cvt[i].push_back(1.0); } n_cvt++; } else { } gsl_matrix_free(W); return; } // Post-process phenotypes and covariates. void PARAM::ProcessCvtPhen() { // Convert indicator_pheno to indicator_idv. int k = 1; indicator_idv.clear(); for (size_t i = 0; i < indicator_pheno.size(); i++) { k = 1; for (size_t j = 0; j < indicator_pheno[i].size(); j++) { if (indicator_pheno[i][j] == 0) { k = 0; } } indicator_idv.push_back(k); } // Remove individuals with missing covariates. if ((indicator_cvt).size() != 0) { for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { indicator_idv[i] *= indicator_cvt[i]; } } // Remove individuals with missing gxe variables. if ((indicator_gxe).size() != 0) { for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { indicator_idv[i] *= indicator_gxe[i]; } } // Remove individuals with missing residual weights. if ((indicator_weight).size() != 0) { for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { indicator_idv[i] *= indicator_weight[i]; } } // Obtain ni_test. ni_test = 0; for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { if (indicator_idv[i] == 0) { continue; } ni_test++; } // If subsample number is set, perform a random sub-sampling // to determine the subsampled ids. if (ni_subsample != 0) { if (ni_test < ni_subsample) { cout << "error! number of subsamples is less than number of" << "analyzed individuals. " << endl; } else { // From ni_test, sub-sample ni_subsample. vector a, b; for (size_t i = 0; i < ni_subsample; i++) { a.push_back(0); } for (size_t i = 0; i < ni_test; i++) { b.push_back(i); } gsl_ran_choose(gsl_r, static_cast(&a[0]), ni_subsample, static_cast(&b[0]), ni_test, sizeof(size_t)); // Re-set indicator_idv and ni_test. int j = 0; for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { if (indicator_idv[i] == 0) { continue; } if (find(a.begin(), a.end(), j) == a.end()) { indicator_idv[i] = 0; } j++; } ni_test = ni_subsample; } } // Check ni_test. if (a_mode != M_BSLMM5) enforce_msg(ni_test,"number of analyzed individuals equals 0."); if (ni_test == 0 && a_mode != M_BSLMM5) { error = true; cout << "error! number of analyzed individuals equals 0. " << endl; } // Check covariates to see if they are correlated with each // other, and to see if the intercept term is included. // After getting ni_test. // Add or remove covariates. if (indicator_cvt.size() != 0) { CheckCvt(); } else { vector cvt_row; cvt_row.push_back(1); for (vector::size_type i = 0; i < (indicator_idv).size(); ++i) { indicator_cvt.push_back(1); cvt.push_back(cvt_row); } } return; } void PARAM::CopyCvt(gsl_matrix *W) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0 || indicator_cvt[i] == 0) { continue; } for (size_t j = 0; j < n_cvt; ++j) { gsl_matrix_set(W, ci_test, j, (cvt)[i][j]); } ci_test++; } return; } void PARAM::CopyGxe(gsl_vector *env) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0 || indicator_gxe[i] == 0) { continue; } gsl_vector_set(env, ci_test, gxe[i]); ci_test++; } return; } void PARAM::CopyWeight(gsl_vector *w) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0 || indicator_weight[i] == 0) { continue; } gsl_vector_set(w, ci_test, weight[i]); ci_test++; } return; } // If flag=0, then use indicator_idv to load W and Y; // else, use indicator_cvt to load them. void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (flag == 0) { if (indicator_idv[i] == 0) { continue; } } else { if (indicator_cvt[i] == 0) { continue; } } gsl_vector_set(y, ci_test, (pheno)[i][0]); for (size_t j = 0; j < n_cvt; ++j) { gsl_matrix_set(W, ci_test, j, (cvt)[i][j]); } ci_test++; } return; } // If flag=0, then use indicator_idv to load W and Y; // else, use indicator_cvt to load them. void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (flag == 0) { if (indicator_idv[i] == 0) { continue; } } else { if (indicator_cvt[i] == 0) { continue; } } for (size_t j = 0; j < n_ph; ++j) { gsl_matrix_set(Y, ci_test, j, (pheno)[i][j]); } for (size_t j = 0; j < n_cvt; ++j) { gsl_matrix_set(W, ci_test, j, (cvt)[i][j]); } ci_test++; } return; } void PARAM::CopyRead(gsl_vector *log_N) { size_t ci_test = 0; for (vector::size_type i = 0; i < indicator_idv.size(); ++i) { if (indicator_idv[i] == 0) { continue; } gsl_vector_set(log_N, ci_test, log(vec_read[i])); ci_test++; } return; } void PARAM::ObtainWeight(const set &setSnps_beta, map &mapRS2wK) { mapRS2wK.clear(); vector wsum, wcount; for (size_t i = 0; i < n_vc; i++) { wsum.push_back(0.0); wcount.push_back(0.0); } string rs; if (msnpInfo.size() == 0) { for (size_t i = 0; i < snpInfo.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) && (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) && (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) && (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) { if (mapRS2wsnp.size() != 0) { mapRS2wK[rs] = mapRS2wsnp[rs]; if (mapRS2cat.size() == 0) { wsum[0] += mapRS2wsnp[rs]; } else { wsum[mapRS2cat[rs]] += mapRS2wsnp[rs]; } wcount[0]++; } else { mapRS2wK[rs] = 1; } } } } else { for (size_t t = 0; t < msnpInfo.size(); t++) { snpInfo = msnpInfo[t]; indicator_snp = mindicator_snp[t]; for (size_t i = 0; i < snpInfo.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; if ((setSnps_beta.size() == 0 || setSnps_beta.count(rs) != 0) && (mapRS2wsnp.size() == 0 || mapRS2wsnp.count(rs) != 0) && (mapRS2wcat.size() == 0 || mapRS2wcat.count(rs) != 0) && (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0)) { if (mapRS2wsnp.size() != 0) { mapRS2wK[rs] = mapRS2wsnp[rs]; if (mapRS2cat.size() == 0) { wsum[0] += mapRS2wsnp[rs]; } else { wsum[mapRS2cat[rs]] += mapRS2wsnp[rs]; } wcount[0]++; } else { mapRS2wK[rs] = 1; } } } } } if (mapRS2wsnp.size() != 0) { for (size_t i = 0; i < n_vc; i++) { wsum[i] /= wcount[i]; } for (map::iterator it = mapRS2wK.begin(); it != mapRS2wK.end(); ++it) { if (mapRS2cat.size() == 0) { it->second /= wsum[0]; } else { it->second /= wsum[mapRS2cat[it->first]]; } } } return; } // If pve_flag=0 then do not change pve; pve_flag==1, then change pve // to 0 if pve < 0 and pve to 1 if pve > 1. void PARAM::UpdateWeight(const size_t pve_flag, const map &mapRS2wK, const size_t ni_test, const gsl_vector *ns, map &mapRS2wA) { double d; vector wsum, wcount; for (size_t i = 0; i < n_vc; i++) { wsum.push_back(0.0); wcount.push_back(0.0); } for (map::const_iterator it = mapRS2wK.begin(); it != mapRS2wK.end(); ++it) { d = 1; for (size_t i = 0; i < n_vc; i++) { if (v_pve[i] >= 1 && pve_flag == 1) { d += (double)ni_test / gsl_vector_get(ns, i) * mapRS2wcat[it->first][i]; } else if (v_pve[i] <= 0 && pve_flag == 1) { d += 0; } else { d += (double)ni_test / gsl_vector_get(ns, i) * mapRS2wcat[it->first][i] * v_pve[i]; } } mapRS2wA[it->first] = 1 / (d * d); if (mapRS2cat.size() == 0) { wsum[0] += mapRS2wA[it->first]; wcount[0]++; } else { wsum[mapRS2cat[it->first]] += mapRS2wA[it->first]; wcount[mapRS2cat[it->first]]++; } } for (size_t i = 0; i < n_vc; i++) { wsum[i] /= wcount[i]; } for (map::iterator it = mapRS2wA.begin(); it != mapRS2wA.end(); ++it) { if (mapRS2cat.size() == 0) { it->second /= wsum[0]; } else { it->second /= wsum[mapRS2cat[it->first]]; } } return; } // This function updates indicator_snp, and save z-scores and other // values into vectors. void PARAM::UpdateSNPnZ(const map &mapRS2wA, const map &mapRS2A1, const map &mapRS2z, gsl_vector *w, gsl_vector *z, vector &vec_cat) { gsl_vector_set_zero(w); gsl_vector_set_zero(z); vec_cat.clear(); string rs, a1; size_t c = 0; if (msnpInfo.size() == 0) { for (size_t i = 0; i < snpInfo.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; a1 = snpInfo[i].a_minor; if (mapRS2wA.count(rs) != 0) { if (a1 == mapRS2A1.at(rs)) { gsl_vector_set(z, c, mapRS2z.at(rs)); } else { gsl_vector_set(z, c, -1 * mapRS2z.at(rs)); } vec_cat.push_back(mapRS2cat.at(rs)); gsl_vector_set(w, c, mapRS2wA.at(rs)); c++; } else { indicator_snp[i] = 0; } } } else { for (size_t t = 0; t < msnpInfo.size(); t++) { snpInfo = msnpInfo[t]; for (size_t i = 0; i < snpInfo.size(); i++) { if (mindicator_snp[t][i] == 0) { continue; } rs = snpInfo[i].rs_number; a1 = snpInfo[i].a_minor; if (mapRS2wA.count(rs) != 0) { if (a1 == mapRS2A1.at(rs)) { gsl_vector_set(z, c, mapRS2z.at(rs)); } else { gsl_vector_set(z, c, -1 * mapRS2z.at(rs)); } vec_cat.push_back(mapRS2cat.at(rs)); gsl_vector_set(w, c, mapRS2wA.at(rs)); c++; } else { mindicator_snp[t][i] = 0; } } } } return; } // This function updates indicator_snp, and save z-scores and other // values into vectors. void PARAM::UpdateSNP(const map &mapRS2wA) { string rs; if (msnpInfo.size() == 0) { for (size_t i = 0; i < snpInfo.size(); i++) { if (indicator_snp[i] == 0) { continue; } rs = snpInfo[i].rs_number; if (mapRS2wA.count(rs) == 0) { indicator_snp[i] = 0; } } } else { for (size_t t = 0; t < msnpInfo.size(); t++) { snpInfo = msnpInfo[t]; for (size_t i = 0; i < mindicator_snp[t].size(); i++) { if (mindicator_snp[t][i] == 0) { continue; } rs = snpInfo[i].rs_number; if (mapRS2wA.count(rs) == 0) { mindicator_snp[t][i] = 0; } } } } return; }