/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011-2017, Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" #include "gzstream.h" #include "gemma_io.h" #include "lapack.h" #include "mathfunc.h" #include "param.h" #include "varcov.h" using namespace std; void VARCOV::CopyFromParam(PARAM &cPar) { d_pace = cPar.d_pace; file_bfile = cPar.file_bfile; file_geno = cPar.file_geno; file_out = cPar.file_out; path_out = cPar.path_out; time_opt = 0.0; window_cm = cPar.window_cm; window_bp = cPar.window_bp; window_ns = cPar.window_ns; indicator_idv = cPar.indicator_idv; indicator_snp = cPar.indicator_snp; snpInfo = cPar.snpInfo; return; } void VARCOV::CopyToParam(PARAM &cPar) { cPar.time_opt = time_opt; return; } void VARCOV::WriteCov(const int flag, const vector &snpInfo_sub, const vector> &Cov_mat) { string file_cov; file_cov = path_out + "/" + file_out; file_cov += ".cor.txt"; ofstream outfile; if (flag == 0) { outfile.open(file_cov.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_cov << endl; return; } outfile << "chr" << "\t" << "rs" << "\t" << "ps" << "\t" << "n_mis" << "\t" << "n_obs" << "\t" << "allele1" << "\t" << "allele0" << "\t" << "af" << "\t" << "window_size" << "\t" << "var" << "\t" << "cor" << endl; } else { outfile.open(file_cov.c_str(), ofstream::app); if (!outfile) { cout << "error writing file: " << file_cov << endl; return; } for (size_t i = 0; i < Cov_mat.size(); i++) { outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t" << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor << "\t" << snpInfo_sub[i].a_major << "\t" << fixed << setprecision(3) << snpInfo_sub[i].maf << "\t" << Cov_mat[i].size() - 1 << "\t"; outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t"; if (Cov_mat[i].size() == 1) { outfile << "NA"; } else { for (size_t j = 1; j < Cov_mat[i].size(); j++) { if (j == (Cov_mat[i].size() - 1)) { outfile << Cov_mat[i][j]; } else { outfile << Cov_mat[i][j] << ","; } } } outfile << endl; } } outfile.close(); outfile.clear(); return; } bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) { int c_chr = snpInfo1.chr.compare(snpInfo2.chr); long int c_bp = snpInfo1.base_position - snpInfo2.base_position; if (c_chr < 0) { return true; } else if (c_chr > 0) { return false; } else { if (c_bp < 0) { return true; } else if (c_bp > 0) { return false; } else { return true; } } } // Do not sort SNPs (because gzip files do not support random access) // then calculate n_nb, the number of neighbours, for each SNP. void VARCOV::CalcNB(vector &snpInfo_sort) { size_t t2 = 0, n_nb = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { if (indicator_snp[t] == 0) { continue; } if (snpInfo_sort[t].chr == "-9" || (snpInfo_sort[t].cM == -9 && window_cm != 0) || (snpInfo_sort[t].base_position == -9 && window_bp != 0)) { snpInfo_sort[t].n_nb = 0; continue; } if (t == indicator_snp.size() - 1) { snpInfo_sort[t].n_nb = 0; continue; } t2 = t + 1; n_nb = 0; while (t2 < indicator_snp.size() && snpInfo_sort[t2].chr == snpInfo_sort[t].chr && indicator_snp[t2] == 0) { t2++; } while (t2 < indicator_snp.size() && snpInfo_sort[t2].chr == snpInfo_sort[t].chr && (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm || window_cm == 0) && (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position < (long int) window_bp || window_bp == 0) && (n_nb < window_ns || window_ns == 0)) { t2++; n_nb++; while (t2 < indicator_snp.size() && snpInfo_sort[t2].chr == snpInfo_sort[t].chr && indicator_snp[t2] == 0) { t2++; } } snpInfo_sort[t].n_nb = n_nb; } return; } // Vector double is centered to have mean 0. void Calc_Cor(vector> &X_mat, vector &cov_vec) { cov_vec.clear(); double v1, v2, r; vector x_vec = X_mat[0]; lapack_ddot(x_vec, x_vec, v1); cov_vec.push_back(v1 / (double)x_vec.size()); for (size_t i = 1; i < X_mat.size(); i++) { lapack_ddot(X_mat[i], x_vec, r); lapack_ddot(X_mat[i], X_mat[i], v2); r /= sqrt(v1 * v2); cov_vec.push_back(r); } return; } // Read the genotype file again, calculate r2 between pairs of SNPs // within a window, output the file every 10K SNPs the output results // are sorted based on chr and bp output format similar to assoc.txt // files (remember n_miss is replaced by n_idv). // // r2 between the current SNP and every following SNPs within the // window_size (which can vary if cM was used) read bimbam mean // genotype file and calculate the covariance matrix for neighboring // SNPs output values at 10000-SNP-interval. void VARCOV::AnalyzeBimbam() { debug_msg("entering"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { cout << "error reading genotype file:" << file_geno << endl; return; } // Calculate the number of right-hand-side neighbours for each SNP. vector snpInfo_sub; CalcNB(snpInfo); size_t ni_test = 0; for (size_t i = 0; i < indicator_idv.size(); i++) { ni_test += indicator_idv[i]; } gsl_vector *geno = gsl_vector_alloc(ni_test); double geno_mean; vector x_vec, cov_vec; vector> X_mat, Cov_mat; for (size_t i = 0; i < ni_test; i++) { x_vec.push_back(0); } WriteCov(0, snpInfo_sub, Cov_mat); size_t t2 = 0, inc; int n_nb = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; } if (X_mat.size() == 0) { n_nb = snpInfo[t].n_nb + 1; } else { n_nb = snpInfo[t].n_nb - n_nb + 1; } for (int i = 0; i < n_nb; i++) { if (X_mat.size() == 0) { t2 = t; } // Read a line of the snp is filtered out. inc = 0; while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) { t2++; inc++; } Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean); gsl_vector_add_constant(geno, -1.0 * geno_mean); for (size_t j = 0; j < geno->size; j++) { x_vec[j] = gsl_vector_get(geno, j); } X_mat.push_back(x_vec); t2++; } n_nb = snpInfo[t].n_nb; Calc_Cor(X_mat, cov_vec); Cov_mat.push_back(cov_vec); snpInfo_sub.push_back(snpInfo[t]); X_mat.erase(X_mat.begin()); // Write out var/cov values. if (Cov_mat.size() == 10000) { WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } } if (Cov_mat.size() != 0) { WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } gsl_vector_free(geno); infile.close(); infile.clear(); return; } void VARCOV::AnalyzePlink() { debug_msg("entering"); string file_bed = file_bfile + ".bed"; ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { cout << "error reading bed file:" << file_bed << endl; return; } // Calculate the number of right-hand-side neighbours for each SNP. vector snpInfo_sub; CalcNB(snpInfo); size_t ni_test = 0; for (size_t i = 0; i < indicator_idv.size(); i++) { ni_test += indicator_idv[i]; } gsl_vector *geno = gsl_vector_alloc(ni_test); double geno_mean; vector x_vec, cov_vec; vector> X_mat, Cov_mat; for (size_t i = 0; i < ni_test; i++) { x_vec.push_back(0); } WriteCov(0, snpInfo_sub, Cov_mat); size_t t2 = 0, inc; int n_nb = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; } if (X_mat.size() == 0) { n_nb = snpInfo[t].n_nb + 1; } else { n_nb = snpInfo[t].n_nb - n_nb + 1; } for (int i = 0; i < n_nb; i++) { if (X_mat.size() == 0) { t2 = t; } // Read a line if the SNP is filtered out. inc = 0; while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) { t2++; inc++; } Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean); gsl_vector_add_constant(geno, -1.0 * geno_mean); for (size_t j = 0; j < geno->size; j++) { x_vec[j] = gsl_vector_get(geno, j); } X_mat.push_back(x_vec); t2++; } n_nb = snpInfo[t].n_nb; Calc_Cor(X_mat, cov_vec); Cov_mat.push_back(cov_vec); snpInfo_sub.push_back(snpInfo[t]); X_mat.erase(X_mat.begin()); // Write out var/cov values. if (Cov_mat.size() == 10000) { WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } } if (Cov_mat.size() != 0) { WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } gsl_vector_free(geno); infile.close(); infile.clear(); return; }