diff options
Diffstat (limited to 'src/varcov.cpp')
-rw-r--r-- | src/varcov.cpp | 386 |
1 files changed, 216 insertions, 170 deletions
diff --git a/src/varcov.cpp b/src/varcov.cpp index 46b5bf8..0f87ba8 100644 --- a/src/varcov.cpp +++ b/src/varcov.cpp @@ -16,103 +16,126 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ -#include <iostream> +#include <bitset> +#include <cmath> +#include <cstring> #include <fstream> -#include <sstream> -#include <string> #include <iomanip> -#include <bitset> -#include <vector> +#include <iostream> #include <map> #include <set> -#include <cstring> -#include <cmath> +#include <sstream> #include <stdio.h> #include <stdlib.h> +#include <string> +#include <vector> -#include "gsl/gsl_vector.h" -#include "gsl/gsl_matrix.h" -#include "gsl/gsl_linalg.h" #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" -#include "lapack.h" #include "gzstream.h" -#include "param.h" -#include "varcov.h" #include "io.h" +#include "lapack.h" #include "mathfunc.h" +#include "param.h" +#include "varcov.h" using namespace std; -void VARCOV::CopyFromParam (PARAM &cPar) { - d_pace=cPar.d_pace; +void VARCOV::CopyFromParam(PARAM &cPar) { + d_pace = cPar.d_pace; - file_bfile=cPar.file_bfile; - file_geno=cPar.file_geno; - file_out=cPar.file_out; - path_out=cPar.path_out; + file_bfile = cPar.file_bfile; + file_geno = cPar.file_geno; + file_out = cPar.file_out; + path_out = cPar.path_out; - time_opt=0.0; + time_opt = 0.0; - window_cm=cPar.window_cm; - window_bp=cPar.window_bp; - window_ns=cPar.window_ns; + window_cm = cPar.window_cm; + window_bp = cPar.window_bp; + window_ns = cPar.window_ns; - indicator_idv=cPar.indicator_idv; - indicator_snp=cPar.indicator_snp; - snpInfo=cPar.snpInfo; + indicator_idv = cPar.indicator_idv; + indicator_snp = cPar.indicator_snp; + snpInfo = cPar.snpInfo; - return; + return; } -void VARCOV::CopyToParam (PARAM &cPar) { - cPar.time_opt=time_opt; - return; +void VARCOV::CopyToParam(PARAM &cPar) { + cPar.time_opt = time_opt; + return; } -void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub, - const vector<vector<double> > &Cov_mat) { +void VARCOV::WriteCov(const int flag, const vector<SNPINFO> &snpInfo_sub, + const vector<vector<double>> &Cov_mat) { string file_cov; - file_cov=path_out+"/"+file_out; - file_cov+=".cor.txt"; + file_cov = path_out + "/" + file_out; + file_cov += ".cor.txt"; ofstream outfile; - if (flag==0) { - outfile.open (file_cov.c_str(), ofstream::out); - if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;} + if (flag == 0) { + outfile.open(file_cov.c_str(), ofstream::out); + if (!outfile) { + cout << "error writing file: " << file_cov << endl; + return; + } - outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis" - <<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0" - <<"\t"<<"af"<<"\t"<<"window_size" - <<"\t"<<"var"<<"\t"<<"cor"<<endl; + outfile << "chr" + << "\t" + << "rs" + << "\t" + << "ps" + << "\t" + << "n_mis" + << "\t" + << "n_obs" + << "\t" + << "allele1" + << "\t" + << "allele0" + << "\t" + << "af" + << "\t" + << "window_size" + << "\t" + << "var" + << "\t" + << "cor" << endl; } else { - outfile.open (file_cov.c_str(), ofstream::app); - if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;} - - for (size_t i=0; i<Cov_mat.size(); i++) { - outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << - "\t" << snpInfo_sub[i].base_position << "\t" << - snpInfo_sub[i].n_miss << "\t" << snpInfo_sub[i].n_idv << "\t" << - snpInfo_sub[i].a_minor << "\t" << snpInfo_sub[i].a_major << "\t" << - fixed << setprecision(3) << snpInfo_sub[i].maf << "\t" << - Cov_mat[i].size()-1 << "\t"; - outfile<<scientific<<setprecision(6)<<Cov_mat[i][0]<<"\t"; - - if (Cov_mat[i].size()==1) { - outfile<<"NA"; + outfile.open(file_cov.c_str(), ofstream::app); + if (!outfile) { + cout << "error writing file: " << file_cov << endl; + return; + } + + for (size_t i = 0; i < Cov_mat.size(); i++) { + outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t" + << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss + << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor + << "\t" << snpInfo_sub[i].a_major << "\t" << fixed + << setprecision(3) << snpInfo_sub[i].maf << "\t" + << Cov_mat[i].size() - 1 << "\t"; + outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t"; + + if (Cov_mat[i].size() == 1) { + outfile << "NA"; } else { - for (size_t j=1; j<Cov_mat[i].size(); j++) { - if (j==(Cov_mat[i].size()-1)) { - outfile<<Cov_mat[i][j]; - } else { - outfile<<Cov_mat[i][j]<<","; - } - } + for (size_t j = 1; j < Cov_mat[i].size(); j++) { + if (j == (Cov_mat[i].size() - 1)) { + outfile << Cov_mat[i][j]; + } else { + outfile << Cov_mat[i][j] << ","; + } + } } - outfile<<endl; + outfile << endl; } } @@ -121,18 +144,18 @@ void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub, return; } -bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) { - int c_chr=snpInfo1.chr.compare(snpInfo2.chr); - long int c_bp=snpInfo1.base_position-snpInfo2.base_position; +bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) { + int c_chr = snpInfo1.chr.compare(snpInfo2.chr); + long int c_bp = snpInfo1.base_position - snpInfo2.base_position; - if(c_chr<0) { + if (c_chr < 0) { return true; - } else if (c_chr>0) { + } else if (c_chr > 0) { return false; } else { - if (c_bp<0) { + if (c_bp < 0) { return true; - } else if (c_bp>0) { + } else if (c_bp > 0) { return false; } else { return true; @@ -140,64 +163,73 @@ bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) { } } - // Do not sort SNPs (because gzip files do not support random access) // then calculate n_nb, the number of neighbours, for each SNP. -void VARCOV::CalcNB (vector<SNPINFO> &snpInfo_sort) { - size_t t2=0, n_nb=0; - for (size_t t=0; t<indicator_snp.size(); ++t) { - if (indicator_snp[t]==0) {continue;} - - if (snpInfo_sort[t].chr=="-9" || - (snpInfo_sort[t].cM==-9 && window_cm!=0) || - (snpInfo_sort[t].base_position==-9 && window_bp!=0) ) { - snpInfo_sort[t].n_nb=0; continue; +void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) { + size_t t2 = 0, n_nb = 0; + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (indicator_snp[t] == 0) { + continue; + } + + if (snpInfo_sort[t].chr == "-9" || + (snpInfo_sort[t].cM == -9 && window_cm != 0) || + (snpInfo_sort[t].base_position == -9 && window_bp != 0)) { + snpInfo_sort[t].n_nb = 0; + continue; } - if (t==indicator_snp.size()-1) {snpInfo_sort[t].n_nb=0; continue;} + if (t == indicator_snp.size() - 1) { + snpInfo_sort[t].n_nb = 0; + continue; + } - t2=t+1; n_nb=0; + t2 = t + 1; + n_nb = 0; - while (t2<indicator_snp.size() && - snpInfo_sort[t2].chr == snpInfo_sort[t].chr && - indicator_snp[t2]==0) { + while (t2 < indicator_snp.size() && + snpInfo_sort[t2].chr == snpInfo_sort[t].chr && + indicator_snp[t2] == 0) { t2++; } - while (t2<indicator_snp.size() && - snpInfo_sort[t2].chr==snpInfo_sort[t].chr && - (snpInfo_sort[t2].cM-snpInfo_sort[t].cM<window_cm || - window_cm==0) && - (snpInfo_sort[t2].base_position-snpInfo_sort[t].base_position < - window_bp || window_bp==0) && (n_nb<window_ns|| window_ns==0)) { - t2++; n_nb++; - while (t2<indicator_snp.size() && - snpInfo_sort[t2].chr==snpInfo_sort[t].chr && - indicator_snp[t2]==0) { - t2++; + while (t2 < indicator_snp.size() && + snpInfo_sort[t2].chr == snpInfo_sort[t].chr && + (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm || + window_cm == 0) && + (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position < + window_bp || + window_bp == 0) && + (n_nb < window_ns || window_ns == 0)) { + t2++; + n_nb++; + while (t2 < indicator_snp.size() && + snpInfo_sort[t2].chr == snpInfo_sort[t].chr && + indicator_snp[t2] == 0) { + t2++; } } - snpInfo_sort[t].n_nb=n_nb; + snpInfo_sort[t].n_nb = n_nb; } return; } // Vector double is centered to have mean 0. -void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec) { +void Calc_Cor(vector<vector<double>> &X_mat, vector<double> &cov_vec) { cov_vec.clear(); double v1, v2, r; - vector<double> x_vec=X_mat[0]; + vector<double> x_vec = X_mat[0]; lapack_ddot(x_vec, x_vec, v1); - cov_vec.push_back(v1/(double)x_vec.size() ); + cov_vec.push_back(v1 / (double)x_vec.size()); - for (size_t i=1; i<X_mat.size(); i++) { + for (size_t i = 1; i < X_mat.size(); i++) { lapack_ddot(X_mat[i], x_vec, r); lapack_ddot(X_mat[i], X_mat[i], v2); - r/=sqrt(v1*v2); + r /= sqrt(v1 * v2); cov_vec.push_back(r); } @@ -214,10 +246,10 @@ void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec) { // window_size (which can vary if cM was used) read bimbam mean // genotype file and calculate the covariance matrix for neighboring // SNPs output values at 10000-SNP-interval. -void VARCOV::AnalyzeBimbam () { - igzstream infile (file_geno.c_str(), igzstream::in); +void VARCOV::AnalyzeBimbam() { + igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { - cout<<"error reading genotype file:"<<file_geno<<endl; + cout << "error reading genotype file:" << file_geno << endl; return; } @@ -225,58 +257,64 @@ void VARCOV::AnalyzeBimbam () { vector<SNPINFO> snpInfo_sub; CalcNB(snpInfo); - size_t ni_test=0; - for (size_t i=0; i<indicator_idv.size(); i++) { - ni_test+=indicator_idv[i]; + size_t ni_test = 0; + for (size_t i = 0; i < indicator_idv.size(); i++) { + ni_test += indicator_idv[i]; } - gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno = gsl_vector_alloc(ni_test); double geno_mean; vector<double> x_vec, cov_vec; - vector<vector<double> > X_mat, Cov_mat; + vector<vector<double>> X_mat, Cov_mat; - for (size_t i=0; i<ni_test; i++) { + for (size_t i = 0; i < ni_test; i++) { x_vec.push_back(0); } - WriteCov (0, snpInfo_sub, Cov_mat); + WriteCov(0, snpInfo_sub, Cov_mat); - size_t t2=0, inc; - int n_nb=0; + size_t t2 = 0, inc; + int n_nb = 0; - for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%d_pace==0 || t==(indicator_snp.size()-1)) - {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} - if (indicator_snp[t]==0) {continue;} + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) { + continue; + } - if (X_mat.size()==0) { - n_nb=snpInfo[t].n_nb+1; + if (X_mat.size() == 0) { + n_nb = snpInfo[t].n_nb + 1; } else { - n_nb=snpInfo[t].n_nb-n_nb+1; + n_nb = snpInfo[t].n_nb - n_nb + 1; } - for (int i=0; i<n_nb; i++) { - if (X_mat.size()==0) {t2=t;} + for (int i = 0; i < n_nb; i++) { + if (X_mat.size() == 0) { + t2 = t; + } // Read a line of the snp is filtered out. - inc=0; - while (t2<indicator_snp.size() && indicator_snp[t2]==0) { - t2++; inc++; + inc = 0; + while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) { + t2++; + inc++; } - Bimbam_ReadOneSNP (inc, indicator_idv, infile, geno, geno_mean); - gsl_vector_add_constant (geno, -1.0*geno_mean); + Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean); + gsl_vector_add_constant(geno, -1.0 * geno_mean); - for (size_t j=0; j<geno->size; j++) { - x_vec[j]=gsl_vector_get(geno, j); + for (size_t j = 0; j < geno->size; j++) { + x_vec[j] = gsl_vector_get(geno, j); } X_mat.push_back(x_vec); t2++; } - n_nb=snpInfo[t].n_nb; + n_nb = snpInfo[t].n_nb; Calc_Cor(X_mat, cov_vec); Cov_mat.push_back(cov_vec); @@ -285,15 +323,15 @@ void VARCOV::AnalyzeBimbam () { X_mat.erase(X_mat.begin()); // Write out var/cov values. - if (Cov_mat.size()==10000) { - WriteCov (1, snpInfo_sub, Cov_mat); + if (Cov_mat.size() == 10000) { + WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } } - if (Cov_mat.size()!=0) { - WriteCov (1, snpInfo_sub, Cov_mat); + if (Cov_mat.size() != 0) { + WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } @@ -306,68 +344,76 @@ void VARCOV::AnalyzeBimbam () { return; } -void VARCOV::AnalyzePlink () { - string file_bed=file_bfile+".bed"; - ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;} +void VARCOV::AnalyzePlink() { + string file_bed = file_bfile + ".bed"; + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return; + } // Calculate the number of right-hand-side neighbours for each SNP. vector<SNPINFO> snpInfo_sub; CalcNB(snpInfo); - size_t ni_test=0; - for (size_t i=0; i<indicator_idv.size(); i++) { - ni_test+=indicator_idv[i]; + size_t ni_test = 0; + for (size_t i = 0; i < indicator_idv.size(); i++) { + ni_test += indicator_idv[i]; } - gsl_vector *geno=gsl_vector_alloc (ni_test); + gsl_vector *geno = gsl_vector_alloc(ni_test); double geno_mean; vector<double> x_vec, cov_vec; - vector<vector<double> > X_mat, Cov_mat; + vector<vector<double>> X_mat, Cov_mat; - for (size_t i=0; i<ni_test; i++) { + for (size_t i = 0; i < ni_test; i++) { x_vec.push_back(0); } - WriteCov (0, snpInfo_sub, Cov_mat); + WriteCov(0, snpInfo_sub, Cov_mat); - size_t t2=0, inc; - int n_nb=0; + size_t t2 = 0, inc; + int n_nb = 0; - for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%d_pace==0 || t==(indicator_snp.size()-1)) - {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} - if (indicator_snp[t]==0) {continue;} + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) { + continue; + } - if (X_mat.size()==0) { - n_nb=snpInfo[t].n_nb+1; + if (X_mat.size() == 0) { + n_nb = snpInfo[t].n_nb + 1; } else { - n_nb=snpInfo[t].n_nb-n_nb+1; + n_nb = snpInfo[t].n_nb - n_nb + 1; } - for (int i=0; i<n_nb; i++) { - if (X_mat.size()==0) {t2=t;} + for (int i = 0; i < n_nb; i++) { + if (X_mat.size() == 0) { + t2 = t; + } // Read a line if the SNP is filtered out. - inc=0; - while (t2<indicator_snp.size() && indicator_snp[t2]==0) { - t2++; - inc++; + inc = 0; + while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) { + t2++; + inc++; } - Plink_ReadOneSNP (t2, indicator_idv, infile, geno, geno_mean); - gsl_vector_add_constant (geno, -1.0*geno_mean); + Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean); + gsl_vector_add_constant(geno, -1.0 * geno_mean); - for (size_t j=0; j<geno->size; j++) { - x_vec[j]=gsl_vector_get(geno, j); + for (size_t j = 0; j < geno->size; j++) { + x_vec[j] = gsl_vector_get(geno, j); } X_mat.push_back(x_vec); t2++; } - n_nb=snpInfo[t].n_nb; + n_nb = snpInfo[t].n_nb; Calc_Cor(X_mat, cov_vec); Cov_mat.push_back(cov_vec); @@ -376,15 +422,15 @@ void VARCOV::AnalyzePlink () { X_mat.erase(X_mat.begin()); // Write out var/cov values. - if (Cov_mat.size()==10000) { - WriteCov (1, snpInfo_sub, Cov_mat); + if (Cov_mat.size() == 10000) { + WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } } - if (Cov_mat.size()!=0) { - WriteCov (1, snpInfo_sub, Cov_mat); + if (Cov_mat.size() != 0) { + WriteCov(1, snpInfo_sub, Cov_mat); Cov_mat.clear(); snpInfo_sub.clear(); } |