about summary refs log tree commit diff
path: root/src/varcov.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/varcov.cpp')
-rw-r--r--src/varcov.cpp386
1 files changed, 216 insertions, 170 deletions
diff --git a/src/varcov.cpp b/src/varcov.cpp
index 46b5bf8..0f87ba8 100644
--- a/src/varcov.cpp
+++ b/src/varcov.cpp
@@ -16,103 +16,126 @@
     along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include <iostream>
+#include <bitset>
+#include <cmath>
+#include <cstring>
 #include <fstream>
-#include <sstream>
-#include <string>
 #include <iomanip>
-#include <bitset>
-#include <vector>
+#include <iostream>
 #include <map>
 #include <set>
-#include <cstring>
-#include <cmath>
+#include <sstream>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string>
+#include <vector>
 
-#include "gsl/gsl_vector.h"
-#include "gsl/gsl_matrix.h"
-#include "gsl/gsl_linalg.h"
 #include "gsl/gsl_blas.h"
 #include "gsl/gsl_cdf.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_vector.h"
 
-#include "lapack.h"
 #include "gzstream.h"
-#include "param.h"
-#include "varcov.h"
 #include "io.h"
+#include "lapack.h"
 #include "mathfunc.h"
+#include "param.h"
+#include "varcov.h"
 
 using namespace std;
 
-void VARCOV::CopyFromParam (PARAM &cPar) {
-	d_pace=cPar.d_pace;
+void VARCOV::CopyFromParam(PARAM &cPar) {
+  d_pace = cPar.d_pace;
 
-	file_bfile=cPar.file_bfile;
-	file_geno=cPar.file_geno;
-	file_out=cPar.file_out;
-	path_out=cPar.path_out;
+  file_bfile = cPar.file_bfile;
+  file_geno = cPar.file_geno;
+  file_out = cPar.file_out;
+  path_out = cPar.path_out;
 
-	time_opt=0.0;
+  time_opt = 0.0;
 
-	window_cm=cPar.window_cm;
-	window_bp=cPar.window_bp;
-	window_ns=cPar.window_ns;
+  window_cm = cPar.window_cm;
+  window_bp = cPar.window_bp;
+  window_ns = cPar.window_ns;
 
-	indicator_idv=cPar.indicator_idv;
-	indicator_snp=cPar.indicator_snp;
-	snpInfo=cPar.snpInfo;
+  indicator_idv = cPar.indicator_idv;
+  indicator_snp = cPar.indicator_snp;
+  snpInfo = cPar.snpInfo;
 
-	return;
+  return;
 }
 
-void VARCOV::CopyToParam (PARAM &cPar) {
-	cPar.time_opt=time_opt;
-	return;
+void VARCOV::CopyToParam(PARAM &cPar) {
+  cPar.time_opt = time_opt;
+  return;
 }
 
-void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub,
-		       const vector<vector<double> > &Cov_mat) {
+void VARCOV::WriteCov(const int flag, const vector<SNPINFO> &snpInfo_sub,
+                      const vector<vector<double>> &Cov_mat) {
   string file_cov;
-  file_cov=path_out+"/"+file_out;
-  file_cov+=".cor.txt";
+  file_cov = path_out + "/" + file_out;
+  file_cov += ".cor.txt";
 
   ofstream outfile;
 
-  if (flag==0) {
-    outfile.open (file_cov.c_str(), ofstream::out);
-    if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;}
+  if (flag == 0) {
+    outfile.open(file_cov.c_str(), ofstream::out);
+    if (!outfile) {
+      cout << "error writing file: " << file_cov << endl;
+      return;
+    }
 
-    outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis"
-	   <<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0"
-	   <<"\t"<<"af"<<"\t"<<"window_size"
-	   <<"\t"<<"var"<<"\t"<<"cor"<<endl;
+    outfile << "chr"
+            << "\t"
+            << "rs"
+            << "\t"
+            << "ps"
+            << "\t"
+            << "n_mis"
+            << "\t"
+            << "n_obs"
+            << "\t"
+            << "allele1"
+            << "\t"
+            << "allele0"
+            << "\t"
+            << "af"
+            << "\t"
+            << "window_size"
+            << "\t"
+            << "var"
+            << "\t"
+            << "cor" << endl;
   } else {
-    outfile.open (file_cov.c_str(), ofstream::app);
-    if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;}
-
-    for (size_t i=0; i<Cov_mat.size(); i++) {
-      outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number <<
-	"\t" << snpInfo_sub[i].base_position << "\t" <<
-	snpInfo_sub[i].n_miss << "\t" << snpInfo_sub[i].n_idv << "\t" <<
-	snpInfo_sub[i].a_minor << "\t" << snpInfo_sub[i].a_major << "\t" <<
-	fixed << setprecision(3) << snpInfo_sub[i].maf << "\t" <<
-	Cov_mat[i].size()-1 << "\t";
-      outfile<<scientific<<setprecision(6)<<Cov_mat[i][0]<<"\t";
-
-      if (Cov_mat[i].size()==1) {
-	outfile<<"NA";
+    outfile.open(file_cov.c_str(), ofstream::app);
+    if (!outfile) {
+      cout << "error writing file: " << file_cov << endl;
+      return;
+    }
+
+    for (size_t i = 0; i < Cov_mat.size(); i++) {
+      outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t"
+              << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss
+              << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor
+              << "\t" << snpInfo_sub[i].a_major << "\t" << fixed
+              << setprecision(3) << snpInfo_sub[i].maf << "\t"
+              << Cov_mat[i].size() - 1 << "\t";
+      outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t";
+
+      if (Cov_mat[i].size() == 1) {
+        outfile << "NA";
       } else {
-	for (size_t j=1; j<Cov_mat[i].size(); j++) {
-	  if (j==(Cov_mat[i].size()-1)) {
-	    outfile<<Cov_mat[i][j];
-	  } else {
-	    outfile<<Cov_mat[i][j]<<",";
-	  }
-	}
+        for (size_t j = 1; j < Cov_mat[i].size(); j++) {
+          if (j == (Cov_mat[i].size() - 1)) {
+            outfile << Cov_mat[i][j];
+          } else {
+            outfile << Cov_mat[i][j] << ",";
+          }
+        }
       }
 
-      outfile<<endl;
+      outfile << endl;
     }
   }
 
@@ -121,18 +144,18 @@ void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub,
   return;
 }
 
-bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
-  int c_chr=snpInfo1.chr.compare(snpInfo2.chr);
-  long int c_bp=snpInfo1.base_position-snpInfo2.base_position;
+bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
+  int c_chr = snpInfo1.chr.compare(snpInfo2.chr);
+  long int c_bp = snpInfo1.base_position - snpInfo2.base_position;
 
-  if(c_chr<0) {
+  if (c_chr < 0) {
     return true;
-  } else if (c_chr>0) {
+  } else if (c_chr > 0) {
     return false;
   } else {
-    if (c_bp<0) {
+    if (c_bp < 0) {
       return true;
-    } else if (c_bp>0) {
+    } else if (c_bp > 0) {
       return false;
     } else {
       return true;
@@ -140,64 +163,73 @@ bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
   }
 }
 
-
 // Do not sort SNPs (because gzip files do not support random access)
 // then calculate n_nb, the number of neighbours, for each SNP.
-void VARCOV::CalcNB (vector<SNPINFO> &snpInfo_sort) {
-  size_t t2=0, n_nb=0;
-  for (size_t t=0; t<indicator_snp.size(); ++t) {
-    if (indicator_snp[t]==0) {continue;}
-
-    if (snpInfo_sort[t].chr=="-9" ||
-	(snpInfo_sort[t].cM==-9 && window_cm!=0) ||
-	(snpInfo_sort[t].base_position==-9 && window_bp!=0) ) {
-      snpInfo_sort[t].n_nb=0; continue;
+void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) {
+  size_t t2 = 0, n_nb = 0;
+  for (size_t t = 0; t < indicator_snp.size(); ++t) {
+    if (indicator_snp[t] == 0) {
+      continue;
+    }
+
+    if (snpInfo_sort[t].chr == "-9" ||
+        (snpInfo_sort[t].cM == -9 && window_cm != 0) ||
+        (snpInfo_sort[t].base_position == -9 && window_bp != 0)) {
+      snpInfo_sort[t].n_nb = 0;
+      continue;
     }
 
-    if (t==indicator_snp.size()-1) {snpInfo_sort[t].n_nb=0; continue;}
+    if (t == indicator_snp.size() - 1) {
+      snpInfo_sort[t].n_nb = 0;
+      continue;
+    }
 
-    t2=t+1; n_nb=0;
+    t2 = t + 1;
+    n_nb = 0;
 
-    while (t2<indicator_snp.size() &&
-	   snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
-	   indicator_snp[t2]==0) {
+    while (t2 < indicator_snp.size() &&
+           snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
+           indicator_snp[t2] == 0) {
       t2++;
     }
 
-    while (t2<indicator_snp.size() &&
-	   snpInfo_sort[t2].chr==snpInfo_sort[t].chr &&
-	   (snpInfo_sort[t2].cM-snpInfo_sort[t].cM<window_cm ||
-	    window_cm==0) &&
-	   (snpInfo_sort[t2].base_position-snpInfo_sort[t].base_position <
-	    window_bp || window_bp==0) && (n_nb<window_ns|| window_ns==0)) {
-      t2++; n_nb++;
-      while (t2<indicator_snp.size() &&
-	     snpInfo_sort[t2].chr==snpInfo_sort[t].chr &&
-	     indicator_snp[t2]==0) {
-	t2++;
+    while (t2 < indicator_snp.size() &&
+           snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
+           (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm ||
+            window_cm == 0) &&
+           (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position <
+                window_bp ||
+            window_bp == 0) &&
+           (n_nb < window_ns || window_ns == 0)) {
+      t2++;
+      n_nb++;
+      while (t2 < indicator_snp.size() &&
+             snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
+             indicator_snp[t2] == 0) {
+        t2++;
       }
     }
 
-    snpInfo_sort[t].n_nb=n_nb;
+    snpInfo_sort[t].n_nb = n_nb;
   }
 
   return;
 }
 
 // Vector double is centered to have mean 0.
-void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec) {
+void Calc_Cor(vector<vector<double>> &X_mat, vector<double> &cov_vec) {
   cov_vec.clear();
 
   double v1, v2, r;
-  vector<double> x_vec=X_mat[0];
+  vector<double> x_vec = X_mat[0];
 
   lapack_ddot(x_vec, x_vec, v1);
-  cov_vec.push_back(v1/(double)x_vec.size() );
+  cov_vec.push_back(v1 / (double)x_vec.size());
 
-  for (size_t i=1; i<X_mat.size(); i++) {
+  for (size_t i = 1; i < X_mat.size(); i++) {
     lapack_ddot(X_mat[i], x_vec, r);
     lapack_ddot(X_mat[i], X_mat[i], v2);
-    r/=sqrt(v1*v2);
+    r /= sqrt(v1 * v2);
 
     cov_vec.push_back(r);
   }
@@ -214,10 +246,10 @@ void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec) {
 // window_size (which can vary if cM was used) read bimbam mean
 // genotype file and calculate the covariance matrix for neighboring
 // SNPs output values at 10000-SNP-interval.
-void VARCOV::AnalyzeBimbam () {
-  igzstream infile (file_geno.c_str(), igzstream::in);
+void VARCOV::AnalyzeBimbam() {
+  igzstream infile(file_geno.c_str(), igzstream::in);
   if (!infile) {
-    cout<<"error reading genotype file:"<<file_geno<<endl;
+    cout << "error reading genotype file:" << file_geno << endl;
     return;
   }
 
@@ -225,58 +257,64 @@ void VARCOV::AnalyzeBimbam () {
   vector<SNPINFO> snpInfo_sub;
   CalcNB(snpInfo);
 
-  size_t ni_test=0;
-  for (size_t i=0; i<indicator_idv.size(); i++) {
-    ni_test+=indicator_idv[i];
+  size_t ni_test = 0;
+  for (size_t i = 0; i < indicator_idv.size(); i++) {
+    ni_test += indicator_idv[i];
   }
 
-  gsl_vector *geno=gsl_vector_alloc (ni_test);
+  gsl_vector *geno = gsl_vector_alloc(ni_test);
   double geno_mean;
 
   vector<double> x_vec, cov_vec;
-  vector<vector<double> > X_mat, Cov_mat;
+  vector<vector<double>> X_mat, Cov_mat;
 
-  for (size_t i=0; i<ni_test; i++) {
+  for (size_t i = 0; i < ni_test; i++) {
     x_vec.push_back(0);
   }
 
-  WriteCov (0, snpInfo_sub, Cov_mat);
+  WriteCov(0, snpInfo_sub, Cov_mat);
 
-  size_t t2=0, inc;
-  int n_nb=0;
+  size_t t2 = 0, inc;
+  int n_nb = 0;
 
-  for (size_t t=0; t<indicator_snp.size(); ++t) {
-    if (t%d_pace==0 || t==(indicator_snp.size()-1))
-      {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
-    if (indicator_snp[t]==0) {continue;}
+  for (size_t t = 0; t < indicator_snp.size(); ++t) {
+    if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
+      ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
+    }
+    if (indicator_snp[t] == 0) {
+      continue;
+    }
 
-    if (X_mat.size()==0) {
-      n_nb=snpInfo[t].n_nb+1;
+    if (X_mat.size() == 0) {
+      n_nb = snpInfo[t].n_nb + 1;
     } else {
-      n_nb=snpInfo[t].n_nb-n_nb+1;
+      n_nb = snpInfo[t].n_nb - n_nb + 1;
     }
 
-    for (int i=0; i<n_nb; i++) {
-      if (X_mat.size()==0) {t2=t;}
+    for (int i = 0; i < n_nb; i++) {
+      if (X_mat.size() == 0) {
+        t2 = t;
+      }
 
       // Read a line of the snp is filtered out.
-      inc=0;
-      while (t2<indicator_snp.size() && indicator_snp[t2]==0) {
-	t2++; inc++;
+      inc = 0;
+      while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
+        t2++;
+        inc++;
       }
 
-      Bimbam_ReadOneSNP (inc, indicator_idv, infile, geno, geno_mean);
-      gsl_vector_add_constant (geno, -1.0*geno_mean);
+      Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean);
+      gsl_vector_add_constant(geno, -1.0 * geno_mean);
 
-      for (size_t j=0; j<geno->size; j++) {
-	x_vec[j]=gsl_vector_get(geno, j);
+      for (size_t j = 0; j < geno->size; j++) {
+        x_vec[j] = gsl_vector_get(geno, j);
       }
       X_mat.push_back(x_vec);
 
       t2++;
     }
 
-    n_nb=snpInfo[t].n_nb;
+    n_nb = snpInfo[t].n_nb;
 
     Calc_Cor(X_mat, cov_vec);
     Cov_mat.push_back(cov_vec);
@@ -285,15 +323,15 @@ void VARCOV::AnalyzeBimbam () {
     X_mat.erase(X_mat.begin());
 
     // Write out var/cov values.
-    if (Cov_mat.size()==10000) {
-      WriteCov (1, snpInfo_sub, Cov_mat);
+    if (Cov_mat.size() == 10000) {
+      WriteCov(1, snpInfo_sub, Cov_mat);
       Cov_mat.clear();
       snpInfo_sub.clear();
     }
   }
 
-  if (Cov_mat.size()!=0) {
-    WriteCov (1, snpInfo_sub, Cov_mat);
+  if (Cov_mat.size() != 0) {
+    WriteCov(1, snpInfo_sub, Cov_mat);
     Cov_mat.clear();
     snpInfo_sub.clear();
   }
@@ -306,68 +344,76 @@ void VARCOV::AnalyzeBimbam () {
   return;
 }
 
-void VARCOV::AnalyzePlink () {
-  string file_bed=file_bfile+".bed";
-  ifstream infile (file_bed.c_str(), ios::binary);
-  if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+void VARCOV::AnalyzePlink() {
+  string file_bed = file_bfile + ".bed";
+  ifstream infile(file_bed.c_str(), ios::binary);
+  if (!infile) {
+    cout << "error reading bed file:" << file_bed << endl;
+    return;
+  }
 
   // Calculate the number of right-hand-side neighbours for each SNP.
   vector<SNPINFO> snpInfo_sub;
   CalcNB(snpInfo);
 
-  size_t ni_test=0;
-  for (size_t i=0; i<indicator_idv.size(); i++) {
-    ni_test+=indicator_idv[i];
+  size_t ni_test = 0;
+  for (size_t i = 0; i < indicator_idv.size(); i++) {
+    ni_test += indicator_idv[i];
   }
 
-  gsl_vector *geno=gsl_vector_alloc (ni_test);
+  gsl_vector *geno = gsl_vector_alloc(ni_test);
   double geno_mean;
 
   vector<double> x_vec, cov_vec;
-  vector<vector<double> > X_mat, Cov_mat;
+  vector<vector<double>> X_mat, Cov_mat;
 
-  for (size_t i=0; i<ni_test; i++) {
+  for (size_t i = 0; i < ni_test; i++) {
     x_vec.push_back(0);
   }
 
-  WriteCov (0, snpInfo_sub, Cov_mat);
+  WriteCov(0, snpInfo_sub, Cov_mat);
 
-  size_t t2=0, inc;
-  int n_nb=0;
+  size_t t2 = 0, inc;
+  int n_nb = 0;
 
-  for (size_t t=0; t<indicator_snp.size(); ++t) {
-    if (t%d_pace==0 || t==(indicator_snp.size()-1))
-      {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
-    if (indicator_snp[t]==0) {continue;}
+  for (size_t t = 0; t < indicator_snp.size(); ++t) {
+    if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
+      ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
+    }
+    if (indicator_snp[t] == 0) {
+      continue;
+    }
 
-    if (X_mat.size()==0) {
-      n_nb=snpInfo[t].n_nb+1;
+    if (X_mat.size() == 0) {
+      n_nb = snpInfo[t].n_nb + 1;
     } else {
-      n_nb=snpInfo[t].n_nb-n_nb+1;
+      n_nb = snpInfo[t].n_nb - n_nb + 1;
     }
 
-    for (int i=0; i<n_nb; i++) {
-      if (X_mat.size()==0) {t2=t;}
+    for (int i = 0; i < n_nb; i++) {
+      if (X_mat.size() == 0) {
+        t2 = t;
+      }
 
       // Read a line if the SNP is filtered out.
-      inc=0;
-      while (t2<indicator_snp.size() && indicator_snp[t2]==0) {
-	t2++;
-	inc++;
+      inc = 0;
+      while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
+        t2++;
+        inc++;
       }
 
-      Plink_ReadOneSNP (t2, indicator_idv, infile, geno, geno_mean);
-      gsl_vector_add_constant (geno, -1.0*geno_mean);
+      Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean);
+      gsl_vector_add_constant(geno, -1.0 * geno_mean);
 
-      for (size_t j=0; j<geno->size; j++) {
-	x_vec[j]=gsl_vector_get(geno, j);
+      for (size_t j = 0; j < geno->size; j++) {
+        x_vec[j] = gsl_vector_get(geno, j);
       }
       X_mat.push_back(x_vec);
 
       t2++;
     }
 
-    n_nb=snpInfo[t].n_nb;
+    n_nb = snpInfo[t].n_nb;
 
     Calc_Cor(X_mat, cov_vec);
     Cov_mat.push_back(cov_vec);
@@ -376,15 +422,15 @@ void VARCOV::AnalyzePlink () {
     X_mat.erase(X_mat.begin());
 
     // Write out var/cov values.
-    if (Cov_mat.size()==10000) {
-      WriteCov (1, snpInfo_sub, Cov_mat);
+    if (Cov_mat.size() == 10000) {
+      WriteCov(1, snpInfo_sub, Cov_mat);
       Cov_mat.clear();
       snpInfo_sub.clear();
     }
   }
 
-  if (Cov_mat.size()!=0) {
-    WriteCov (1, snpInfo_sub, Cov_mat);
+  if (Cov_mat.size() != 0) {
+    WriteCov(1, snpInfo_sub, Cov_mat);
     Cov_mat.clear();
     snpInfo_sub.clear();
   }