about summary refs log tree commit diff
path: root/src/varcov.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/varcov.cpp')
-rw-r--r--src/varcov.cpp482
1 files changed, 482 insertions, 0 deletions
diff --git a/src/varcov.cpp b/src/varcov.cpp
new file mode 100644
index 0000000..fdc6f10
--- /dev/null
+++ b/src/varcov.cpp
@@ -0,0 +1,482 @@
+/*
+	Genome-wide Efficient Mixed Model Association (GEMMA)
+    Copyright (C) 2011  Xiang Zhou
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <map>
+#include <set>
+#include <cstring>
+#include <cmath>
+#include <stdio.h>
+#include <stdlib.h> 
+
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_cdf.h"
+
+#include "lapack.h"
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "param_float.h"
+#include "varcov_float.h"
+#include "io_float.h"
+#include "mathfunc_float.h"
+#else
+#include "param.h"
+#include "varcov.h"
+#include "io.h"
+#include "mathfunc.h"
+#endif
+
+
+using namespace std;
+
+
+
+
+void VARCOV::CopyFromParam (PARAM &cPar)
+{
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	
+	time_opt=0.0;
+
+	window_cm=cPar.window_cm;
+	window_bp=cPar.window_bp;
+	window_ns=cPar.window_ns;
+	
+	indicator_idv=cPar.indicator_idv;
+	indicator_snp=cPar.indicator_snp;
+	snpInfo=cPar.snpInfo;
+	
+	return;
+}
+
+
+void VARCOV::CopyToParam (PARAM &cPar)
+{
+	cPar.time_opt=time_opt;
+	return;
+}
+
+
+
+//chr rs ps n_idv allele1 allele0 af var window_size r2 (r2_11,n_11,r2_12,n_12...r2_1m,n_1m)
+void VARCOV::WriteCov (const int flag, const vector<SNPINFO> &snpInfo_sub, const vector<vector<double> > &Cov_mat) 
+{
+  string file_cov;
+  file_cov=path_out+"/"+file_out;
+  file_cov+=".cor.txt";
+
+  ofstream outfile;
+
+  if (flag==0) {
+    outfile.open (file_cov.c_str(), ofstream::out);
+    if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;}
+		
+    outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_mis"
+	   <<"\t"<<"n_obs"<<"\t"<<"allele1"<<"\t"<<"allele0"
+	   <<"\t"<<"af"<<"\t"<<"window_size"
+	   <<"\t"<<"var"<<"\t"<<"cor"<<endl;
+  } else {
+    outfile.open (file_cov.c_str(), ofstream::app);
+    if (!outfile) {cout<<"error writing file: "<<file_cov<<endl; return;}
+
+    for (size_t i=0; i<Cov_mat.size(); i++) {
+      outfile<<snpInfo_sub[i].chr<<"\t"<<snpInfo_sub[i].rs_number<<"\t"<<snpInfo_sub[i].base_position<<"\t"<<snpInfo_sub[i].n_miss<<"\t"<<snpInfo_sub[i].n_idv<<"\t"<<snpInfo_sub[i].a_minor<<"\t"<<snpInfo_sub[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo_sub[i].maf<<"\t"<<Cov_mat[i].size()-1<<"\t";
+      outfile<<scientific<<setprecision(6)<<Cov_mat[i][0]<<"\t";
+
+      if (Cov_mat[i].size()==1) {
+	outfile<<"NA";
+      } else {
+	for (size_t j=1; j<Cov_mat[i].size(); j++) {
+	  if (j==(Cov_mat[i].size()-1)) {
+	    outfile<<Cov_mat[i][j];
+	  } else {
+	    outfile<<Cov_mat[i][j]<<",";
+	  }
+	}
+      }
+      
+      outfile<<endl;
+    }
+  }
+	
+  outfile.close();
+  outfile.clear();
+  return;
+}
+
+
+
+
+// sort SNPs first based on chromosomes then based on bp (or cm, if bp is not available)
+//if chr1=chr2 and bp1=bp2, then return with the same order
+bool CompareSNPinfo (const SNPINFO &snpInfo1, const SNPINFO &snpInfo2)
+{
+  int c_chr=snpInfo1.chr.compare(snpInfo2.chr);
+  long int c_bp=snpInfo1.base_position-snpInfo2.base_position;
+
+  if(c_chr<0) {
+    return true;
+  } else if (c_chr>0) {
+    return false;
+  } else {
+    if (c_bp<0) {
+      return true;
+    } else if (c_bp>0) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+}
+
+
+// do not sort SNPs (because gzip files do not support random access)
+// then calculate n_nb, the number of neighbours, for each snp
+void VARCOV::CalcNB (vector<SNPINFO> &snpInfo_sort)
+{
+  //  stable_sort(snpInfo_sort.begin(), snpInfo_sort.end(), CompareSNPinfo);
+
+  size_t t2=0, n_nb=0;
+  for (size_t t=0; t<indicator_snp.size(); ++t) {
+    if (indicator_snp[t]==0) {continue;}
+    
+    if (snpInfo_sort[t].chr=="-9" || (snpInfo_sort[t].cM==-9 && window_cm!=0) || (snpInfo_sort[t].base_position==-9 && window_bp!=0) ) {
+      snpInfo_sort[t].n_nb=0; continue;
+    }
+
+    if (t==indicator_snp.size()-1) {snpInfo_sort[t].n_nb=0; continue;}
+
+    t2=t+1; n_nb=0;
+
+    while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr  && indicator_snp[t2]==0) {t2++;}
+
+    while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr && (snpInfo_sort[t2].cM-snpInfo_sort[t].cM<window_cm || window_cm==0) && (snpInfo_sort[t2].base_position-snpInfo_sort[t].base_position<window_bp || window_bp==0) && (n_nb<window_ns|| window_ns==0)  ) {
+      t2++; n_nb++;
+      while (t2<indicator_snp.size() && snpInfo_sort[t2].chr==snpInfo_sort[t].chr && indicator_snp[t2]==0) {t2++;}
+    }
+
+    snpInfo_sort[t].n_nb=n_nb;
+  }
+
+  return;
+}
+
+
+
+//vector double is centered to have mean 0
+void Calc_Cor(vector<vector<double> > &X_mat, vector<double> &cov_vec)
+{
+  cov_vec.clear();
+
+  double v1, v2, r;
+  vector<double> x_vec=X_mat[0];
+
+  lapack_ddot(x_vec, x_vec, v1);
+  cov_vec.push_back(v1/(double)x_vec.size() );
+
+  for (size_t i=1; i<X_mat.size(); i++) {
+    lapack_ddot(X_mat[i], x_vec, r);
+    lapack_ddot(X_mat[i], X_mat[i], v2);
+    r/=sqrt(v1*v2);
+
+    cov_vec.push_back(r);
+  }
+
+  return;
+} 
+
+/*
+//somehow this can produce nan for some snps; perhaps due to missing values?
+//vector int is not centered, and have 0/1/2 values only
+//missing value is -9; to calculate covariance, these values are replaced by 
+void Calc_Cor(const vector<vector<int> > &X_mat, vector<double> &cov_vec)
+{
+  cov_vec.clear();
+
+  int d1, d2, m1, m2, n1, n2, n12;
+  double m1d, m2d, m12d, v1d, v2d, v;
+
+  vector<int> x_vec=X_mat[0];
+  
+  //calculate m2
+  m2=0; n2=0;
+  for (size_t j=0; j<x_vec.size(); j++) {
+    d2=x_vec[j];
+    if (d2==-9) {continue;}
+    m2+=d2; n2++;
+  }
+  m2d=(double)m2/(double)n2;
+
+  for (size_t i=0; i<X_mat.size(); i++) {
+    //calculate m1
+    m1=0; n1=0;
+    for (size_t j=0; j<x_vec.size(); j++) {
+      d1=X_mat[i][j];
+      if (d1==-9) {continue;}
+      m1+=d1; n1++;
+    }
+    m1d=(double)m1/(double)n1;
+
+    //calculate v1
+    m1=0; m2=0; m12d=0; n12=0;
+    for (size_t j=0; j<x_vec.size(); j++) {
+      d1=X_mat[i][j];
+      if (d1==-9) {
+	n12++;
+      } else if (d1!=0) {
+	if (d1==1) {m12d+=1;} else if (d1==2) {m12d+=4;} else {m12d+=(double)(d1*d1);}
+      }
+    }
+    
+    v1d=((double)m12d+m1d*(double)m2+m2d*(double)m1+(double)n12*m1d*m2d)/(double)x_vec.size()-m1d*m2d;
+
+    //calculate covariance
+    if (i!=0) {
+      m1=0; m2=0; m12d=0; n12=0;
+      for (size_t j=0; j<x_vec.size(); j++) {
+	d1=X_mat[i][j]; d2=x_vec[j];
+	if (d1==-9 && d2==-9) {
+	  n12++;
+	} else if (d1==-9) {
+	  m2+=d2;
+	} else if (d2==-9) {
+	  m1+=d1;
+	} else if (d1!=0 && d2!=0) {
+	  if (d1==1) {m12d+=d2;} else if (d1==2) {m12d+=d2+d2;} else {m12d+=(double)(d1*d2);}
+	}
+      }
+    
+      v=((double)m12d+m1d*(double)m2+m2d*(double)m1+(double)n12*m1d*m2d)/(double)x_vec.size()-m1d*m2d;
+      v/=sqrt(v1d*v2d);
+    } else {
+      v2d=v1d;
+      v=v1d/(double)x_vec.size();
+    }
+    
+    cov_vec.push_back(v);
+  }
+  return;
+} 
+*/
+
+
+//read the genotype file again, calculate r2 between pairs of SNPs within a window, output the file every 10K SNPs
+//the output results are sorted based on chr and bp
+//output format similar to assoc.txt files (remember n_miss is replaced by n_idv)
+
+//r2 between the current SNP and every following SNPs within the window_size (which can vary if cM was used)
+//read bimbam mean genotype file and calculate the covariance matrix for neighboring SNPs
+//output values at 10000-SNP-interval
+void VARCOV::AnalyzeBimbam ()
+{
+  igzstream infile (file_geno.c_str(), igzstream::in);
+  if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+
+  //calculate the number of right-hand-side neighbours for each SNP
+  vector<SNPINFO> snpInfo_sub;
+  CalcNB(snpInfo);
+
+  size_t ni_test=0;
+  for (size_t i=0; i<indicator_idv.size(); i++) {
+    ni_test+=indicator_idv[i];
+  }
+  
+  gsl_vector *geno=gsl_vector_alloc (ni_test);
+  double geno_mean;
+
+  vector<double> x_vec, cov_vec;
+  vector<vector<double> > X_mat, Cov_mat;
+
+  for (size_t i=0; i<ni_test; i++) {
+    x_vec.push_back(0);
+  }
+
+  WriteCov (0, snpInfo_sub, Cov_mat);
+
+  size_t t2=0, inc;
+  int n_nb=0;
+
+  for (size_t t=0; t<indicator_snp.size(); ++t) {
+    if (t%d_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+    if (indicator_snp[t]==0) {continue;}
+    //    if (t>=2) {break;}    
+
+    if (X_mat.size()==0) {
+      n_nb=snpInfo[t].n_nb+1;
+    } else {
+      n_nb=snpInfo[t].n_nb-n_nb+1;       
+    }
+
+    for (int i=0; i<n_nb; i++) {
+      if (X_mat.size()==0) {t2=t;} 
+
+      //read a line of the snp is filtered out
+      inc=0;
+      while (t2<indicator_snp.size() && indicator_snp[t2]==0) {
+	t2++; inc++; 
+      }
+
+      Bimbam_ReadOneSNP (inc, indicator_idv, infile, geno, geno_mean);
+      gsl_vector_add_constant (geno, -1.0*geno_mean);
+            
+      for (size_t j=0; j<geno->size; j++) {
+	x_vec[j]=gsl_vector_get(geno, j);
+      }
+      X_mat.push_back(x_vec);
+
+      t2++;
+    }     
+    
+    n_nb=snpInfo[t].n_nb;
+
+    Calc_Cor(X_mat, cov_vec);
+    Cov_mat.push_back(cov_vec);
+    snpInfo_sub.push_back(snpInfo[t]);
+
+    X_mat.erase(X_mat.begin());
+
+    //write out var/cov values
+    if (Cov_mat.size()==10000) {
+      WriteCov (1, snpInfo_sub, Cov_mat);
+      Cov_mat.clear();
+      snpInfo_sub.clear();
+    }
+  }
+
+  if (Cov_mat.size()!=0) {
+    WriteCov (1, snpInfo_sub, Cov_mat);
+    Cov_mat.clear();
+    snpInfo_sub.clear();
+  }
+
+  gsl_vector_free(geno);
+
+  infile.close();
+  infile.clear();	
+	
+  return;
+}
+
+
+
+
+void VARCOV::AnalyzePlink ()
+{
+  string file_bed=file_bfile+".bed";
+  ifstream infile (file_bed.c_str(), ios::binary);
+  if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+
+  //calculate the number of right-hand-side neighbours for each SNP
+  vector<SNPINFO> snpInfo_sub;
+  CalcNB(snpInfo);
+  
+  size_t ni_test=0;
+  for (size_t i=0; i<indicator_idv.size(); i++) {
+    ni_test+=indicator_idv[i];
+  }
+
+  gsl_vector *geno=gsl_vector_alloc (ni_test);
+  double geno_mean;
+
+  vector<double> x_vec, cov_vec;
+  vector<vector<double> > X_mat, Cov_mat;
+
+  for (size_t i=0; i<ni_test; i++) {
+    x_vec.push_back(0);
+  }
+
+  WriteCov (0, snpInfo_sub, Cov_mat);
+
+  size_t t2=0, inc;
+  int n_nb=0;
+
+  for (size_t t=0; t<indicator_snp.size(); ++t) {
+    if (t%d_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs  ", t, indicator_snp.size()-1);}
+    if (indicator_snp[t]==0) {continue;}
+    //    if (t>=2) {break;}    
+
+    if (X_mat.size()==0) {
+      n_nb=snpInfo[t].n_nb+1;
+    } else {
+      n_nb=snpInfo[t].n_nb-n_nb+1;       
+    }
+
+    for (int i=0; i<n_nb; i++) {
+      if (X_mat.size()==0) {t2=t;} 
+
+      //read a line of the snp is filtered out
+      inc=0;
+      while (t2<indicator_snp.size() && indicator_snp[t2]==0) {
+	t2++; inc++; 
+      }
+
+      Plink_ReadOneSNP (t2, indicator_idv, infile, geno, geno_mean);
+      gsl_vector_add_constant (geno, -1.0*geno_mean);
+            
+      for (size_t j=0; j<geno->size; j++) {
+	x_vec[j]=gsl_vector_get(geno, j);
+      }
+      X_mat.push_back(x_vec);
+
+      t2++;
+    }     
+    
+    n_nb=snpInfo[t].n_nb;
+
+    Calc_Cor(X_mat, cov_vec);
+    Cov_mat.push_back(cov_vec);
+    snpInfo_sub.push_back(snpInfo[t]);
+
+    X_mat.erase(X_mat.begin());
+
+    //write out var/cov values
+    if (Cov_mat.size()==10000) {
+      WriteCov (1, snpInfo_sub, Cov_mat);
+      Cov_mat.clear();
+      snpInfo_sub.clear();
+    }
+  }
+
+  if (Cov_mat.size()!=0) {
+    WriteCov (1, snpInfo_sub, Cov_mat);
+    Cov_mat.clear();
+    snpInfo_sub.clear();
+  }
+
+  gsl_vector_free(geno);
+
+  infile.close();
+  infile.clear();	
+	
+  return;
+}