From 17deca2d54827a00df3ea4d98df700fc2b8ed777 Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Sat, 20 Sep 2014 10:17:34 -0400 Subject: initial upload, version 0.95alpha --- io.cpp | 1396 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1396 insertions(+) create mode 100644 io.cpp (limited to 'io.cpp') diff --git a/io.cpp b/io.cpp new file mode 100644 index 0000000..c22f668 --- /dev/null +++ b/io.cpp @@ -0,0 +1,1396 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" +#include "gzstream.h" +#include "mathfunc.h" + +#ifdef FORCE_FLOAT +#include "io_float.h" +#else +#include "io.h" +#endif + + +using namespace std; + + + +//Print process bar +void ProgressBar (string str, double p, double total) +{ + double progress = (100.0 * p / total); + int barsize = (int) (progress / 2.0); + char bar[51]; + + cout<sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if(sb->sgetc() == '\n') + sb->sbumpc(); + return is; + case EOF: + // Also handle the case when the last line has no line ending + if(t.empty()) + is.setstate(std::ios::eofbit); + return is; + default: + t += (char)c; + } + } +} + +//Read snp file +bool ReadFile_snps (const string &file_snps, set &setSnps) +{ + setSnps.clear(); + + ifstream infile (file_snps.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open snps file: "< &mapRS2chr, map &mapRS2bp, map &mapRS2cM) +{ + mapRS2chr.clear(); + mapRS2bp.clear(); + + ifstream infile (file_anno.c_str(), ifstream::in); + if (!infile) {cout<<"error opening annotation file: "< &indicator_idv, vector &pheno, const int &p_column) +{ + indicator_idv.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< > &indicator_pheno, vector > &pheno, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + + igzstream infile (file_pheno.c_str(), igzstream::in); +// ifstream infile (file_pheno.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open phenotype file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &indicator_cvt, vector > &cvt, size_t &n_cvt) +{ + indicator_cvt.clear(); + + ifstream infile (file_cvt.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open covariates file: "< v_d; flag_na=0; + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + while (ch_ptr!=NULL) { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} + else {d=atof(ch_ptr);} + + v_d.push_back(d); + ch_ptr=strtok (NULL, " , \t"); + } + if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + cvt.push_back(v_d); + } + + if (indicator_cvt.empty()) {n_cvt=0;} + else { + flag_na=0; + for (vector::size_type i=0; i &snpInfo) +{ + snpInfo.clear(); + + ifstream infile (file_bim.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .bim file: "< > &indicator_pheno, vector > &pheno, map &mapID2num, const vector &p_column) +{ + indicator_pheno.clear(); + pheno.clear(); + mapID2num.clear(); + + igzstream infile (file_fam.c_str(), igzstream::in); + //ifstream infile (file_fam.c_str(), ifstream::in); + if (!infile) {cout<<"error opening .fam file: "< pheno_row; + vector ind_pheno_row; + + size_t p_max=*max_element(p_column.begin(), p_column.end() ); + map mapP2c; + for (size_t i=0; i &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map &mapRS2chr, map &mapRS2bp, map &mapRS2cM, vector &snpInfo, size_t &ns_test) +{ + indicator_snp.clear(); + snpInfo.clear(); + + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w; + int c_idv=0; + + string line; + char *ch_ptr; + + string rs; + long int b_pos; + string chr; + string major; + string minor; + double cM; + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + int ni_total=indicator_idv.size(); + int ni_test=0; + for (int i=0; i=0 && geno<=0.5) {n_0++;} + if (geno>0.5 && geno<1.5) {n_1++;} + if (geno>=1.5 && geno<=2.0) {n_2++;} + + gsl_vector_set (genotype, c_idv, geno); + +// if (geno<0) {n_miss++; continue;} + + if (flag_poly==0) {geno_old=geno; flag_poly=2;} + if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} + + maf+=geno; + + c_idv++; + } + maf/=2.0*(double)(ni_test-n_miss); + + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + snpInfo.push_back(sInfo); + + if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=0) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + + + + +//Read bed file, the first time +bool ReadFile_bed (const string &file_bed, const set &setSnps, const gsl_matrix *W, vector &indicator_idv, vector &indicator_snp, vector &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) +{ + indicator_snp.clear(); + size_t ns_total=snpInfo.size(); + + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); + gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2); + gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2); + gsl_vector *Wtx=gsl_vector_alloc (W->size2); + gsl_vector *WtWiWtx=gsl_vector_alloc (W->size2); + gsl_permutation * pmt=gsl_permutation_alloc (W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp (WtW, pmt, &sig); + LUInvert (WtW, pmt, WtWi); + + double v_x, v_w, geno; + size_t c_idv=0; + + char ch[1]; + bitset<8> b; + + size_t ni_total=indicator_idv.size(); + size_t ni_test=0; + for (size_t i=0; i miss_level) {indicator_snp.push_back(0); continue;} + + if ( (maf (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} + + if (hwe_level!=1) { + if (CalcHWE(n_0, n_2, n_1)size; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + } + + gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot (genotype, genotype, &v_x); + gsl_blas_ddot (Wtx, WtWiWtx, &v_w); + + if (v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free (genotype); + gsl_vector_free (genotype_miss); + gsl_matrix_free (WtW); + gsl_matrix_free (WtWi); + gsl_vector_free (Wtx); + gsl_vector_free (WtWiWtx); + gsl_permutation_free (pmt); + + infile.close(); + infile.clear(); + + return true; +} + + + +void ReadFile_kin (const string &file_kin, vector &indicator_idv, map &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) +{ + igzstream infile (file_kin.c_str(), igzstream::in); +// ifstream infile (file_kin.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open kinship file: "< mapID2ID; + size_t c=0; + for (size_t i=0; isize1, G->size1, G->size1); + ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix); + i++; + } + + infile.close(); + infile.clear(); + return; +} + + +void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) +{ + igzstream infile (file_ku.c_str(), igzstream::in); +// ifstream infile (file_ku.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open the U file: "<size1, n_col=U->size2, i_row=0, i_col=0; + + gsl_matrix_set_zero (U); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<size, i_row=0; + + gsl_vector_set_zero (eval); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + gsl_vector *geno_miss=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + for (size_t t=0; t &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + size_t n_miss, ci_total; + double d, geno_mean, geno_var; + + size_t ni_total=matrix_kin->size1; + gsl_vector *geno=gsl_vector_alloc (ni_total); + + size_t ns_test=0; + int n_bit; + + //calculate n_bit and c, the number of bit for each snp + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1; } + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + for (size_t t=0; t &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + igzstream infile (file_geno.c_str(), igzstream::in); +// ifstream infile (file_geno.c_str(), ifstream::in); + if (!infile) {cout<<"error reading genotype file:"<size1); + gsl_vector *genotype_miss=gsl_vector_alloc (UtX->size1); + double geno, geno_mean; + size_t n_miss; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + + int c_idv=0, c_snp=0; + + for (int i=0; isize; ++i) { + if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} + else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &indicator_idv, vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) +{ + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< b; + + int ni_total=(int)indicator_idv.size(); + int ns_total=(int)indicator_snp.size(); + int ni_test=UtX->size1; + int ns_test=UtX->size2; + int n_bit; + + if (ni_total%4==0) {n_bit=ni_total/4;} + else {n_bit=ni_total/4+1;} + + //print the first three majic numbers + for (int i=0; i<3; ++i) { + infile.read(ch,1); + b=ch[0]; + } + + if (calc_K==true) {gsl_matrix_set_zero (K);} + + gsl_vector *genotype=gsl_vector_alloc (UtX->size1); + + double geno, geno_mean; + size_t n_miss; + int c_idv=0, c_snp=0, c=0; + + //start reading snps and doing association test + for (int t=0; tsize; ++i) { + geno=gsl_vector_get (genotype, i); + if (geno==-9) {geno=0;} + else {geno-=geno_mean;} + + gsl_vector_set (genotype, i, geno); + gsl_matrix_set (UtX, i, c_snp, geno); + } + + if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + + c_snp++; + } + + if (calc_K==true) { + gsl_matrix_scale (K, 1.0/(double)ns_test); + + for (size_t i=0; isize; ++i) { + for (size_t j=0; j &est_column, map &mapRS2est) +{ + mapRS2est.clear(); + + ifstream infile (file_est.c_str(), ifstream::in); + if (!infile) {cout<<"error opening estimated parameter file: "<(infile), istreambuf_iterator(), '\n'); + infile.seekg (0, ios::beg); + + return true; +} + + + +//Read gene expression file +bool ReadFile_gene (const string &file_gene, vector &vec_read, vector &snpInfo, size_t &ng_total) +{ + vec_read.clear(); + ng_total=0; + + ifstream infile (file_gene.c_str(), ifstream::in); + if (!infile) {cout<<"error! fail to open gene expression file: "<