diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/io.cpp | 2042 |
1 files changed, 1161 insertions, 881 deletions
@@ -27,8 +27,10 @@ #include <set> #include <cstring> #include <cmath> +#include <cstdint> #include <stdio.h> #include <stdlib.h> +#include <assert.h> #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" @@ -310,8 +312,10 @@ bool ReadFile_column (const string &file_pheno, vector<int> &indicator_idv, if (strcmp(ch_ptr, "NA")==0) { indicator_idv.push_back(0); pheno.push_back(-9); - } // Pheno is different from pimass2. + } else { + + // Pheno is different from pimass2. p=atof(ch_ptr); indicator_idv.push_back(1); pheno.push_back(p); @@ -487,15 +491,18 @@ bool ReadFile_bim (const string &file_bim, vector<SNPINFO> &snpInfo) { } // Read .fam file. -bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column) -{ +bool ReadFile_fam (const string &file_fam, + vector<vector<int> > &indicator_pheno, + vector<vector<double> > &pheno, + map<string, int> &mapID2num, + const vector<size_t> &p_column) { indicator_pheno.clear(); pheno.clear(); mapID2num.clear(); igzstream infile (file_fam.c_str(), igzstream::in); - //ifstream infile (file_fam.c_str(), ifstream::in); - if (!infile) {cout<<"error opening .fam file: "<<file_fam<<endl; return false;} + if (!infile) { + cout<<"error opening .fam file: "<<file_fam<<endl; return false;} string line; char *ch_ptr; @@ -528,12 +535,19 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno while (i<p_max ) { if (mapP2c.count(i+1)!=0 ) { if (strcmp(ch_ptr, "NA")==0) { - ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9; + ind_pheno_row[mapP2c[i+1]]=0; + pheno_row[mapP2c[i+1]]=-9; } else { - p=atof(ch_ptr); + p=atof(ch_ptr); - if (p==-9) {ind_pheno_row[mapP2c[i+1]]=0; pheno_row[mapP2c[i+1]]=-9;} - else {ind_pheno_row[mapP2c[i+1]]=1; pheno_row[mapP2c[i+1]]=p;} + if (p==-9) { + ind_pheno_row[mapP2c[i+1]]=0; + pheno_row[mapP2c[i+1]]=-9; + } + else { + ind_pheno_row[mapP2c[i+1]]=1; + pheno_row[mapP2c[i+1]]=p; + } } } i++; @@ -551,20 +565,26 @@ bool ReadFile_fam (const string &file_fam, vector<vector<int> > &indicator_pheno return true; } - - - - - -//Read bimbam mean genotype file, the first time, to obtain #SNPs for analysis (ns_test) and total #SNP (ns_total) -bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, size_t &ns_test) -{ +// Read bimbam mean genotype file, the first time, to obtain #SNPs for +// analysis (ns_test) and total #SNP (ns_total). +bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, + const gsl_matrix *W, vector<int> &indicator_idv, + vector<int> &indicator_snp, const double &maf_level, + const double &miss_level, const double &hwe_level, + const double &r2_level, + map<string, string> &mapRS2chr, + map<string, long int> &mapRS2bp, + map<string, double> &mapRS2cM, + vector<SNPINFO> &snpInfo, + size_t &ns_test) { indicator_snp.clear(); snpInfo.clear(); igzstream infile (file_geno.c_str(), igzstream::in); -// ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + if (!infile) { + cout<<"error reading genotype file:"<<file_geno<<endl; + return false; + } gsl_vector *genotype=gsl_vector_alloc (W->size1); gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); @@ -575,7 +595,6 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g gsl_permutation * pmt=gsl_permutation_alloc (W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - //eigenlib_dgemm("T", "N", 1.0, W, W, 0.0, WtW); int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); @@ -616,7 +635,8 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g major=ch_ptr; if (setSnps.size()!=0 && setSnps.count(rs)==0) { - SNPINFO sInfo={"-9", rs, -9, -9, minor, major, 0, -9, -9, 0, 0, file_pos}; + SNPINFO sInfo={"-9", rs, -9, -9, minor, major, 0, -9, -9, + 0, 0, file_pos}; snpInfo.push_back(sInfo); indicator_snp.push_back(0); @@ -634,7 +654,12 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[i]==0) {continue;} - if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;} + if (strcmp(ch_ptr, "NA")==0) { + gsl_vector_set (genotype_miss, c_idv, 1); + n_miss++; + c_idv++; + continue; + } geno=atof(ch_ptr); if (geno>=0 && geno<=0.5) {n_0++;} @@ -643,8 +668,6 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g gsl_vector_set (genotype, c_idv, geno); -// if (geno<0) {n_miss++; continue;} - if (flag_poly==0) {geno_old=geno; flag_poly=2;} if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} @@ -654,24 +677,38 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g } maf/=2.0*(double)(ni_test-n_miss); - SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf, ni_test-n_miss, 0, file_pos}; + SNPINFO sInfo={chr, rs, cM, b_pos, minor, major, n_miss, + (double)n_miss/(double)ni_test, maf, + ni_test-n_miss, 0, file_pos}; snpInfo.push_back(sInfo); file_pos++; - if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + if ( (double)n_miss/(double)ni_test > miss_level) { + indicator_snp.push_back(0); + continue; + } - if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + if ((maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1) { + indicator_snp.push_back(0); + continue; + } if (flag_poly!=1) {indicator_snp.push_back(0); continue;} if (hwe_level!=0 && maf_level!=-1) { - if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} + if (CalcHWE(n_0, n_2, n_1)<hwe_level) { + indicator_snp.push_back(0); + continue; + } } - //filter SNP if it is correlated with W - //unless W has only one column, of 1s + // Filter SNP if it is correlated with W unless W has + // only one column, of 1s. for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + if (gsl_vector_get (genotype_miss, i)==1) { + geno=maf*2.0; + gsl_vector_set (genotype, i, geno); + } } gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); @@ -679,7 +716,10 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + if (W->size2!=1 && v_w/v_x >= r2_level) { + indicator_snp.push_back(0); + continue; + } indicator_snp.push_back(1); ns_test++; @@ -699,19 +739,21 @@ bool ReadFile_geno (const string &file_geno, const set<string> &setSnps, const g return true; } - - - - - -//Read bed file, the first time -bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) -{ +// Read bed file, the first time. +bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, + const gsl_matrix *W, vector<int> &indicator_idv, + vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, + const double &maf_level, const double &miss_level, + const double &hwe_level, const double &r2_level, + size_t &ns_test) { indicator_snp.clear(); size_t ns_total=snpInfo.size(); ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + if (!infile) { + cout<<"error reading bed file:"<<file_bed<<endl; + return false; + } gsl_vector *genotype=gsl_vector_alloc (W->size1); gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); @@ -739,12 +781,12 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl } ns_test=0; - //calculate n_bit and c, the number of bit for each snp + // Calculate n_bit and c, the number of bit for each snp. size_t n_bit; if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - //ignore the first three majic numbers + // Ignore the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; @@ -754,11 +796,14 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl size_t n_miss; size_t n_0, n_1, n_2, c; - //start reading snps and doing association test + // Start reading snps and doing association test. for (size_t t=0; t<ns_total; ++t) { - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - if (setSnps.size()!=0 && setSnps.count(snpInfo[t].rs_number)==0) { + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t*n_bit+3); + + if (setSnps.size()!=0 && + setSnps.count(snpInfo[t].rs_number) == 0) { snpInfo[t].n_miss=-9; snpInfo[t].missingness=-9; snpInfo[t].maf=-9; @@ -767,24 +812,41 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl continue; } - //read genotypes + // Read genotypes. c=0; maf=0.0; n_miss=0; n_0=0; n_1=0; n_2=0; c_idv=0; gsl_vector_set_zero (genotype_miss); for (size_t i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + + // Minor allele homozygous: 2.0; major: 0.0; + for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; if (b[2*j]==0) { - if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); maf+=2.0; n_2++;} - else {gsl_vector_set(genotype, c_idv, 1.0); maf+=1.0; n_1++;} + if (b[2*j+1]==0) { + gsl_vector_set(genotype, c_idv, 2.0); + maf+=2.0; + n_2++; + } + else { + gsl_vector_set(genotype, c_idv, 1.0); + maf+=1.0; + n_1++; + } } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); maf+=0.0; n_0++;} - else {gsl_vector_set(genotype_miss, c_idv, 1); n_miss++; } + if (b[2*j+1]==1) { + gsl_vector_set(genotype, c_idv, 0.0); + maf+=0.0; + n_0++; + } + else { + gsl_vector_set(genotype_miss, c_idv, 1); + n_miss++; + } } c_idv++; } @@ -798,20 +860,35 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl snpInfo[t].n_nb=0; snpInfo[t].file_position=t; - if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + if ( (double)n_miss/(double)ni_test > miss_level) { + indicator_snp.push_back(0); + continue; + } - if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + if ((maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1) { + indicator_snp.push_back(0); + continue; + } - if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) {indicator_snp.push_back(0); continue;} + if ( (n_0+n_1)==0 || (n_1+n_2)==0 || (n_2+n_0)==0) { + indicator_snp.push_back(0); + continue; + } if (hwe_level!=0 && maf_level!=-1) { - if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} + if (CalcHWE(n_0, n_2, n_1)<hwe_level) { + indicator_snp.push_back(0); + continue; + } } - //filter SNP if it is correlated with W - //unless W has only one column, of 1s + // Filter SNP if it is correlated with W unless W has + // only one column, of 1s. for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + if (gsl_vector_get (genotype_miss, i)==1) { + geno=maf*2.0; + gsl_vector_set (genotype, i, geno); + } } gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); @@ -819,7 +896,10 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - if (W->size2!=1 && v_w/v_x > r2_level) {indicator_snp.push_back(0); continue;} + if (W->size2!=1 && v_w/v_x > r2_level) { + indicator_snp.push_back(0); + continue; + } indicator_snp.push_back(1); ns_test++; @@ -839,20 +919,14 @@ bool ReadFile_bed (const string &file_bed, const set<string> &setSnps, const gsl return true; } - - - - -//read the genotype for one SNP; remember to read empty lines -//geno stores original genotypes without centering -//missing values are replaced by mean -bool Bimbam_ReadOneSNP (const size_t inc, const vector<int> &indicator_idv, igzstream &infile, gsl_vector *geno, double &geno_mean) -{ +// Read the genotype for one SNP; remember to read empty lines. +// Geno stores original genotypes without centering. +// Missing values are replaced by mean. +bool Bimbam_ReadOneSNP (const size_t inc, const vector<int> &indicator_idv, + igzstream &infile, gsl_vector *geno, + double &geno_mean) { size_t ni_total=indicator_idv.size(); - // if (infile.eof()) {infile.clear();} - // infile.seekg(pos); - string line; char *ch_ptr; bool flag=false; @@ -896,16 +970,17 @@ bool Bimbam_ReadOneSNP (const size_t inc, const vector<int> &indicator_idv, igzs return flag; } - -//for plink, store SNPs as double too -void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, ifstream &infile, gsl_vector *geno, double &geno_mean) -{ +// For PLINK, store SNPs as double too. +void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, + ifstream &infile, gsl_vector *geno, double &geno_mean) { size_t ni_total=indicator_idv.size(), n_bit; if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - infile.seekg(pos*n_bit+3); //n_bit, and 3 is the number of magic numbers - //read genotypes + // n_bit, and 3 is the number of magic numbers. + infile.seekg(pos*n_bit+3); + + // Read genotypes. char ch[1]; bitset<8> b; @@ -916,7 +991,9 @@ void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, ifstream for (size_t i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; @@ -951,15 +1028,14 @@ void Plink_ReadOneSNP (const int pos, const vector<int> &indicator_idv, ifstream return; } - - - - -void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) -{ +void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, + map<string, int> &mapID2num, const size_t k_mode, + bool &error, gsl_matrix *G) { igzstream infile (file_kin.c_str(), igzstream::in); -// ifstream infile (file_kin.c_str(), ifstream::in); - if (!infile) {cout<<"error! fail to open kinship file: "<<file_kin<<endl; error=true; return;} + if (!infile) { + cout<<"error! fail to open kinship file: "<<file_kin<<endl; + error=true; return; + } size_t ni_total=indicator_idv.size(); @@ -972,25 +1048,47 @@ void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<strin if (k_mode==1) { size_t i_test=0, i_total=0, j_test=0, j_total=0; while (getline(infile, line)) { - if (i_total==ni_total) {cout<<"error! number of rows in the kinship file is larger than the number of phentypes."<<endl; error=true;} + if (i_total==ni_total) { + cout<<"error! number of rows in the kinship "<< + "file is larger than the number of phentypes."<< + endl; + error=true; + } if (indicator_idv[i_total]==0) {i_total++; continue;} j_total=0; j_test=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { - if (j_total==ni_total) {cout<<"error! number of columns in the kinship file is larger than the number of phentypes for row = "<<i_total<<endl; error=true;} + if (j_total==ni_total) { + cout<<"error! number of columns in the "<< + "kinship file is larger than the number"<< + " of phentypes for row = "<<i_total<<endl; + error=true; + } d=atof(ch_ptr); - if (indicator_idv[j_total]==1) {gsl_matrix_set (G, i_test, j_test, d); j_test++;} + if (indicator_idv[j_total]==1) { + gsl_matrix_set (G, i_test, j_test, d); + j_test++; + } j_total++; ch_ptr=strtok (NULL, " , \t"); } - if (j_total!=ni_total) {cout<<"error! number of columns in the kinship file do not match the number of phentypes for row = "<<i_total<<endl; error=true;} + if (j_total!=ni_total) { + cout<<"error! number of columns in the kinship "<< + "file do not match the number of phentypes for "<< + "row = "<<i_total<<endl; + error=true; + } i_total++; i_test++; } - if (i_total!=ni_total) {cout<<"error! number of rows in the kinship file do not match the number of phentypes."<<endl; error=true;} + if (i_total!=ni_total) { + cout<<"error! number of rows in the kinship file do "<< + "not match the number of phentypes."<<endl; + error=true; + } } else { map<size_t, size_t> mapID2ID; @@ -1010,14 +1108,24 @@ void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<strin id2=ch_ptr; ch_ptr=strtok (NULL, " , \t"); d=atof(ch_ptr); - if (mapID2num.count(id1)==0 || mapID2num.count(id2)==0) {continue;} - if (indicator_idv[mapID2num[id1]]==0 || indicator_idv[mapID2num[id2]]==0) {continue;} + if (mapID2num.count(id1)==0 || + mapID2num.count(id2)==0) { + continue; + } + if (indicator_idv[mapID2num[id1]]==0 || + indicator_idv[mapID2num[id2]]==0) { + continue; + } n_id1=mapID2ID[mapID2num[id1]]; n_id2=mapID2ID[mapID2num[id2]]; Cov_d=gsl_matrix_get(G, n_id1, n_id2); - if (Cov_d!=0 && Cov_d!=d) {cout<<"error! redundant and unequal terms in the kinship file, for id1 = "<<id1<<" and id2 = "<<id2<<endl;} + if (Cov_d!=0 && Cov_d!=d) { + cout<<"error! redundant and unequal terms in the "<< + "kinship file, for id1 = "<<id1<<" and id2 = "<< + id2<<endl; + } else { gsl_matrix_set(G, n_id1, n_id2, d); gsl_matrix_set(G, n_id2, n_id1, d); @@ -1031,19 +1139,24 @@ void ReadFile_kin (const string &file_kin, vector<int> &indicator_idv, map<strin return; } - -void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) -{ +void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, + map<string, int> &mapID2num, const size_t k_mode, + bool &error, gsl_matrix *G) { igzstream infile (file_mk.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open file: "<<file_mk<<endl; error=true; return;} + if (!infile) {cout<<"error! fail to open file: "<<file_mk<<endl; + error=true; + return; + } string file_kin, line; size_t i=0; while (getline(infile, line)) { file_kin=line.c_str(); - gsl_matrix_view G_sub=gsl_matrix_submatrix(G, 0, i*G->size1, G->size1, G->size1); - ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, error, &G_sub.matrix); + gsl_matrix_view G_sub=gsl_matrix_submatrix(G, 0, i*G->size1, + G->size1, G->size1); + ReadFile_kin (file_kin, indicator_idv, mapID2num, k_mode, + error, &G_sub.matrix); i++; } @@ -1052,12 +1165,13 @@ void ReadFile_mk (const string &file_mk, vector<int> &indicator_idv, map<string, return; } - -void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) -{ +void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) { igzstream infile (file_ku.c_str(), igzstream::in); -// ifstream infile (file_ku.c_str(), ifstream::in); - if (!infile) {cout<<"error! fail to open the U file: "<<file_ku<<endl; error=true; return;} + if (!infile) { + cout<<"error! fail to open the U file: "<<file_ku<<endl; + error=true; + return; + } size_t n_row=U->size1, n_col=U->size2, i_row=0, i_col=0; @@ -1068,12 +1182,21 @@ void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) double d; while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the U file is larger than expected."<<endl; error=true;} + if (i_row==n_row) { + cout<<"error! number of rows in the U file is larger "<< + "than expected."<<endl; + error=true; + } i_col=0; ch_ptr=strtok ((char *)line.c_str(), " , \t"); while (ch_ptr!=NULL) { - if (i_col==n_col) {cout<<"error! number of columns in the U file is larger than expected, for row = "<<i_row<<endl; error=true;} + if (i_col==n_col) { + cout<<"error! number of columns in the U file "<< + "is larger than expected, for row = "<< + i_row<<endl; + error=true; + } d=atof(ch_ptr); gsl_matrix_set (U, i_row, i_col, d); @@ -1091,14 +1214,13 @@ void ReadFile_eigenU (const string &file_ku, bool &error, gsl_matrix *U) return; } - - - -void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) -{ +void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) { igzstream infile (file_kd.c_str(), igzstream::in); -// ifstream infile (file_kd.c_str(), ifstream::in); - if (!infile) {cout<<"error! fail to open the D file: "<<file_kd<<endl; error=true; return;} + if (!infile) { + cout<<"error! fail to open the D file: "<<file_kd<<endl; + error=true; + return; + } size_t n_row=eval->size, i_row=0; @@ -1109,13 +1231,21 @@ void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) double d; while (getline(infile, line)) { - if (i_row==n_row) {cout<<"error! number of rows in the D file is larger than expected."<<endl; error=true;} + if (i_row==n_row) { + cout<<"error! number of rows in the D file is larger "<< + "than expected."<<endl; + error=true; + } ch_ptr=strtok ((char *)line.c_str(), " , \t"); d=atof(ch_ptr); ch_ptr=strtok (NULL, " , \t"); - if (ch_ptr!=NULL) {cout<<"error! number of columns in the D file is larger than expected, for row = "<<i_row<<endl; error=true;} + if (ch_ptr!=NULL) { + cout<<"error! number of columns in the D file is larger "<< + "than expected, for row = "<<i_row<<endl; + error=true; + } gsl_vector_set (eval, i_row, d); @@ -1128,14 +1258,15 @@ void ReadFile_eigenD (const string &file_kd, bool &error, gsl_vector *eval) return; } - - -//read bimbam mean genotype file and calculate kinship matrix -bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) -{ +// Read bimbam mean genotype file and calculate kinship matrix. +bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, + const int k_mode, const int display_pace, + gsl_matrix *matrix_kin) { igzstream infile (file_geno.c_str(), igzstream::in); - //ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + if (!infile) { + cout<<"error reading genotype file:"<<file_geno<<endl; + return false; + } string line; char *ch_ptr; @@ -1147,7 +1278,7 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k gsl_vector *geno=gsl_vector_alloc (ni_total); gsl_vector *geno_miss=gsl_vector_alloc (ni_total); - //create a large matrix + // Create a large matrix. size_t msize=10000; gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); gsl_matrix_set_zero(Xlarge); @@ -1155,7 +1286,9 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k size_t ns_test=0; for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); - if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (t%display_pace==0 || t==(indicator_snp.size()-1)) { + ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); + } if (indicator_snp[t]==0) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); @@ -1166,8 +1299,9 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k gsl_vector_set_all(geno_miss, 0); for (size_t i=0; i<ni_total; ++i) { ch_ptr=strtok (NULL, " , \t"); - if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;} - else { + if (strcmp(ch_ptr, "NA")==0) { + gsl_vector_set(geno_miss, i, 0); n_miss++; + } else { d=atof(ch_ptr); gsl_vector_set (geno, i, d); gsl_vector_set (geno_miss, i, 1); @@ -1180,36 +1314,27 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; -// geno_var=geno_mean*(1-geno_mean*0.5); for (size_t i=0; i<ni_total; ++i) { - if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + if (gsl_vector_get (geno_miss, i)==0) { + gsl_vector_set(geno, i, geno_mean); + } } gsl_vector_add_constant (geno, -1.0*geno_mean); - /* - if (geno_var!=0) { - if (k_mode==1) { - gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin); - //eigenlib_dsyr (1.0, geno, matrix_kin); - } else if (k_mode==2) { - gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin); - //eigenlib_dsyr (1.0/geno_var, geno, matrix_kin); - } else { - cout<<"Unknown kinship mode."<<endl; - } + if (k_mode==2 && geno_var!=0) { + gsl_vector_scale (geno, 1.0/sqrt(geno_var)); } - */ - - if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));} - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column (Xlarge, ns_test%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_test++; if (ns_test%msize==0) { - eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, + matrix_kin); gsl_matrix_set_zero(Xlarge); } } @@ -1238,16 +1363,14 @@ bool BimbamKin (const string &file_geno, vector<int> &indicator_snp, const int k return true; } - - - - - - -bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) -{ +bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, + const int k_mode, const int display_pace, + gsl_matrix *matrix_kin) { ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + if (!infile) { + cout<<"error reading bed file:"<<file_bed<<endl; + return false; + } char ch[1]; bitset<8> b; @@ -1261,12 +1384,12 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m size_t ns_test=0; int n_bit; - //create a large matrix + // Create a large matrix. size_t msize=10000; gsl_matrix *Xlarge=gsl_matrix_alloc (ni_total, msize); gsl_matrix_set_zero(Xlarge); - //calculate n_bit and c, the number of bit for each snp + // Calculate n_bit and c, the number of bit for each snp. if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } @@ -1277,26 +1400,46 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m } for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (t%display_pace==0 || t==(indicator_snp.size()-1)) { + ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); + } if (indicator_snp[t]==0) {continue;} - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t*n_bit+3); - //read genotypes + // Read genotypes. geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; - if ((i==(n_bit-1)) && ci_total==ni_total) {break;} + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j=0; j<4; ++j) { + if ((i==(n_bit-1)) && ci_total==ni_total) { + break; + } if (b[2*j]==0) { - if (b[2*j+1]==0) {gsl_vector_set(geno, ci_total, 2.0); geno_mean+=2.0; geno_var+=4.0; } - else {gsl_vector_set(geno, ci_total, 1.0); geno_mean+=1.0; geno_var+=1.0;} + if (b[2*j+1]==0) { + gsl_vector_set(geno, ci_total, 2.0); + geno_mean+=2.0; + geno_var+=4.0; + } + else { + gsl_vector_set(geno, ci_total, 1.0); + geno_mean+=1.0; + geno_var+=1.0; + } } else { - if (b[2*j+1]==1) {gsl_vector_set(geno, ci_total, 0.0); } - else {gsl_vector_set(geno, ci_total, -9.0); n_miss++; } + if (b[2*j+1]==1) { + gsl_vector_set(geno,ci_total,0.0); + } + else { + gsl_vector_set(geno,ci_total,-9.0); + n_miss++; + } } ci_total++; @@ -1307,7 +1450,6 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; -// geno_var=geno_mean*(1-geno_mean*0.5); for (size_t i=0; i<ni_total; ++i) { d=gsl_vector_get(geno,i); @@ -1316,22 +1458,17 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m gsl_vector_add_constant (geno, -1.0*geno_mean); - /* - if (geno_var!=0) { - if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} - else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} - else {cout<<"Unknown kinship mode."<<endl;} + if (k_mode==2 && geno_var!=0) { + gsl_vector_scale (geno, 1.0/sqrt(geno_var)); } - */ - - if (k_mode==2 && geno_var!=0) {gsl_vector_scale (geno, 1.0/sqrt(geno_var));} - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_test%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column (Xlarge, ns_test%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_test++; if (ns_test%msize==0) { - eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + eigenlib_dgemm("N","T",1.0,Xlarge,Xlarge,1.0,matrix_kin); gsl_matrix_set_zero(Xlarge); } } @@ -1360,16 +1497,16 @@ bool PlinkKin (const string &file_bed, vector<int> &indicator_snp, const int k_m return true; } - - - - -//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K -bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) -{ +// Read bimbam mean genotype file, the second time, recode "mean" +// genotype and calculate K. +bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, + vector<int> &indicator_snp, gsl_matrix *UtX, + gsl_matrix *K, const bool calc_K) { igzstream infile (file_geno.c_str(), igzstream::in); -// ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + if (!infile) { + cout<<"error reading genotype file:"<<file_geno<<endl; + return false; + } string line; char *ch_ptr; @@ -1402,8 +1539,10 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[j]==0) {continue;} - if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;} - else { + if (strcmp(ch_ptr, "NA")==0) { + gsl_vector_set (genotype_miss, c_idv, 1); + n_miss++; + } else { geno=atof(ch_ptr); gsl_vector_set (genotype, c_idv, geno); geno_mean+=geno; @@ -1414,14 +1553,21 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< geno_mean/=(double)(ni_test-n_miss); for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=0;} - else {geno=gsl_vector_get (genotype, i); geno-=geno_mean;} + if (gsl_vector_get (genotype_miss, i)==1) { + geno=0; + } + else { + geno=gsl_vector_get (genotype, i); + geno-=geno_mean; + } gsl_vector_set (genotype, i, geno); gsl_matrix_set (UtX, i, c_snp, geno); } - if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + if (calc_K==true) { + gsl_blas_dsyr (CblasUpper, 1.0, genotype, K); + } c_snp++; } @@ -1446,14 +1592,18 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< return true; } - - -//compact version of the above function, using uchar instead of gsl_matrix -bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) -{ +// Compact version of the above function, using uchar instead of +// gsl_matrix. +bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, + vector<int> &indicator_snp, + vector<vector<unsigned char> > &Xt, + gsl_matrix *K, const bool calc_K, const size_t ni_test, + const size_t ns_test) { igzstream infile (file_geno.c_str(), igzstream::in); - // ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + if (!infile) { + cout<<"error reading genotype file:"<<file_geno<<endl; + return false; + } Xt.clear(); vector<unsigned char> Xt_row; @@ -1490,7 +1640,10 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< ch_ptr=strtok (NULL, " , \t"); if (indicator_idv[j]==0) {continue;} - if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++;} else { + if (strcmp(ch_ptr, "NA")==0) { + gsl_vector_set (genotype_miss, c_idv, 1); + n_miss++; + } else { geno=atof(ch_ptr); gsl_vector_set (genotype, c_idv, geno); geno_mean+=geno; @@ -1512,7 +1665,9 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< } Xt.push_back(Xt_row); - if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + if (calc_K==true) { + gsl_blas_dsyr (CblasUpper, 1.0, genotype, K); + } c_snp++; } @@ -1537,14 +1692,16 @@ bool ReadFile_geno (const string &file_geno, vector<int> &indicator_idv, vector< return true; } - - - -//Read bimbam mean genotype file, the second time, recode "mean" genotype and calculate K -bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) -{ +// Read bimbam mean genotype file, the second time, recode "mean" +// genotype and calculate K. +bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, + vector<int> &indicator_snp, gsl_matrix *UtX, + gsl_matrix *K, const bool calc_K) { ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + if (!infile) { + cout<<"error reading bed file:"<<file_bed<<endl; + return false; + } char ch[1]; bitset<8> b; @@ -1558,7 +1715,7 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - //print the first three majic numbers + // Print the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; @@ -1572,28 +1729,44 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in size_t n_miss; size_t c_idv=0, c_snp=0, c=0; - //start reading snps and doing association test + // Start reading snps and doing association test. for (size_t t=0; t<ns_total; ++t) { if (indicator_snp[t]==0) {continue;} - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - //read genotypes + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t*n_bit+3); + + // Read genotypes. c_idv=0; geno_mean=0.0; n_miss=0; c=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; if (b[2*j]==0) { - if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;} - else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;} + if (b[2*j+1]==0) { + gsl_vector_set(genotype, c_idv, 2.0); + geno_mean+=2.0; + } + else { + gsl_vector_set(genotype, c_idv, 1.0); + geno_mean+=1.0; + } } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;} - else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;} + if (b[2*j+1]==1) { + gsl_vector_set(genotype, c_idv, 0.0); + geno_mean+=0.0; + } + else { + gsl_vector_set(genotype, c_idv, -9.0); + n_miss++; + } } c_idv++; } @@ -1610,7 +1783,9 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in gsl_matrix_set (UtX, i, c_snp, geno); } - if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + if (calc_K==true) { + gsl_blas_dsyr (CblasUpper, 1.0, genotype, K); + } c_snp++; } @@ -1633,14 +1808,17 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in return true; } - - - -//compact version of the above function, using uchar instead of gsl_matrix -bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char> > &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) -{ +// Compact version of the above function, using uchar instead of gsl_matrix. +bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, + vector<int> &indicator_snp, + vector<vector<unsigned char> > &Xt, gsl_matrix *K, + const bool calc_K, const size_t ni_test, + const size_t ns_test) { ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + if (!infile) { + cout<<"error reading bed file:"<<file_bed<<endl; + return false; + } Xt.clear(); vector<unsigned char> Xt_row; @@ -1658,7 +1836,7 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1;} - //print the first three majic numbers + // Print the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; @@ -1672,28 +1850,44 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in size_t n_miss; size_t c_idv=0, c_snp=0, c=0; - //start reading snps and doing association test + // Start reading SNPs and doing association test. for (size_t t=0; t<ns_total; ++t) { if (indicator_snp[t]==0) {continue;} - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers - //read genotypes + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t*n_bit+3); + + // Read genotypes. c_idv=0; geno_mean=0.0; n_miss=0; c=0; for (int i=0; i<n_bit; ++i) { infile.read(ch,1); b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && c==ni_total) {break;} if (indicator_idv[c]==0) {c++; continue;} c++; if (b[2*j]==0) { - if (b[2*j+1]==0) {gsl_vector_set(genotype, c_idv, 2.0); geno_mean+=2.0;} - else {gsl_vector_set(genotype, c_idv, 1.0); geno_mean+=1.0;} + if (b[2*j+1]==0) { + gsl_vector_set(genotype, c_idv, 2.0); + geno_mean+=2.0; + } + else { + gsl_vector_set(genotype, c_idv, 1.0); + geno_mean+=1.0; + } } else { - if (b[2*j+1]==1) {gsl_vector_set(genotype, c_idv, 0.0); geno_mean+=0.0;} - else {gsl_vector_set(genotype, c_idv, -9.0); n_miss++;} + if (b[2*j+1]==1) { + gsl_vector_set(genotype, c_idv, 0.0); + geno_mean+=0.0; + } + else { + gsl_vector_set(genotype, c_idv, -9.0); + n_miss++; + } } c_idv++; } @@ -1713,7 +1907,9 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in } Xt.push_back(Xt_row); - if (calc_K==true) {gsl_blas_dsyr (CblasUpper, 1.0, genotype, K);} + if (calc_K==true) { + gsl_blas_dsyr (CblasUpper, 1.0, genotype, K); + } c_snp++; } @@ -1736,18 +1932,15 @@ bool ReadFile_bed (const string &file_bed, vector<int> &indicator_idv, vector<in return true; } - - - - - - -bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est) -{ +bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, + map<string, double> &mapRS2est) { mapRS2est.clear(); ifstream infile (file_est.c_str(), ifstream::in); - if (!infile) {cout<<"error opening estimated parameter file: "<<file_est<<endl; return false;} + if (!infile) { + cout<<"error opening estimated parameter file: "<<file_est<<endl; + return false; + } string line; char *ch_ptr; @@ -1755,7 +1948,7 @@ bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map string rs; double alpha, beta, gamma, d; - //header + // Header. getline(infile, line); size_t n=*max_element(est_column.begin(), est_column.end()); @@ -1778,7 +1971,9 @@ bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map mapRS2est[rs]=d; } else { - cout<<"the same SNP occurs more than once in estimated parameter file: "<<rs<<endl; return false; + cout << "the same SNP occurs more than once in estimated "<< + "parameter file: "<<rs<<endl; + return false; } } @@ -1787,13 +1982,12 @@ bool ReadFile_est (const string &file_est, const vector<size_t> &est_column, map return true; } - - -bool CountFileLines (const string &file_input, size_t &n_lines) -{ +bool CountFileLines (const string &file_input, size_t &n_lines) { igzstream infile (file_input.c_str(), igzstream::in); - //ifstream infile (file_input.c_str(), ifstream::in); - if (!infile) {cout<<"error! fail to open file: "<<file_input<<endl; return false;} + if (!infile) { + cout<<"error! fail to open file: "<<file_input<<endl; + return false; + } n_lines=count(istreambuf_iterator<char>(infile), istreambuf_iterator<char>(), '\n'); infile.seekg (0, ios::beg); @@ -1801,16 +1995,17 @@ bool CountFileLines (const string &file_input, size_t &n_lines) return true; } - - -//Read gene expression file -bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total) -{ +// Read gene expression file. +bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, + vector<SNPINFO> &snpInfo, size_t &ng_total) { vec_read.clear(); ng_total=0; igzstream infile (file_gene.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open gene expression file: "<<file_gene<<endl; return false;} + if (!infile) { + cout<<"error! fail to open gene expression file: "<<file_gene<<endl; + return false; + } string line; char *ch_ptr; @@ -1818,7 +2013,7 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN size_t n_idv=0, t=0; - //header + // Header. getline(infile, line); while (getline(infile, line)) { @@ -1841,9 +2036,13 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN ch_ptr=strtok (NULL, " , \t"); } - if (t!=n_idv) {cout<<"error! number of columns doesn't match in row: "<<ng_total<<endl; return false;} + if (t!=n_idv) { + cout<<"error! number of columns doesn't match in row: "<< + ng_total<<endl; + return false; + } - SNPINFO sInfo={"-9", rs, -9, -9, "-9", "-9", 0, -9, -9, 0, 0, 0}; + SNPINFO sInfo={"-9",rs,-9,-9,"-9","-9",0,-9,-9,0,0,0}; snpInfo.push_back(sInfo); ng_total++; @@ -1855,28 +2054,28 @@ bool ReadFile_gene (const string &file_gene, vector<double> &vec_read, vector<SN return true; } - - - - - - // WJA Added -//Read Oxford sample file -bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_pheno, vector<vector<double> > &pheno, const vector<size_t> &p_column, vector<int> &indicator_cvt, vector<vector<double> > &cvt, size_t &n_cvt) -{ +// Read Oxford sample file. +bool ReadFile_sample (const string &file_sample, + vector<vector<int> > &indicator_pheno, + vector<vector<double> > &pheno, + const vector<size_t> &p_column, + vector<int> &indicator_cvt, + vector<vector<double> > &cvt, size_t &n_cvt) { indicator_pheno.clear(); pheno.clear(); indicator_cvt.clear(); igzstream infile (file_sample.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open sample file: "<<file_sample<<endl; return false;} + if (!infile) { + cout<<"error! fail to open sample file: "<<file_sample<<endl; + return false; + } string line; char *ch_ptr; - string id; double p,d; @@ -1888,8 +2087,6 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ size_t num_p_in_file=0; size_t num_cvt_in_file=0; -// size_t p_max=*max_element(p_column.begin(), p_column.end()); - map<size_t, size_t> mapP2c; for (size_t i=0; i<p_column.size(); i++) { mapP2c[p_column[i]]=i; @@ -1897,7 +2094,7 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ ind_pheno_row.push_back(0); } - // read header line1 + // Read header line1. if(!safeGetline(infile, line).eof()) { ch_ptr=strtok((char *)line.c_str(), " \t"); if(strcmp(ch_ptr, "ID_1")!=0) {return false;} @@ -1916,7 +2113,8 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ vector<map<uint32_t, size_t> > cvt_factor_levels; char col_type[num_cols]; - // read header line2 + + // Read header line2. if(!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " \t"); if(strcmp(ch_ptr, "0")!=0) {return false;} @@ -1927,13 +2125,17 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ size_t it=0; ch_ptr=strtok (NULL, " \t"); if(ch_ptr!=NULL) - while(ch_ptr!=NULL){ - col_type[it++]=ch_ptr[0]; - if(ch_ptr[0]=='D') {cvt_factor_levels.push_back(map<uint32_t, size_t>());num_cvt_in_file++;} - if(ch_ptr[0]=='C') {num_cvt_in_file++;} - if((ch_ptr[0]=='P')||(ch_ptr[0]=='B')) {num_p_in_file++;} - ch_ptr=strtok(NULL, " \t"); - } + while(ch_ptr!=NULL){ + col_type[it++]=ch_ptr[0]; + if(ch_ptr[0]=='D') { + cvt_factor_levels.push_back(map<uint32_t,size_t>()); + num_cvt_in_file++; + } + if(ch_ptr[0]=='C') {num_cvt_in_file++;} + if((ch_ptr[0]=='P')||(ch_ptr[0]=='B')) { + num_p_in_file++;} + ch_ptr=strtok(NULL, " \t"); + } } @@ -1943,7 +2145,6 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " \t");} - size_t i=0; size_t p_i=0; size_t fac_cvt_i=0; @@ -1952,42 +2153,62 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ if((col_type[i]=='P')||(col_type[i]=='B')) { - if (mapP2c.count(p_i+1)!=0) { - if (strcmp(ch_ptr, "NA")==0) {ind_pheno_row[mapP2c[p_i+1]]=0; pheno_row[mapP2c[p_i+1]]=-9;} - else {p=atof(ch_ptr); ind_pheno_row[mapP2c[p_i+1]]=1; pheno_row[mapP2c[p_i+1]]=p;} - } - p_i++; + if (mapP2c.count(p_i+1)!=0) { + if (strcmp(ch_ptr, "NA")==0) { + ind_pheno_row[mapP2c[p_i+1]]=0; + pheno_row[mapP2c[p_i+1]]=-9; + } + else { + p=atof(ch_ptr); + ind_pheno_row[mapP2c[p_i+1]]=1; + pheno_row[mapP2c[p_i+1]]=p; + } + } + p_i++; } if(col_type[i]=='D') { - // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL IS INTEGRAL i.e for atoi error - if (strcmp(ch_ptr, "NA")!=0) {uint32_t level=atoi(ch_ptr); if(cvt_factor_levels[fac_cvt_i].count(level) == 0) {cvt_factor_levels[fac_cvt_i][level]=cvt_factor_levels[fac_cvt_i].size();}} - fac_cvt_i++; + + // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL + // IS INTEGRAL i.e for atoi error. + if (strcmp(ch_ptr, "NA")!=0) { + uint32_t level=atoi(ch_ptr); + if (cvt_factor_levels[fac_cvt_i].count(level)==0) { + cvt_factor_levels[fac_cvt_i][level]= + cvt_factor_levels[fac_cvt_i].size(); + } + } + fac_cvt_i++; } ch_ptr=strtok (NULL, " \t"); i++; } - indicator_pheno.push_back(ind_pheno_row); pheno.push_back(pheno_row); } - // close and reopen the file + + // Close and reopen the file. infile.close(); infile.clear(); - if(num_cvt_in_file>0) - { + if(num_cvt_in_file>0) { igzstream infile2 (file_sample.c_str(), igzstream::in); - if (!infile2) {cout<<"error! fail to open sample file: "<<file_sample<<endl; return false;} - // skip header + if (!infile2) { + cout<<"error! fail to open sample file: "<< + file_sample<<endl; + return false; + } + + // Skip header. safeGetline(infile2, line); safeGetline(infile2, line); - // pull in the covariates now we now the number of factor levels + // Pull in the covariates now we now the number of + // factor levels. while (!safeGetline(infile2, line).eof()) { vector<double> v_d; flag_na=0; @@ -1995,38 +2216,51 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ for(int it=0;it<3;it++){ch_ptr=strtok(NULL, " \t");} - size_t i=0; size_t fac_cvt_i=0; size_t num_fac_levels; while (i<num_cols) { - - if(col_type[i]=='C') - { - if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} - else {d=atof(ch_ptr);} - - v_d.push_back(d); + + if(col_type[i]=='C') { + if (strcmp(ch_ptr, "NA")==0) {flag_na=1; d=-9;} + else {d=atof(ch_ptr);} + + v_d.push_back(d); + } + + if(col_type[i]=='D') { + + // NOTE THIS DOES NOT CHECK TO BE SURE + // LEVEL IS INTEGRAL i.e for atoi error. + num_fac_levels=cvt_factor_levels[fac_cvt_i].size(); + if(num_fac_levels>1) { + if (strcmp(ch_ptr, "NA")==0) { + flag_na=1; + for(size_t it=0;it<num_fac_levels-1; it++) { + v_d.push_back(-9); } - - - if(col_type[i]=='D') - { - // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL IS INTEGRAL i.e for atoi error - num_fac_levels=cvt_factor_levels[fac_cvt_i].size(); - if(num_fac_levels>1) - { - if (strcmp(ch_ptr, "NA")==0) {flag_na=1; for(size_t it=0;it<num_fac_levels-1; it++) {v_d.push_back(-9);}} - else {uint32_t level=atoi(ch_ptr); for(size_t it=0;it<num_fac_levels-1;it++) {cvt_factor_levels[fac_cvt_i][level]==it+1 ? v_d.push_back(1.0) : v_d.push_back(0.0); }} - } - fac_cvt_i++; + } + else { + uint32_t level=atoi(ch_ptr); + for(size_t it=0;it<num_fac_levels-1;it++) { + cvt_factor_levels[fac_cvt_i][level]==it+1 ? + v_d.push_back(1.0) : + v_d.push_back(0.0); } - - ch_ptr=strtok (NULL, " \t"); - i++; + } + } + fac_cvt_i++; + } + + ch_ptr=strtok (NULL, " \t"); + i++; } - if (flag_na==0) {indicator_cvt.push_back(1);} else {indicator_cvt.push_back(0);} + if (flag_na==0) { + indicator_cvt.push_back(1); + } else { + indicator_cvt.push_back(0); + } cvt.push_back(v_d); @@ -2035,11 +2269,20 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ if (indicator_cvt.empty()) {n_cvt=0;} else { flag_na=0; - for (vector<int>::size_type i=0; i<indicator_cvt.size(); ++i) { + for (vector<int>::size_type i=0; + i<indicator_cvt.size(); + ++i) { if (indicator_cvt[i]==0) {continue;} - if (flag_na==0) {flag_na=1; n_cvt=cvt[i].size();} - if (flag_na!=0 && n_cvt!=cvt[i].size()) {cout<<"error! number of covariates in row "<<i<<" do not match other rows."<<endl; return false;} + if (flag_na==0) { + flag_na=1; + n_cvt=cvt[i].size(); + } + if (flag_na!=0 && n_cvt!=cvt[i].size()) { + cout<<"error! number of covariates in row "<< + i<<" do not match other rows."<<endl; + return false; + } } } @@ -2049,19 +2292,22 @@ bool ReadFile_sample(const string &file_sample, vector<vector<int> > &indicator_ return true; } - - -// WJA Added -//Read bgen file, the first time -#include <cstdint> -#include <assert.h> -bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) -{ +// WJA Added. +// Read bgen file, the first time. +bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, + const gsl_matrix *W, vector<int> &indicator_idv, + vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, + const double &maf_level, const double &miss_level, + const double &hwe_level, const double &r2_level, + size_t &ns_test) { indicator_snp.clear(); ifstream infile (file_bgen.c_str(), ios::binary); - if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return false;} + if (!infile) { + cout<<"error reading bgen file:"<<file_bgen<<endl; + return false; + } gsl_vector *genotype=gsl_vector_alloc (W->size1); gsl_vector *genotype_miss=gsl_vector_alloc (W->size1); @@ -2075,8 +2321,8 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs int sig; LUDecomp (WtW, pmt, &sig); LUInvert (WtW, pmt, WtWi); - - // read in header + + // Read in header. uint32_t bgen_snp_block_offset; uint32_t bgen_header_length; uint32_t bgen_nsamples; @@ -2108,7 +2354,6 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs string rs; long int b_pos; string chr; -// double cM; string major; string minor; string id; @@ -2116,17 +2361,19 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs double v_x, v_w; int c_idv=0; - double maf, geno, geno_old; size_t n_miss; size_t n_0, n_1, n_2; int flag_poly; - double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + double bgen_geno_prob_AA, bgen_geno_prob_AB; + double bgen_geno_prob_BB, bgen_geno_prob_non_miss; + // Total number of samples in phenotype file. + size_t ni_total=indicator_idv.size(); - size_t ni_total=indicator_idv.size(); // total number of samples in phenotype file - size_t ni_test=0; // number of samples to use in test + // Number of samples to use in test. + size_t ni_test=0; uint32_t bgen_N; uint16_t bgen_LS; @@ -2141,13 +2388,9 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs size_t unzipped_data_size; for (size_t i=0; i<ni_total; ++i) { - - ni_test+=indicator_idv[i]; + ni_test+=indicator_idv[i]; } - - -// ns_total=1; for (size_t t=0; t<ns_total; ++t) { id.clear(); @@ -2181,8 +2424,7 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs bgen_B_allele.resize(bgen_LB); infile.read(&bgen_B_allele[0], bgen_LB); - - // should we switch according to MAF? + // Should we switch according to MAF? minor=bgen_B_allele; major=bgen_A_allele; b_pos=static_cast<long int>(bgen_SNP_pos); @@ -2196,16 +2438,15 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs snpInfo.push_back(sInfo); indicator_snp.push_back(0); if(CompressedSNPBlocks) - infile.read(reinterpret_cast<char*>(&bgen_P),4); + infile.read(reinterpret_cast<char*>(&bgen_P),4); else - bgen_P=6*bgen_N; + bgen_P=6*bgen_N; infile.ignore(static_cast<size_t>(bgen_P)); continue; } - if(CompressedSNPBlocks) { infile.read(reinterpret_cast<char*>(&bgen_P),4); @@ -2213,36 +2454,48 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs unzipped_data_size=6*bgen_N; - infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); - int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + infile.read(reinterpret_cast<char*>(zipped_data), + bgen_P); + int result= + uncompress(reinterpret_cast<Bytef*>(unzipped_data), + reinterpret_cast<uLongf*>(&unzipped_data_size), + reinterpret_cast<Bytef*>(zipped_data), + static_cast<uLong> (bgen_P)); assert(result == Z_OK); } else { - bgen_P=6*bgen_N; - infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); } - maf=0; n_miss=0; flag_poly=0; geno_old=-9; n_0=0; n_1=0; n_2=0; c_idv=0; gsl_vector_set_zero (genotype_miss); for (size_t i=0; i<bgen_N; ++i) { + // CHECK this set correctly! if (indicator_idv[i]==0) {continue;} - - bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; - bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; - bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; - bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; - - //CHECK 0.1 OK - if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set (genotype_miss, c_idv, 1); n_miss++; c_idv++; continue;} - + bgen_geno_prob_AA= + static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB= + static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB= + static_cast<double>(unzipped_data[i*3+2])/32768.0; + bgen_geno_prob_non_miss= + bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; + + //CHECK 0.1 OK. + if (bgen_geno_prob_non_miss<0.9) { + gsl_vector_set (genotype_miss, c_idv, 1); + n_miss++; + c_idv++; + continue; + } bgen_geno_prob_AA/=bgen_geno_prob_non_miss; bgen_geno_prob_AB/=bgen_geno_prob_non_miss; @@ -2255,7 +2508,7 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs gsl_vector_set (genotype, c_idv, geno); - // CHECK WHAT THIS DOES + // CHECK WHAT THIS DOES. if (flag_poly==0) {geno_old=geno; flag_poly=2;} if (flag_poly==2 && geno!=geno_old) {flag_poly=1;} @@ -2266,23 +2519,39 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs maf/=2.0*static_cast<double>(ni_test-n_miss); - SNPINFO sInfo={chr, rs, -9, b_pos, minor, major, n_miss, (double)n_miss/(double)ni_test, maf}; + SNPINFO sInfo={chr, rs, -9, b_pos, minor, major, n_miss, + (double)n_miss/(double)ni_test, maf}; snpInfo.push_back(sInfo); - if ( (double)n_miss/(double)ni_test > miss_level) {indicator_snp.push_back(0); continue;} + if ( (double)n_miss/(double)ni_test > miss_level) { + indicator_snp.push_back(0); + continue; + } - if ( (maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1 ) {indicator_snp.push_back(0); continue;} + if ((maf<maf_level || maf> (1.0-maf_level)) && maf_level!=-1) { + indicator_snp.push_back(0); + continue; + } - if (flag_poly!=1) {indicator_snp.push_back(0); continue;} + if (flag_poly!=1) { + indicator_snp.push_back(0); + continue; + } if (hwe_level!=0 && maf_level!=-1) { - if (CalcHWE(n_0, n_2, n_1)<hwe_level) {indicator_snp.push_back(0); continue;} + if (CalcHWE(n_0, n_2, n_1)<hwe_level) { + indicator_snp.push_back(0); + continue; + } } - //filter SNP if it is correlated with W - //unless W has only one column, of 1s + // Filter SNP if it is correlated with W + // unless W has only one column, of 1s. for (size_t i=0; i<genotype->size; ++i) { - if (gsl_vector_get (genotype_miss, i)==1) {geno=maf*2.0; gsl_vector_set (genotype, i, geno);} + if (gsl_vector_get (genotype_miss, i)==1) { + geno=maf*2.0; + gsl_vector_set (genotype, i, geno); + } } gsl_blas_dgemv (CblasTrans, 1.0, W, genotype, 0.0, Wtx); @@ -2290,30 +2559,29 @@ bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, const gs gsl_blas_ddot (genotype, genotype, &v_x); gsl_blas_ddot (Wtx, WtWiWtx, &v_w); - if (W->size2!=1 && v_w/v_x >= r2_level) {indicator_snp.push_back(0); continue;} + if (W->size2!=1 && v_w/v_x >= r2_level) { + indicator_snp.push_back(0); continue;} indicator_snp.push_back(1); ns_test++; } - - - return true; - } - -//read oxford genotype file and calculate kinship matrix -bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin) -{ +// Read oxford genotype file and calculate kinship matrix. +bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, + const int k_mode, const int display_pace, + gsl_matrix *matrix_kin) { string file_bgen=file_oxford; ifstream infile (file_bgen.c_str(), ios::binary); - if (!infile) {cout<<"error reading bgen file:"<<file_bgen<<endl; return false;} - + if (!infile) { + cout<<"error reading bgen file:"<<file_bgen<<endl; + return false; + } - // read in header + // Read in header. uint32_t bgen_snp_block_offset; uint32_t bgen_header_length; uint32_t bgen_nsamples; @@ -2331,11 +2599,11 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k infile.read(reinterpret_cast<char*>(&bgen_flags),4); bgen_snp_block_offset-=4; bool CompressedSNPBlocks=bgen_flags&0x1; -// bool LongIds=bgen_flags&0x4; infile.ignore(bgen_snp_block_offset); - double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB, bgen_geno_prob_non_miss; + double bgen_geno_prob_AA, bgen_geno_prob_AB; + double bgen_geno_prob_BB, bgen_geno_prob_non_miss; uint32_t bgen_N; uint16_t bgen_LS; @@ -2353,7 +2621,6 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k string chr; double genotype; - size_t n_miss; double d, geno_mean, geno_var; @@ -2364,7 +2631,9 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k size_t ns_test=0; for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (t%display_pace==0 || t==(indicator_snp.size()-1)) { + ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); + } id.clear(); rs.clear(); @@ -2396,74 +2665,76 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k infile.read(reinterpret_cast<char*>(&bgen_LB),4); bgen_B_allele.resize(bgen_LB); infile.read(&bgen_B_allele[0], bgen_LB); - - - - + uint16_t unzipped_data[3*bgen_N]; if (indicator_snp[t]==0) { if(CompressedSNPBlocks) - infile.read(reinterpret_cast<char*>(&bgen_P),4); + infile.read(reinterpret_cast<char*>(&bgen_P),4); else - bgen_P=6*bgen_N; + bgen_P=6*bgen_N; infile.ignore(static_cast<size_t>(bgen_P)); continue; } - - if(CompressedSNPBlocks) { - - - infile.read(reinterpret_cast<char*>(&bgen_P),4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size=6*bgen_N; - - infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); - - int result=uncompress(reinterpret_cast<Bytef*>(unzipped_data), reinterpret_cast<uLongf*>(&unzipped_data_size), reinterpret_cast<Bytef*>(zipped_data), static_cast<uLong> (bgen_P)); + infile.read(reinterpret_cast<char*>(&bgen_P),4); + uint8_t zipped_data[bgen_P]; + + unzipped_data_size=6*bgen_N; + + infile.read(reinterpret_cast<char*>(zipped_data),bgen_P); + + int result= + uncompress(reinterpret_cast<Bytef*>(unzipped_data), + reinterpret_cast<uLongf*>(&unzipped_data_size), + reinterpret_cast<Bytef*>(zipped_data), + static_cast<uLong> (bgen_P)); assert(result == Z_OK); } else { - - bgen_P=6*bgen_N; - infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); + + bgen_P=6*bgen_N; + infile.read(reinterpret_cast<char*>(unzipped_data),bgen_P); } - - geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); for (size_t i=0; i<bgen_N; ++i) { - - bgen_geno_prob_AA=static_cast<double>(unzipped_data[i*3])/32768.0; - bgen_geno_prob_AB=static_cast<double>(unzipped_data[i*3+1])/32768.0; - bgen_geno_prob_BB=static_cast<double>(unzipped_data[i*3+2])/32768.0; - // WJA - bgen_geno_prob_non_miss=bgen_geno_prob_AA+bgen_geno_prob_AB+bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss<0.9) {gsl_vector_set(geno_miss, i, 0.0); n_miss++;} - else { - - bgen_geno_prob_AA/=bgen_geno_prob_non_miss; - bgen_geno_prob_AB/=bgen_geno_prob_non_miss; - bgen_geno_prob_BB/=bgen_geno_prob_non_miss; - - genotype=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; - - gsl_vector_set(geno, i, genotype); - gsl_vector_set(geno_miss, i, 1.0); - geno_mean+=genotype; - geno_var+=genotype*genotype; - } + + bgen_geno_prob_AA= + static_cast<double>(unzipped_data[i*3])/32768.0; + bgen_geno_prob_AB= + static_cast<double>(unzipped_data[i*3+1])/32768.0; + bgen_geno_prob_BB= + static_cast<double>(unzipped_data[i*3+2])/32768.0; + // WJA + bgen_geno_prob_non_miss=bgen_geno_prob_AA + + bgen_geno_prob_AB+bgen_geno_prob_BB; + if (bgen_geno_prob_non_miss<0.9) { + gsl_vector_set(geno_miss, i, 0.0); + n_miss++; + } + else { + + bgen_geno_prob_AA/=bgen_geno_prob_non_miss; + bgen_geno_prob_AB/=bgen_geno_prob_non_miss; + bgen_geno_prob_BB/=bgen_geno_prob_non_miss; + + genotype=2.0*bgen_geno_prob_BB+bgen_geno_prob_AB; + + gsl_vector_set(geno, i, genotype); + gsl_vector_set(geno_miss, i, 1.0); + geno_mean+=genotype; + geno_var+=genotype*genotype; + } } @@ -2472,18 +2743,24 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_total; geno_var-=geno_mean*geno_mean; -// geno_var=geno_mean*(1-geno_mean*0.5); for (size_t i=0; i<ni_total; ++i) { - if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + if (gsl_vector_get (geno_miss, i)==0) { + gsl_vector_set(geno, i, geno_mean); + } } gsl_vector_add_constant (geno, -1.0*geno_mean); if (geno_var!=0) { - if (k_mode==1) {gsl_blas_dsyr (CblasUpper, 1.0, geno, matrix_kin);} - else if (k_mode==2) {gsl_blas_dsyr (CblasUpper, 1.0/geno_var, geno, matrix_kin);} - else {cout<<"Unknown kinship mode."<<endl;} + if (k_mode==1) { + gsl_blas_dsyr(CblasUpper,1.0,geno,matrix_kin); + } else if (k_mode==2) { + gsl_blas_dsyr(CblasUpper,1.0/geno_var,geno,matrix_kin); + } + else { + cout<<"Unknown kinship mode."<<endl; + } } ns_test++; @@ -2508,42 +2785,23 @@ bool bgenKin (const string &file_oxford, vector<int> &indicator_snp, const int k return true; } - - - - - - - - - - - - - - - - - - - - - - -//read header to determine which column contains which item +// Read header to determine which column contains which item. bool ReadHeader_io (const string &line, HEADER &header) { - string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID","rsid","RSID","MarkerName"}; + string rs_ptr[]={"rs","RS","snp","SNP","snps","SNPS","snpid","SNPID", + "rsid","RSID","MarkerName"}; set<string> rs_set(rs_ptr, rs_ptr+11); string chr_ptr[]={"chr","CHR"}; set<string> chr_set(chr_ptr, chr_ptr+2); - string pos_ptr[]={"ps","PS","pos","POS","base_position","BASE_POSITION", "bp", "BP"}; + string pos_ptr[]={"ps","PS","pos","POS","base_position", + "BASE_POSITION", "bp", "BP"}; set<string> pos_set(pos_ptr, pos_ptr+8); string cm_ptr[]={"cm","CM"}; set<string> cm_set(cm_ptr, cm_ptr+2); string a1_ptr[]={"a1","A1","allele1","ALLELE1","Allele1","INC_ALLELE"}; set<string> a1_set(a1_ptr, a1_ptr+5); - string a0_ptr[]={"a0","A0","allele0","ALLELE0","Allele0","a2","A2","allele2","ALLELE2","Allele2","DEC_ALLELE"}; + string a0_ptr[]={"a0","A0","allele0","ALLELE0","Allele0","a2","A2", + "allele2","ALLELE2","Allele2","DEC_ALLELE"}; set<string> a0_set(a0_ptr, a0_ptr+10); string z_ptr[]={"z","Z","z_score","Z_SCORE","zscore","ZSCORE"}; @@ -2568,7 +2826,10 @@ bool ReadHeader_io (const string &line, HEADER &header) string ncontrol_ptr[]={"ncontrol","NCONTROL","n_control","N_CONTROL"}; set<string> ncontrol_set(ncontrol_ptr, ncontrol_ptr+4); - string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq","ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY","Freq.Allele1.HapMapCEU","FreqAllele1HapMapCEU", "Freq1.Hapmap"}; + string af_ptr[]={"af","AF","maf","MAF","f","F","allele_freq", + "ALLELE_FREQ","allele_frequency","ALLELE_FREQUENCY", + "Freq.Allele1.HapMapCEU","FreqAllele1HapMapCEU", + "Freq1.Hapmap"}; set<string> af_set(af_ptr, af_ptr+13); string var_ptr[]={"var","VAR"}; set<string> var_set(var_ptr, var_ptr+2); @@ -2578,7 +2839,13 @@ bool ReadHeader_io (const string &line, HEADER &header) string cor_ptr[]={"cor","COR","r","R"}; set<string> cor_set(cor_ptr, cor_ptr+4); - header.rs_col=0; header.chr_col=0; header.pos_col=0; header.cm_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; header.p_col=0; header.n_col=0; header.nmis_col=0; header.nobs_col=0; header.ncase_col=0; header.ncontrol_col=0; header.af_col=0; header.var_col=0; header.ws_col=0; header.cor_col=0; header.coln=0; + header.rs_col=0; header.chr_col=0; header.pos_col=0; + header.cm_col=0; header.a1_col=0; header.a0_col=0; header.z_col=0; + header.beta_col=0; header.sebeta_col=0; header.chisq_col=0; + header.p_col=0; header.n_col=0; header.nmis_col=0; + header.nobs_col=0; header.ncase_col=0; header.ncontrol_col=0; + header.af_col=0; header.var_col=0; header.ws_col=0; + header.cor_col=0; header.coln=0; char *ch_ptr; string type; @@ -2588,50 +2855,147 @@ bool ReadHeader_io (const string &line, HEADER &header) while (ch_ptr!=NULL) { type=ch_ptr; if (rs_set.count(type)!=0) { - if (header.rs_col==0) {header.rs_col=header.coln+1;} else {cout<<"error! more than two rs columns in the file."<<endl; n_error++;} + if (header.rs_col==0) { + header.rs_col=header.coln+1; + } else { + cout<<"error! more than two rs columns in the file."<<endl; + n_error++; + } } else if (chr_set.count(type)!=0) { - if (header.chr_col==0) {header.chr_col=header.coln+1;} else {cout<<"error! more than two chr columns in the file."<<endl; n_error++;} + if (header.chr_col==0) { + header.chr_col=header.coln+1; + } else { + cout<<"error! more than two chr columns in the file."<<endl; + n_error++; + } } else if (pos_set.count(type)!=0) { - if (header.pos_col==0) {header.pos_col=header.coln+1;} else {cout<<"error! more than two pos columns in the file."<<endl; n_error++;} + if (header.pos_col==0) { + header.pos_col=header.coln+1; + } else { + cout<<"error! more than two pos columns in the file."<<endl; + n_error++; + } } else if (cm_set.count(type)!=0) { - if (header.cm_col==0) {header.cm_col=header.coln+1;} else {cout<<"error! more than two cm columns in the file."<<endl; n_error++;} + if (header.cm_col==0) { + header.cm_col=header.coln+1; + } else { + cout<<"error! more than two cm columns in the file."<<endl; + n_error++; + } } else if (a1_set.count(type)!=0) { - if (header.a1_col==0) {header.a1_col=header.coln+1;} else {cout<<"error! more than two allele1 columns in the file."<<endl; n_error++;} + if (header.a1_col==0) { + header.a1_col=header.coln+1; + } else { + cout<<"error! more than two allele1 columns in the file."<<endl; + n_error++; + } } else if (a0_set.count(type)!=0) { - if (header.a0_col==0) {header.a0_col=header.coln+1;} else {cout<<"error! more than two allele0 columns in the file."<<endl; n_error++;} + if (header.a0_col==0) { + header.a0_col=header.coln+1; + } else { + cout<<"error! more than two allele0 columns in the file."<<endl; + n_error++; + } } else if (z_set.count(type)!=0) { - if (header.z_col==0) {header.z_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + if (header.z_col==0) { + header.z_col=header.coln+1; + } else { + cout<<"error! more than two z columns in the file."<<endl; + n_error++; + } } else if (beta_set.count(type)!=0) { - if (header.beta_col==0) {header.beta_col=header.coln+1;} else {cout<<"error! more than two beta columns in the file."<<endl; n_error++;} + if (header.beta_col==0) { + header.beta_col=header.coln+1; + } else { + cout<<"error! more than two beta columns in the file."<<endl; + n_error++; + } } else if (sebeta_set.count(type)!=0) { - if (header.sebeta_col==0) {header.sebeta_col=header.coln+1;} else {cout<<"error! more than two se_beta columns in the file."<<endl; n_error++;} + if (header.sebeta_col==0) { + header.sebeta_col=header.coln+1; + } else { + cout<<"error! more than two se_beta columns in the file."<<endl; + n_error++; + } } else if (chisq_set.count(type)!=0) { - if (header.chisq_col==0) {header.chisq_col=header.coln+1;} else {cout<<"error! more than two z columns in the file."<<endl; n_error++;} + if (header.chisq_col==0) { + header.chisq_col=header.coln+1; + } else { + cout<<"error! more than two z columns in the file."<<endl; + n_error++; + } } else if (p_set.count(type)!=0) { - if (header.p_col==0) {header.p_col=header.coln+1;} else {cout<<"error! more than two p columns in the file."<<endl; n_error++;} + if (header.p_col==0) { + header.p_col=header.coln+1; + } else { + cout<<"error! more than two p columns in the file."<<endl; + n_error++; + } } else if (n_set.count(type)!=0) { - if (header.n_col==0) {header.n_col=header.coln+1;} else {cout<<"error! more than two n_total columns in the file."<<endl; n_error++;} + if (header.n_col==0) { + header.n_col=header.coln+1; + } else { + cout<<"error! more than two n_total columns in the file."<<endl; + n_ + error++;} } else if (nmis_set.count(type)!=0) { - if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else {cout<<"error! more than two n_mis columns in the file."<<endl; n_error++;} + if (header.nmis_col==0) {header.nmis_col=header.coln+1;} else { + cout<<"error! more than two n_mis columns in the file."<<endl; + n_error++; + } } else if (nobs_set.count(type)!=0) { - if (header.nobs_col==0) {header.nobs_col=header.coln+1;} else {cout<<"error! more than two n_obs columns in the file."<<endl; n_error++;} + if (header.nobs_col==0) { + header.nobs_col=header.coln+1; + } else { + cout<<"error! more than two n_obs columns in the file."<<endl; + n_error++; + } } else if (ncase_set.count(type)!=0) { - if (header.ncase_col==0) {header.ncase_col=header.coln+1;} else {cout<<"error! more than two n_case columns in the file."<<endl; n_error++;} + if (header.ncase_col==0) { + header.ncase_col=header.coln+1; + } else { + cout<<"error! more than two n_case columns in the file."<<endl; + n_error++; + } } else if (ncontrol_set.count(type)!=0) { - if (header.ncontrol_col==0) {header.ncontrol_col=header.coln+1;} else {cout<<"error! more than two n_control columns in the file."<<endl; n_error++;} + if (header.ncontrol_col==0) { + header.ncontrol_col=header.coln+1; + } else { + cout<<"error! more than two n_control columns in the file."<<endl; + n_error++; + } } else if (ws_set.count(type)!=0) { - if (header.ws_col==0) {header.ws_col=header.coln+1;} else {cout<<"error! more than two window_size columns in the file."<<endl; n_error++;} + if (header.ws_col==0) { + header.ws_col=header.coln+1; + } else { + cout<<"error! more than two window_size columns in the file."<<endl; + n_error++; + } } else if (af_set.count(type)!=0) { - if (header.af_col==0) {header.af_col=header.coln+1;} else {cout<<"error! more than two af columns in the file."<<endl; n_error++;} + if (header.af_col==0) { + header.af_col=header.coln+1; + } else { + cout<<"error! more than two af columns in the file."<<endl; + n_error++; + } } else if (cor_set.count(type)!=0) { - if (header.cor_col==0) {header.cor_col=header.coln+1;} else {cout<<"error! more than two cor columns in the file."<<endl; n_error++;} + if (header.cor_col==0) { + header.cor_col=header.coln+1; + } else { + cout<<"error! more than two cor columns in the file."<<endl; + n_error++; + } } else { string str = ch_ptr; string cat = str.substr(str.size()-2, 2); - // continuous + if(cat == "_c" || cat =="_C"){ + + // continuous header.catc_col.insert(header.coln+1); - } else { //discrete + } else { + + // discrete header.catd_col.insert(header.coln+1); } } @@ -2640,7 +3004,10 @@ bool ReadHeader_io (const string &line, HEADER &header) header.coln++; } - if (header.cor_col!=0 && header.cor_col!=header.coln) {cout<<"error! the cor column should be the last column."<<endl; n_error++;} + if (header.cor_col!=0 && header.cor_col!=header.coln) { + cout<<"error! the cor column should be the last column."<<endl; + n_error++; + } if (header.rs_col==0) { if (header.chr_col!=0 && header.pos_col!=0) { @@ -2650,34 +3017,38 @@ bool ReadHeader_io (const string &line, HEADER &header) } } - if (n_error==0) {return true;} else {return false;} + if (n_error==0) { + return true; + } else { + return false; + } } - - - -//read category file, record mapRS2in -//the category file does not contain a null category -//so if a snp has 0 for all categories, then it is not included in the analysis -bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_t &n_vc) -{ +// Read category file, record mapRS2 in the category file does not +// contain a null category so if a snp has 0 for all categories, then +// it is not included in the analysis. +bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, + size_t &n_vc) { mapRS2cat.clear(); igzstream infile (file_cat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open category file: "<<file_cat<<endl; return false;} + if (!infile) { + cout<<"error! fail to open category file: "<<file_cat<<endl; + return false; + } string line; char *ch_ptr; string rs, chr, a1, a0, pos, cm; - size_t i_cat;// ns_vc=0; + size_t i_cat; - //read header + // Read header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_io (line, header); - //use the header to count the number of categories + // Use the header to count the number of categories. n_vc=header.coln; if (header.rs_col!=0) {n_vc--;} if (header.chr_col!=0) {n_vc--;} @@ -2686,7 +3057,7 @@ bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_ if (header.a1_col!=0) {n_vc--;} if (header.a0_col!=0) {n_vc--;} - //read the following lines to record mapRS2cat + // Read the following lines to record mapRS2cat. while (!safeGetline(infile, line).eof()) { ch_ptr=strtok ((char *)line.c_str(), " , \t"); @@ -2717,27 +3088,23 @@ bool ReadFile_cat (const string &file_cat, map<string, size_t> &mapRS2cat, size_ ch_ptr=strtok (NULL, " , \t"); } - - //if (mapRS2cat.count(rs)==0) {mapRS2cat[rs]=n_vc+1; ns_vc++;} } - //if (ns_vc>0) {n_vc++;} - infile.clear(); infile.close(); return true; } - - - -bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, size_t &n_vc) -{ +bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, + size_t &n_vc) { mapRS2cat.clear(); igzstream infile (file_mcat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; return false;} + if (!infile) { + cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; + return false; + } string file_name; map<string, size_t> mapRS2cat_tmp; @@ -2754,125 +3121,22 @@ bool ReadFile_mcat (const string &file_mcat, map<string, size_t> &mapRS2cat, siz return true; } - - - - -/* -//read the continuous category file, record mapR2catc -bool ReadFile_catc (const string &file_cat, map<string, vector<double> > &mapRS2catc, size_t &n_cat) -{ - mapRS2catc.clear(); - - igzstream infile (file_cat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open category file: "<<file_cat<<endl; return false;} - - string line; - char *ch_ptr; - - string rs, chr, a1, a0, pos, cm; - size_t i_cat;// ns_vc=0; - - //read header - HEADER header; - !safeGetline(infile, line).eof(); - ReadHeader_io (line, header); - - //use the header to count the number of categories - n_cat=header.coln; - if (header.rs_col!=0) {n_cat--;} - if (header.chr_col!=0) {n_cat--;} - if (header.pos_col!=0) {n_cat--;} - if (header.cm_col!=0) {n_cat--;} - if (header.a1_col!=0) {n_cat--;} - if (header.a0_col!=0) {n_cat--;} - - //set up continous category - vector<double> catc; - for (size_t i=0; i<n_cat; i++) { - catc.push_back(0); - } - - //read the following lines to record mapRS2cat - while (!safeGetline(infile, line).eof()) { - ch_ptr=strtok ((char *)line.c_str(), " , \t"); - - i_cat=0; - if (header.rs_col==0) { - rs=chr+":"+pos; - } - - for (size_t i=0; i<header.coln; i++) { - if (header.rs_col!=0 && header.rs_col==i+1) { - rs=ch_ptr; - } else if (header.chr_col!=0 && header.chr_col==i+1) { - chr=ch_ptr; - } else if (header.pos_col!=0 && header.pos_col==i+1) { - pos=ch_ptr; - } else if (header.cm_col!=0 && header.cm_col==i+1) { - cm=ch_ptr; - } else if (header.a1_col!=0 && header.a1_col==i+1) { - a1=ch_ptr; - } else if (header.a0_col!=0 && header.a0_col==i+1) { - a0=ch_ptr; - } else { - catc[i_cat]=atof(ch_ptr); - i_cat++; - } - - ch_ptr=strtok (NULL, " , \t"); - } - - if (mapRS2catc.count(rs)==0) {mapRS2catc[rs]=catc;} - - //if (mapRS2cat.count(rs)==0) {mapRS2cat[rs]=n_vc+1; ns_vc++;} - } - - //if (ns_vc>0) {n_vc++;} - - infile.clear(); - infile.close(); - - return true; -} - - - - -bool ReadFile_mcatc (const string &file_mcat, map<string, vector<double> > &mapRS2catc, size_t &n_cat) -{ - mapRS2catc.clear(); - - igzstream infile (file_mcat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mcategory file: "<<file_mcat<<endl; return false;} - - string file_name; - map<string, vector<double> > mapRS2catc_tmp; - size_t n_cat_tmp, t=0; - - while (!safeGetline(infile, file_name).eof()) { - mapRS2catc_tmp.clear(); - ReadFile_catc (file_name, mapRS2catc_tmp, n_cat_tmp); - mapRS2catc.insert(mapRS2catc_tmp.begin(), mapRS2catc_tmp.end()); - if (t==0) {n_cat=n_cat_tmp;} - if (n_cat!=n_cat_tmp) {cout<<"number of category differs in different mcatc files."<<endl;;} - - t++; - } - - return true; -} -*/ - - - - -//read bimbam mean genotype file and calculate kinship matrix; this time, the kinship matrix is not centered, and can contain multiple K matrix -bool BimbamKin (const string &file_geno, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) -{ +// Read bimbam mean genotype file and calculate kinship matrix; this +// time, the kinship matrix is not centered, and can contain multiple +// K matrix. +bool BimbamKin (const string &file_geno, const int display_pace, + const vector<int> &indicator_idv, + const vector<int> &indicator_snp, + const map<string, double> &mapRS2weight, + const map<string, size_t> &mapRS2cat, + const vector<SNPINFO> &snpInfo, + const gsl_matrix *W, gsl_matrix *matrix_kin, + gsl_vector *vector_ns) { igzstream infile (file_geno.c_str(), igzstream::in); - //ifstream infile (file_geno.c_str(), ifstream::in); - if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return false;} + if (!infile) { + cout<<"error reading genotype file:"<<file_geno<<endl; + return false; + } string line; char *ch_ptr; @@ -2902,7 +3166,7 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in ns_vec.push_back(0); } - //create a large matrix + // Create a large matrix. size_t msize=10000; gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); gsl_matrix_set_zero(Xlarge); @@ -2910,14 +3174,16 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in size_t ns_test=0; for (size_t t=0; t<indicator_snp.size(); ++t) { !safeGetline(infile, line).eof(); - if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (t%display_pace==0 || t==(indicator_snp.size()-1)) { + ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); + } if (indicator_snp[t]==0) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); ch_ptr=strtok (NULL, " , \t"); ch_ptr=strtok (NULL, " , \t"); - rs=snpInfo[t].rs_number;//this line is new + rs=snpInfo[t].rs_number; // This line is new. geno_mean=0.0; n_miss=0; geno_var=0.0; gsl_vector_set_all(geno_miss, 0); @@ -2926,13 +3192,15 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in for (size_t i=0; i<indicator_idv.size(); ++i) { if (indicator_idv[i]==0) {continue;} ch_ptr=strtok (NULL, " , \t"); - if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(geno_miss, i, 0); n_miss++;} + if (strcmp(ch_ptr, "NA")==0) { + gsl_vector_set(geno_miss, i, 0); n_miss++; + } else { - d=atof(ch_ptr); - gsl_vector_set (geno, j, d); - gsl_vector_set (geno_miss, j, 1); - geno_mean+=d; - geno_var+=d*d; + d=atof(ch_ptr); + gsl_vector_set (geno, j, d); + gsl_vector_set (geno_miss, j, 1); + geno_mean+=d; + geno_var+=d*d; } j++; } @@ -2941,10 +3209,11 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; -// geno_var=geno_mean*(1-geno_mean*0.5); for (size_t i=0; i<ni_test; ++i) { - if (gsl_vector_get (geno_miss, i)==0) {gsl_vector_set(geno, i, geno_mean);} + if (gsl_vector_get (geno_miss, i)==0) { + gsl_vector_set(geno, i, geno_mean); + } } gsl_vector_add_constant (geno, -1.0*geno_mean); @@ -2955,48 +3224,43 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in gsl_blas_ddot (geno, geno, &geno_var); geno_var/=(double)ni_test; - if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (geno_var!=0 && (mapRS2weight.size()==0 || + mapRS2weight.count(rs)!=0)) { if (mapRS2weight.size()==0) { d=1.0/geno_var; } else { d=mapRS2weight.at(rs)/geno_var; } - /* - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); - //eigenlib_dsyr (1.0, geno, matrix_kin); - } - */ - gsl_vector_scale (geno, sqrt(d)); if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column(Xlarge,ns_vec[0]%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_vec[0]++; if (ns_vec[0]%msize==0) { - eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + eigenlib_dgemm("N","T",1.0,Xlarge,Xlarge,1.0,matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs)!=0) { i_vc=mapRS2cat.at(rs); - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column(Xlarge,msize*i_vc+ns_vec[i_vc]%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_vec[i_vc]++; if (ns_vec[i_vc]%msize==0) { - gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + gsl_matrix_view X_sub= + gsl_matrix_submatrix(Xlarge,0,msize*i_vc, + ni_test,msize); + gsl_matrix_view kin_sub= + gsl_matrix_submatrix(matrix_kin,0,ni_test*i_vc, + ni_test,ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, + &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); } @@ -3009,9 +3273,13 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in for (size_t i_vc=0; i_vc<n_vc; i_vc++) { if (ns_vec[i_vc]%msize!=0) { - gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + gsl_matrix_view X_sub= + gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub= + gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, + ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, + 1.0, &kin_sub.matrix); } } @@ -3047,16 +3315,19 @@ bool BimbamKin (const string &file_geno, const int display_pace, const vector<in return true; } - - - - - - -bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> &indicator_idv, const vector<int> &indicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<SNPINFO> &snpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) -{ +bool PlinkKin (const string &file_bed, const int display_pace, + const vector<int> &indicator_idv, + const vector<int> &indicator_snp, + const map<string, double> &mapRS2weight, + const map<string, size_t> &mapRS2cat, + const vector<SNPINFO> &snpInfo, + const gsl_matrix *W, gsl_matrix *matrix_kin, + gsl_vector *vector_ns) { ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return false;} + if (!infile) { + cout<<"error reading bed file:"<<file_bed<<endl; + return false; + } char ch[1]; bitset<8> b; @@ -3089,58 +3360,68 @@ bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> ns_vec.push_back(0); } - //create a large matrix + // Create a large matrix. size_t msize=10000; gsl_matrix *Xlarge=gsl_matrix_alloc (ni_test, msize*n_vc); gsl_matrix_set_zero(Xlarge); - //calculate n_bit and c, the number of bit for each snp + // Calculate n_bit and c, the number of bit for each SNP. if (ni_total%4==0) {n_bit=ni_total/4;} else {n_bit=ni_total/4+1; } - //print the first three majic numbers + // Print the first three magic numbers. for (int i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; } for (size_t t=0; t<indicator_snp.size(); ++t) { - if (t%display_pace==0 || t==(indicator_snp.size()-1)) {ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1);} + if (t%display_pace==0 || t==(indicator_snp.size()-1)) { + ProgressBar ("Reading SNPs ", t, indicator_snp.size()-1); + } if (indicator_snp[t]==0) {continue;} - infile.seekg(t*n_bit+3); //n_bit, and 3 is the number of magic numbers + // n_bit, and 3 is the number of magic numbers + infile.seekg(t*n_bit+3); - rs=snpInfo[t].rs_number;//this line is new + rs=snpInfo[t].rs_number; // This line is new. - //read genotypes + // Read genotypes. geno_mean=0.0; n_miss=0; ci_total=0; geno_var=0.0; ci_test=0; for (int i=0; i<n_bit; ++i) { - infile.read(ch,1); - b=ch[0]; - for (size_t j=0; j<4; ++j) { //minor allele homozygous: 2.0; major: 0.0; - if ((i==(n_bit-1)) && ci_total==ni_total) {break;} - if (indicator_idv[ci_total]==0) {ci_total++; continue;} - - if (b[2*j]==0) { - if (b[2*j+1]==0) {gsl_vector_set(geno, ci_test, 2.0); geno_mean+=2.0; geno_var+=4.0; } - else {gsl_vector_set(geno, ci_test, 1.0); geno_mean+=1.0; geno_var+=1.0;} - } - else { - if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); } - else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; } - } - - ci_test++; - ci_total++; - } + infile.read(ch,1); + b=ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0; + for (size_t j=0; j<4; ++j) { + if ((i==(n_bit-1)) && ci_total==ni_total) {break;} + if (indicator_idv[ci_total]==0) {ci_total++; continue;} + + if (b[2*j]==0) { + if (b[2*j+1]==0) { + gsl_vector_set(geno, ci_test, 2.0); + geno_mean+=2.0; geno_var+=4.0; + } + else { + gsl_vector_set(geno, ci_test, 1.0); + geno_mean+=1.0; + geno_var+=1.0; + } + } + else { + if (b[2*j+1]==1) {gsl_vector_set(geno, ci_test, 0.0); } + else {gsl_vector_set(geno, ci_test, -9.0); n_miss++; } + } + + ci_test++; + ci_total++; + } } - geno_mean/=(double)(ni_test-n_miss); geno_var+=geno_mean*geno_mean*(double)n_miss; geno_var/=(double)ni_test; geno_var-=geno_mean*geno_mean; -// geno_var=geno_mean*(1-geno_mean*0.5); for (size_t i=0; i<ni_test; ++i) { d=gsl_vector_get(geno,i); @@ -3155,47 +3436,43 @@ bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> gsl_blas_ddot (geno, geno, &geno_var); geno_var/=(double)ni_test; - if (geno_var!=0 && (mapRS2weight.size()==0 || mapRS2weight.count(rs)!=0) ) { + if (geno_var!=0 && (mapRS2weight.size()==0 || + mapRS2weight.count(rs)!=0)) { if (mapRS2weight.size()==0) { d=1.0/geno_var; } else { d=mapRS2weight.at(rs)/geno_var; } - /* - if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_blas_dsyr (CblasUpper, d, geno, matrix_kin); - ns_vec[0]++; - } else if (mapRS2cat.count(rs)!=0) { - i_vc=mapRS2cat.at(rs); - ns_vec[i_vc]++; - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - gsl_blas_dsyr (CblasUpper, d, geno, &kin_sub.matrix); - } - */ - gsl_vector_scale (geno, sqrt(d)); if (n_vc==1 || mapRS2cat.size()==0 ) { - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, ns_vec[0]%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column (Xlarge, ns_vec[0]%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_vec[0]++; if (ns_vec[0]%msize==0) { - eigenlib_dgemm ("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + eigenlib_dgemm("N","T",1.0,Xlarge,Xlarge,1.0,matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs)!=0) { i_vc=mapRS2cat.at(rs); - gsl_vector_view Xlarge_col=gsl_matrix_column (Xlarge, msize*i_vc+ns_vec[i_vc]%msize); + gsl_vector_view Xlarge_col= + gsl_matrix_column(Xlarge,msize*i_vc+ns_vec[i_vc]%msize); gsl_vector_memcpy (&Xlarge_col.vector, geno); ns_vec[i_vc]++; if (ns_vec[i_vc]%msize==0) { - gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + gsl_matrix_view X_sub= + gsl_matrix_submatrix(Xlarge,0,msize*i_vc,ni_test, + msize); + gsl_matrix_view kin_sub= + gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, + ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, + &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); } @@ -3208,9 +3485,13 @@ bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> for (size_t i_vc=0; i_vc<n_vc; i_vc++) { if (ns_vec[i_vc]%msize!=0) { - gsl_matrix_view X_sub=gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); - gsl_matrix_view kin_sub=gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, ni_test, ni_test); - eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); + gsl_matrix_view X_sub= + gsl_matrix_submatrix(Xlarge, 0, msize*i_vc, ni_test, msize); + gsl_matrix_view kin_sub= + gsl_matrix_submatrix(matrix_kin, 0, ni_test*i_vc, + ni_test, ni_test); + eigenlib_dgemm ("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, + 1.0, &kin_sub.matrix); } } @@ -3245,16 +3526,23 @@ bool PlinkKin (const string &file_bed, const int display_pace, const vector<int> return true; } - - -bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int display_pace, const vector<int> &indicator_idv, const vector<vector<int> > &mindicator_snp, const map<string, double> &mapRS2weight, const map<string, size_t> &mapRS2cat, const vector<vector<SNPINFO> > &msnpInfo, const gsl_matrix *W, gsl_matrix *matrix_kin, gsl_vector *vector_ns) -{ +bool MFILEKin (const size_t mfile_mode, const string &file_mfile, + const int display_pace, const vector<int> &indicator_idv, + const vector<vector<int> > &mindicator_snp, + const map<string, double> &mapRS2weight, + const map<string, size_t> &mapRS2cat, + const vector<vector<SNPINFO> > &msnpInfo, + const gsl_matrix *W, gsl_matrix *matrix_kin, + gsl_vector *vector_ns) { size_t n_vc=vector_ns->size, ni_test=matrix_kin->size1; gsl_matrix_set_zero(matrix_kin); gsl_vector_set_zero(vector_ns); igzstream infile (file_mfile.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mfile file: "<<file_mfile<<endl; return false;} + if (!infile) { + cout<<"error! fail to open mfile file: "<<file_mfile<<endl; + return false; + } string file_name; @@ -3273,11 +3561,11 @@ bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int disp } else { BimbamKin (file_name, display_pace, indicator_idv, mindicator_snp[l], mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); } - - //add ns + + // Add ns. gsl_vector_add(vector_ns, ns_tmp); - //add kin + // Add kin. for (size_t t=0; t<n_vc; t++) { for (size_t i=0; i<ni_test; ++i) { for (size_t j=0; j<=i; ++j) { @@ -3291,11 +3579,12 @@ bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int disp l++; } - //renormalize kin + // Renormalize kin. for (size_t t=0; t<n_vc; t++) { for (size_t i=0; i<ni_test; ++i) { for (size_t j=0; j<=i; ++j) { - d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)/gsl_vector_get(vector_ns, t); + d=gsl_matrix_get (matrix_kin, j, i+ni_test*t)/ + gsl_vector_get(vector_ns, t); gsl_matrix_set (matrix_kin, i, j+ni_test*t, d); gsl_matrix_set (matrix_kin, j, i+ni_test*t, d); @@ -3315,15 +3604,16 @@ bool MFILEKin (const size_t mfile_mode, const string &file_mfile, const int disp } - - -//read var file, store mapRS2wsnp -bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2weight) -{ +// Read var file, store mapRS2wsnp. +bool ReadFile_wsnp (const string &file_wsnp, + map<string, double> &mapRS2weight) { mapRS2weight.clear(); igzstream infile (file_wsnp.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wsnp<<endl; return false;} + if (!infile) { + cout<<"error! fail to open snp weight file: "<<file_wsnp<<endl; + return false; + } char *ch_ptr; string line, rs; @@ -3340,12 +3630,15 @@ bool ReadFile_wsnp (const string &file_wsnp, map<string, double> &mapRS2weight) return true; } -bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vector<double> > &mapRS2wvector) -{ +bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, + map<string, vector<double> > &mapRS2wvector) { mapRS2wvector.clear(); igzstream infile (file_wcat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open snp weight file: "<<file_wcat<<endl; return false;} + if (!infile) { + cout<<"error! fail to open snp weight file: "<<file_wcat<<endl; + return false; + } char *ch_ptr; vector<double> weight; @@ -3354,10 +3647,8 @@ bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vect } string line, rs, chr, a1, a0, pos, cm; - //double af=0, var_x=0; - //size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; - - //read header + + // Read header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_io (line, header); @@ -3366,7 +3657,6 @@ bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vect if (isBlankLine(line)) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); - //n_total=0; n_mis=0; n_obs=0; n_case=0; n_control=0; n_case=0; af=0; var_x=0; size_t t=0; for (size_t i=0; i<header.coln; i++) { if (header.rs_col!=0 && header.rs_col==i+1) {rs=ch_ptr;} @@ -3375,22 +3665,23 @@ bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vect else if (header.cm_col!=0 && header.cm_col==i+1) {cm=ch_ptr; } else if (header.a1_col!=0 && header.a1_col==i+1) {a1=ch_ptr; } else if (header.a0_col!=0 && header.a0_col==i+1) {a0=ch_ptr; } - //else if (header.n_col!=0 && header.n_col==i+1) {n_total=atoi(ch_ptr); } - //else if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr); } - //else if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr); } - //else if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr); } - //else if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr); } - //else if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr); } - //else if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr); } else { weight[t]=atof(ch_ptr); t++; - if (t>n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + if (t>n_vc) { + cout<<"error! Number of columns in the wcat file does not "<< + "match that of cat file."; + return false; + } } ch_ptr=strtok (NULL, " , \t"); } - if (t!=n_vc) {cout<<"error! Number of columns in the wcat file does not match that of cat file."; return false;} + if (t!=n_vc) { + cout<<"error! Number of columns in the wcat file does not "<< + "match that of cat file."; + return false; + } if (header.rs_col==0) { rs=chr+":"+pos; @@ -3402,25 +3693,28 @@ bool ReadFile_wsnp (const string &file_wcat, const size_t n_vc, map<string, vect return true; } - - - - - - - -//read the beta file, save snp z scores in to z2_score, and save category into indicator_snp based on mapRS2var and set, and indicator_snp record the category number (from 1 to n_vc), and provide var if maf/var is not provided in the beta file -//notice that indicator_snp contains ns_test snps, instead of ns_total snps -//read the beta file for the second time, compute q, and Vq based on block jacknife -//use the mapRS2var to select snps (and to ), calculate q -//do a block-wise jacknife, and compute Vq -void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2cat, const map<string, double> &mapRS2wA, vector<size_t> &vec_cat, vector<size_t> &vec_ni, vector<double> &vec_weight, vector<double> &vec_z2, size_t &ni_total, size_t &ns_total, size_t &ns_test) -{ +// Read the beta file, save snp z scores in to z2_score, and save +// category into indicator_snp based on mapRS2var and set, and +// indicator_snp record the category number (from 1 to n_vc), and +// provide var if maf/var is not provided in the beta file notice that +// indicator_snp contains ns_test snps, instead of ns_total snps read +// the beta file for the second time, compute q, and Vq based on block +// jacknife use the mapRS2var to select snps (and to ), calculate q do +// a block-wise jacknife, and compute Vq +void ReadFile_beta (const string &file_beta, + const map<string, size_t> &mapRS2cat, + const map<string, double> &mapRS2wA, + vector<size_t> &vec_cat, vector<size_t> &vec_ni, + vector<double> &vec_weight, vector<double> &vec_z2, + size_t &ni_total, size_t &ns_total, size_t &ns_test) { vec_cat.clear(); vec_ni.clear(); vec_weight.clear(); vec_z2.clear(); ni_total=0; ns_total=0; ns_test=0; igzstream infile (file_beta.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;} + if (!infile) { + cout<<"error! fail to open beta file: "<<file_beta<<endl; + return; + } string line; char *ch_ptr; @@ -3430,27 +3724,25 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca double z=0, beta=0, se_beta=0, chisq=0, pvalue=0, zsquare=0, af=0, var_x=0; size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; - //read header + // Read header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_io (line, header); if (header.n_col==0 ) { - if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { + if ( (header.nobs_col==0 && header.nmis_col==0) && + (header.ncase_col==0 && header.ncontrol_col==0) ) { cout<<"error! missing sample size in the beta file."<<endl; } else { cout<<"total sample size will be replaced by obs/mis sample size."<<endl; } } - if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && header.chisq_col==0 && header.p_col==0) { + if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0) && + header.chisq_col==0 && header.p_col==0) { cout<<"error! missing z scores in the beta file."<<endl; } - /* - if (header.af_col==0 && header.var_col==0) { - cout<<"error! missing allele frequency in the beta file."<<endl; - } - */ + while (!safeGetline(infile, line).eof()) { if (isBlankLine(line)) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); @@ -3467,7 +3759,9 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} - if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);} + if (header.sebeta_col!=0 && header.sebeta_col==i+1) { + se_beta=atof(ch_ptr); + } if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} @@ -3475,8 +3769,9 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);} - if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);} - + if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) { + n_control=atoi(ch_ptr); + } if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} @@ -3495,7 +3790,8 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca } } - //both z values and beta/se_beta have directions, while chisq/pvalue do not + // Both z values and beta/se_beta have directions, while + // chisq/pvalue do not. if (header.z_col!=0) { zsquare=z*z; } else if (header.beta_col!=0 && header.sebeta_col!=0) { @@ -3507,13 +3803,14 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca zsquare=gsl_cdf_chisq_Qinv (pvalue, 1); } else {zsquare=0;} - //obtain var_x + // Obtain var_x. if (header.var_col==0 && header.af_col!=0) { var_x=2.0*af*(1.0-af); } - //if the snp is also present in cor file, then do calculations - if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) && (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) && zsquare!=0) { + // If the SNP is also present in cor file, then do calculations. + if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) && + (mapRS2cat.size()==0 || mapRS2cat.count(rs)!=0) && zsquare!=0) { if (mapRS2cat.size()!=0) { vec_cat.push_back(mapRS2cat.at(rs)); } else { @@ -3540,17 +3837,17 @@ void ReadFile_beta (const string &file_beta, const map<string, size_t> &mapRS2ca return; } - - - - - -void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA, map<string, string> &mapRS2A1, map<string, double> &mapRS2z) -{ +void ReadFile_beta (const string &file_beta, + const map<string, double> &mapRS2wA, + map<string, string> &mapRS2A1, + map<string, double> &mapRS2z) { mapRS2A1.clear(); mapRS2z.clear(); igzstream infile (file_beta.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open beta file: "<<file_beta<<endl; return;} + if (!infile) { + cout<<"error! fail to open beta file: "<<file_beta<<endl; + return; + } string line; char *ch_ptr; @@ -3561,13 +3858,14 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA size_t n_total=0, n_mis=0, n_obs=0, n_case=0, n_control=0; size_t ni_total=0, ns_total=0, ns_test=0; - //read header + // Read header. HEADER header; !safeGetline(infile, line).eof(); ReadHeader_io (line, header); if (header.n_col==0 ) { - if ( (header.nobs_col==0 && header.nmis_col==0) && (header.ncase_col==0 && header.ncontrol_col==0) ) { + if ((header.nobs_col==0 && header.nmis_col==0) && + (header.ncase_col==0 && header.ncontrol_col==0)) { cout<<"error! missing sample size in the beta file."<<endl; } else { cout<<"total sample size will be replaced by obs/mis sample size."<<endl; @@ -3577,11 +3875,7 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA if (header.z_col==0 && (header.beta_col==0 || header.sebeta_col==0)) { cout<<"error! missing z scores in the beta file."<<endl; } - /* - if (header.af_col==0 && header.var_col==0) { - cout<<"error! missing allele frequency in the beta file."<<endl; - } - */ + while (!safeGetline(infile, line).eof()) { if (isBlankLine(line)) {continue;} ch_ptr=strtok ((char *)line.c_str(), " , \t"); @@ -3598,7 +3892,9 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA if (header.z_col!=0 && header.z_col==i+1) {z=atof(ch_ptr);} if (header.beta_col!=0 && header.beta_col==i+1) {beta=atof(ch_ptr);} - if (header.sebeta_col!=0 && header.sebeta_col==i+1) {se_beta=atof(ch_ptr);} + if (header.sebeta_col!=0 && header.sebeta_col==i+1) { + se_beta=atof(ch_ptr); + } if (header.chisq_col!=0 && header.chisq_col==i+1) {chisq=atof(ch_ptr);} if (header.p_col!=0 && header.p_col==i+1) {pvalue=atof(ch_ptr);} @@ -3606,7 +3902,9 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA if (header.nmis_col!=0 && header.nmis_col==i+1) {n_mis=atoi(ch_ptr);} if (header.nobs_col!=0 && header.nobs_col==i+1) {n_obs=atoi(ch_ptr);} if (header.ncase_col!=0 && header.ncase_col==i+1) {n_case=atoi(ch_ptr);} - if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) {n_control=atoi(ch_ptr);} + if (header.ncontrol_col!=0 && header.ncontrol_col==i+1) { + n_control=atoi(ch_ptr); + } if (header.af_col!=0 && header.af_col==i+1) {af=atof(ch_ptr);} if (header.var_col!=0 && header.var_col==i+1) {var_x=atof(ch_ptr);} @@ -3626,7 +3924,8 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA } } - //both z values and beta/se_beta have directions, while chisq/pvalue do not + // Both z values and beta/se_beta have directions, while + // chisq/pvalue do not. if (header.z_col!=0) { z=z; } else if (header.beta_col!=0 && header.sebeta_col!=0) { @@ -3635,7 +3934,7 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA z=0; } - //if the snp is also present in cor file, then do calculations + // If the snp is also present in cor file, then do calculations. if ( (mapRS2wA.size()==0 || mapRS2wA.count(rs)!=0) ) { mapRS2z[rs]=z; mapRS2A1[rs]=a1; @@ -3653,10 +3952,10 @@ void ReadFile_beta (const string &file_beta, const map<string, double> &mapRS2wA return; } - - -void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<size_t> &vec_ni, const vector<double> &vec_weight, const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, gsl_vector *s) -{ +void Calcq (const size_t n_block, const vector<size_t> &vec_cat, + const vector<size_t> &vec_ni, const vector<double> &vec_weight, + const vector<double> &vec_z2, gsl_matrix *Vq, gsl_vector *q, + gsl_vector *s) { gsl_matrix_set_zero (Vq); gsl_vector_set_zero (q); gsl_vector_set_zero (s); @@ -3677,21 +3976,22 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si mat_s.push_back(vec_s); } - //compute q and s + // Compute q and s. for (size_t i=0; i<vec_cat.size(); i++) { - //extract quantities + + // Extract quantities. cat=vec_cat[i]; n_total=vec_ni[i]; w=vec_weight[i]; zsquare=vec_z2[i]; - //compute q and s + // Compute q and s. vec_q[cat]+=(zsquare-1.0)*w/(double)n_total; vec_s[cat]+=w; n_snps[cat]++; } - //update q; vec_q is used again for computing Vq below + // Update q; vec_q is used again for computing Vq below. for (size_t i=0; i<q->size; i++) { if (vec_s[i]!=0) { gsl_vector_set(q, i, vec_q[i]/vec_s[i]); @@ -3699,14 +3999,15 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si gsl_vector_set(s, i, vec_s[i]); } - //compute Vq; divide SNPs in each category into evenly distributed blocks + // Compute Vq; divide SNPs in each category into evenly distributed + // blocks. size_t t=0, b=0, n_snp=0; double d, m, n; for (size_t l=0; l<q->size; l++) { n_snp=floor(n_snps[l]/n_block); t=0; b=0; if (n_snp==0) {continue;} - //initiate everything to zero + // Initiate everything to zero. for (size_t i=0; i<n_block; i++) { for (size_t j=0; j<q->size; j++) { mat_q[i][j]=0; @@ -3714,15 +4015,17 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si } } - //record values + // Record values. for (size_t i=0; i<vec_cat.size(); i++) { - //extract quantities + + // Extract quantities. cat=vec_cat[i]; n_total=vec_ni[i]; w=vec_weight[i]; zsquare=vec_z2[i]; - //save quantities for computing Vq (which is not divided by n_total) + // Save quantities for computing Vq (which is not divided by + // n_total). mat_q[b][cat]+=(zsquare-1.0)*w; mat_s[b][cat]+=w; @@ -3735,7 +4038,7 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si } } - //center mat_q + // Center mat_q. for (size_t i=0; i<q->size; i++) { m=0; n=0; for (size_t k=0; k<n_block; k++) { @@ -3755,7 +4058,7 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si } } - //compute Vq for l'th row and l'th column only + // Compute Vq for l'th row and l'th column only. for (size_t i=0; i<q->size; i++) { d=0; n=0; for (size_t k=0; k<n_block; k++) { @@ -3788,14 +4091,14 @@ void Calcq (const size_t n_block, const vector<size_t> &vec_cat, const vector<si return; } - - - -//read vector file +// Read vector file. void ReadFile_vector (const string &file_vec, gsl_vector *vec) { igzstream infile (file_vec.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open vector file: "<<file_vec<<endl; return;} + if (!infile) { + cout<<"error! fail to open vector file: "<<file_vec<<endl; + return; + } string line; char *ch_ptr; @@ -3812,11 +4115,12 @@ void ReadFile_vector (const string &file_vec, gsl_vector *vec) return; } - -void ReadFile_matrix (const string &file_mat, gsl_matrix *mat) -{ +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat) { igzstream infile (file_mat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;} + if (!infile) { + cout<<"error! fail to open matrix file: "<<file_mat<<endl; + return; + } string line; char *ch_ptr; @@ -3836,10 +4140,13 @@ void ReadFile_matrix (const string &file_mat, gsl_matrix *mat) return; } -void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, gsl_matrix *mat2) -{ +void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, + gsl_matrix *mat2) { igzstream infile (file_mat.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open matrix file: "<<file_mat<<endl; return;} + if (!infile) { + cout<<"error! fail to open matrix file: "<<file_mat<<endl; + return; + } string line; char *ch_ptr; @@ -3868,11 +4175,9 @@ void ReadFile_matrix (const string &file_mat, gsl_matrix *mat1, gsl_matrix *mat2 return; } - - -//read study file -void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) -{ +// Read study file. +void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, + gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { string Vqfile=file_study+".Vq.txt"; string sfile=file_study+".size.txt"; string qfile=file_study+".q.txt"; @@ -3895,19 +4200,16 @@ void ReadFile_study (const string &file_study, gsl_matrix *Vq_mat, gsl_vector *q return; } - -//read reference file -void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) -{ +// Read reference file. +void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { string sfile=file_ref+".size.txt"; string Sfile=file_ref+".S.txt"; - //string Vfile=file_ref+".V.txt"; gsl_vector *s=gsl_vector_alloc (s_vec->size+1); ReadFile_vector(sfile, s); ReadFile_matrix(Sfile, S_mat, Svar_mat); - //ReadFile_matrix(Vfile, V_mat); double d; for (size_t i=0; i<s_vec->size; i++) { @@ -3921,10 +4223,9 @@ void ReadFile_ref (const string &file_ref, gsl_matrix *S_mat, gsl_matrix *Svar_m return; } - -//read mstudy file -void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) -{ +// Read mstudy file. +void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, + gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { gsl_matrix_set_zero(Vq_mat); gsl_vector_set_zero(q_vec); gsl_vector_set_zero(s_vec); @@ -3935,7 +4236,10 @@ void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector gsl_vector *s=gsl_vector_alloc (s_vec->size+1); igzstream infile (file_mstudy.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mstudy file: "<<file_mstudy<<endl; return;} + if (!infile) { + cout<<"error! fail to open mstudy file: "<<file_mstudy<<endl; + return; + } string file_name; double d1, d2, d; @@ -3996,45 +4300,42 @@ void ReadFile_mstudy (const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector return; } -//read reference file -void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) -{ +// Read reference file. +void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { gsl_matrix_set_zero(S_mat); gsl_matrix_set_zero(Svar_mat); - // gsl_matrix_set_zero(V_mat); gsl_vector_set_zero(s_vec); ni=0; - //size_t n_vc=S_mat->size1; gsl_matrix *S_sub=gsl_matrix_alloc (S_mat->size1, S_mat->size2); gsl_matrix *Svar_sub=gsl_matrix_alloc (Svar_mat->size1, Svar_mat->size2); - //gsl_matrix *V_sub=gsl_matrix_alloc (V_mat->size1, V_mat->size2); gsl_vector *s=gsl_vector_alloc (s_vec->size+1); igzstream infile (file_mref.c_str(), igzstream::in); - if (!infile) {cout<<"error! fail to open mref file: "<<file_mref<<endl; return;} + if (!infile) { + cout<<"error! fail to open mref file: "<<file_mref<<endl; + return; + } string file_name; double d1, d2, d; - //size_t t_ij; while (!safeGetline(infile, file_name).eof()) { string sfile=file_name+".size.txt"; string Sfile=file_name+".S.txt"; - //string Vfile=file_name+".V.txt"; ReadFile_vector(sfile, s); ReadFile_matrix(Sfile, S_sub, Svar_sub); - //ReadFile_matrix(Vfile, V_sub); - //update s_vec and ni + // Update s_vec and ni. for (size_t i=0; i<s_vec->size; i++) { d=gsl_vector_get (s, i)+gsl_vector_get (s_vec, i); gsl_vector_set (s_vec, i, d); } ni=max(ni, (size_t)gsl_vector_get (s, s_vec->size)); - //update S and Svar from each file + // Update S and Svar from each file. for (size_t i=0; i<S_mat->size1; i++) { d1=gsl_vector_get(s, i); for (size_t j=0; j<S_mat->size2; j++) { @@ -4049,30 +4350,9 @@ void ReadFile_mref (const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar gsl_matrix_add (S_mat, S_sub); gsl_matrix_add (Svar_mat, Svar_sub); - /* - //update V from each file - for (size_t i=0; i<n_vc; i++) { - d1=gsl_vector_get(s, i); - for (size_t j=i; j<n_vc; j++) { - d2=gsl_vector_get(s, j); - t_ij=GetabIndex (i+1, j+1, n_vc-2); - for (size_t l=0; l<n_vc+1; l++) { - if (l==n_vc) {d3=1;} else {d3=gsl_vector_get(s, l);} - for (size_t m=0; m<n_vc+1; m++) { - if (m==n_vc) {d4=1;} else {d4=gsl_vector_get(s, m);} - - d=gsl_matrix_get (V_sub, l, t_ij*(n_vc+1)+m)*d1*d2*d3*d4; - gsl_matrix_set (V_sub, l, t_ij*(n_vc+1)+m, d); - } - } - } - } - - gsl_matrix_add (V_mat, V_sub); - */ } - //final: update S and Svar + // Final: update S and Svar. for (size_t i=0; i<S_mat->size1; i++) { d1=gsl_vector_get(s_vec, i); if (d1==0) {continue;} |