diff options
author | Peter Carbonetto | 2017-07-07 11:20:56 -0500 |
---|---|---|
committer | GitHub | 2017-07-07 11:20:56 -0500 |
commit | 86e96ede4ff0955bb2d03ac6c1bd7562a3984955 (patch) | |
tree | 33120540091e7d16b58f389a13949df397535912 /src/prdt.cpp | |
parent | b3747413e6c5c8cd447e979157880676da66a342 (diff) | |
parent | b9758364059d52e153a9f1b4fcae3bc3f3e68422 (diff) | |
download | pangemma-86e96ede4ff0955bb2d03ac6c1bd7562a3984955.tar.gz |
Merge pull request #51 from genenetwork/spacing
Spacing fixes.
Diffstat (limited to 'src/prdt.cpp')
-rw-r--r-- | src/prdt.cpp | 188 |
1 files changed, 94 insertions, 94 deletions
diff --git a/src/prdt.cpp b/src/prdt.cpp index db0fa14..b29d150 100644 --- a/src/prdt.cpp +++ b/src/prdt.cpp @@ -1,17 +1,17 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011-2017, Xiang Zhou - + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -24,7 +24,7 @@ #include <bitset> #include <vector> #include <stdio.h> -#include <stdlib.h> +#include <stdlib.h> #include <cmath> #include "gsl/gsl_vector.h" #include "gsl/gsl_matrix.h" @@ -43,36 +43,36 @@ using namespace std; void PRDT::CopyFromParam (PARAM &cPar) { a_mode=cPar.a_mode; d_pace=cPar.d_pace; - + file_bfile=cPar.file_bfile; file_geno=cPar.file_geno; file_out=cPar.file_out; path_out=cPar.path_out; - - indicator_pheno=cPar.indicator_pheno; + + indicator_pheno=cPar.indicator_pheno; indicator_cvt=cPar.indicator_cvt; indicator_idv=cPar.indicator_idv; - + snpInfo=cPar.snpInfo; mapRS2est=cPar.mapRS2est; - + time_eigen=0; - + n_ph=cPar.n_ph; np_obs=cPar.np_obs; np_miss=cPar.np_miss; ns_total=cPar.ns_total; - ns_test=0; - + ns_test=0; + return; } void PRDT::CopyToParam (PARAM &cPar) { cPar.ns_test=ns_test; cPar.time_eigen=time_eigen; - + return; -} +} void PRDT::WriteFiles (gsl_vector *y_prdt) { string file_str; @@ -80,13 +80,13 @@ void PRDT::WriteFiles (gsl_vector *y_prdt) { file_str+="."; file_str+="prdt"; file_str+=".txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) { cout<<"error writing file: "<<file_str.c_str()<<endl; return; } - + size_t ci_test=0; for (size_t i=0; i<indicator_idv.size(); i++) { if (indicator_idv[i]==1) { @@ -96,7 +96,7 @@ void PRDT::WriteFiles (gsl_vector *y_prdt) { ci_test++; } } - + outfile.close(); outfile.clear(); return; @@ -106,13 +106,13 @@ void PRDT::WriteFiles (gsl_matrix *Y_full) { string file_str; file_str=path_out+"/"+file_out; file_str+=".prdt.txt"; - + ofstream outfile (file_str.c_str(), ofstream::out); if (!outfile) { cout<<"error writing file: "<<file_str.c_str()<<endl; return; } - + size_t ci_test=0; for (size_t i=0; i<indicator_cvt.size(); i++) { if (indicator_cvt[i]==0) { @@ -126,7 +126,7 @@ void PRDT::WriteFiles (gsl_matrix *Y_full) { ci_test++; } } - + outfile.close(); outfile.clear(); return; @@ -134,21 +134,21 @@ void PRDT::WriteFiles (gsl_matrix *Y_full) { void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { size_t ni_test=u_hat->size, ni_total=G->size1; - + gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test); gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test); - gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); + gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); gsl_vector *eval=gsl_vector_alloc (ni_test); gsl_vector *Utu=gsl_vector_alloc (ni_test); gsl_vector *w=gsl_vector_alloc (ni_total); gsl_permutation *pmt=gsl_permutation_alloc (ni_test); - + //center matrix G based on indicator_idv for (size_t i=0; i<ni_total; i++) { gsl_vector_set(w, i, indicator_idv[i]); } CenterMatrix(G, w); - + //obtain Koo and Kfo size_t o_i=0, o_j=0; double d; @@ -166,7 +166,7 @@ void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { } if (indicator_idv[i]==1) {o_i++;} } - + //matrix operations to get u_prdt cout<<"Start Eigen-Decomposition..."<<endl; clock_t time_start=clock(); @@ -177,8 +177,8 @@ void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { } } - time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - + time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu); for (size_t i=0; i<eval->size; i++) { d=gsl_vector_get(eval, i); @@ -189,7 +189,7 @@ void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { } gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval); gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); - + // Free matrices. gsl_matrix_free(Goo); gsl_matrix_free(Gfo); @@ -199,7 +199,7 @@ void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { gsl_vector_free(w); gsl_permutation_free(pmt); - return; + return; } void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { @@ -208,17 +208,17 @@ void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { cout<<"error reading genotype file:"<<file_geno<<endl; return; } - + string line; char *ch_ptr; string rs; - + size_t n_miss, n_train_nomiss, c_phen; double geno, x_mean, x_train_mean, effect_size; - + gsl_vector *x=gsl_vector_alloc (y_prdt->size); gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size); - + ns_test=0; // Start reading genotypes and analyze. @@ -227,24 +227,24 @@ void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { if (t%d_pace==0 || t==(ns_total-1)) { ProgressBar ("Reading SNPs ", t, ns_total-1); } - + ch_ptr=strtok ((char *)line.c_str(), " , \t"); rs=ch_ptr; ch_ptr=strtok (NULL, " , \t"); - ch_ptr=strtok (NULL, " , \t"); - + ch_ptr=strtok (NULL, " , \t"); + if (mapRS2est.count(rs)==0) { continue; } else { effect_size=mapRS2est[rs]; } - + x_mean=0.0; c_phen=0; n_miss=0; x_train_mean=0; n_train_nomiss=0; - + gsl_vector_set_zero(x_miss); for (size_t i=0; i<indicator_idv.size(); ++i) { @@ -260,10 +260,10 @@ void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { - geno=atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); + geno=atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); x_mean+=geno; } c_phen++; @@ -274,12 +274,12 @@ void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { cout << "snp " << rs << " has missing genotype for all " << "individuals and will be ignored." << endl; continue;} - + x_mean/=(double)(x->size-n_miss); x_train_mean/=(double)(n_train_nomiss); - - + + for (size_t i=0; i<x->size; ++i) { geno=gsl_vector_get(x, i); if (gsl_vector_get (x_miss, i)==0) { @@ -291,17 +291,17 @@ void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { gsl_vector_scale (x, effect_size); gsl_vector_add (y_prdt, x); - + ns_test++; - } + } cout<<endl; - + gsl_vector_free (x); gsl_vector_free (x_miss); - + infile.close(); infile.clear(); - + return; } @@ -312,35 +312,35 @@ void PRDT::AnalyzePlink (gsl_vector *y_prdt) { cout<<"error reading bed file:"<<file_bed<<endl; return; } - + char ch[1]; - bitset<8> b; + bitset<8> b; string rs; - + size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; double geno, x_mean, x_train_mean, effect_size; - + gsl_vector *x=gsl_vector_alloc (y_prdt->size); - + // Calculate n_bit and c, the number of bit for each SNP. if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;} else {n_bit=indicator_idv.size()/4+1; } - + // Print the first 3 magic numbers. for (size_t i=0; i<3; ++i) { infile.read(ch,1); b=ch[0]; - } - + } + ns_test=0; - + for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) { if (t%d_pace==0 || t==snpInfo.size()-1) { ProgressBar ("Reading SNPs ", t, snpInfo.size()-1); } - + rs=snpInfo[t].rs_number; - + if (mapRS2est.count(rs)==0) { continue; } else { @@ -349,7 +349,7 @@ void PRDT::AnalyzePlink (gsl_vector *y_prdt) { // n_bit, and 3 is the number of magic numbers. infile.seekg(t*n_bit+3); - + // Read genotypes. x_mean=0.0; n_miss=0; @@ -359,7 +359,7 @@ void PRDT::AnalyzePlink (gsl_vector *y_prdt) { b=ch[0]; // Minor allele homozygous: 2.0; major: 0.0. - for (size_t j=0; j<4; ++j) { + for (size_t j=0; j<4; ++j) { if ((i==(n_bit-1)) && ci_total==indicator_idv.size()) { break; @@ -404,19 +404,19 @@ void PRDT::AnalyzePlink (gsl_vector *y_prdt) { ci_test++; } ci_total++; - + } } - + if (x->size==n_miss) { cout << "snp " << rs << " has missing genotype for all " << "individuals and will be ignored."<<endl; continue; } - + x_mean/=(double)(x->size-n_miss); x_train_mean/=(double)(n_train_nomiss); - + for (size_t i=0; i<x->size; ++i) { geno=gsl_vector_get(x, i); if (geno==-9) { @@ -425,47 +425,47 @@ void PRDT::AnalyzePlink (gsl_vector *y_prdt) { gsl_vector_set(x, i, geno-x_train_mean); } } - + gsl_vector_scale (x, effect_size); gsl_vector_add (y_prdt, x); - + ns_test++; - } + } cout<<endl; - + gsl_vector_free (x); - + infile.close(); - infile.clear(); - + infile.clear(); + return; } // Predict missing phenotypes using ridge regression. // Y_hat contains fixed effects void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, - gsl_matrix *Y_full) { + gsl_matrix *Y_full) { gsl_vector *y_obs=gsl_vector_alloc (np_obs); gsl_vector *y_miss=gsl_vector_alloc (np_miss); gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs); gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs); gsl_vector *Hiy=gsl_vector_alloc (np_obs); - + size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0; - + // Obtain H_oo, H_mo. - c_obs1=0; c_miss1=0; + c_obs1=0; c_miss1=0; for (vector<int>::size_type i1=0; i1<indicator_pheno.size(); ++i1) { if (indicator_cvt[i1]==0) {continue;} for (vector<int>::size_type j1=0; j1<n_ph; ++j1) { - + c_obs2=0; c_miss2=0; for (vector<int>::size_type i2=0; i2<indicator_pheno.size(); ++i2) { if (indicator_cvt[i2]==0) {continue;} for (vector<int>::size_type j2=0; j2<n_ph; j2++) { - + if (indicator_pheno[i2][j2]==1) { if (indicator_pheno[i1][j1]==1) { gsl_matrix_set(H_oo,c_obs1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) ); @@ -476,30 +476,30 @@ void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, } else { c_miss2++; } - } + } } - + if (indicator_pheno[i1][j1]==1) { c_obs1++; } else { c_miss1++; } } - - } - + + } + // Do LU decomposition of H_oo. int sig; gsl_permutation * pmt=gsl_permutation_alloc (np_obs); LUDecomp (H_oo, pmt, &sig); - + // Obtain y_obs=y_full-y_hat. // Add the fixed effects part to y_miss: y_miss=y_hat. c_obs1=0; c_miss1=0; for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) { if (indicator_cvt[i]==0) {continue;} - + for (vector<int>::size_type j=0; j<n_ph; ++j) { if (indicator_pheno[i][j]==1) { gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j)-gsl_matrix_get (Y_hat, i, j) ); @@ -509,18 +509,18 @@ void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, c_miss1++; } } - } - + } + LUSolve (H_oo, pmt, y_obs, Hiy); - + gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss); - + // Put back predicted y_miss to Y_full. c_miss1=0; for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) { if (indicator_cvt[i]==0) {continue;} - + for (vector<int>::size_type j=0; j<n_ph; ++j) { if (indicator_pheno[i][j]==0) { gsl_matrix_set (Y_full, i, j, gsl_vector_get (y_miss, c_miss1) ); @@ -528,14 +528,14 @@ void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, } } } - + // Free matrices. gsl_vector_free(y_obs); gsl_vector_free(y_miss); gsl_matrix_free(H_oo); gsl_matrix_free(H_mo); gsl_vector_free(Hiy); - + return; } |