From 3935ba39d30666dd7d4a831155631847c77b70c4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 2 Aug 2017 08:46:58 +0000 Subject: Massive patch using LLVM coding style. It was generated with: clang-format -style=LLVM -i *.cpp *.h Please set your editor to replace tabs with spaces and use indentation of 2 spaces. --- src/prdt.cpp | 988 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 499 insertions(+), 489 deletions(-) (limited to 'src/prdt.cpp') diff --git a/src/prdt.cpp b/src/prdt.cpp index b29d150..3e7c004 100644 --- a/src/prdt.cpp +++ b/src/prdt.cpp @@ -16,527 +16,537 @@ along with this program. If not, see . */ -#include -#include +#include "gsl/gsl_blas.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" +#include +#include #include -#include #include -#include -#include +#include +#include #include #include -#include -#include "gsl/gsl_vector.h" -#include "gsl/gsl_matrix.h" -#include "gsl/gsl_linalg.h" -#include "gsl/gsl_blas.h" +#include +#include -#include "io.h" -#include "lapack.h" #include "gzstream.h" #include "io.h" -#include "prdt.h" +#include "io.h" +#include "lapack.h" #include "mathfunc.h" +#include "prdt.h" using namespace std; -void PRDT::CopyFromParam (PARAM &cPar) { - a_mode=cPar.a_mode; - d_pace=cPar.d_pace; +void PRDT::CopyFromParam(PARAM &cPar) { + a_mode = cPar.a_mode; + d_pace = cPar.d_pace; - file_bfile=cPar.file_bfile; - file_geno=cPar.file_geno; - file_out=cPar.file_out; - path_out=cPar.path_out; + file_bfile = cPar.file_bfile; + file_geno = cPar.file_geno; + file_out = cPar.file_out; + path_out = cPar.path_out; - indicator_pheno=cPar.indicator_pheno; - indicator_cvt=cPar.indicator_cvt; - indicator_idv=cPar.indicator_idv; + indicator_pheno = cPar.indicator_pheno; + indicator_cvt = cPar.indicator_cvt; + indicator_idv = cPar.indicator_idv; - snpInfo=cPar.snpInfo; - mapRS2est=cPar.mapRS2est; + snpInfo = cPar.snpInfo; + mapRS2est = cPar.mapRS2est; - time_eigen=0; + time_eigen = 0; - n_ph=cPar.n_ph; - np_obs=cPar.np_obs; - np_miss=cPar.np_miss; - ns_total=cPar.ns_total; - ns_test=0; + n_ph = cPar.n_ph; + np_obs = cPar.np_obs; + np_miss = cPar.np_miss; + ns_total = cPar.ns_total; + ns_test = 0; - return; + return; } -void PRDT::CopyToParam (PARAM &cPar) { - cPar.ns_test=ns_test; - cPar.time_eigen=time_eigen; +void PRDT::CopyToParam(PARAM &cPar) { + cPar.ns_test = ns_test; + cPar.time_eigen = time_eigen; - return; + return; } -void PRDT::WriteFiles (gsl_vector *y_prdt) { - string file_str; - file_str=path_out+"/"+file_out; - file_str+="."; - file_str+="prdt"; - file_str+=".txt"; - - ofstream outfile (file_str.c_str(), ofstream::out); - if (!outfile) { - cout<<"error writing file: "<size2; j++) { - outfile << gsl_matrix_get(Y_full,ci_test,j) << - "\t"; - } - outfile<size2; j++) { + outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t"; + } + outfile << endl; + ci_test++; + } + } + + outfile.close(); + outfile.clear(); + return; } -void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { - size_t ni_test=u_hat->size, ni_total=G->size1; - - gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test); - gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test); - gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); - gsl_vector *eval=gsl_vector_alloc (ni_test); - gsl_vector *Utu=gsl_vector_alloc (ni_test); - gsl_vector *w=gsl_vector_alloc (ni_total); - gsl_permutation *pmt=gsl_permutation_alloc (ni_test); - - //center matrix G based on indicator_idv - for (size_t i=0; isize; i++) { - if (gsl_vector_get(eval,i)<1e-10) { - gsl_vector_set(eval, i, 0); - } - } - - time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); - - gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu); - for (size_t i=0; isize; i++) { - d=gsl_vector_get(eval, i); - if (d!=0) { - d=gsl_vector_get(Utu, i)/d; - gsl_vector_set(Utu, i, d); - } - } - gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval); - gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); - - // Free matrices. - gsl_matrix_free(Goo); - gsl_matrix_free(Gfo); - gsl_matrix_free(U); - gsl_vector_free(eval); - gsl_vector_free(Utu); - gsl_vector_free(w); - gsl_permutation_free(pmt); - - return; +void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { + size_t ni_test = u_hat->size, ni_total = G->size1; + + gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test); + gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test); + gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test); + gsl_vector *eval = gsl_vector_alloc(ni_test); + gsl_vector *Utu = gsl_vector_alloc(ni_test); + gsl_vector *w = gsl_vector_alloc(ni_total); + gsl_permutation *pmt = gsl_permutation_alloc(ni_test); + + // center matrix G based on indicator_idv + for (size_t i = 0; i < ni_total; i++) { + gsl_vector_set(w, i, indicator_idv[i]); + } + CenterMatrix(G, w); + + // obtain Koo and Kfo + size_t o_i = 0, o_j = 0; + double d; + for (size_t i = 0; i < indicator_idv.size(); i++) { + o_j = 0; + for (size_t j = 0; j < indicator_idv.size(); j++) { + d = gsl_matrix_get(G, i, j); + if (indicator_idv[i] == 1 && indicator_idv[j] == 1) { + gsl_matrix_set(Goo, o_i, o_j, d); + } + if (indicator_idv[i] == 0 && indicator_idv[j] == 1) { + gsl_matrix_set(Gfo, i - o_i, o_j, d); + } + if (indicator_idv[j] == 1) { + o_j++; + } + } + if (indicator_idv[i] == 1) { + o_i++; + } + } + + // matrix operations to get u_prdt + cout << "Start Eigen-Decomposition..." << endl; + clock_t time_start = clock(); + EigenDecomp(Goo, U, eval, 0); + for (size_t i = 0; i < eval->size; i++) { + if (gsl_vector_get(eval, i) < 1e-10) { + gsl_vector_set(eval, i, 0); + } + } + + time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu); + for (size_t i = 0; i < eval->size; i++) { + d = gsl_vector_get(eval, i); + if (d != 0) { + d = gsl_vector_get(Utu, i) / d; + gsl_vector_set(Utu, i, d); + } + } + gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval); + gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); + + // Free matrices. + gsl_matrix_free(Goo); + gsl_matrix_free(Gfo); + gsl_matrix_free(U); + gsl_vector_free(eval); + gsl_vector_free(Utu); + gsl_vector_free(w); + gsl_permutation_free(pmt); + + return; } -void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) { - igzstream infile (file_geno.c_str(), igzstream::in); - if (!infile) { - cout<<"error reading genotype file:"<size); - gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size); - - ns_test=0; - - // Start reading genotypes and analyze. - for (size_t t=0; tsize==n_miss) { - cout << "snp " << rs << " has missing genotype for all " << - "individuals and will be ignored." << endl; - continue;} - - - x_mean/=(double)(x->size-n_miss); - x_train_mean/=(double)(n_train_nomiss); - - - for (size_t i=0; isize; ++i) { - geno=gsl_vector_get(x, i); - if (gsl_vector_get (x_miss, i)==0) { - gsl_vector_set(x, i, x_mean-x_train_mean); - } else { - gsl_vector_set(x, i, geno-x_train_mean); - } - } - - gsl_vector_scale (x, effect_size); - gsl_vector_add (y_prdt, x); - - ns_test++; - } - cout<size); + gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size); + + ns_test = 0; + + // Start reading genotypes and analyze. + for (size_t t = 0; t < ns_total; ++t) { + !safeGetline(infile, line).eof(); + if (t % d_pace == 0 || t == (ns_total - 1)) { + ProgressBar("Reading SNPs ", t, ns_total - 1); + } + + ch_ptr = strtok((char *)line.c_str(), " , \t"); + rs = ch_ptr; + ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok(NULL, " , \t"); + + if (mapRS2est.count(rs) == 0) { + continue; + } else { + effect_size = mapRS2est[rs]; + } + + x_mean = 0.0; + c_phen = 0; + n_miss = 0; + x_train_mean = 0; + n_train_nomiss = 0; + + gsl_vector_set_zero(x_miss); + + for (size_t i = 0; i < indicator_idv.size(); ++i) { + ch_ptr = strtok(NULL, " , \t"); + if (indicator_idv[i] == 1) { + if (strcmp(ch_ptr, "NA") != 0) { + geno = atof(ch_ptr); + x_train_mean += geno; + n_train_nomiss++; + } + } else { + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(x_miss, c_phen, 0.0); + n_miss++; + } else { + geno = atof(ch_ptr); + + gsl_vector_set(x, c_phen, geno); + gsl_vector_set(x_miss, c_phen, 1.0); + x_mean += geno; + } + c_phen++; + } + } + + if (x->size == n_miss) { + cout << "snp " << rs << " has missing genotype for all " + << "individuals and will be ignored." << endl; + continue; + } + + x_mean /= (double)(x->size - n_miss); + x_train_mean /= (double)(n_train_nomiss); + + for (size_t i = 0; i < x->size; ++i) { + geno = gsl_vector_get(x, i); + if (gsl_vector_get(x_miss, i) == 0) { + gsl_vector_set(x, i, x_mean - x_train_mean); + } else { + gsl_vector_set(x, i, geno - x_train_mean); + } + } + + gsl_vector_scale(x, effect_size); + gsl_vector_add(y_prdt, x); + + ns_test++; + } + cout << endl; + + gsl_vector_free(x); + gsl_vector_free(x_miss); + + infile.close(); + infile.clear(); + + return; } -void PRDT::AnalyzePlink (gsl_vector *y_prdt) { - string file_bed=file_bfile+".bed"; - ifstream infile (file_bed.c_str(), ios::binary); - if (!infile) { - cout<<"error reading bed file:"< b; - string rs; - - size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; - double geno, x_mean, x_train_mean, effect_size; - - gsl_vector *x=gsl_vector_alloc (y_prdt->size); - - // Calculate n_bit and c, the number of bit for each SNP. - if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;} - else {n_bit=indicator_idv.size()/4+1; } - - // Print the first 3 magic numbers. - for (size_t i=0; i<3; ++i) { - infile.read(ch,1); - b=ch[0]; - } - - ns_test=0; - - for (vector::size_type t=0; tsize==n_miss) { - cout << "snp " << rs << " has missing genotype for all " << - "individuals and will be ignored."<size-n_miss); - x_train_mean/=(double)(n_train_nomiss); - - for (size_t i=0; isize; ++i) { - geno=gsl_vector_get(x, i); - if (geno==-9) { - gsl_vector_set(x, i, x_mean-x_train_mean); - } else { - gsl_vector_set(x, i, geno-x_train_mean); - } - } - - gsl_vector_scale (x, effect_size); - gsl_vector_add (y_prdt, x); - - ns_test++; - } - cout< b; + string rs; + + size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; + double geno, x_mean, x_train_mean, effect_size; + + gsl_vector *x = gsl_vector_alloc(y_prdt->size); + + // Calculate n_bit and c, the number of bit for each SNP. + if (indicator_idv.size() % 4 == 0) { + n_bit = indicator_idv.size() / 4; + } else { + n_bit = indicator_idv.size() / 4 + 1; + } + + // Print the first 3 magic numbers. + for (size_t i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + ns_test = 0; + + for (vector::size_type t = 0; t < snpInfo.size(); ++t) { + if (t % d_pace == 0 || t == snpInfo.size() - 1) { + ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + } + + rs = snpInfo[t].rs_number; + + if (mapRS2est.count(rs) == 0) { + continue; + } else { + effect_size = mapRS2est[rs]; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + // Read genotypes. + x_mean = 0.0; + n_miss = 0; + ci_total = 0; + ci_test = 0; + x_train_mean = 0; + n_train_nomiss = 0; + for (size_t i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) { + break; + } + if (indicator_idv[ci_total] == 1) { + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + x_train_mean += 2.0; + n_train_nomiss++; + } else { + x_train_mean += 1.0; + n_train_nomiss++; + } + } else { + if (b[2 * j + 1] == 1) { + n_train_nomiss++; + } else { + } + } + } else { + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(x, ci_test, 2); + x_mean += 2.0; + } else { + gsl_vector_set(x, ci_test, 1); + x_mean += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(x, ci_test, 0); + } else { + gsl_vector_set(x, ci_test, -9); + n_miss++; + } + } + ci_test++; + } + ci_total++; + } + } + + if (x->size == n_miss) { + cout << "snp " << rs << " has missing genotype for all " + << "individuals and will be ignored." << endl; + continue; + } + + x_mean /= (double)(x->size - n_miss); + x_train_mean /= (double)(n_train_nomiss); + + for (size_t i = 0; i < x->size; ++i) { + geno = gsl_vector_get(x, i); + if (geno == -9) { + gsl_vector_set(x, i, x_mean - x_train_mean); + } else { + gsl_vector_set(x, i, geno - x_train_mean); + } + } + + gsl_vector_scale(x, effect_size); + gsl_vector_add(y_prdt, x); + + ns_test++; + } + cout << endl; + + gsl_vector_free(x); + + infile.close(); + infile.clear(); + + return; } // Predict missing phenotypes using ridge regression. // Y_hat contains fixed effects -void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, - gsl_matrix *Y_full) { - gsl_vector *y_obs=gsl_vector_alloc (np_obs); - gsl_vector *y_miss=gsl_vector_alloc (np_miss); - gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs); - gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs); - gsl_vector *Hiy=gsl_vector_alloc (np_obs); - - size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0; - - // Obtain H_oo, H_mo. - c_obs1=0; c_miss1=0; - for (vector::size_type i1=0; i1::size_type j1=0; j1::size_type i2=0; - i2::size_type j2=0; - j2::size_type i=0; - i::size_type j=0; j::size_type i=0; - i::size_type j=0; j::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) { + if (indicator_cvt[i1] == 0) { + continue; + } + for (vector::size_type j1 = 0; j1 < n_ph; ++j1) { + + c_obs2 = 0; + c_miss2 = 0; + for (vector::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) { + if (indicator_cvt[i2] == 0) { + continue; + } + for (vector::size_type j2 = 0; j2 < n_ph; j2++) { + + if (indicator_pheno[i2][j2] == 1) { + if (indicator_pheno[i1][j1] == 1) { + gsl_matrix_set( + H_oo, c_obs1, c_obs2, + gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2)); + } else { + gsl_matrix_set( + H_mo, c_miss1, c_obs2, + gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2)); + } + c_obs2++; + } else { + c_miss2++; + } + } + } + + if (indicator_pheno[i1][j1] == 1) { + c_obs1++; + } else { + c_miss1++; + } + } + } + + // Do LU decomposition of H_oo. + int sig; + gsl_permutation *pmt = gsl_permutation_alloc(np_obs); + LUDecomp(H_oo, pmt, &sig); + + // Obtain y_obs=y_full-y_hat. + // Add the fixed effects part to y_miss: y_miss=y_hat. + c_obs1 = 0; + c_miss1 = 0; + for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) { + if (indicator_cvt[i] == 0) { + continue; + } + + for (vector::size_type j = 0; j < n_ph; ++j) { + if (indicator_pheno[i][j] == 1) { + gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) - + gsl_matrix_get(Y_hat, i, j)); + c_obs1++; + } else { + gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j)); + c_miss1++; + } + } + } + + LUSolve(H_oo, pmt, y_obs, Hiy); + + gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss); + + // Put back predicted y_miss to Y_full. + c_miss1 = 0; + for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) { + if (indicator_cvt[i] == 0) { + continue; + } + + for (vector::size_type j = 0; j < n_ph; ++j) { + if (indicator_pheno[i][j] == 0) { + gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1)); + c_miss1++; + } + } + } + + // Free matrices. + gsl_vector_free(y_obs); + gsl_vector_free(y_miss); + gsl_matrix_free(H_oo); + gsl_matrix_free(H_mo); + gsl_vector_free(Hiy); + + return; } - - -- cgit v1.2.3