From 3935ba39d30666dd7d4a831155631847c77b70c4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 2 Aug 2017 08:46:58 +0000
Subject: Massive patch using LLVM coding style. It was generated with:
clang-format -style=LLVM -i *.cpp *.h
Please set your editor to replace tabs with spaces and use indentation of 2 spaces.
---
src/prdt.cpp | 988 ++++++++++++++++++++++++++++++-----------------------------
1 file changed, 499 insertions(+), 489 deletions(-)
(limited to 'src/prdt.cpp')
diff --git a/src/prdt.cpp b/src/prdt.cpp
index b29d150..3e7c004 100644
--- a/src/prdt.cpp
+++ b/src/prdt.cpp
@@ -16,527 +16,537 @@
along with this program. If not, see .
*/
-#include
-#include
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_vector.h"
+#include
+#include
#include
-#include
#include
-#include
-#include
+#include
+#include
#include
#include
-#include
-#include "gsl/gsl_vector.h"
-#include "gsl/gsl_matrix.h"
-#include "gsl/gsl_linalg.h"
-#include "gsl/gsl_blas.h"
+#include
+#include
-#include "io.h"
-#include "lapack.h"
#include "gzstream.h"
#include "io.h"
-#include "prdt.h"
+#include "io.h"
+#include "lapack.h"
#include "mathfunc.h"
+#include "prdt.h"
using namespace std;
-void PRDT::CopyFromParam (PARAM &cPar) {
- a_mode=cPar.a_mode;
- d_pace=cPar.d_pace;
+void PRDT::CopyFromParam(PARAM &cPar) {
+ a_mode = cPar.a_mode;
+ d_pace = cPar.d_pace;
- file_bfile=cPar.file_bfile;
- file_geno=cPar.file_geno;
- file_out=cPar.file_out;
- path_out=cPar.path_out;
+ file_bfile = cPar.file_bfile;
+ file_geno = cPar.file_geno;
+ file_out = cPar.file_out;
+ path_out = cPar.path_out;
- indicator_pheno=cPar.indicator_pheno;
- indicator_cvt=cPar.indicator_cvt;
- indicator_idv=cPar.indicator_idv;
+ indicator_pheno = cPar.indicator_pheno;
+ indicator_cvt = cPar.indicator_cvt;
+ indicator_idv = cPar.indicator_idv;
- snpInfo=cPar.snpInfo;
- mapRS2est=cPar.mapRS2est;
+ snpInfo = cPar.snpInfo;
+ mapRS2est = cPar.mapRS2est;
- time_eigen=0;
+ time_eigen = 0;
- n_ph=cPar.n_ph;
- np_obs=cPar.np_obs;
- np_miss=cPar.np_miss;
- ns_total=cPar.ns_total;
- ns_test=0;
+ n_ph = cPar.n_ph;
+ np_obs = cPar.np_obs;
+ np_miss = cPar.np_miss;
+ ns_total = cPar.ns_total;
+ ns_test = 0;
- return;
+ return;
}
-void PRDT::CopyToParam (PARAM &cPar) {
- cPar.ns_test=ns_test;
- cPar.time_eigen=time_eigen;
+void PRDT::CopyToParam(PARAM &cPar) {
+ cPar.ns_test = ns_test;
+ cPar.time_eigen = time_eigen;
- return;
+ return;
}
-void PRDT::WriteFiles (gsl_vector *y_prdt) {
- string file_str;
- file_str=path_out+"/"+file_out;
- file_str+=".";
- file_str+="prdt";
- file_str+=".txt";
-
- ofstream outfile (file_str.c_str(), ofstream::out);
- if (!outfile) {
- cout<<"error writing file: "<size2; j++) {
- outfile << gsl_matrix_get(Y_full,ci_test,j) <<
- "\t";
- }
- outfile<size2; j++) {
+ outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t";
+ }
+ outfile << endl;
+ ci_test++;
+ }
+ }
+
+ outfile.close();
+ outfile.clear();
+ return;
}
-void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
- size_t ni_test=u_hat->size, ni_total=G->size1;
-
- gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test);
- gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test);
- gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test);
- gsl_vector *eval=gsl_vector_alloc (ni_test);
- gsl_vector *Utu=gsl_vector_alloc (ni_test);
- gsl_vector *w=gsl_vector_alloc (ni_total);
- gsl_permutation *pmt=gsl_permutation_alloc (ni_test);
-
- //center matrix G based on indicator_idv
- for (size_t i=0; isize; i++) {
- if (gsl_vector_get(eval,i)<1e-10) {
- gsl_vector_set(eval, i, 0);
- }
- }
-
- time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-
- gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu);
- for (size_t i=0; isize; i++) {
- d=gsl_vector_get(eval, i);
- if (d!=0) {
- d=gsl_vector_get(Utu, i)/d;
- gsl_vector_set(Utu, i, d);
- }
- }
- gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval);
- gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
-
- // Free matrices.
- gsl_matrix_free(Goo);
- gsl_matrix_free(Gfo);
- gsl_matrix_free(U);
- gsl_vector_free(eval);
- gsl_vector_free(Utu);
- gsl_vector_free(w);
- gsl_permutation_free(pmt);
-
- return;
+void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
+ size_t ni_test = u_hat->size, ni_total = G->size1;
+
+ gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test);
+ gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test);
+ gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test);
+ gsl_vector *eval = gsl_vector_alloc(ni_test);
+ gsl_vector *Utu = gsl_vector_alloc(ni_test);
+ gsl_vector *w = gsl_vector_alloc(ni_total);
+ gsl_permutation *pmt = gsl_permutation_alloc(ni_test);
+
+ // center matrix G based on indicator_idv
+ for (size_t i = 0; i < ni_total; i++) {
+ gsl_vector_set(w, i, indicator_idv[i]);
+ }
+ CenterMatrix(G, w);
+
+ // obtain Koo and Kfo
+ size_t o_i = 0, o_j = 0;
+ double d;
+ for (size_t i = 0; i < indicator_idv.size(); i++) {
+ o_j = 0;
+ for (size_t j = 0; j < indicator_idv.size(); j++) {
+ d = gsl_matrix_get(G, i, j);
+ if (indicator_idv[i] == 1 && indicator_idv[j] == 1) {
+ gsl_matrix_set(Goo, o_i, o_j, d);
+ }
+ if (indicator_idv[i] == 0 && indicator_idv[j] == 1) {
+ gsl_matrix_set(Gfo, i - o_i, o_j, d);
+ }
+ if (indicator_idv[j] == 1) {
+ o_j++;
+ }
+ }
+ if (indicator_idv[i] == 1) {
+ o_i++;
+ }
+ }
+
+ // matrix operations to get u_prdt
+ cout << "Start Eigen-Decomposition..." << endl;
+ clock_t time_start = clock();
+ EigenDecomp(Goo, U, eval, 0);
+ for (size_t i = 0; i < eval->size; i++) {
+ if (gsl_vector_get(eval, i) < 1e-10) {
+ gsl_vector_set(eval, i, 0);
+ }
+ }
+
+ time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu);
+ for (size_t i = 0; i < eval->size; i++) {
+ d = gsl_vector_get(eval, i);
+ if (d != 0) {
+ d = gsl_vector_get(Utu, i) / d;
+ gsl_vector_set(Utu, i, d);
+ }
+ }
+ gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval);
+ gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
+
+ // Free matrices.
+ gsl_matrix_free(Goo);
+ gsl_matrix_free(Gfo);
+ gsl_matrix_free(U);
+ gsl_vector_free(eval);
+ gsl_vector_free(Utu);
+ gsl_vector_free(w);
+ gsl_permutation_free(pmt);
+
+ return;
}
-void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) {
- igzstream infile (file_geno.c_str(), igzstream::in);
- if (!infile) {
- cout<<"error reading genotype file:"<size);
- gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size);
-
- ns_test=0;
-
- // Start reading genotypes and analyze.
- for (size_t t=0; tsize==n_miss) {
- cout << "snp " << rs << " has missing genotype for all " <<
- "individuals and will be ignored." << endl;
- continue;}
-
-
- x_mean/=(double)(x->size-n_miss);
- x_train_mean/=(double)(n_train_nomiss);
-
-
- for (size_t i=0; isize; ++i) {
- geno=gsl_vector_get(x, i);
- if (gsl_vector_get (x_miss, i)==0) {
- gsl_vector_set(x, i, x_mean-x_train_mean);
- } else {
- gsl_vector_set(x, i, geno-x_train_mean);
- }
- }
-
- gsl_vector_scale (x, effect_size);
- gsl_vector_add (y_prdt, x);
-
- ns_test++;
- }
- cout<size);
+ gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size);
+
+ ns_test = 0;
+
+ // Start reading genotypes and analyze.
+ for (size_t t = 0; t < ns_total; ++t) {
+ !safeGetline(infile, line).eof();
+ if (t % d_pace == 0 || t == (ns_total - 1)) {
+ ProgressBar("Reading SNPs ", t, ns_total - 1);
+ }
+
+ ch_ptr = strtok((char *)line.c_str(), " , \t");
+ rs = ch_ptr;
+ ch_ptr = strtok(NULL, " , \t");
+ ch_ptr = strtok(NULL, " , \t");
+
+ if (mapRS2est.count(rs) == 0) {
+ continue;
+ } else {
+ effect_size = mapRS2est[rs];
+ }
+
+ x_mean = 0.0;
+ c_phen = 0;
+ n_miss = 0;
+ x_train_mean = 0;
+ n_train_nomiss = 0;
+
+ gsl_vector_set_zero(x_miss);
+
+ for (size_t i = 0; i < indicator_idv.size(); ++i) {
+ ch_ptr = strtok(NULL, " , \t");
+ if (indicator_idv[i] == 1) {
+ if (strcmp(ch_ptr, "NA") != 0) {
+ geno = atof(ch_ptr);
+ x_train_mean += geno;
+ n_train_nomiss++;
+ }
+ } else {
+ if (strcmp(ch_ptr, "NA") == 0) {
+ gsl_vector_set(x_miss, c_phen, 0.0);
+ n_miss++;
+ } else {
+ geno = atof(ch_ptr);
+
+ gsl_vector_set(x, c_phen, geno);
+ gsl_vector_set(x_miss, c_phen, 1.0);
+ x_mean += geno;
+ }
+ c_phen++;
+ }
+ }
+
+ if (x->size == n_miss) {
+ cout << "snp " << rs << " has missing genotype for all "
+ << "individuals and will be ignored." << endl;
+ continue;
+ }
+
+ x_mean /= (double)(x->size - n_miss);
+ x_train_mean /= (double)(n_train_nomiss);
+
+ for (size_t i = 0; i < x->size; ++i) {
+ geno = gsl_vector_get(x, i);
+ if (gsl_vector_get(x_miss, i) == 0) {
+ gsl_vector_set(x, i, x_mean - x_train_mean);
+ } else {
+ gsl_vector_set(x, i, geno - x_train_mean);
+ }
+ }
+
+ gsl_vector_scale(x, effect_size);
+ gsl_vector_add(y_prdt, x);
+
+ ns_test++;
+ }
+ cout << endl;
+
+ gsl_vector_free(x);
+ gsl_vector_free(x_miss);
+
+ infile.close();
+ infile.clear();
+
+ return;
}
-void PRDT::AnalyzePlink (gsl_vector *y_prdt) {
- string file_bed=file_bfile+".bed";
- ifstream infile (file_bed.c_str(), ios::binary);
- if (!infile) {
- cout<<"error reading bed file:"< b;
- string rs;
-
- size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
- double geno, x_mean, x_train_mean, effect_size;
-
- gsl_vector *x=gsl_vector_alloc (y_prdt->size);
-
- // Calculate n_bit and c, the number of bit for each SNP.
- if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;}
- else {n_bit=indicator_idv.size()/4+1; }
-
- // Print the first 3 magic numbers.
- for (size_t i=0; i<3; ++i) {
- infile.read(ch,1);
- b=ch[0];
- }
-
- ns_test=0;
-
- for (vector::size_type t=0; tsize==n_miss) {
- cout << "snp " << rs << " has missing genotype for all " <<
- "individuals and will be ignored."<size-n_miss);
- x_train_mean/=(double)(n_train_nomiss);
-
- for (size_t i=0; isize; ++i) {
- geno=gsl_vector_get(x, i);
- if (geno==-9) {
- gsl_vector_set(x, i, x_mean-x_train_mean);
- } else {
- gsl_vector_set(x, i, geno-x_train_mean);
- }
- }
-
- gsl_vector_scale (x, effect_size);
- gsl_vector_add (y_prdt, x);
-
- ns_test++;
- }
- cout< b;
+ string rs;
+
+ size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
+ double geno, x_mean, x_train_mean, effect_size;
+
+ gsl_vector *x = gsl_vector_alloc(y_prdt->size);
+
+ // Calculate n_bit and c, the number of bit for each SNP.
+ if (indicator_idv.size() % 4 == 0) {
+ n_bit = indicator_idv.size() / 4;
+ } else {
+ n_bit = indicator_idv.size() / 4 + 1;
+ }
+
+ // Print the first 3 magic numbers.
+ for (size_t i = 0; i < 3; ++i) {
+ infile.read(ch, 1);
+ b = ch[0];
+ }
+
+ ns_test = 0;
+
+ for (vector::size_type t = 0; t < snpInfo.size(); ++t) {
+ if (t % d_pace == 0 || t == snpInfo.size() - 1) {
+ ProgressBar("Reading SNPs ", t, snpInfo.size() - 1);
+ }
+
+ rs = snpInfo[t].rs_number;
+
+ if (mapRS2est.count(rs) == 0) {
+ continue;
+ } else {
+ effect_size = mapRS2est[rs];
+ }
+
+ // n_bit, and 3 is the number of magic numbers.
+ infile.seekg(t * n_bit + 3);
+
+ // Read genotypes.
+ x_mean = 0.0;
+ n_miss = 0;
+ ci_total = 0;
+ ci_test = 0;
+ x_train_mean = 0;
+ n_train_nomiss = 0;
+ for (size_t i = 0; i < n_bit; ++i) {
+ infile.read(ch, 1);
+ b = ch[0];
+
+ // Minor allele homozygous: 2.0; major: 0.0.
+ for (size_t j = 0; j < 4; ++j) {
+ if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) {
+ break;
+ }
+ if (indicator_idv[ci_total] == 1) {
+ if (b[2 * j] == 0) {
+ if (b[2 * j + 1] == 0) {
+ x_train_mean += 2.0;
+ n_train_nomiss++;
+ } else {
+ x_train_mean += 1.0;
+ n_train_nomiss++;
+ }
+ } else {
+ if (b[2 * j + 1] == 1) {
+ n_train_nomiss++;
+ } else {
+ }
+ }
+ } else {
+ if (b[2 * j] == 0) {
+ if (b[2 * j + 1] == 0) {
+ gsl_vector_set(x, ci_test, 2);
+ x_mean += 2.0;
+ } else {
+ gsl_vector_set(x, ci_test, 1);
+ x_mean += 1.0;
+ }
+ } else {
+ if (b[2 * j + 1] == 1) {
+ gsl_vector_set(x, ci_test, 0);
+ } else {
+ gsl_vector_set(x, ci_test, -9);
+ n_miss++;
+ }
+ }
+ ci_test++;
+ }
+ ci_total++;
+ }
+ }
+
+ if (x->size == n_miss) {
+ cout << "snp " << rs << " has missing genotype for all "
+ << "individuals and will be ignored." << endl;
+ continue;
+ }
+
+ x_mean /= (double)(x->size - n_miss);
+ x_train_mean /= (double)(n_train_nomiss);
+
+ for (size_t i = 0; i < x->size; ++i) {
+ geno = gsl_vector_get(x, i);
+ if (geno == -9) {
+ gsl_vector_set(x, i, x_mean - x_train_mean);
+ } else {
+ gsl_vector_set(x, i, geno - x_train_mean);
+ }
+ }
+
+ gsl_vector_scale(x, effect_size);
+ gsl_vector_add(y_prdt, x);
+
+ ns_test++;
+ }
+ cout << endl;
+
+ gsl_vector_free(x);
+
+ infile.close();
+ infile.clear();
+
+ return;
}
// Predict missing phenotypes using ridge regression.
// Y_hat contains fixed effects
-void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H,
- gsl_matrix *Y_full) {
- gsl_vector *y_obs=gsl_vector_alloc (np_obs);
- gsl_vector *y_miss=gsl_vector_alloc (np_miss);
- gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs);
- gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs);
- gsl_vector *Hiy=gsl_vector_alloc (np_obs);
-
- size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0;
-
- // Obtain H_oo, H_mo.
- c_obs1=0; c_miss1=0;
- for (vector::size_type i1=0; i1::size_type j1=0; j1::size_type i2=0;
- i2::size_type j2=0;
- j2::size_type i=0;
- i::size_type j=0; j::size_type i=0;
- i::size_type j=0; j::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) {
+ if (indicator_cvt[i1] == 0) {
+ continue;
+ }
+ for (vector::size_type j1 = 0; j1 < n_ph; ++j1) {
+
+ c_obs2 = 0;
+ c_miss2 = 0;
+ for (vector::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) {
+ if (indicator_cvt[i2] == 0) {
+ continue;
+ }
+ for (vector::size_type j2 = 0; j2 < n_ph; j2++) {
+
+ if (indicator_pheno[i2][j2] == 1) {
+ if (indicator_pheno[i1][j1] == 1) {
+ gsl_matrix_set(
+ H_oo, c_obs1, c_obs2,
+ gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
+ } else {
+ gsl_matrix_set(
+ H_mo, c_miss1, c_obs2,
+ gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
+ }
+ c_obs2++;
+ } else {
+ c_miss2++;
+ }
+ }
+ }
+
+ if (indicator_pheno[i1][j1] == 1) {
+ c_obs1++;
+ } else {
+ c_miss1++;
+ }
+ }
+ }
+
+ // Do LU decomposition of H_oo.
+ int sig;
+ gsl_permutation *pmt = gsl_permutation_alloc(np_obs);
+ LUDecomp(H_oo, pmt, &sig);
+
+ // Obtain y_obs=y_full-y_hat.
+ // Add the fixed effects part to y_miss: y_miss=y_hat.
+ c_obs1 = 0;
+ c_miss1 = 0;
+ for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) {
+ if (indicator_cvt[i] == 0) {
+ continue;
+ }
+
+ for (vector::size_type j = 0; j < n_ph; ++j) {
+ if (indicator_pheno[i][j] == 1) {
+ gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) -
+ gsl_matrix_get(Y_hat, i, j));
+ c_obs1++;
+ } else {
+ gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j));
+ c_miss1++;
+ }
+ }
+ }
+
+ LUSolve(H_oo, pmt, y_obs, Hiy);
+
+ gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+
+ // Put back predicted y_miss to Y_full.
+ c_miss1 = 0;
+ for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) {
+ if (indicator_cvt[i] == 0) {
+ continue;
+ }
+
+ for (vector::size_type j = 0; j < n_ph; ++j) {
+ if (indicator_pheno[i][j] == 0) {
+ gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1));
+ c_miss1++;
+ }
+ }
+ }
+
+ // Free matrices.
+ gsl_vector_free(y_obs);
+ gsl_vector_free(y_miss);
+ gsl_matrix_free(H_oo);
+ gsl_matrix_free(H_mo);
+ gsl_vector_free(Hiy);
+
+ return;
}
-
-
--
cgit v1.2.3