aboutsummaryrefslogtreecommitdiff
path: root/src/prdt.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/prdt.cpp')
-rw-r--r--src/prdt.cpp988
1 files changed, 499 insertions, 489 deletions
diff --git a/src/prdt.cpp b/src/prdt.cpp
index b29d150..3e7c004 100644
--- a/src/prdt.cpp
+++ b/src/prdt.cpp
@@ -16,527 +16,537 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-#include <iostream>
-#include <sstream>
+#include "gsl/gsl_blas.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_vector.h"
+#include <bitset>
+#include <cmath>
#include <fstream>
-#include <string>
#include <iomanip>
-#include <bitset>
-#include <vector>
+#include <iostream>
+#include <sstream>
#include <stdio.h>
#include <stdlib.h>
-#include <cmath>
-#include "gsl/gsl_vector.h"
-#include "gsl/gsl_matrix.h"
-#include "gsl/gsl_linalg.h"
-#include "gsl/gsl_blas.h"
+#include <string>
+#include <vector>
-#include "io.h"
-#include "lapack.h"
#include "gzstream.h"
#include "io.h"
-#include "prdt.h"
+#include "io.h"
+#include "lapack.h"
#include "mathfunc.h"
+#include "prdt.h"
using namespace std;
-void PRDT::CopyFromParam (PARAM &cPar) {
- a_mode=cPar.a_mode;
- d_pace=cPar.d_pace;
+void PRDT::CopyFromParam(PARAM &cPar) {
+ a_mode = cPar.a_mode;
+ d_pace = cPar.d_pace;
- file_bfile=cPar.file_bfile;
- file_geno=cPar.file_geno;
- file_out=cPar.file_out;
- path_out=cPar.path_out;
+ file_bfile = cPar.file_bfile;
+ file_geno = cPar.file_geno;
+ file_out = cPar.file_out;
+ path_out = cPar.path_out;
- indicator_pheno=cPar.indicator_pheno;
- indicator_cvt=cPar.indicator_cvt;
- indicator_idv=cPar.indicator_idv;
+ indicator_pheno = cPar.indicator_pheno;
+ indicator_cvt = cPar.indicator_cvt;
+ indicator_idv = cPar.indicator_idv;
- snpInfo=cPar.snpInfo;
- mapRS2est=cPar.mapRS2est;
+ snpInfo = cPar.snpInfo;
+ mapRS2est = cPar.mapRS2est;
- time_eigen=0;
+ time_eigen = 0;
- n_ph=cPar.n_ph;
- np_obs=cPar.np_obs;
- np_miss=cPar.np_miss;
- ns_total=cPar.ns_total;
- ns_test=0;
+ n_ph = cPar.n_ph;
+ np_obs = cPar.np_obs;
+ np_miss = cPar.np_miss;
+ ns_total = cPar.ns_total;
+ ns_test = 0;
- return;
+ return;
}
-void PRDT::CopyToParam (PARAM &cPar) {
- cPar.ns_test=ns_test;
- cPar.time_eigen=time_eigen;
+void PRDT::CopyToParam(PARAM &cPar) {
+ cPar.ns_test = ns_test;
+ cPar.time_eigen = time_eigen;
- return;
+ return;
}
-void PRDT::WriteFiles (gsl_vector *y_prdt) {
- string file_str;
- file_str=path_out+"/"+file_out;
- file_str+=".";
- file_str+="prdt";
- file_str+=".txt";
-
- ofstream outfile (file_str.c_str(), ofstream::out);
- if (!outfile) {
- cout<<"error writing file: "<<file_str.c_str()<<endl;
- return;
- }
-
- size_t ci_test=0;
- for (size_t i=0; i<indicator_idv.size(); i++) {
- if (indicator_idv[i]==1) {
- outfile<<"NA"<<endl;
- } else {
- outfile<<gsl_vector_get (y_prdt, ci_test)<<endl;
- ci_test++;
- }
- }
-
- outfile.close();
- outfile.clear();
- return;
+void PRDT::WriteFiles(gsl_vector *y_prdt) {
+ string file_str;
+ file_str = path_out + "/" + file_out;
+ file_str += ".";
+ file_str += "prdt";
+ file_str += ".txt";
+
+ ofstream outfile(file_str.c_str(), ofstream::out);
+ if (!outfile) {
+ cout << "error writing file: " << file_str.c_str() << endl;
+ return;
+ }
+
+ size_t ci_test = 0;
+ for (size_t i = 0; i < indicator_idv.size(); i++) {
+ if (indicator_idv[i] == 1) {
+ outfile << "NA" << endl;
+ } else {
+ outfile << gsl_vector_get(y_prdt, ci_test) << endl;
+ ci_test++;
+ }
+ }
+
+ outfile.close();
+ outfile.clear();
+ return;
}
-void PRDT::WriteFiles (gsl_matrix *Y_full) {
- string file_str;
- file_str=path_out+"/"+file_out;
- file_str+=".prdt.txt";
-
- ofstream outfile (file_str.c_str(), ofstream::out);
- if (!outfile) {
- cout<<"error writing file: "<<file_str.c_str()<<endl;
- return;
- }
-
- size_t ci_test=0;
- for (size_t i=0; i<indicator_cvt.size(); i++) {
- if (indicator_cvt[i]==0) {
- outfile<<"NA"<<endl;
- } else {
- for (size_t j=0; j<Y_full->size2; j++) {
- outfile << gsl_matrix_get(Y_full,ci_test,j) <<
- "\t";
- }
- outfile<<endl;
- ci_test++;
- }
- }
-
- outfile.close();
- outfile.clear();
- return;
+void PRDT::WriteFiles(gsl_matrix *Y_full) {
+ string file_str;
+ file_str = path_out + "/" + file_out;
+ file_str += ".prdt.txt";
+
+ ofstream outfile(file_str.c_str(), ofstream::out);
+ if (!outfile) {
+ cout << "error writing file: " << file_str.c_str() << endl;
+ return;
+ }
+
+ size_t ci_test = 0;
+ for (size_t i = 0; i < indicator_cvt.size(); i++) {
+ if (indicator_cvt[i] == 0) {
+ outfile << "NA" << endl;
+ } else {
+ for (size_t j = 0; j < Y_full->size2; j++) {
+ outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t";
+ }
+ outfile << endl;
+ ci_test++;
+ }
+ }
+
+ outfile.close();
+ outfile.clear();
+ return;
}
-void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
- size_t ni_test=u_hat->size, ni_total=G->size1;
-
- gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test);
- gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test);
- gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test);
- gsl_vector *eval=gsl_vector_alloc (ni_test);
- gsl_vector *Utu=gsl_vector_alloc (ni_test);
- gsl_vector *w=gsl_vector_alloc (ni_total);
- gsl_permutation *pmt=gsl_permutation_alloc (ni_test);
-
- //center matrix G based on indicator_idv
- for (size_t i=0; i<ni_total; i++) {
- gsl_vector_set(w, i, indicator_idv[i]);
- }
- CenterMatrix(G, w);
-
- //obtain Koo and Kfo
- size_t o_i=0, o_j=0;
- double d;
- for (size_t i=0; i<indicator_idv.size(); i++) {
- o_j=0;
- for (size_t j=0; j<indicator_idv.size(); j++) {
- d=gsl_matrix_get(G, i, j);
- if (indicator_idv[i]==1 && indicator_idv[j]==1) {
- gsl_matrix_set(Goo, o_i, o_j, d);
- }
- if (indicator_idv[i]==0 && indicator_idv[j]==1) {
- gsl_matrix_set(Gfo, i-o_i, o_j, d);
- }
- if (indicator_idv[j]==1) {o_j++;}
- }
- if (indicator_idv[i]==1) {o_i++;}
- }
-
- //matrix operations to get u_prdt
- cout<<"Start Eigen-Decomposition..."<<endl;
- clock_t time_start=clock();
- EigenDecomp (Goo, U, eval, 0);
- for (size_t i=0; i<eval->size; i++) {
- if (gsl_vector_get(eval,i)<1e-10) {
- gsl_vector_set(eval, i, 0);
- }
- }
-
- time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-
- gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu);
- for (size_t i=0; i<eval->size; i++) {
- d=gsl_vector_get(eval, i);
- if (d!=0) {
- d=gsl_vector_get(Utu, i)/d;
- gsl_vector_set(Utu, i, d);
- }
- }
- gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval);
- gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
-
- // Free matrices.
- gsl_matrix_free(Goo);
- gsl_matrix_free(Gfo);
- gsl_matrix_free(U);
- gsl_vector_free(eval);
- gsl_vector_free(Utu);
- gsl_vector_free(w);
- gsl_permutation_free(pmt);
-
- return;
+void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
+ size_t ni_test = u_hat->size, ni_total = G->size1;
+
+ gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test);
+ gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test);
+ gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test);
+ gsl_vector *eval = gsl_vector_alloc(ni_test);
+ gsl_vector *Utu = gsl_vector_alloc(ni_test);
+ gsl_vector *w = gsl_vector_alloc(ni_total);
+ gsl_permutation *pmt = gsl_permutation_alloc(ni_test);
+
+ // center matrix G based on indicator_idv
+ for (size_t i = 0; i < ni_total; i++) {
+ gsl_vector_set(w, i, indicator_idv[i]);
+ }
+ CenterMatrix(G, w);
+
+ // obtain Koo and Kfo
+ size_t o_i = 0, o_j = 0;
+ double d;
+ for (size_t i = 0; i < indicator_idv.size(); i++) {
+ o_j = 0;
+ for (size_t j = 0; j < indicator_idv.size(); j++) {
+ d = gsl_matrix_get(G, i, j);
+ if (indicator_idv[i] == 1 && indicator_idv[j] == 1) {
+ gsl_matrix_set(Goo, o_i, o_j, d);
+ }
+ if (indicator_idv[i] == 0 && indicator_idv[j] == 1) {
+ gsl_matrix_set(Gfo, i - o_i, o_j, d);
+ }
+ if (indicator_idv[j] == 1) {
+ o_j++;
+ }
+ }
+ if (indicator_idv[i] == 1) {
+ o_i++;
+ }
+ }
+
+ // matrix operations to get u_prdt
+ cout << "Start Eigen-Decomposition..." << endl;
+ clock_t time_start = clock();
+ EigenDecomp(Goo, U, eval, 0);
+ for (size_t i = 0; i < eval->size; i++) {
+ if (gsl_vector_get(eval, i) < 1e-10) {
+ gsl_vector_set(eval, i, 0);
+ }
+ }
+
+ time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
+
+ gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu);
+ for (size_t i = 0; i < eval->size; i++) {
+ d = gsl_vector_get(eval, i);
+ if (d != 0) {
+ d = gsl_vector_get(Utu, i) / d;
+ gsl_vector_set(Utu, i, d);
+ }
+ }
+ gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval);
+ gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
+
+ // Free matrices.
+ gsl_matrix_free(Goo);
+ gsl_matrix_free(Gfo);
+ gsl_matrix_free(U);
+ gsl_vector_free(eval);
+ gsl_vector_free(Utu);
+ gsl_vector_free(w);
+ gsl_permutation_free(pmt);
+
+ return;
}
-void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) {
- igzstream infile (file_geno.c_str(), igzstream::in);
- if (!infile) {
- cout<<"error reading genotype file:"<<file_geno<<endl;
- return;
- }
-
- string line;
- char *ch_ptr;
- string rs;
-
- size_t n_miss, n_train_nomiss, c_phen;
- double geno, x_mean, x_train_mean, effect_size;
-
- gsl_vector *x=gsl_vector_alloc (y_prdt->size);
- gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size);
-
- ns_test=0;
-
- // Start reading genotypes and analyze.
- for (size_t t=0; t<ns_total; ++t) {
- !safeGetline(infile, line).eof();
- if (t%d_pace==0 || t==(ns_total-1)) {
- ProgressBar ("Reading SNPs ", t, ns_total-1);
- }
-
- ch_ptr=strtok ((char *)line.c_str(), " , \t");
- rs=ch_ptr;
- ch_ptr=strtok (NULL, " , \t");
- ch_ptr=strtok (NULL, " , \t");
-
- if (mapRS2est.count(rs)==0) {
- continue;
- } else {
- effect_size=mapRS2est[rs];
- }
-
- x_mean=0.0;
- c_phen=0;
- n_miss=0;
- x_train_mean=0;
- n_train_nomiss=0;
-
- gsl_vector_set_zero(x_miss);
-
- for (size_t i=0; i<indicator_idv.size(); ++i) {
- ch_ptr=strtok (NULL, " , \t");
- if (indicator_idv[i]==1) {
- if (strcmp(ch_ptr, "NA")!=0) {
- geno=atof(ch_ptr);
- x_train_mean+=geno;
- n_train_nomiss++;
- }
- } else {
- if (strcmp(ch_ptr, "NA")==0) {
- gsl_vector_set(x_miss, c_phen, 0.0);
- n_miss++;
- } else {
- geno=atof(ch_ptr);
-
- gsl_vector_set(x, c_phen, geno);
- gsl_vector_set(x_miss, c_phen, 1.0);
- x_mean+=geno;
- }
- c_phen++;
- }
- }
-
- if (x->size==n_miss) {
- cout << "snp " << rs << " has missing genotype for all " <<
- "individuals and will be ignored." << endl;
- continue;}
-
-
- x_mean/=(double)(x->size-n_miss);
- x_train_mean/=(double)(n_train_nomiss);
-
-
- for (size_t i=0; i<x->size; ++i) {
- geno=gsl_vector_get(x, i);
- if (gsl_vector_get (x_miss, i)==0) {
- gsl_vector_set(x, i, x_mean-x_train_mean);
- } else {
- gsl_vector_set(x, i, geno-x_train_mean);
- }
- }
-
- gsl_vector_scale (x, effect_size);
- gsl_vector_add (y_prdt, x);
-
- ns_test++;
- }
- cout<<endl;
-
- gsl_vector_free (x);
- gsl_vector_free (x_miss);
-
- infile.close();
- infile.clear();
-
- return;
+void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) {
+ igzstream infile(file_geno.c_str(), igzstream::in);
+ if (!infile) {
+ cout << "error reading genotype file:" << file_geno << endl;
+ return;
+ }
+
+ string line;
+ char *ch_ptr;
+ string rs;
+
+ size_t n_miss, n_train_nomiss, c_phen;
+ double geno, x_mean, x_train_mean, effect_size;
+
+ gsl_vector *x = gsl_vector_alloc(y_prdt->size);
+ gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size);
+
+ ns_test = 0;
+
+ // Start reading genotypes and analyze.
+ for (size_t t = 0; t < ns_total; ++t) {
+ !safeGetline(infile, line).eof();
+ if (t % d_pace == 0 || t == (ns_total - 1)) {
+ ProgressBar("Reading SNPs ", t, ns_total - 1);
+ }
+
+ ch_ptr = strtok((char *)line.c_str(), " , \t");
+ rs = ch_ptr;
+ ch_ptr = strtok(NULL, " , \t");
+ ch_ptr = strtok(NULL, " , \t");
+
+ if (mapRS2est.count(rs) == 0) {
+ continue;
+ } else {
+ effect_size = mapRS2est[rs];
+ }
+
+ x_mean = 0.0;
+ c_phen = 0;
+ n_miss = 0;
+ x_train_mean = 0;
+ n_train_nomiss = 0;
+
+ gsl_vector_set_zero(x_miss);
+
+ for (size_t i = 0; i < indicator_idv.size(); ++i) {
+ ch_ptr = strtok(NULL, " , \t");
+ if (indicator_idv[i] == 1) {
+ if (strcmp(ch_ptr, "NA") != 0) {
+ geno = atof(ch_ptr);
+ x_train_mean += geno;
+ n_train_nomiss++;
+ }
+ } else {
+ if (strcmp(ch_ptr, "NA") == 0) {
+ gsl_vector_set(x_miss, c_phen, 0.0);
+ n_miss++;
+ } else {
+ geno = atof(ch_ptr);
+
+ gsl_vector_set(x, c_phen, geno);
+ gsl_vector_set(x_miss, c_phen, 1.0);
+ x_mean += geno;
+ }
+ c_phen++;
+ }
+ }
+
+ if (x->size == n_miss) {
+ cout << "snp " << rs << " has missing genotype for all "
+ << "individuals and will be ignored." << endl;
+ continue;
+ }
+
+ x_mean /= (double)(x->size - n_miss);
+ x_train_mean /= (double)(n_train_nomiss);
+
+ for (size_t i = 0; i < x->size; ++i) {
+ geno = gsl_vector_get(x, i);
+ if (gsl_vector_get(x_miss, i) == 0) {
+ gsl_vector_set(x, i, x_mean - x_train_mean);
+ } else {
+ gsl_vector_set(x, i, geno - x_train_mean);
+ }
+ }
+
+ gsl_vector_scale(x, effect_size);
+ gsl_vector_add(y_prdt, x);
+
+ ns_test++;
+ }
+ cout << endl;
+
+ gsl_vector_free(x);
+ gsl_vector_free(x_miss);
+
+ infile.close();
+ infile.clear();
+
+ return;
}
-void PRDT::AnalyzePlink (gsl_vector *y_prdt) {
- string file_bed=file_bfile+".bed";
- ifstream infile (file_bed.c_str(), ios::binary);
- if (!infile) {
- cout<<"error reading bed file:"<<file_bed<<endl;
- return;
- }
-
- char ch[1];
- bitset<8> b;
- string rs;
-
- size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
- double geno, x_mean, x_train_mean, effect_size;
-
- gsl_vector *x=gsl_vector_alloc (y_prdt->size);
-
- // Calculate n_bit and c, the number of bit for each SNP.
- if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;}
- else {n_bit=indicator_idv.size()/4+1; }
-
- // Print the first 3 magic numbers.
- for (size_t i=0; i<3; ++i) {
- infile.read(ch,1);
- b=ch[0];
- }
-
- ns_test=0;
-
- for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
- if (t%d_pace==0 || t==snpInfo.size()-1) {
- ProgressBar ("Reading SNPs ", t, snpInfo.size()-1);
- }
-
- rs=snpInfo[t].rs_number;
-
- if (mapRS2est.count(rs)==0) {
- continue;
- } else {
- effect_size=mapRS2est[rs];
- }
-
- // n_bit, and 3 is the number of magic numbers.
- infile.seekg(t*n_bit+3);
-
- // Read genotypes.
- x_mean=0.0;
- n_miss=0;
- ci_total=0; ci_test=0; x_train_mean=0; n_train_nomiss=0;
- for (size_t i=0; i<n_bit; ++i) {
- infile.read(ch,1);
- b=ch[0];
-
- // Minor allele homozygous: 2.0; major: 0.0.
- for (size_t j=0; j<4; ++j) {
- if ((i==(n_bit-1)) &&
- ci_total==indicator_idv.size()) {
- break;
- }
- if (indicator_idv[ci_total]==1) {
- if (b[2*j]==0) {
- if (b[2*j+1]==0) {
- x_train_mean+=2.0;
- n_train_nomiss++;
- }
- else {
- x_train_mean+=1.0;
- n_train_nomiss++;
- }
- }
- else {
- if (b[2*j+1]==1) {
- n_train_nomiss++;
- }
- else {}
- }
- } else {
- if (b[2*j]==0) {
- if (b[2*j+1]==0) {
- gsl_vector_set(x,ci_test,2);
- x_mean+=2.0;
- }
- else {
- gsl_vector_set(x,ci_test,1);
- x_mean+=1.0;
- }
- }
- else {
- if (b[2*j+1]==1) {
- gsl_vector_set(x,ci_test,0);
- }
- else {
- gsl_vector_set(x,ci_test,-9);
- n_miss++;
- }
- }
- ci_test++;
- }
- ci_total++;
-
- }
- }
-
- if (x->size==n_miss) {
- cout << "snp " << rs << " has missing genotype for all " <<
- "individuals and will be ignored."<<endl;
- continue;
- }
-
- x_mean/=(double)(x->size-n_miss);
- x_train_mean/=(double)(n_train_nomiss);
-
- for (size_t i=0; i<x->size; ++i) {
- geno=gsl_vector_get(x, i);
- if (geno==-9) {
- gsl_vector_set(x, i, x_mean-x_train_mean);
- } else {
- gsl_vector_set(x, i, geno-x_train_mean);
- }
- }
-
- gsl_vector_scale (x, effect_size);
- gsl_vector_add (y_prdt, x);
-
- ns_test++;
- }
- cout<<endl;
-
- gsl_vector_free (x);
-
- infile.close();
- infile.clear();
-
- return;
+void PRDT::AnalyzePlink(gsl_vector *y_prdt) {
+ string file_bed = file_bfile + ".bed";
+ ifstream infile(file_bed.c_str(), ios::binary);
+ if (!infile) {
+ cout << "error reading bed file:" << file_bed << endl;
+ return;
+ }
+
+ char ch[1];
+ bitset<8> b;
+ string rs;
+
+ size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
+ double geno, x_mean, x_train_mean, effect_size;
+
+ gsl_vector *x = gsl_vector_alloc(y_prdt->size);
+
+ // Calculate n_bit and c, the number of bit for each SNP.
+ if (indicator_idv.size() % 4 == 0) {
+ n_bit = indicator_idv.size() / 4;
+ } else {
+ n_bit = indicator_idv.size() / 4 + 1;
+ }
+
+ // Print the first 3 magic numbers.
+ for (size_t i = 0; i < 3; ++i) {
+ infile.read(ch, 1);
+ b = ch[0];
+ }
+
+ ns_test = 0;
+
+ for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) {
+ if (t % d_pace == 0 || t == snpInfo.size() - 1) {
+ ProgressBar("Reading SNPs ", t, snpInfo.size() - 1);
+ }
+
+ rs = snpInfo[t].rs_number;
+
+ if (mapRS2est.count(rs) == 0) {
+ continue;
+ } else {
+ effect_size = mapRS2est[rs];
+ }
+
+ // n_bit, and 3 is the number of magic numbers.
+ infile.seekg(t * n_bit + 3);
+
+ // Read genotypes.
+ x_mean = 0.0;
+ n_miss = 0;
+ ci_total = 0;
+ ci_test = 0;
+ x_train_mean = 0;
+ n_train_nomiss = 0;
+ for (size_t i = 0; i < n_bit; ++i) {
+ infile.read(ch, 1);
+ b = ch[0];
+
+ // Minor allele homozygous: 2.0; major: 0.0.
+ for (size_t j = 0; j < 4; ++j) {
+ if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) {
+ break;
+ }
+ if (indicator_idv[ci_total] == 1) {
+ if (b[2 * j] == 0) {
+ if (b[2 * j + 1] == 0) {
+ x_train_mean += 2.0;
+ n_train_nomiss++;
+ } else {
+ x_train_mean += 1.0;
+ n_train_nomiss++;
+ }
+ } else {
+ if (b[2 * j + 1] == 1) {
+ n_train_nomiss++;
+ } else {
+ }
+ }
+ } else {
+ if (b[2 * j] == 0) {
+ if (b[2 * j + 1] == 0) {
+ gsl_vector_set(x, ci_test, 2);
+ x_mean += 2.0;
+ } else {
+ gsl_vector_set(x, ci_test, 1);
+ x_mean += 1.0;
+ }
+ } else {
+ if (b[2 * j + 1] == 1) {
+ gsl_vector_set(x, ci_test, 0);
+ } else {
+ gsl_vector_set(x, ci_test, -9);
+ n_miss++;
+ }
+ }
+ ci_test++;
+ }
+ ci_total++;
+ }
+ }
+
+ if (x->size == n_miss) {
+ cout << "snp " << rs << " has missing genotype for all "
+ << "individuals and will be ignored." << endl;
+ continue;
+ }
+
+ x_mean /= (double)(x->size - n_miss);
+ x_train_mean /= (double)(n_train_nomiss);
+
+ for (size_t i = 0; i < x->size; ++i) {
+ geno = gsl_vector_get(x, i);
+ if (geno == -9) {
+ gsl_vector_set(x, i, x_mean - x_train_mean);
+ } else {
+ gsl_vector_set(x, i, geno - x_train_mean);
+ }
+ }
+
+ gsl_vector_scale(x, effect_size);
+ gsl_vector_add(y_prdt, x);
+
+ ns_test++;
+ }
+ cout << endl;
+
+ gsl_vector_free(x);
+
+ infile.close();
+ infile.clear();
+
+ return;
}
// Predict missing phenotypes using ridge regression.
// Y_hat contains fixed effects
-void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H,
- gsl_matrix *Y_full) {
- gsl_vector *y_obs=gsl_vector_alloc (np_obs);
- gsl_vector *y_miss=gsl_vector_alloc (np_miss);
- gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs);
- gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs);
- gsl_vector *Hiy=gsl_vector_alloc (np_obs);
-
- size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0;
-
- // Obtain H_oo, H_mo.
- c_obs1=0; c_miss1=0;
- for (vector<int>::size_type i1=0; i1<indicator_pheno.size(); ++i1) {
- if (indicator_cvt[i1]==0) {continue;}
- for (vector<int>::size_type j1=0; j1<n_ph; ++j1) {
-
- c_obs2=0; c_miss2=0;
- for (vector<int>::size_type i2=0;
- i2<indicator_pheno.size(); ++i2) {
- if (indicator_cvt[i2]==0) {continue;}
- for (vector<int>::size_type j2=0;
- j2<n_ph; j2++) {
-
- if (indicator_pheno[i2][j2]==1) {
- if (indicator_pheno[i1][j1]==1) {
- gsl_matrix_set(H_oo,c_obs1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
- } else {
- gsl_matrix_set (H_mo, c_miss1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
- }
- c_obs2++;
- } else {
- c_miss2++;
- }
- }
- }
-
- if (indicator_pheno[i1][j1]==1) {
- c_obs1++;
- } else {
- c_miss1++;
- }
- }
-
- }
-
- // Do LU decomposition of H_oo.
- int sig;
- gsl_permutation * pmt=gsl_permutation_alloc (np_obs);
- LUDecomp (H_oo, pmt, &sig);
-
- // Obtain y_obs=y_full-y_hat.
- // Add the fixed effects part to y_miss: y_miss=y_hat.
- c_obs1=0; c_miss1=0;
- for (vector<int>::size_type i=0;
- i<indicator_pheno.size(); ++i) {
- if (indicator_cvt[i]==0) {continue;}
-
- for (vector<int>::size_type j=0; j<n_ph; ++j) {
- if (indicator_pheno[i][j]==1) {
- gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j)-gsl_matrix_get (Y_hat, i, j) );
- c_obs1++;
- } else {
- gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
- c_miss1++;
- }
- }
- }
-
- LUSolve (H_oo, pmt, y_obs, Hiy);
-
- gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
-
- // Put back predicted y_miss to Y_full.
- c_miss1=0;
- for (vector<int>::size_type i=0;
- i<indicator_pheno.size(); ++i) {
- if (indicator_cvt[i]==0) {continue;}
-
- for (vector<int>::size_type j=0; j<n_ph; ++j) {
- if (indicator_pheno[i][j]==0) {
- gsl_matrix_set (Y_full, i, j, gsl_vector_get (y_miss, c_miss1) );
- c_miss1++;
- }
- }
- }
-
- // Free matrices.
- gsl_vector_free(y_obs);
- gsl_vector_free(y_miss);
- gsl_matrix_free(H_oo);
- gsl_matrix_free(H_mo);
- gsl_vector_free(Hiy);
-
- return;
+void PRDT::MvnormPrdt(const gsl_matrix *Y_hat, const gsl_matrix *H,
+ gsl_matrix *Y_full) {
+ gsl_vector *y_obs = gsl_vector_alloc(np_obs);
+ gsl_vector *y_miss = gsl_vector_alloc(np_miss);
+ gsl_matrix *H_oo = gsl_matrix_alloc(np_obs, np_obs);
+ gsl_matrix *H_mo = gsl_matrix_alloc(np_miss, np_obs);
+ gsl_vector *Hiy = gsl_vector_alloc(np_obs);
+
+ size_t c_obs1 = 0, c_obs2 = 0, c_miss1 = 0, c_miss2 = 0;
+
+ // Obtain H_oo, H_mo.
+ c_obs1 = 0;
+ c_miss1 = 0;
+ for (vector<int>::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) {
+ if (indicator_cvt[i1] == 0) {
+ continue;
+ }
+ for (vector<int>::size_type j1 = 0; j1 < n_ph; ++j1) {
+
+ c_obs2 = 0;
+ c_miss2 = 0;
+ for (vector<int>::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) {
+ if (indicator_cvt[i2] == 0) {
+ continue;
+ }
+ for (vector<int>::size_type j2 = 0; j2 < n_ph; j2++) {
+
+ if (indicator_pheno[i2][j2] == 1) {
+ if (indicator_pheno[i1][j1] == 1) {
+ gsl_matrix_set(
+ H_oo, c_obs1, c_obs2,
+ gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
+ } else {
+ gsl_matrix_set(
+ H_mo, c_miss1, c_obs2,
+ gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
+ }
+ c_obs2++;
+ } else {
+ c_miss2++;
+ }
+ }
+ }
+
+ if (indicator_pheno[i1][j1] == 1) {
+ c_obs1++;
+ } else {
+ c_miss1++;
+ }
+ }
+ }
+
+ // Do LU decomposition of H_oo.
+ int sig;
+ gsl_permutation *pmt = gsl_permutation_alloc(np_obs);
+ LUDecomp(H_oo, pmt, &sig);
+
+ // Obtain y_obs=y_full-y_hat.
+ // Add the fixed effects part to y_miss: y_miss=y_hat.
+ c_obs1 = 0;
+ c_miss1 = 0;
+ for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
+ if (indicator_cvt[i] == 0) {
+ continue;
+ }
+
+ for (vector<int>::size_type j = 0; j < n_ph; ++j) {
+ if (indicator_pheno[i][j] == 1) {
+ gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) -
+ gsl_matrix_get(Y_hat, i, j));
+ c_obs1++;
+ } else {
+ gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j));
+ c_miss1++;
+ }
+ }
+ }
+
+ LUSolve(H_oo, pmt, y_obs, Hiy);
+
+ gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+
+ // Put back predicted y_miss to Y_full.
+ c_miss1 = 0;
+ for (vector<int>::size_type i = 0; i < indicator_pheno.size(); ++i) {
+ if (indicator_cvt[i] == 0) {
+ continue;
+ }
+
+ for (vector<int>::size_type j = 0; j < n_ph; ++j) {
+ if (indicator_pheno[i][j] == 0) {
+ gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1));
+ c_miss1++;
+ }
+ }
+ }
+
+ // Free matrices.
+ gsl_vector_free(y_obs);
+ gsl_vector_free(y_miss);
+ gsl_matrix_free(H_oo);
+ gsl_matrix_free(H_mo);
+ gsl_vector_free(Hiy);
+
+ return;
}
-
-