about summary refs log tree commit diff
path: root/src/prdt.cpp
diff options
context:
space:
mode:
authorxiangzhou2014-09-22 11:06:02 -0400
committerxiangzhou2014-09-22 11:06:02 -0400
commit7762722f264adc402ea3b0f21923b18f072253ba (patch)
tree879ed22943d424b52bd04b4ee6fbdf51616dc9a9 /src/prdt.cpp
parent44faf98d2c6fe56c916cace02fe498fc1271bd9d (diff)
downloadpangemma-7762722f264adc402ea3b0f21923b18f072253ba.tar.gz
version 0.95alpha
Diffstat (limited to 'src/prdt.cpp')
-rw-r--r--src/prdt.cpp544
1 files changed, 544 insertions, 0 deletions
diff --git a/src/prdt.cpp b/src/prdt.cpp
new file mode 100644
index 0000000..2875119
--- /dev/null
+++ b/src/prdt.cpp
@@ -0,0 +1,544 @@
+/*
+ Genome-wide Efficient Mixed Model Association (GEMMA)
+ Copyright (C) 2011  Xiang Zhou
+ 
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <iomanip>
+#include <bitset>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h> 
+#include <cmath>
+#include "gsl/gsl_vector.h"
+#include "gsl/gsl_matrix.h"
+#include "gsl/gsl_linalg.h"
+#include "gsl/gsl_blas.h"
+
+
+#include "io.h"
+#include "lapack.h"  //for functions EigenDecomp
+#include "gzstream.h"
+
+#ifdef FORCE_FLOAT
+#include "io_float.h"
+#include "prdt_float.h"
+#include "mathfunc_float.h"
+#else
+#include "io.h"
+#include "prdt.h"
+#include "mathfunc.h"
+#endif
+
+using namespace std;
+
+
+
+
+void PRDT::CopyFromParam (PARAM &cPar) 
+{
+	a_mode=cPar.a_mode;
+	d_pace=cPar.d_pace;
+	
+	file_bfile=cPar.file_bfile;
+	file_geno=cPar.file_geno;
+	file_out=cPar.file_out;
+	path_out=cPar.path_out;
+	
+	indicator_pheno=cPar.indicator_pheno;	
+	indicator_cvt=cPar.indicator_cvt;
+	indicator_idv=cPar.indicator_idv;
+	
+	snpInfo=cPar.snpInfo;
+	mapRS2est=cPar.mapRS2est;
+	
+	time_eigen=0;
+	
+	n_ph=cPar.n_ph;
+	np_obs=cPar.np_obs;
+	np_miss=cPar.np_miss;
+	ns_total=cPar.ns_total;
+	ns_test=0;	
+	
+	return;
+}
+
+void PRDT::CopyToParam (PARAM &cPar) 
+{
+	cPar.ns_test=ns_test;
+	cPar.time_eigen=time_eigen;
+	
+	return;
+}               
+
+
+
+
+void PRDT::WriteFiles (gsl_vector *y_prdt) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".";
+	file_str+="prdt";
+	file_str+=".txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		if (indicator_idv[i]==1) {
+			outfile<<"NA"<<endl;
+		} else {
+			outfile<<gsl_vector_get (y_prdt, ci_test)<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+void PRDT::WriteFiles (gsl_matrix *Y_full) 
+{
+	string file_str;
+	file_str=path_out+"/"+file_out;
+	file_str+=".prdt.txt";
+	
+	ofstream outfile (file_str.c_str(), ofstream::out);
+	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
+	
+	size_t ci_test=0;
+	for (size_t i=0; i<indicator_cvt.size(); i++) {
+		if (indicator_cvt[i]==0) {
+			outfile<<"NA"<<endl;
+		} else {
+			for (size_t j=0; j<Y_full->size2; j++) {
+				outfile<<gsl_matrix_get (Y_full, ci_test, j)<<"\t";
+			}
+			outfile<<endl;
+			ci_test++;
+		}
+	}
+	
+	outfile.close();
+	outfile.clear();
+	return;
+}
+
+
+
+
+void PRDT::AddBV (gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) 
+{
+	size_t ni_test=u_hat->size, ni_total=G->size1;
+	
+	gsl_matrix *Goo=gsl_matrix_alloc (ni_test, ni_test);
+	gsl_matrix *Gfo=gsl_matrix_alloc (ni_total-ni_test, ni_test);
+	gsl_matrix *U=gsl_matrix_alloc (ni_test, ni_test); 
+	gsl_vector *eval=gsl_vector_alloc (ni_test);
+	gsl_vector *Utu=gsl_vector_alloc (ni_test);
+	gsl_vector *w=gsl_vector_alloc (ni_total);
+	gsl_permutation *pmt=gsl_permutation_alloc (ni_test);
+	
+	//center matrix G based on indicator_idv
+	for (size_t i=0; i<ni_total; i++) {
+		gsl_vector_set(w, i, indicator_idv[i]);
+	}
+	CenterMatrix(G, w);
+		
+	//obtain Koo and Kfo
+	size_t o_i=0, o_j=0;
+	double d;
+	for (size_t i=0; i<indicator_idv.size(); i++) {
+		o_j=0;
+		for (size_t j=0; j<indicator_idv.size(); j++) {
+			d=gsl_matrix_get(G, i, j);
+			if (indicator_idv[i]==1 && indicator_idv[j]==1) {
+				gsl_matrix_set(Goo, o_i, o_j, d);
+			}
+			if (indicator_idv[i]==0 && indicator_idv[j]==1) {
+				gsl_matrix_set(Gfo, i-o_i, o_j, d);
+			}
+			if (indicator_idv[j]==1) {o_j++;}
+		}
+		if (indicator_idv[i]==1) {o_i++;}
+	}
+		
+	//matrix operations to get u_prdt
+	cout<<"Start Eigen-Decomposition..."<<endl;
+	clock_t time_start=clock();
+	EigenDecomp (Goo, U, eval, 0);
+	for (size_t i=0; i<eval->size; i++) {
+		if (gsl_vector_get(eval,i)<1e-10) {gsl_vector_set(eval, i, 0);}
+	}
+
+	time_eigen=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);	
+	
+	gsl_blas_dgemv (CblasTrans, 1.0, U, u_hat, 0.0, Utu);
+	for (size_t i=0; i<eval->size; i++) {
+		d=gsl_vector_get(eval, i);
+		if (d!=0) {d=gsl_vector_get(Utu, i)/d; gsl_vector_set(Utu, i, d);}
+	}
+	gsl_blas_dgemv (CblasNoTrans, 1.0, U, Utu, 0.0, eval);
+	gsl_blas_dgemv (CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
+	
+	//free matrices
+	gsl_matrix_free(Goo);
+	gsl_matrix_free(Gfo);
+	gsl_matrix_free(U);
+	gsl_vector_free(eval);
+	gsl_vector_free(Utu);
+	gsl_vector_free(w);
+	gsl_permutation_free(pmt);
+
+	return;	
+}
+
+
+
+void PRDT::AnalyzeBimbam (gsl_vector *y_prdt) 
+{
+	igzstream infile (file_geno.c_str(), igzstream::in);
+//	ifstream infile (file_geno.c_str(), ifstream::in);
+	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
+	
+	string line;
+	char *ch_ptr;
+	string rs;
+	
+	size_t n_miss, n_train_nomiss, c_phen;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	gsl_vector *x_miss=gsl_vector_alloc (y_prdt->size);
+	
+	ns_test=0;
+
+	//start reading genotypes and analyze	
+	for (size_t t=0; t<ns_total; ++t) {
+		!safeGetline(infile, line).eof();
+		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
+		
+		ch_ptr=strtok ((char *)line.c_str(), " , \t");
+		rs=ch_ptr;
+		ch_ptr=strtok (NULL, " , \t");
+		ch_ptr=strtok (NULL, " , \t");		
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		x_mean=0.0; c_phen=0; n_miss=0; x_train_mean=0; n_train_nomiss=0;
+		gsl_vector_set_zero(x_miss);
+
+		for (size_t i=0; i<indicator_idv.size(); ++i) {
+			ch_ptr=strtok (NULL, " , \t");
+			if (indicator_idv[i]==1) {
+				if (strcmp(ch_ptr, "NA")!=0) {
+					geno=atof(ch_ptr); 			
+					x_train_mean+=geno;
+					n_train_nomiss++;
+				}
+			} else {
+				if (strcmp(ch_ptr, "NA")==0) {
+					gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;
+				} else {
+					geno=atof(ch_ptr); 	
+					
+					gsl_vector_set(x, c_phen, geno); 
+					gsl_vector_set(x_miss, c_phen, 1.0); 
+					x_mean+=geno;
+				}
+				c_phen++;
+			}
+		}
+
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (gsl_vector_get (x_miss, i)==0) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	gsl_vector_free (x_miss);
+	
+	infile.close();
+	infile.clear();
+	
+	return;
+}
+
+
+
+
+
+
+
+void PRDT::AnalyzePlink (gsl_vector *y_prdt) 
+{
+	string file_bed=file_bfile+".bed";
+	ifstream infile (file_bed.c_str(), ios::binary);
+	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
+	
+	char ch[1];
+	bitset<8> b;	
+	string rs;
+	
+	size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
+	double geno, x_mean, x_train_mean, effect_size;
+	
+	gsl_vector *x=gsl_vector_alloc (y_prdt->size);
+	
+	//calculate n_bit and c, the number of bit for each snp
+	if (indicator_idv.size()%4==0) {n_bit=indicator_idv.size()/4;}
+	else {n_bit=indicator_idv.size()/4+1; }
+	
+	//print the first three majic numbers
+	for (size_t i=0; i<3; ++i) {
+		infile.read(ch,1);
+		b=ch[0];
+	}	
+	
+	ns_test=0;
+	
+	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
+		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
+		//if (indicator_snp[t]==0) {continue;}
+		
+		rs=snpInfo[t].rs_number;
+		
+		if (mapRS2est.count(rs)==0) {continue;} else {effect_size=mapRS2est[rs];}
+		
+		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
+		
+		//read genotypes
+		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; x_train_mean=0; n_train_nomiss=0;
+		for (size_t i=0; i<n_bit; ++i) {
+			infile.read(ch,1);
+			b=ch[0];
+			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
+				if ((i==(n_bit-1)) && ci_total==indicator_idv.size() ) {break;}
+				if (indicator_idv[ci_total]==1) {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {x_train_mean+=2.0; n_train_nomiss++;}
+						else {x_train_mean+=1.0; n_train_nomiss++;}
+					}
+					else {
+						if (b[2*j+1]==1) {n_train_nomiss++;}                                  
+						else {}
+					}
+				} else {
+					if (b[2*j]==0) {
+						if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
+						else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
+					}
+					else {
+						if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
+						else {gsl_vector_set(x, ci_test, -9); n_miss++; }
+					}
+					ci_test++;
+				}
+				ci_total++;
+				
+			}
+		}
+		
+		if (x->size==n_miss) {cout<<"snp "<<rs<<" has missing genotype for all individuals and will be ignored."<<endl; continue;}
+		
+		x_mean/=(double)(x->size-n_miss);
+		x_train_mean/=(double)(n_train_nomiss);
+		
+		for (size_t i=0; i<x->size; ++i) {
+			geno=gsl_vector_get(x, i);
+			if (geno==-9) {
+				gsl_vector_set(x, i, x_mean-x_train_mean);
+			} else {
+				gsl_vector_set(x, i, geno-x_train_mean);
+			}
+		}
+		
+		gsl_vector_scale (x, effect_size);
+		gsl_vector_add (y_prdt, x);
+		
+		ns_test++;
+	}	
+	cout<<endl;
+	
+	gsl_vector_free (x);
+	
+	infile.close();
+	infile.clear();	
+	
+	return;
+}
+
+
+
+
+//predict missing phenotypes using ridge regression
+//Y_hat contains fixed effects
+void PRDT::MvnormPrdt (const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full) 
+{	
+	gsl_vector *y_obs=gsl_vector_alloc (np_obs);
+	gsl_vector *y_miss=gsl_vector_alloc (np_miss);
+	gsl_matrix *H_oo=gsl_matrix_alloc (np_obs, np_obs);
+	gsl_matrix *H_mo=gsl_matrix_alloc (np_miss, np_obs);
+	gsl_vector *Hiy=gsl_vector_alloc (np_obs);
+	
+	size_t c_obs1=0, c_obs2=0, c_miss1=0, c_miss2=0;
+	
+	//obtain H_oo, H_mo
+	c_obs1=0; c_miss1=0; 
+	for (vector<int>::size_type i1=0; i1<indicator_pheno.size(); ++i1) {
+		if (indicator_cvt[i1]==0) {continue;}
+		for (vector<int>::size_type j1=0; j1<n_ph; ++j1) {
+			
+			c_obs2=0; c_miss2=0;
+			for (vector<int>::size_type i2=0; i2<indicator_pheno.size(); ++i2) {
+				if (indicator_cvt[i2]==0) {continue;}
+				for (vector<int>::size_type j2=0; j2<n_ph; j2++) {
+					
+					if (indicator_pheno[i2][j2]==1) {
+						if (indicator_pheno[i1][j1]==1) {
+							gsl_matrix_set (H_oo, c_obs1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						} else {
+							gsl_matrix_set (H_mo, c_miss1, c_obs2, gsl_matrix_get (H, c_obs1+c_miss1, c_obs2+c_miss2) );
+						}
+						c_obs2++;
+					} else {
+						c_miss2++;
+					}
+				}				
+			}
+			
+			if (indicator_pheno[i1][j1]==1) {
+				c_obs1++;
+			} else {
+				c_miss1++;
+			}
+		}
+		
+	}	
+	
+	//do LU decomposition of H_oo
+	int sig;
+	gsl_permutation * pmt=gsl_permutation_alloc (np_obs);
+	LUDecomp (H_oo, pmt, &sig);
+	
+//	if (mode_temp==0) {
+		//obtain y_obs=y_full-y_hat
+		//add the fixed effects part to y_miss: y_miss=y_hat
+		c_obs1=0; c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==1) {
+					gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j)-gsl_matrix_get (Y_hat, i, j) );
+					c_obs1++;
+				} else {
+					gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+					c_miss1++;
+				}
+			}
+		}	
+		
+		LUSolve (H_oo, pmt, y_obs, Hiy);
+		
+		gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+		
+		//put back predicted y_miss to Y_full
+		c_miss1=0;
+		for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+			if (indicator_cvt[i]==0) {continue;}
+			
+			for (vector<int>::size_type j=0; j<n_ph; ++j) {
+				if (indicator_pheno[i][j]==0) {
+					gsl_matrix_set (Y_full, i, j, gsl_vector_get (y_miss, c_miss1) );
+					c_miss1++;
+				}
+			}
+		}
+/*
+	} else {
+		for (size_t k=0; k<mode_temp; k++) {
+			c_obs1=0; c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==1) {
+						gsl_vector_set (y_obs, c_obs1, gsl_matrix_get (Y_full, i, j+k*2)-gsl_matrix_get (Y_hat, i, j) );
+						c_obs1++;
+					} else {
+						gsl_vector_set (y_miss, c_miss1, gsl_matrix_get (Y_hat, i, j) );
+						c_miss1++;
+					}
+				}
+			}	
+			
+			LUSolve (H_oo, pmt, y_obs, Hiy);
+			
+			gsl_blas_dgemv (CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
+			
+			//put back predicted y_miss to Y_full
+			c_miss1=0;
+			for (vector<int>::size_type i=0; i<indicator_pheno.size(); ++i) {
+				if (indicator_cvt[i]==0) {continue;}
+				
+				for (vector<int>::size_type j=0; j<2; ++j) {
+					if (indicator_pheno[i][j]==0) {
+						gsl_matrix_set (Y_full, i, j+k*2, gsl_vector_get (y_miss, c_miss1) );
+						c_miss1++;
+					}
+				}
+			}
+		}
+	}
+*/
+	//free matrices
+	gsl_vector_free(y_obs);
+	gsl_vector_free(y_miss);
+	gsl_matrix_free(H_oo);
+	gsl_matrix_free(H_mo);
+	gsl_vector_free(Hiy);
+	
+	return;
+}
+
+