about summary refs log tree commit diff
path: root/lm.cpp
diff options
context:
space:
mode:
authorxiangzhou2014-09-22 11:12:46 -0400
committerxiangzhou2014-09-22 11:12:46 -0400
commit9aa9f76cb16a0e71fea19dd125761034aaca4fd4 (patch)
tree450932dba530a3e5cbd5b0c0736eaad5ad9720a7 /lm.cpp
parent7762722f264adc402ea3b0f21923b18f072253ba (diff)
downloadpangemma-9aa9f76cb16a0e71fea19dd125761034aaca4fd4.tar.gz
version 0.95alpha
Diffstat (limited to 'lm.cpp')
-rw-r--r--lm.cpp572
1 files changed, 0 insertions, 572 deletions
diff --git a/lm.cpp b/lm.cpp
deleted file mode 100644
index 7577d0a..0000000
--- a/lm.cpp
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- Genome-wide Efficient Mixed Model Association (GEMMA)
- Copyright (C) 2011  Xiang Zhou
- 
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License
- along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-#include <iomanip>
-#include <cmath>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h> 
-#include <bitset>
-#include <cstring>
-
-#include "gsl/gsl_vector.h"
-#include "gsl/gsl_matrix.h"
-#include "gsl/gsl_linalg.h"
-#include "gsl/gsl_blas.h"
-
-
-#include "gsl/gsl_cdf.h"
-#include "gsl/gsl_roots.h"
-#include "gsl/gsl_min.h"
-#include "gsl/gsl_integration.h"
-
-#include "gzstream.h"
-#include "lapack.h"
-
-#ifdef FORCE_FLOAT
-#include "lm_float.h"
-#else
-#include "lm.h"
-#endif
-
-
-using namespace std;
-
-
-
-
-
-void LM::CopyFromParam (PARAM &cPar) 
-{
-	a_mode=cPar.a_mode;
-	d_pace=cPar.d_pace;
-	
-	file_bfile=cPar.file_bfile;
-	file_geno=cPar.file_geno;
-	file_out=cPar.file_out;
-	path_out=cPar.path_out;
-	file_gene=cPar.file_gene;
-	
-	time_opt=0.0;
-	
-	ni_total=cPar.ni_total;
-	ns_total=cPar.ns_total;
-	ni_test=cPar.ni_test;
-	ns_test=cPar.ns_test;
-	n_cvt=cPar.n_cvt;
-	
-	ng_total=cPar.ng_total;
-	ng_test=0;
-	
-	indicator_idv=cPar.indicator_idv;	
-	indicator_snp=cPar.indicator_snp;	
-	snpInfo=cPar.snpInfo;
-	
-	return;
-}
-
-
-void LM::CopyToParam (PARAM &cPar) 
-{
-	cPar.time_opt=time_opt;	
-	
-	cPar.ng_test=ng_test;
-	
-	return;
-}
-
-
-
-void LM::WriteFiles () 
-{
-	string file_str;
-	file_str=path_out+"/"+file_out;
-	file_str+=".assoc.txt";
-
-	ofstream outfile (file_str.c_str(), ofstream::out);
-	if (!outfile) {cout<<"error writing file: "<<file_str.c_str()<<endl; return;}
-
-	if (!file_gene.empty()) {
-		outfile<<"geneID"<<"\t";
-		
-		if (a_mode==51) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
-		} else if (a_mode==52) {
-			outfile<<"p_lrt"<<endl;
-		} else if (a_mode==53) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
-		} else if (a_mode==54) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
-		} else {}
-				
-		for (vector<SUMSTAT>::size_type t=0; t<sumStat.size(); ++t) {	
-			outfile<<snpInfo[t].rs_number<<"\t";
-			
-			if (a_mode==51) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
-			} else if (a_mode==52) {
-				outfile<<scientific<<setprecision(6)<<"\t"<<sumStat[t].p_lrt<<endl;
-			} else if (a_mode==53) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
-			} else if (a_mode==54) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
-			} else {}
-		}	
-	}  else {
-		outfile<<"chr"<<"\t"<<"rs"<<"\t"<<"ps"<<"\t"<<"n_miss"<<"\t"<<"allele1"<<"\t"<<"allele0"<<"\t"<<"af"<<"\t";
-		
-		if (a_mode==51) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<endl;
-		} else if (a_mode==52) {
-			outfile<<"p_lrt"<<endl;
-		} else if (a_mode==53) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_score"<<endl;
-		} else if (a_mode==54) {
-			outfile<<"beta"<<"\t"<<"se"<<"\t"<<"p_wald"<<"\t"<<"p_lrt"<<"\t"<<"p_score"<<endl;
-		} else {}
-		
-		size_t t=0;
-		for (size_t i=0; i<snpInfo.size(); ++i) {
-			if (indicator_snp[i]==0) {continue;}
-			
-			outfile<<snpInfo[i].chr<<"\t"<<snpInfo[i].rs_number<<"\t"<<snpInfo[i].base_position<<"\t"<<snpInfo[i].n_miss<<"\t"<<snpInfo[i].a_minor<<"\t"<<snpInfo[i].a_major<<"\t"<<fixed<<setprecision(3)<<snpInfo[i].maf<<"\t";
-			
-			if (a_mode==51) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<endl;
-			} else if (a_mode==52) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].p_lrt<<endl;
-			} else if (a_mode==53) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_score<<endl;
-			} else if (a_mode==54) {
-				outfile<<scientific<<setprecision(6)<<sumStat[t].beta<<"\t"<<sumStat[t].se<<"\t"<<sumStat[t].p_wald <<"\t"<<sumStat[t].p_lrt<<"\t"<<sumStat[t].p_score<<endl;
-			} else {}
-			t++;
-		}
-	}
-	
-		
-	outfile.close();
-	outfile.clear();
-	return;
-}
-
-
-
-
-
-void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *Wtx, const gsl_vector *y, const gsl_vector *x,  double &xPwy, double &xPwx)
-{
-	size_t c_size=Wty->size;
-	double d;
-	
-	gsl_vector *WtWiWtx=gsl_vector_alloc (c_size);
-	
-	gsl_blas_ddot (x, x, &xPwx);
-	gsl_blas_ddot (x, y, &xPwy);
-	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);	
-	
-	gsl_blas_ddot (WtWiWtx, Wtx, &d);	
-	xPwx-=d;
-	
-	gsl_blas_ddot (WtWiWtx, Wty, &d);	
-	xPwy-=d;
-	
-	gsl_vector_free (WtWiWtx);
-	
-	return;
-}
-
-
-void CalcvPv(const gsl_matrix *WtWi, const gsl_vector *Wty, const gsl_vector *y, double &yPwy)
-{
-	size_t c_size=Wty->size;
-	double d;
-	
-	gsl_vector *WtWiWty=gsl_vector_alloc (c_size);
-	
-	gsl_blas_ddot (y, y, &yPwy);
-	gsl_blas_dgemv (CblasNoTrans, 1.0, WtWi, Wty, 0.0, WtWiWty);	
-	
-	gsl_blas_ddot (WtWiWty, Wty, &d);	
-	yPwy-=d;
-	
-	gsl_vector_free (WtWiWty);
-	
-	return;
-}
-
-
-
-//calculate p values and beta/se in a linear model
-void LmCalcP (const size_t test_mode, const double yPwy, const double xPwy, const double xPwx, const double df, const size_t n_size, double &beta, double &se, double &p_wald, double &p_lrt, double &p_score)
-{
-	double yPxy=yPwy-xPwy*xPwy/xPwx;
-	double se_wald, se_score;
-	
-	beta=xPwy/xPwx;
-	se_wald=sqrt(yPxy/(df*xPwx) );
-	se_score=sqrt(yPwy/((double)n_size*xPwx) );
-	
-	p_wald=gsl_cdf_fdist_Q (beta*beta/(se_wald*se_wald), 1.0, df);
-	p_score=gsl_cdf_fdist_Q (beta*beta/(se_score*se_score), 1.0, df);
-	p_lrt=gsl_cdf_chisq_Q ((double)n_size*(log(yPwy)-log(yPxy)), 1);
-	
-	if (test_mode==3) {se=se_score;} else {se=se_wald;}
-	
-	return;
-}
-
-
-
-
-void LM::AnalyzeGene (const gsl_matrix *W, const gsl_vector *x) 
-{
-	ifstream infile (file_gene.c_str(), ifstream::in);
-	if (!infile) {cout<<"error reading gene expression file:"<<file_gene<<endl; return;}
-	
-	clock_t time_start=clock();
-	
-	string line;
-	char *ch_ptr;
-	
-	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
-	int c_phen;
-	string rs; //gene id
-	double d;
-	
-	//calculate some basic quantities
-	double yPwy, xPwy, xPwx;
-	double df=(double)W->size1-(double)W->size2-1.0;
-
-	gsl_vector *y=gsl_vector_alloc (W->size1);
-
-	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
-	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
-	gsl_vector *Wty=gsl_vector_alloc (W->size2);
-	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
-	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
-
-	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
-	int sig;
-	LUDecomp (WtW, pmt, &sig);
-	LUInvert (WtW, pmt, WtWi);
-
-	gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
-	CalcvPv(WtWi, Wtx, x, xPwx);
-		
-	//header
-	getline(infile, line);
-	
-	for (size_t t=0; t<ng_total; t++) {
-		getline(infile, line);
-		if (t%d_pace==0 || t==ng_total-1) {ProgressBar ("Performing Analysis ", t, ng_total-1);}
-		ch_ptr=strtok ((char *)line.c_str(), " , \t");
-		rs=ch_ptr;
-		
-		c_phen=0; 
-		for (size_t i=0; i<indicator_idv.size(); ++i) {
-			ch_ptr=strtok (NULL, " , \t");
-			if (indicator_idv[i]==0) {continue;}
-			
-			d=atof(ch_ptr); 			
-			gsl_vector_set(y, c_phen, d);
-			
-			c_phen++;
-		}
-				
-		//calculate statistics		
-		time_start=clock();	
-	
-		gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty);
-		CalcvPv(WtWi, Wtx, Wty, x, y, xPwy, yPwy);
-		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);	
-	
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-		
-		//store summary data
-		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
-	}
-	cout<<endl;
-	
-	gsl_vector_free(y);
-
-	gsl_matrix_free(WtW);
-	gsl_matrix_free(WtWi);
-	gsl_vector_free(Wty);
-	gsl_vector_free(Wtx);
-	gsl_permutation_free(pmt);
-	
-	infile.close();
-	infile.clear();
-	
-	return;
-}
-
-
-
-
-void LM::AnalyzeBimbam (const gsl_matrix *W, const gsl_vector *y)
-{
-	igzstream infile (file_geno.c_str(), igzstream::in);
-	//	ifstream infile (file_geno.c_str(), ifstream::in);
-	if (!infile) {cout<<"error reading genotype file:"<<file_geno<<endl; return;}
-	
-	clock_t time_start=clock();
-	
-	string line;
-	char *ch_ptr;
-	
-	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
-	int n_miss, c_phen;
-	double geno, x_mean;
-	
-	//calculate some basic quantities
-	double yPwy, xPwy, xPwx;
-	double df=(double)W->size1-(double)W->size2-1.0;
-
-	gsl_vector *x=gsl_vector_alloc (W->size1);
-	gsl_vector *x_miss=gsl_vector_alloc (W->size1);
-
-	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
-	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);		
-	gsl_vector *Wty=gsl_vector_alloc (W->size2);
-	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
-	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
-
-	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
-	int sig;
-	LUDecomp (WtW, pmt, &sig);
-	LUInvert (WtW, pmt, WtWi);
-
-	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
-	CalcvPv(WtWi, Wty, y, yPwy);
-	
-	//start reading genotypes and analyze	
-	for (size_t t=0; t<indicator_snp.size(); ++t) {
-		//if (t>1) {break;}
-		getline(infile, line);
-		if (t%d_pace==0 || t==(ns_total-1)) {ProgressBar ("Reading SNPs  ", t, ns_total-1);}
-		if (indicator_snp[t]==0) {continue;}
-		
-		ch_ptr=strtok ((char *)line.c_str(), " , \t");
-		ch_ptr=strtok (NULL, " , \t");
-		ch_ptr=strtok (NULL, " , \t");
-		
-		x_mean=0.0; c_phen=0; n_miss=0;
-		gsl_vector_set_zero(x_miss);
-		for (size_t i=0; i<ni_total; ++i) {
-			ch_ptr=strtok (NULL, " , \t");
-			if (indicator_idv[i]==0) {continue;}
-			
-			if (strcmp(ch_ptr, "NA")==0) {gsl_vector_set(x_miss, c_phen, 0.0); n_miss++;}
-			else {
-				geno=atof(ch_ptr); 				
-				
-				gsl_vector_set(x, c_phen, geno); 
-				gsl_vector_set(x_miss, c_phen, 1.0); 
-				x_mean+=geno;
-			}
-			c_phen++;
-		}	
-		
-		x_mean/=(double)(ni_test-n_miss);
-		
-		for (size_t i=0; i<ni_test; ++i) {
-			if (gsl_vector_get (x_miss, i)==0) {gsl_vector_set(x, i, x_mean);}
-			geno=gsl_vector_get(x, i);
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
-		}		
-		
-		//calculate statistics		
-		time_start=clock();		
-
-		gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx);		
-		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);
-		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);
-		
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-		
-		//store summary data
-		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
-	}	
-	cout<<endl;
-
-	gsl_vector_free(x);
-	gsl_vector_free(x_miss);
-
-	gsl_matrix_free(WtW);
-	gsl_matrix_free(WtWi);
-	gsl_vector_free(Wty);
-	gsl_vector_free(Wtx);
-	gsl_permutation_free(pmt);
-	
-	infile.close();
-	infile.clear();
-	
-	return;
-}
-
-
-
-
-
-
-
-void LM::AnalyzePlink (const gsl_matrix *W, const gsl_vector *y) 
-{
-	string file_bed=file_bfile+".bed";
-	ifstream infile (file_bed.c_str(), ios::binary);
-	if (!infile) {cout<<"error reading bed file:"<<file_bed<<endl; return;}
-	
-	clock_t time_start=clock();
-	
-	char ch[1];
-	bitset<8> b;	
-	
-	double beta=0, se=0, p_wald=0, p_lrt=0, p_score=0;
-	int n_bit, n_miss, ci_total, ci_test;
-	double geno, x_mean;
-		
-	//calculate some basic quantities
-	double yPwy, xPwy, xPwx;
-	double df=(double)W->size1-(double)W->size2-1.0;
-
-	gsl_vector *x=gsl_vector_alloc (W->size1);
-
-	gsl_matrix *WtW=gsl_matrix_alloc (W->size2, W->size2);
-	gsl_matrix *WtWi=gsl_matrix_alloc (W->size2, W->size2);	
-	gsl_vector *Wty=gsl_vector_alloc (W->size2);
-	gsl_vector *Wtx=gsl_vector_alloc (W->size2);
-	gsl_permutation * pmt=gsl_permutation_alloc (W->size2);
-
-	gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW);
-	int sig;
-	LUDecomp (WtW, pmt, &sig);
-	LUInvert (WtW, pmt, WtWi);
-
-	gsl_blas_dgemv (CblasTrans, 1.0, W, y, 0.0, Wty);
-	CalcvPv(WtWi, Wty, y, yPwy);
-		
-	//calculate n_bit and c, the number of bit for each snp
-	if (ni_total%4==0) {n_bit=ni_total/4;}
-	else {n_bit=ni_total/4+1; }
-	
-	//print the first three majic numbers
-	for (int i=0; i<3; ++i) {
-		infile.read(ch,1);
-		b=ch[0];
-	}
-	
-	
-	for (vector<SNPINFO>::size_type t=0; t<snpInfo.size(); ++t) {
-		if (t%d_pace==0 || t==snpInfo.size()-1) {ProgressBar ("Reading SNPs  ", t, snpInfo.size()-1);}
-		if (indicator_snp[t]==0) {continue;}
-		
-		infile.seekg(t*n_bit+3);		//n_bit, and 3 is the number of magic numbers
-		
-		//read genotypes
-		x_mean=0.0;	n_miss=0; ci_total=0; ci_test=0; 
-		for (int i=0; i<n_bit; ++i) {
-			infile.read(ch,1);
-			b=ch[0];
-			for (size_t j=0; j<4; ++j) {                //minor allele homozygous: 2.0; major: 0.0;
-				if ((i==(n_bit-1)) && ci_total==(int)ni_total) {break;}
-				if (indicator_idv[ci_total]==0) {ci_total++; continue;}
-				
-				if (b[2*j]==0) {
-					if (b[2*j+1]==0) {gsl_vector_set(x, ci_test, 2); x_mean+=2.0; }
-					else {gsl_vector_set(x, ci_test, 1); x_mean+=1.0; }
-				}
-				else {
-					if (b[2*j+1]==1) {gsl_vector_set(x, ci_test, 0); }                                  
-					else {gsl_vector_set(x, ci_test, -9); n_miss++; }
-				}
-				
-				ci_total++;
-				ci_test++;
-			}
-		}
-		
-		x_mean/=(double)(ni_test-n_miss);
-		
-		for (size_t i=0; i<ni_test; ++i) {			
-			geno=gsl_vector_get(x,i);
-			if (geno==-9) {gsl_vector_set(x, i, x_mean); geno=x_mean;}
-			if (x_mean>1) {
-				gsl_vector_set(x, i, 2-geno);
-			}
-		}
-		
-		//calculate statistics		
-		time_start=clock();	
-		
-		gsl_blas_dgemv (CblasTrans, 1.0, W, x, 0.0, Wtx);
-		CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx);		
-		LmCalcP (a_mode-50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, p_lrt, p_score);    
-
-		time_opt+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0);
-		
-		//store summary data
-		SUMSTAT SNPs={beta, se, 0.0, 0.0, p_wald, p_lrt, p_score};
-		sumStat.push_back(SNPs);
-	}	
-	cout<<endl;
-	
-	gsl_vector_free(x);
-
-	gsl_matrix_free(WtW);
-	gsl_matrix_free(WtWi);	
-	gsl_vector_free(Wty);
-	gsl_vector_free(Wtx);
-	gsl_permutation_free(pmt);
-	
-	infile.close();
-	infile.clear();	
-	
-	return;
-}
-
-
-
-//make sure that both y and X are centered already
-void MatrixCalcLmLR (const gsl_matrix *X, const gsl_vector *y, vector<pair<size_t, double> > &pos_loglr) 
-{
-	double yty, xty, xtx, log_lr;
-	gsl_blas_ddot(y, y, &yty);
-
-	for (size_t i=0; i<X->size2; ++i) {
-	  gsl_vector_const_view X_col=gsl_matrix_const_column (X, i);
-	  gsl_blas_ddot(&X_col.vector, &X_col.vector, &xtx);
-	  gsl_blas_ddot(&X_col.vector, y, &xty);
-
-	  log_lr=0.5*(double)y->size*(log(yty)-log(yty-xty*xty/xtx));
-	  pos_loglr.push_back(make_pair(i,log_lr) );
-	}
-	
-	return;
-}