From 0dd4e05fc8babc1517de1d7981a99ad0a5241a5e Mon Sep 17 00:00:00 2001 From: Peter Carbonetto Date: Thu, 4 May 2017 14:43:12 -0500 Subject: Added new files shared by Xiang via email on May 4, 2017. --- src/bslmmdap.cpp | 1015 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/bslmmdap.h | 101 ++++++ src/ldr.cpp | 285 +++++++++++++++ src/ldr.h | 77 +++++ src/logistic.cpp | 783 +++++++++++++++++++++++++++++++++++++++++ src/logistic.h | 70 ++++ src/varcov.cpp | 482 ++++++++++++++++++++++++++ src/varcov.h | 72 ++++ 8 files changed, 2885 insertions(+) create mode 100644 src/bslmmdap.cpp create mode 100644 src/bslmmdap.h create mode 100644 src/ldr.cpp create mode 100644 src/ldr.h create mode 100644 src/logistic.cpp create mode 100644 src/logistic.h create mode 100644 src/varcov.cpp create mode 100644 src/varcov.h diff --git a/src/bslmmdap.cpp b/src/bslmmdap.cpp new file mode 100644 index 0000000..0bf0e7b --- /dev/null +++ b/src/bslmmdap.cpp @@ -0,0 +1,1015 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" + + + +#include "logistic.h" +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "bslmmdap_float.h" +#include "lmm_float.h" //for class FUNC_PARAM and MatrixCalcLR +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "bslmmdap.h" +#include "lmm.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; + + + + +void BSLMMDAP::CopyFromParam (PARAM &cPar) +{ + file_out=cPar.file_out; + path_out=cPar.path_out; + + time_UtZ=0.0; + time_Omega=0.0; + + h_min=cPar.h_min; + h_max=cPar.h_max; + h_ngrid=cPar.h_ngrid; + rho_min=cPar.rho_min; + rho_max=cPar.rho_max; + rho_ngrid=cPar.rho_ngrid; + + if (h_min<=0) {h_min=0.01;} + if (h_max>=1) {h_max=0.99;} + if (rho_min<=0) {rho_min=0.01;} + if (rho_max>=1) {rho_max=0.99;} + + trace_G=cPar.trace_G; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void BSLMMDAP::CopyToParam (PARAM &cPar) +{ + cPar.time_UtZ=time_UtZ; + cPar.time_Omega=time_Omega; + + return; +} + + + +//read hyp file +void ReadFile_hyb (const string &file_hyp, vector &vec_sa2, vector &vec_sb2, vector &vec_wab) +{ + vec_sa2.clear(); vec_sb2.clear(); vec_wab.clear(); + + igzstream infile (file_hyp.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open hyp file: "< &vec_rs, vector > > &BF) +{ + BF.clear(); vec_rs.clear(); + + igzstream infile (file_bf.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open bf file: "< vec_bf; + vector > mat_bf; + char *ch_ptr; + + size_t bf_size, flag_block; + + getline(infile, line); + + size_t t=0; + while (!safeGetline(infile, line).eof()) { + flag_block=0; + + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + rs=ch_ptr; + vec_rs.push_back(rs); + + ch_ptr=strtok (NULL, " , \t"); + if (t==0) { + block=ch_ptr; + } else { + if (strcmp(ch_ptr, block.c_str() )!=0) { + flag_block=1; + block=ch_ptr; + } + } + + ch_ptr=strtok (NULL, " , \t"); + while (ch_ptr!=NULL) { + vec_bf.push_back(atof(ch_ptr)); + ch_ptr=strtok (NULL, " , \t"); + } + + if (t==0) { + bf_size=vec_bf.size(); + } else { + if (bf_size!=vec_bf.size()) {cout<<"error! unequal row size in bf file."< &vec_rs, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel, size_t &kc, size_t &kd) +{ + igzstream infile (file_cat.c_str(), igzstream::in); + if (!infile) {cout<<"error! fail to open category file: "< > mapRS2catc; + map > mapRS2catd; + vector catc; + vector catd; + + //read the following lines to record mapRS2cat + while (!safeGetline(infile, line).eof()) { + ch_ptr=strtok ((char *)line.c_str(), " , \t"); + + if (header.rs_col==0) { + rs=chr+":"+pos; + } + + catc.clear(); catd.clear(); + + for (size_t i=0; i0) {mapRS2catc[rs]=catc;} + if (mapRS2catd.count(rs)==0 && kd>0) {mapRS2catd[rs]=catd;} + } + + //load into Ad and Ac + if (kc>0) { + Ac=gsl_matrix_alloc(vec_rs.size(), kc); + for (size_t i=0; i0) { + Ad=gsl_matrix_int_alloc(vec_rs.size(), kd); + + for (size_t i=0; i rcd; + int val; + for (size_t j=0; jsize1; i++) { + val = gsl_matrix_int_get(Ad, i, j); + rcd[val] = 1; + } + gsl_vector_int_set (dlevel, j, rcd.size()); + } + } + + infile.clear(); + infile.close(); + + return; +} + + + + + + + + +void BSLMMDAP::WriteResult (const gsl_matrix *Hyper, const gsl_matrix *BF) +{ + string file_bf, file_hyp; + file_bf=path_out+"/"+file_out; + file_bf+=".bf.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + + ofstream outfile_bf, outfile_hyp; + + outfile_bf.open (file_bf.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + + if (!outfile_bf) {cout<<"error writing file: "<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_hyp<size2; i++) { + outfile_bf<<"\t"<<"BF"<size2; j++) { + outfile_bf<<"\t"< &vec_rs, const gsl_matrix *Hyper, const gsl_vector *pip, const gsl_vector *coef) +{ + string file_gamma, file_hyp, file_coef; + file_gamma=path_out+"/"+file_out; + file_gamma+=".gamma.txt"; + file_hyp=path_out+"/"+file_out; + file_hyp+=".hyp.txt"; + file_coef=path_out+"/"+file_out; + file_coef+=".coef.txt"; + + ofstream outfile_gamma, outfile_hyp, outfile_coef; + + outfile_gamma.open (file_gamma.c_str(), ofstream::out); + outfile_hyp.open (file_hyp.c_str(), ofstream::out); + outfile_coef.open (file_coef.c_str(), ofstream::out); + + if (!outfile_gamma) {cout<<"error writing file: "<size1; i++) { + for (size_t j=0; jsize2; j++) { + outfile_hyp<size; i++) { + outfile_coef< &rank) +{ + size_t pos; + for (size_t i=0; isize); + + double logm=0.0; + double d, uy, Hi_yy=0, logdet_H=0.0; + for (size_t i=0; isize1, UtXgamma->size2); + gsl_matrix *Omega=gsl_matrix_alloc (UtXgamma->size2, UtXgamma->size2); + gsl_vector *XtHiy=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *beta_hat=gsl_vector_alloc (UtXgamma->size2); + gsl_vector *weight_Hi=gsl_vector_alloc (UtXgamma->size1); + + gsl_matrix_memcpy (UtXgamma_eval, UtXgamma); + + logdet_H=0.0; P_yy=0.0; + for (size_t i=0; i vec_sa2, vec_sb2, logm_null; + + gsl_matrix *BF=gsl_matrix_alloc(ns_test, n_grid); + gsl_matrix *Xgamma=gsl_matrix_alloc(ni_test, 1); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + + //compute tau by using yty + gsl_blas_ddot (Uty, Uty, &tau); + tau=(double)ni_test/tau; + + //set up grid values for sigma_a2 and sigma_b2 based on an approximately even grid for h and rho, and a fixed number of causals + size_t ij=0; + for (size_t i=0; i sum_pip; + map sum; + + int levels = gsl_vector_int_get(dlevel,0); + + for(int i=0;isize1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + sum_pip[cat] += gsl_vector_get(pip_vec,i); + sum[cat] += 1; + } + + for(int i=0;isize1;i++){ + int cat = gsl_matrix_int_get(Xd,i,0); + gsl_vector_set(prior_vec,i,sum_pip[cat]/sum[cat]); + } + + //double baseline=0; + for(int i=0;i &vec_rs, const vector &vec_sa2, const vector &vec_sb2, const vector &wab, const vector > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) { + clock_t time_start; + + //set up BF + double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save; + size_t t1, t2; + size_t n_grid=wab.size(), ns_test=vec_rs.size(); + + gsl_vector *prior_vec=gsl_vector_alloc(ns_test); + gsl_matrix *Hyper=gsl_matrix_alloc(n_grid, 5); + gsl_vector *pip=gsl_vector_alloc(ns_test); + gsl_vector *coef=gsl_vector_alloc(kc+kd+1); + + //perform the EM algorithm + vector vec_wab, vec_wab_new; + + //initial values + for (size_t t=0; t1e-3) { + //update E_gamma + t1=0, t2=0; + for (size_t b=0; bsize; t++) { + s+=gsl_vector_get(pip, t); + } + s=s/(double)pip->size; + for (size_t t=0; tsize; t++) { + gsl_vector_set(prior_vec, t, s); + } + + gsl_vector_set (coef, 0, log(s/(1-s))); + } else if(kc==0 && kd!=0){//only discrete annotations + if(kd == 1){ + single_ct_regression(Ad, dlevel, pip, coef, prior_vec); + }else{ + logistic_cat_fit(coef, Ad, dlevel, pip, 0, 0); + logistic_cat_pred(coef, Ad, dlevel, prior_vec); + } + } else if (kc!=0 && kd==0) {//only continuous annotations + logistic_cont_fit(coef, Ac, pip, 0, 0); + logistic_cont_pred(coef, Ac, prior_vec); + } else if (kc!=0 && kd!=0) {//both continuous and categorical annotations + logistic_mixed_fit(coef, Ad, dlevel, Ac, pip, 0, 0); + logistic_mixed_pred(coef, Ad, dlevel, Ac, prior_vec); + } + + //compute marginal likelihood + logm=0; + + t1=0; + for (size_t b=0; b0) { + dif=logm-logm_save; + } + logm_save=logm; + it++; + + cout<<"iteration = "<10) {break;} + + if (a_mode==13) { + SampleZ (y, z_hat, z); + mean_z=CenterVector (z); + + time_start=clock(); + gsl_blas_dgemv (CblasTrans, 1.0, U, z, 0.0, Utz); + time_UtZ+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + //First proposal + if (cHyp_old.n_gamma==0 || cHyp_old.rho==0) { + logPost_old=CalcPosterior(Utz, K_eval, Utu_old, alpha_old, cHyp_old); + beta_old.clear(); + for (size_t i=0; isize; ++i) { + beta_old.push_back(gsl_vector_get(beta, i)); + } + gsl_matrix_free (UtXgamma); + gsl_vector_free (beta); + } + } + + + delete [] p_gamma; + beta_g.clear(); + + return; +} + +*/ + + + + + + +/* +//below fits MCMC for rho=1 +void BSLMM::CalcXtX (const gsl_matrix *X, const gsl_vector *y, const size_t s_size, gsl_matrix *XtX, gsl_vector *Xty) +{ + time_t time_start=clock(); + gsl_matrix_const_view X_sub=gsl_matrix_const_submatrix(X, 0, 0, X->size1, s_size); + gsl_matrix_view XtX_sub=gsl_matrix_submatrix(XtX, 0, 0, s_size, s_size); + gsl_vector_view Xty_sub=gsl_vector_subvector(Xty, 0, s_size); + +#ifdef WITH_LAPACK + lapack_dgemm ((char *)"T", (char *)"N", 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#else + gsl_blas_dgemm (CblasTrans, CblasNoTrans, 1.0, &X_sub.matrix, &X_sub.matrix, 0.0, &XtX_sub.matrix); +#endif + gsl_blas_dgemv(CblasTrans, 1.0, &X_sub.matrix, y, 0.0, &Xty_sub.vector); + + time_Omega+=(clock()-time_start)/(double(CLOCKS_PER_SEC)*60.0); + + return; +} + + + +double BSLMM::CalcPosterior (const double yty, class HYPBSLMM &cHyp) +{ + double logpost=0.0; + + //for quantitative traits, calculate pve and pge + //pve and pge for case/control data are calculted in CalcCC_PVEnZ + if (a_mode==11) { + cHyp.pve=0.0; + cHyp.pge=1.0; + } + + //calculate likelihood + if (a_mode==11) {logpost-=0.5*(double)ni_test*log(yty);} + else {logpost-=0.5*yty;} + + logpost+=((double)cHyp.n_gamma-1.0)*cHyp.logp+((double)ns_test-(double)cHyp.n_gamma)*log(1-exp(cHyp.logp)); + + return logpost; +} + + +double BSLMM::CalcPosterior (const gsl_matrix *Xgamma, const gsl_matrix *XtX, const gsl_vector *Xty, const double yty, const size_t s_size, gsl_vector *Xb, gsl_vector *beta, class HYPBSLMM &cHyp) +{ + double sigma_a2=cHyp.h/( (1-cHyp.h)*exp(cHyp.logp)*(double)ns_test); + double logpost=0.0; + double d, P_yy=yty, logdet_O=0.0; + + gsl_matrix_const_view Xgamma_sub=gsl_matrix_const_submatrix (Xgamma, 0, 0, Xgamma->size1, s_size); + gsl_matrix_const_view XtX_sub=gsl_matrix_const_submatrix (XtX, 0, 0, s_size, s_size); + gsl_vector_const_view Xty_sub=gsl_vector_const_subvector (Xty, 0, s_size); + + gsl_matrix *Omega=gsl_matrix_alloc (s_size, s_size); + gsl_matrix *M_temp=gsl_matrix_alloc (s_size, s_size); + gsl_vector *beta_hat=gsl_vector_alloc (s_size); + gsl_vector *Xty_temp=gsl_vector_alloc (s_size); + + gsl_vector_memcpy (Xty_temp, &Xty_sub.vector); + + //calculate Omega + gsl_matrix_memcpy (Omega, &XtX_sub.matrix); + gsl_matrix_scale (Omega, sigma_a2); + gsl_matrix_set_identity (M_temp); + gsl_matrix_add (Omega, M_temp); + + //calculate beta_hat + logdet_O=CholeskySolve(Omega, Xty_temp, beta_hat); + gsl_vector_scale (beta_hat, sigma_a2); + + gsl_blas_ddot (Xty_temp, beta_hat, &d); + P_yy-=d; + + //sample tau + double tau=1.0; + if (a_mode==11) {tau =gsl_ran_gamma (gsl_r, (double)ni_test/2.0, 2.0/P_yy); } + + //sample beta + for (size_t i=0; i. + */ + + +#ifndef __BSLMMDAP_H__ +#define __BSLMMDAP_H__ + +#include +#include +#include +#include + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class BSLMMDAP { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + // LMM related parameters + double pve_null; + double pheno_mean; + + // BSLMM MCMC related parameters + long int randseed; + double trace_G; + + HYPBSLMM cHyp_initial; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + + double h_min, h_max, rho_min, rho_max; + size_t h_ngrid, rho_ngrid; + + double time_UtZ; + double time_Omega; //time spent on optimization iterations + double time_Proposal; //time spent on constructing the proposal distribution for gamma (i.e. lmm or lm analysis) + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void WriteResult (const gsl_matrix *Hyper, const gsl_matrix *BF); + void WriteResult (const vector &vec_rs, const gsl_matrix *Hyper, const gsl_vector *pip, const gsl_vector *coef); + double CalcMarginal (const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_b2, const double tau); + double CalcMarginal (const gsl_matrix *UtXgamma, const gsl_vector *Uty, const gsl_vector *K_eval, const double sigma_a2, const double sigma_b2, const double tau); + double CalcPrior (class HYPBSLMM &cHyp); + + void DAP_CalcBF (const gsl_matrix *U, const gsl_matrix *UtX, const gsl_vector *Uty, const gsl_vector *K_eval, const gsl_vector *y); + void DAP_EstimateHyper (const size_t kc, const size_t kd, const vector &vec_rs, const vector &vec_sa2, const vector &vec_sb2, const vector &wab, const vector > > &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel); + +}; + +void ReadFile_hyb (const string &file_hyp, vector &vec_sa2, vector &vec_sb2, vector &vec_wab); +void ReadFile_bf (const string &file_bf, vector &vec_rs, vector > > &BF); +void ReadFile_cat (const string &file_cat, const vector &vec_rs, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel, size_t &kc, size_t &kd); + + +#endif + + diff --git a/src/ldr.cpp b/src/ldr.cpp new file mode 100644 index 0000000..e28490a --- /dev/null +++ b/src/ldr.cpp @@ -0,0 +1,285 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_eigen.h" +#include "gsl/gsl_randist.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_roots.h" +#include "Eigen/Dense" + + + +#include "lapack.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "ldr_float.h" +#include "lm_float.h" +#include "mathfunc_float.h" //for function CenterVector +#else +#include "param.h" +#include "ldr.h" +#include "lm.h" +#include "mathfunc.h" +#endif + +using namespace std; +using namespace Eigen; + + + +void LDR::CopyFromParam (PARAM &cPar) +{ + a_mode=cPar.a_mode; + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + ni_total=cPar.ni_total; + ns_total=cPar.ns_total; + ni_test=cPar.ni_test; + ns_test=cPar.ns_test; + n_cvt=cPar.n_cvt; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void LDR::CopyToParam (PARAM &cPar) +{ + //cPar.pheno_mean=pheno_mean; + //cPar.randseed=randseed; + + return; +} + + +/* +void BSLMM::WriteBV (const gsl_vector *bv) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".bv.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< > &beta_g, const gsl_vector *alpha, const size_t w) +{ + string file_str; + file_str=path_out+"/"+file_out; + file_str+=".param.txt"; + + ofstream outfile (file_str.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "< > &Xt, const gsl_matrix *W_gsl, const gsl_vector *y_gsl) +{ + //save gsl_vector and gsl_matrix into eigen library formats + MatrixXd W(W_gsl->size1, W_gsl->size2); + VectorXd y(y_gsl->size); + VectorXd x_col(y_gsl->size); + + double d; + for (size_t i=0; isize1; i++) { + d=gsl_vector_get(y_gsl, i); + y(i)=d; + for (size_t j=0; jsize2; j++) { + W(i,j)=gsl_matrix_get(W_gsl, i, j); + } + } + + //initial VB values by lm + cout<. + */ + + +#ifndef __LDR_H__ +#define __LDR_H__ + +#include +#include +#include +#include + +#ifdef FORCE_FLOAT +#include "param_float.h" +#else +#include "param.h" +#endif + + +using namespace std; + + + + + + +class LDR { + +public: + // IO related parameters + int a_mode; + size_t d_pace; + + string file_bfile; + string file_geno; + string file_out; + string path_out; + + // Summary statistics + size_t ni_total, ns_total; //number of total individuals and snps + size_t ni_test, ns_test; //number of individuals and snps used for analysis + size_t n_cvt; //number of covariates + vector indicator_idv; //indicator for individuals (phenotypes), 0 missing, 1 available for analysis + vector indicator_snp; //sequence indicator for SNPs: 0 ignored because of (a) maf, (b) miss, (c) non-poly; 1 available for analysis + + vector snpInfo; //record SNP information + + // Not included in PARAM + gsl_rng *gsl_r; + + // Main Functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + + void VB(const vector > &Xt, const gsl_matrix *W_gsl, const gsl_vector *y_gsl); +}; + + + +#endif + + diff --git a/src/logistic.cpp b/src/logistic.cpp new file mode 100644 index 0000000..1b47946 --- /dev/null +++ b/src/logistic.cpp @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include +#include "logistic.h" + +// I need to bundle all the data that goes to the function to optimze together. +typedef struct{ + gsl_matrix_int *X; + gsl_vector_int *nlev; + gsl_vector *y; + gsl_matrix *Xc; // continuous covariates Matrix Nobs x Kc (NULL if not used) + double lambdaL1; + double lambdaL2; +}fix_parm_mixed_T; + + + + + + +double fLogit_mixed(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_matrix *Xc + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + int n = y->size; + // int k = X->size2; + int npar = beta->size; + double total = 0; + double aux = 0; + /* omp_set_num_threads(ompthr); */ + /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta_0*\/ */ + /* /\*#pragma omp parallel for reduction (+:total)*\/ */ + for(int i = 1; i < npar; ++i) + total += beta->data[i]*beta->data[i]; + total = (-total*lambdaL2/2); + /* /\*#pragma omp parallel for reduction (+:aux)*\/ */ + for(int i = 1; i < npar; ++i) + aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]); + total = total-aux*lambdaL1; + /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */ + for(int i = 0; i < n; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < X->size2; ++k) { + if(gsl_matrix_int_get(X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm]; + iParm+=nlev->data[k]-1; + } + for(int k = 0; k < (Xc->size2); ++k) + Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++]; + total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai)); + } + return -total; +} + + +void logistic_mixed_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used) + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ) +{ + for(int i = 0; i < X->size1; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < X->size2; ++k) { + if(gsl_matrix_int_get(X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm]; + iParm+=nlev->data[k]-1; + } + // Adding the continuous + for(int k = 0; k < (Xc->size2); ++k) + Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++]; + yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai)); + } +} + + +/* The gradient of f, df = (df/dx, df/dy). */ +void +wgsl_mixed_optim_df (const gsl_vector *beta, void *params, + gsl_vector *out) +{ + fix_parm_mixed_T *p = (fix_parm_mixed_T *)params; + int n = p->y->size; + int K = p->X->size2; + int Kc = p->Xc->size2; + int npar = beta->size; + // Intitialize gradient out necessary? + for(int i = 0; i < npar; ++i) + out->data[i]= 0; + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */ + for(int i = 1; i < npar; ++i) + out->data[i]= p->lambdaL2*beta->data[i]; + for(int i = 1; i < npar; ++i) + out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0)); + + for(int i = 0; i < n; ++i) { + double pn=0; + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]; + iParm+=p->nlev->data[k]-1; + } + // Adding the continuous + for(int k = 0; k < Kc; ++k) + Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm++]; + + pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) ); + + out->data[0]+= pn; + iParm=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + out->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]+=pn; + iParm+=p->nlev->data[k]-1; + } + // Adding the continuous + for(int k = 0; k < Kc; ++k) { + out->data[iParm++] += gsl_matrix_get(p->Xc,i,k)*pn; + } + } + +} + + +/* The Hessian of f */ +void +wgsl_mixed_optim_hessian (const gsl_vector *beta, void *params, + gsl_matrix *out) +{ + fix_parm_mixed_T *p = (fix_parm_mixed_T *)params; + int n = p->y->size; + int K = p->X->size2; + int Kc = p->Xc->size2; + int npar = beta->size; + gsl_vector *gn = gsl_vector_alloc(npar); /* gn */ + // Intitialize Hessian out necessary ??? + gsl_matrix_set_zero(out); + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/ + for(int i = 1; i < npar; ++i) + gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this + // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net) + + for(int i = 0; i < n; ++i) { + double pn=0; + double aux=0; + double Xbetai=beta->data[0]; + int iParm1=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1]; + iParm1+=p->nlev->data[k]-1; //-1? + } + // Adding the continuous + for(int k = 0; k < Kc; ++k) + Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm1++]; + + pn= 1/(1 + gsl_sf_exp(-Xbetai)); + // Add a protection for pn very close to 0 or 1? + aux=pn*(1-pn); + + // Calculate sub-gradient vector gn + gsl_vector_set_zero(gn); + gn->data[0]= 1; + iParm1=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + gn->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1]=1; + iParm1+=p->nlev->data[k]-1; + } + // Adding the continuous + for(int k = 0; k < Kc; ++k) { + gn->data[iParm1++] = gsl_matrix_get(p->Xc,i,k); + } + + for(int k1=0;k1data[k1]!=0) + for(int k2=0;k2data[k2]!=0) + *gsl_matrix_ptr(out,k1,k2) += (aux * gn->data[k1] * gn->data[k2]); + } + gsl_vector_free(gn); +} + + +double wgsl_mixed_optim_f(gsl_vector *v, void *params) +{ + double mLogLik=0; + fix_parm_mixed_T *p = (fix_parm_mixed_T *)params; + mLogLik = fLogit_mixed(v,p->X,p->nlev,p->Xc,p->y,p->lambdaL1,p->lambdaL2); + return mLogLik; +} + + +/* Compute both f and df together. */ +void +wgsl_mixed_optim_fdf (gsl_vector *x, void *params, + double *f, gsl_vector *df) +{ + *f = wgsl_mixed_optim_f(x, params); + wgsl_mixed_optim_df(x, params, df); +} + + +int logistic_mixed_fit(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used) + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + + double mLogLik=0; + fix_parm_mixed_T p; + int npar = beta->size; + int iter=0; + double maxchange=0; + + //Intializing fix parameters + p.X=X; + p.Xc=Xc; + p.nlev=nlev; + p.y=y; + p.lambdaL1=lambdaL1; + p.lambdaL2=lambdaL2; + + //Initial fit + //#ifdef _RPR_DEBUG_ + mLogLik = wgsl_mixed_optim_f(beta,&p); + //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik); + //#endif //_RPR_DEBUG + + gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/ + gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */ + + gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/ + gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/ + + for(iter=0;iter<100;iter++){ + wgsl_mixed_optim_hessian(beta,&p,myH); //Calculate Hessian + wgsl_mixed_optim_df(beta,&p,myG); //Calculate Gradient + gsl_linalg_QR_decomp(myH,tau); //Calculate next beta + gsl_linalg_QR_solve(myH,tau,myG,stBeta); + gsl_vector_sub(beta,stBeta); + + //Monitor convergence + maxchange=0; + for(int i=0;idata[i])) + maxchange=fabs(stBeta->data[i]); + +#ifdef _RPR_DEBUG_ + //mLogLik = wgsl_mixed_optim_f(beta,&p); + //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange); +#endif //_RPR_DEBUG + + if(maxchange<1E-4) + break; + } + +#ifdef _RPR_DEBUG_ + //for (int i = 0; i < npar; i++) + // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]); +#endif //_RPR_DEBUG + + //Final fit + mLogLik = wgsl_mixed_optim_f(beta,&p); + //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange); + + gsl_vector_free (tau); + gsl_vector_free (stBeta); + gsl_vector_free (myG); + gsl_matrix_free (myH); + + return 0; +} + +/***************/ +/* Categorical */ +/***************/ + +// I need to bundle all the data that goes to the function to optimze together. +typedef struct{ + gsl_matrix_int *X; + gsl_vector_int *nlev; + gsl_vector *y; + double lambdaL1; + double lambdaL2; +}fix_parm_cat_T; + + +double fLogit_cat(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + int n = y->size; + // int k = X->size2; + int npar = beta->size; + double total = 0; + double aux = 0; + /* omp_set_num_threads(ompthr); */ + /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*\/ */ + /* /\*#pragma omp parallel for reduction (+:total)*\/ */ + for(int i = 1; i < npar; ++i) + total += beta->data[i]*beta->data[i]; + total = (-total*lambdaL2/2); + /* /\*#pragma omp parallel for reduction (+:aux)*\/ */ + for(int i = 1; i < npar; ++i) + aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]); + total = total-aux*lambdaL1; + /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */ + for(int i = 0; i < n; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < X->size2; ++k) { + if(gsl_matrix_int_get(X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm]; + iParm+=nlev->data[k]-1; + } + total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai)); + } + return -total; +} + + +void logistic_cat_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ) +{ + for(int i = 0; i < X->size1; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < X->size2; ++k) { + if(gsl_matrix_int_get(X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(X,i,k)-1+iParm]; + iParm+=nlev->data[k]-1; + } + yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai)); + } +} + + +/* The gradient of f, df = (df/dx, df/dy). */ +void +wgsl_cat_optim_df (const gsl_vector *beta, void *params, + gsl_vector *out) +{ + fix_parm_cat_T *p = (fix_parm_cat_T *)params; + int n = p->y->size; + int K = p->X->size2; + int npar = beta->size; + // Intitialize gradient out necessary? + for(int i = 0; i < npar; ++i) + out->data[i]= 0; + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */ + for(int i = 1; i < npar; ++i) + out->data[i]= p->lambdaL2*beta->data[i]; + for(int i = 1; i < npar; ++i) + out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0)); + + for(int i = 0; i < n; ++i) { + double pn=0; + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]; + iParm+=p->nlev->data[k]-1; + } + // total += y->data[i]*Xbetai-log(1+gsl_sf_exp(Xbetai)); + pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) ); + + out->data[0]+= pn; + iParm=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + out->data[gsl_matrix_int_get(p->X,i,k)-1+iParm]+=pn; + iParm+=p->nlev->data[k]-1; + } + } +} + + +/* The Hessian of f */ +void +wgsl_cat_optim_hessian (const gsl_vector *beta, void *params, + gsl_matrix *out) +{ + fix_parm_cat_T *p = (fix_parm_cat_T *)params; + int n = p->y->size; + int K = p->X->size2; + int npar = beta->size; + // Intitialize Hessian out necessary ??? + gsl_matrix_set_zero(out); + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/ + for(int i = 1; i < npar; ++i) + gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this + // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net) + + + for(int i = 0; i < n; ++i) { + double pn=0; + double aux=0; + double Xbetai=beta->data[0]; + int iParm2=1; + int iParm1=1; + for(int k = 0; k < K; ++k) { + if(gsl_matrix_int_get(p->X,i,k)>0) + Xbetai+=beta->data[gsl_matrix_int_get(p->X,i,k)-1+iParm1]; + iParm1+=p->nlev->data[k]-1; //-1? + } + // total += y->data[i]*Xbetai-log(1+gsl_sf_exp(Xbetai)); + pn= 1/(1 + gsl_sf_exp(-Xbetai)); + // Add a protection for pn very close to 0 or 1? + aux=pn*(1-pn); + *gsl_matrix_ptr(out,0,0)+=aux; + iParm2=1; + for(int k2 = 0; k2 < K; ++k2) { + if(gsl_matrix_int_get(p->X,i,k2)>0) + *gsl_matrix_ptr(out,0,gsl_matrix_int_get(p->X,i,k2)-1+iParm2)+=aux; + iParm2+=p->nlev->data[k2]-1; //-1? + } + iParm1=1; + for(int k1 = 0; k1 < K; ++k1) { + if(gsl_matrix_int_get(p->X,i,k1)>0) + *gsl_matrix_ptr(out,gsl_matrix_int_get(p->X,i,k1)-1+iParm1,0)+=aux; + iParm2=1; + for(int k2 = 0; k2 < K; ++k2) { + if((gsl_matrix_int_get(p->X,i,k1)>0) && (gsl_matrix_int_get(p->X,i,k2)>0)) + *gsl_matrix_ptr(out + ,gsl_matrix_int_get(p->X,i,k1)-1+iParm1 + ,gsl_matrix_int_get(p->X,i,k2)-1+iParm2 + )+=aux; + iParm2+=p->nlev->data[k2]-1; //-1? + } + iParm1+=p->nlev->data[k1]-1; //-1? + } + } +} + + +double wgsl_cat_optim_f(gsl_vector *v, void *params) +{ + double mLogLik=0; + fix_parm_cat_T *p = (fix_parm_cat_T *)params; + mLogLik = fLogit_cat(v,p->X,p->nlev,p->y,p->lambdaL1,p->lambdaL2); + return mLogLik; +} + + +/* Compute both f and df together. */ +void +wgsl_cat_optim_fdf (gsl_vector *x, void *params, + double *f, gsl_vector *df) +{ + *f = wgsl_cat_optim_f(x, params); + wgsl_cat_optim_df(x, params, df); +} + + +int logistic_cat_fit(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + double mLogLik=0; + fix_parm_cat_T p; + int npar = beta->size; + int iter=0; + double maxchange=0; + + //Intializing fix parameters + p.X=X; + p.nlev=nlev; + p.y=y; + p.lambdaL1=lambdaL1; + p.lambdaL2=lambdaL2; + + //Initial fit + //#ifdef _RPR_DEBUG_ + mLogLik = wgsl_cat_optim_f(beta,&p); + //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik); + //#endif //_RPR_DEBUG + + gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/ + gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */ + + gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/ + gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/ + + for(iter=0;iter<100;iter++){ + wgsl_cat_optim_hessian(beta,&p,myH); //Calculate Hessian + wgsl_cat_optim_df(beta,&p,myG); //Calculate Gradient + gsl_linalg_QR_decomp(myH,tau); //Calculate next beta + gsl_linalg_QR_solve(myH,tau,myG,stBeta); + gsl_vector_sub(beta,stBeta); + + //Monitor convergence + maxchange=0; + for(int i=0;idata[i])) + maxchange=fabs(stBeta->data[i]); + +#ifdef _RPR_DEBUG_ + mLogLik = wgsl_cat_optim_f(beta,&p); + //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange); +#endif //_RPR_DEBUG + + if(maxchange<1E-4) + break; + } + +#ifdef _RPR_DEBUG_ + //for (int i = 0; i < npar; i++) + // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]); +#endif //_RPR_DEBUG + + //Final fit + mLogLik = wgsl_cat_optim_f(beta,&p); + //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange); + + gsl_vector_free (tau); + gsl_vector_free (stBeta); + gsl_vector_free (myG); + gsl_matrix_free (myH); + + return 0; +} + + +/***************/ +/* Continuous */ +/***************/ + +// I need to bundle all the data that goes to the function to optimze together. +typedef struct{ + gsl_matrix *Xc; // continuous covariates Matrix Nobs x Kc + gsl_vector *y; + double lambdaL1; + double lambdaL2; +}fix_parm_cont_T; + + +double fLogit_cont(gsl_vector *beta + ,gsl_matrix *Xc + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + int n = y->size; + int npar = beta->size; + double total = 0; + double aux = 0; + /* omp_set_num_threads(ompthr); */ + /* /\* Changed loop start at 1 instead of 0 to avoid regularization of beta_0*\/ */ + /* /\*#pragma omp parallel for reduction (+:total)*\/ */ + for(int i = 1; i < npar; ++i) + total += beta->data[i]*beta->data[i]; + total = (-total*lambdaL2/2); + /* /\*#pragma omp parallel for reduction (+:aux)*\/ */ + for(int i = 1; i < npar; ++i) + aux += (beta->data[i]>0 ? beta->data[i] : -beta->data[i]); + total = total-aux*lambdaL1; + /* #pragma omp parallel for schedule(static) shared(n,beta,X,nlev,y) reduction (+:total) */ + for(int i = 0; i < n; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < (Xc->size2); ++k) + Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++]; + total += y->data[i]*Xbetai-gsl_sf_log_1plusx(gsl_sf_exp(Xbetai)); + } + return -total; +} + + +void logistic_cont_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used) + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ) +{ + for(int i = 0; i < Xc->size1; ++i) { + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < (Xc->size2); ++k) + Xbetai+= gsl_matrix_get(Xc,i,k)*beta->data[iParm++]; + yhat->data[i]=1/(1 + gsl_sf_exp(-Xbetai)); + } +} + + +/* The gradient of f, df = (df/dx, df/dy). */ +void +wgsl_cont_optim_df (const gsl_vector *beta, void *params, + gsl_vector *out) +{ + fix_parm_cont_T *p = (fix_parm_cont_T *)params; + int n = p->y->size; + int Kc = p->Xc->size2; + int npar = beta->size; + // Intitialize gradient out necessary? + for(int i = 0; i < npar; ++i) + out->data[i]= 0; + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0 */ + for(int i = 1; i < npar; ++i) + out->data[i]= p->lambdaL2*beta->data[i]; + for(int i = 1; i < npar; ++i) + out->data[i]+= p->lambdaL1*((beta->data[i]>0)-(beta->data[i]<0)); + + for(int i = 0; i < n; ++i) { + double pn=0; + double Xbetai=beta->data[0]; + int iParm=1; + for(int k = 0; k < Kc; ++k) + Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm++]; + + pn= -( p->y->data[i] - 1/(1 + gsl_sf_exp(-Xbetai)) ); + + out->data[0]+= pn; + iParm=1; + // Adding the continuous + for(int k = 0; k < Kc; ++k) { + out->data[iParm++] += gsl_matrix_get(p->Xc,i,k)*pn; + } + } +} + + +/* The Hessian of f */ +void +wgsl_cont_optim_hessian (const gsl_vector *beta, void *params, + gsl_matrix *out) +{ + fix_parm_cont_T *p = (fix_parm_cont_T *)params; + int n = p->y->size; + int Kc = p->Xc->size2; + int npar = beta->size; + gsl_vector *gn = gsl_vector_alloc(npar); /* gn */ + // Intitialize Hessian out necessary ??? + gsl_matrix_set_zero(out); + /* Changed loop start at 1 instead of 0 to avoid regularization of beta 0*/ + for(int i = 1; i < npar; ++i) + gsl_matrix_set(out,i,i,(p->lambdaL2)); // Double check this + // L1 penalty not working yet, as not differentiable, I may need to do coordinate descent (as in glm_net) + + for(int i = 0; i < n; ++i) { + double pn=0; + double aux=0; + double Xbetai=beta->data[0]; + int iParm1=1; + for(int k = 0; k < Kc; ++k) + Xbetai+= gsl_matrix_get(p->Xc,i,k)*beta->data[iParm1++]; + + pn= 1/(1 + gsl_sf_exp(-Xbetai)); + // Add a protection for pn very close to 0 or 1? + aux=pn*(1-pn); + + // Calculate sub-gradient vector gn + gsl_vector_set_zero(gn); + gn->data[0]= 1; + iParm1=1; + for(int k = 0; k < Kc; ++k) { + gn->data[iParm1++] = gsl_matrix_get(p->Xc,i,k); + } + + for(int k1=0;k1data[k1]!=0) + for(int k2=0;k2data[k2]!=0) + *gsl_matrix_ptr(out,k1,k2) += (aux * gn->data[k1] * gn->data[k2]); + } + gsl_vector_free(gn); +} + + +double wgsl_cont_optim_f(gsl_vector *v, void *params) +{ + double mLogLik=0; + fix_parm_cont_T *p = (fix_parm_cont_T *)params; + mLogLik = fLogit_cont(v,p->Xc,p->y,p->lambdaL1,p->lambdaL2); + return mLogLik; +} + + +/* Compute both f and df together. */ +void +wgsl_cont_optim_fdf (gsl_vector *x, void *params, + double *f, gsl_vector *df) +{ + *f = wgsl_cont_optim_f(x, params); + wgsl_cont_optim_df(x, params, df); +} + + +int logistic_cont_fit(gsl_vector *beta + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc (NULL if not used) + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2) +{ + + double mLogLik=0; + fix_parm_cont_T p; + int npar = beta->size; + int iter=0; + double maxchange=0; + + //Intializing fix parameters + p.Xc=Xc; + p.y=y; + p.lambdaL1=lambdaL1; + p.lambdaL2=lambdaL2; + + //Initial fit + //#ifdef _RPR_DEBUG_ + mLogLik = wgsl_cont_optim_f(beta,&p); + //fprintf(stderr,"#Initial -log(Lik(0))=%lf\n",mLogLik); + //#endif //_RPR_DEBUG + + gsl_matrix *myH = gsl_matrix_alloc(npar,npar); /* Hessian matrix*/ + gsl_vector *stBeta = gsl_vector_alloc(npar); /* Direction to move */ + + gsl_vector *myG = gsl_vector_alloc(npar); /* Gradient*/ + gsl_vector *tau = gsl_vector_alloc(npar); /* tau for QR*/ + + for(iter=0;iter<100;iter++){ + wgsl_cont_optim_hessian(beta,&p,myH); //Calculate Hessian + wgsl_cont_optim_df(beta,&p,myG); //Calculate Gradient + gsl_linalg_QR_decomp(myH,tau); //Calculate next beta + gsl_linalg_QR_solve(myH,tau,myG,stBeta); + gsl_vector_sub(beta,stBeta); + + //Monitor convergence + maxchange=0; + for(int i=0;idata[i])) + maxchange=fabs(stBeta->data[i]); + +#ifdef _RPR_DEBUG_ + mLogLik = wgsl_cont_optim_f(beta,&p); + //fprintf(stderr,"#iter %d, -log(Lik(0))=%lf,%lf\n",(int)iter,mLogLik,maxchange); +#endif //_RPR_DEBUG + + if(maxchange<1E-4) + break; + } + +#ifdef _RPR_DEBUG_ + //for (int i = 0; i < npar; i++) + // fprintf(stderr,"#par_%d= %lf\n",i,beta->data[i]); +#endif //_RPR_DEBUG + + //Final fit + mLogLik = wgsl_cont_optim_f(beta,&p); + //fprintf(stderr,"#Final %d) -log(Lik(0))=%lf, maxchange %lf\n",iter,mLogLik,maxchange); + + gsl_vector_free (tau); + gsl_vector_free (stBeta); + gsl_vector_free (myG); + gsl_matrix_free (myH); + + return 0; +} + diff --git a/src/logistic.h b/src/logistic.h new file mode 100644 index 0000000..a68ee09 --- /dev/null +++ b/src/logistic.h @@ -0,0 +1,70 @@ +#ifndef LOGISTIC_H_ /* Include guard */ +#define LOGISTIC_H_ + +/* Mixed interface */ +void logistic_mixed_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ); + +int logistic_mixed_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *y //Vector of prob. to predict + ,double lambdaL1 // Regularization L1 0.0 if not used + ,double lambdaL2); // Regularization L2 0.0 if not used + +double fLogit_mixed(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2); + + +/* Categorical only interface */ +void logistic_cat_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ); + +int logistic_cat_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix_int *X //Matrix Nobs x K + ,gsl_vector_int *nlev // Vector with number categories + ,gsl_vector *y //Vector of prob. to predict + ,double lambdaL1 // Regularization L1 0.0 if not used + ,double lambdaL2); // Regularization L2 0.0 if not used + +double fLogit_cat(gsl_vector *beta + ,gsl_matrix_int *X + ,gsl_vector_int *nlev + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2); + + +/* Continuous only interface */ +void logistic_cont_pred(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *yhat //Vector of prob. predicted by the logistic + ); + +int logistic_cont_fit(gsl_vector *beta // Vector of parameters length = 1 + Sum_k(C_k - 1) + Kc + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *y //Vector of prob. to predict + ,double lambdaL1 // Regularization L1 0.0 if not used + ,double lambdaL2); // Regularization L2 0.0 if not used + +double fLogit_cont(gsl_vector *beta + ,gsl_matrix *Xc // continuous covariates Matrix Nobs x Kc + ,gsl_vector *y + ,double lambdaL1 + ,double lambdaL2); + + +#endif // LOGISTIC_H_ diff --git a/src/varcov.cpp b/src/varcov.cpp new file mode 100644 index 0000000..fdc6f10 --- /dev/null +++ b/src/varcov.cpp @@ -0,0 +1,482 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" + +#include "lapack.h" +#include "gzstream.h" + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "varcov_float.h" +#include "io_float.h" +#include "mathfunc_float.h" +#else +#include "param.h" +#include "varcov.h" +#include "io.h" +#include "mathfunc.h" +#endif + + +using namespace std; + + + + +void VARCOV::CopyFromParam (PARAM &cPar) +{ + d_pace=cPar.d_pace; + + file_bfile=cPar.file_bfile; + file_geno=cPar.file_geno; + file_out=cPar.file_out; + path_out=cPar.path_out; + + time_opt=0.0; + + window_cm=cPar.window_cm; + window_bp=cPar.window_bp; + window_ns=cPar.window_ns; + + indicator_idv=cPar.indicator_idv; + indicator_snp=cPar.indicator_snp; + snpInfo=cPar.snpInfo; + + return; +} + + +void VARCOV::CopyToParam (PARAM &cPar) +{ + cPar.time_opt=time_opt; + return; +} + + + +//chr rs ps n_idv allele1 allele0 af var window_size r2 (r2_11,n_11,r2_12,n_12...r2_1m,n_1m) +void VARCOV::WriteCov (const int flag, const vector &snpInfo_sub, const vector > &Cov_mat) +{ + string file_cov; + file_cov=path_out+"/"+file_out; + file_cov+=".cor.txt"; + + ofstream outfile; + + if (flag==0) { + outfile.open (file_cov.c_str(), ofstream::out); + if (!outfile) {cout<<"error writing file: "<0) { + return false; + } else { + if (c_bp<0) { + return true; + } else if (c_bp>0) { + return false; + } else { + return true; + } + } +} + + +// do not sort SNPs (because gzip files do not support random access) +// then calculate n_nb, the number of neighbours, for each snp +void VARCOV::CalcNB (vector &snpInfo_sort) +{ + // stable_sort(snpInfo_sort.begin(), snpInfo_sort.end(), CompareSNPinfo); + + size_t t2=0, n_nb=0; + for (size_t t=0; t > &X_mat, vector &cov_vec) +{ + cov_vec.clear(); + + double v1, v2, r; + vector x_vec=X_mat[0]; + + lapack_ddot(x_vec, x_vec, v1); + cov_vec.push_back(v1/(double)x_vec.size() ); + + for (size_t i=1; i > &X_mat, vector &cov_vec) +{ + cov_vec.clear(); + + int d1, d2, m1, m2, n1, n2, n12; + double m1d, m2d, m12d, v1d, v2d, v; + + vector x_vec=X_mat[0]; + + //calculate m2 + m2=0; n2=0; + for (size_t j=0; j snpInfo_sub; + CalcNB(snpInfo); + + size_t ni_test=0; + for (size_t i=0; i x_vec, cov_vec; + vector > X_mat, Cov_mat; + + for (size_t i=0; i=2) {break;} + + if (X_mat.size()==0) { + n_nb=snpInfo[t].n_nb+1; + } else { + n_nb=snpInfo[t].n_nb-n_nb+1; + } + + for (int i=0; isize; j++) { + x_vec[j]=gsl_vector_get(geno, j); + } + X_mat.push_back(x_vec); + + t2++; + } + + n_nb=snpInfo[t].n_nb; + + Calc_Cor(X_mat, cov_vec); + Cov_mat.push_back(cov_vec); + snpInfo_sub.push_back(snpInfo[t]); + + X_mat.erase(X_mat.begin()); + + //write out var/cov values + if (Cov_mat.size()==10000) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + } + + if (Cov_mat.size()!=0) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + + gsl_vector_free(geno); + + infile.close(); + infile.clear(); + + return; +} + + + + +void VARCOV::AnalyzePlink () +{ + string file_bed=file_bfile+".bed"; + ifstream infile (file_bed.c_str(), ios::binary); + if (!infile) {cout<<"error reading bed file:"< snpInfo_sub; + CalcNB(snpInfo); + + size_t ni_test=0; + for (size_t i=0; i x_vec, cov_vec; + vector > X_mat, Cov_mat; + + for (size_t i=0; i=2) {break;} + + if (X_mat.size()==0) { + n_nb=snpInfo[t].n_nb+1; + } else { + n_nb=snpInfo[t].n_nb-n_nb+1; + } + + for (int i=0; isize; j++) { + x_vec[j]=gsl_vector_get(geno, j); + } + X_mat.push_back(x_vec); + + t2++; + } + + n_nb=snpInfo[t].n_nb; + + Calc_Cor(X_mat, cov_vec); + Cov_mat.push_back(cov_vec); + snpInfo_sub.push_back(snpInfo[t]); + + X_mat.erase(X_mat.begin()); + + //write out var/cov values + if (Cov_mat.size()==10000) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + } + + if (Cov_mat.size()!=0) { + WriteCov (1, snpInfo_sub, Cov_mat); + Cov_mat.clear(); + snpInfo_sub.clear(); + } + + gsl_vector_free(geno); + + infile.close(); + infile.clear(); + + return; +} diff --git a/src/varcov.h b/src/varcov.h new file mode 100644 index 0000000..b380b8c --- /dev/null +++ b/src/varcov.h @@ -0,0 +1,72 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright (C) 2011 Xiang Zhou + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __VARCOV_H__ +#define __VARCOV_H__ + +#include "gsl/gsl_vector.h" +#include "gsl/gsl_matrix.h" + + +#ifdef FORCE_FLOAT +#include "param_float.h" +#include "io_float.h" +#else +#include "param.h" +#include "io.h" +#endif + +using namespace std; + + + + +class VARCOV { + +public: + // IO related parameters + string file_out; + string path_out; + string file_geno; + string file_bfile; + int d_pace; + + vector indicator_idv; + vector indicator_snp; + + vector snpInfo; + + double time_opt; + + // Class specific parameters + double window_cm; + size_t window_bp; + size_t window_ns; + + // Main functions + void CopyFromParam (PARAM &cPar); + void CopyToParam (PARAM &cPar); + void CalcNB (vector &snpInfo_sort); + void WriteCov (const int flag, const vector &snpInfo_sub, const vector > &Cov_mat); + void AnalyzeBimbam (); + void AnalyzePlink (); +}; + +#endif + + -- cgit v1.2.3