/* Genome-wide Efficient Mixed Model Association (GEMMA) Copyright (C) 2011-2017, Xiang Zhou This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include "gsl/gsl_blas.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" #include #include #include #include #include #include #include #include #include #include #include "gzstream.h" #include "gemma_io.h" #include "lapack.h" #include "mathfunc.h" #include "prdt.h" using namespace std; void PRDT::CopyFromParam(PARAM &cPar) { a_mode = cPar.a_mode; d_pace = cPar.d_pace; file_bfile = cPar.file_bfile; file_geno = cPar.file_geno; file_out = cPar.file_out; path_out = cPar.path_out; indicator_pheno = cPar.indicator_pheno; indicator_cvt = cPar.indicator_cvt; indicator_idv = cPar.indicator_idv; snpInfo = cPar.snpInfo; mapRS2est = cPar.mapRS2est; time_eigen = 0; n_ph = cPar.n_ph; np_obs = cPar.np_obs; np_miss = cPar.np_miss; ns_total = cPar.ns_total; ns_test = 0; return; } void PRDT::CopyToParam(PARAM &cPar) { cPar.ns_test = ns_test; cPar.time_eigen = time_eigen; return; } void PRDT::WriteFiles(gsl_vector *y_prdt) { string file_str; file_str = path_out + "/" + file_out; file_str += "."; file_str += "prdt"; file_str += ".txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } size_t ci_test = 0; for (size_t i = 0; i < indicator_idv.size(); i++) { if (indicator_idv[i] == 1) { outfile << "NA" << endl; } else { outfile << gsl_vector_get(y_prdt, ci_test) << endl; ci_test++; } } outfile.close(); outfile.clear(); return; } void PRDT::WriteFiles(gsl_matrix *Y_full) { string file_str; file_str = path_out + "/" + file_out; file_str += ".prdt.txt"; ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { cout << "error writing file: " << file_str.c_str() << endl; return; } size_t ci_test = 0; for (size_t i = 0; i < indicator_cvt.size(); i++) { if (indicator_cvt[i] == 0) { outfile << "NA" << endl; } else { for (size_t j = 0; j < Y_full->size2; j++) { outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t"; } outfile << endl; ci_test++; } } outfile.close(); outfile.clear(); return; } void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) { size_t ni_test = u_hat->size, ni_total = G->size1; gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test); gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test); gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test); gsl_vector *eval = gsl_vector_alloc(ni_test); gsl_vector *Utu = gsl_vector_alloc(ni_test); gsl_vector *w = gsl_vector_alloc(ni_total); gsl_permutation *pmt = gsl_permutation_alloc(ni_test); // center matrix G based on indicator_idv for (size_t i = 0; i < ni_total; i++) { gsl_vector_set(w, i, indicator_idv[i]); } CenterMatrix(G, w); // obtain Koo and Kfo size_t o_i = 0, o_j = 0; double d; for (size_t i = 0; i < indicator_idv.size(); i++) { o_j = 0; for (size_t j = 0; j < indicator_idv.size(); j++) { d = gsl_matrix_get(G, i, j); if (indicator_idv[i] == 1 && indicator_idv[j] == 1) { gsl_matrix_set(Goo, o_i, o_j, d); } if (indicator_idv[i] == 0 && indicator_idv[j] == 1) { gsl_matrix_set(Gfo, i - o_i, o_j, d); } if (indicator_idv[j] == 1) { o_j++; } } if (indicator_idv[i] == 1) { o_i++; } } // matrix operations to get u_prdt cout << "Start Eigen-Decomposition..." << endl; clock_t time_start = clock(); EigenDecomp(Goo, U, eval, 0); for (size_t i = 0; i < eval->size; i++) { if (gsl_vector_get(eval, i) < 1e-10) { gsl_vector_set(eval, i, 0); } } time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu); for (size_t i = 0; i < eval->size; i++) { d = gsl_vector_get(eval, i); if (d != 0) { d = gsl_vector_get(Utu, i) / d; gsl_vector_set(Utu, i, d); } } gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval); gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt); // Free matrices. gsl_matrix_free(Goo); gsl_matrix_free(Gfo); gsl_matrix_free(U); gsl_vector_free(eval); gsl_vector_free(Utu); gsl_vector_free(w); gsl_permutation_free(pmt); return; } void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) { debug_msg("entering"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { cout << "error reading genotype file:" << file_geno << endl; return; } string line; char *ch_ptr; string rs; size_t n_miss, n_train_nomiss, c_phen; double geno, x_mean, x_train_mean, effect_size; gsl_vector *x = gsl_vector_alloc(y_prdt->size); gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size); ns_test = 0; // Start reading genotypes and analyze. for (size_t t = 0; t < ns_total; ++t) { safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == (ns_total - 1)) { ProgressBar("Reading SNPs ", t, ns_total - 1); } ch_ptr = strtok((char *)line.c_str(), " , \t"); rs = ch_ptr; ch_ptr = strtok(NULL, " , \t"); ch_ptr = strtok(NULL, " , \t"); if (mapRS2est.count(rs) == 0) { continue; } else { effect_size = mapRS2est[rs]; } x_mean = 0.0; c_phen = 0; n_miss = 0; x_train_mean = 0; n_train_nomiss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < indicator_idv.size(); ++i) { ch_ptr = strtok(NULL, " , \t"); if (indicator_idv[i] == 1) { if (strcmp(ch_ptr, "NA") != 0) { geno = atof(ch_ptr); x_train_mean += geno; n_train_nomiss++; } } else { if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(x_miss, c_phen, 0.0); n_miss++; } else { geno = atof(ch_ptr); gsl_vector_set(x, c_phen, geno); gsl_vector_set(x_miss, c_phen, 1.0); x_mean += geno; } c_phen++; } } if (x->size == n_miss) { cout << "snp " << rs << " has missing genotype for all " << "individuals and will be ignored." << endl; continue; } x_mean /= (double)(x->size - n_miss); x_train_mean /= (double)(n_train_nomiss); for (size_t i = 0; i < x->size; ++i) { geno = gsl_vector_get(x, i); if (gsl_vector_get(x_miss, i) == 0) { gsl_vector_set(x, i, x_mean - x_train_mean); } else { gsl_vector_set(x, i, geno - x_train_mean); } } gsl_vector_scale(x, effect_size); gsl_vector_add(y_prdt, x); ns_test++; } cout << endl; gsl_vector_free(x); gsl_vector_free(x_miss); infile.close(); infile.clear(); return; } void PRDT::AnalyzePlink(gsl_vector *y_prdt) { debug_msg("entering"); string file_bed = file_bfile + ".bed"; ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { cout << "error reading bed file:" << file_bed << endl; return; } char ch[1]; bitset<8> b; string rs; size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss; double geno, x_mean, x_train_mean, effect_size; gsl_vector *x = gsl_vector_alloc(y_prdt->size); // Calculate n_bit and c, the number of bit for each SNP. if (indicator_idv.size() % 4 == 0) { n_bit = indicator_idv.size() / 4; } else { n_bit = indicator_idv.size() / 4 + 1; } // Print the first 3 magic numbers. for (size_t i = 0; i < 3; ++i) { infile.read(ch, 1); b = ch[0]; } ns_test = 0; for (vector::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); } rs = snpInfo[t].rs_number; if (mapRS2est.count(rs) == 0) { continue; } else { effect_size = mapRS2est[rs]; } // n_bit, and 3 is the number of magic numbers. infile.seekg(t * n_bit + 3); // Read genotypes. x_mean = 0.0; n_miss = 0; ci_total = 0; ci_test = 0; x_train_mean = 0; n_train_nomiss = 0; for (size_t i = 0; i < n_bit; ++i) { infile.read(ch, 1); b = ch[0]; // Minor allele homozygous: 2.0; major: 0.0. for (size_t j = 0; j < 4; ++j) { if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) { break; } if (indicator_idv[ci_total] == 1) { if (b[2 * j] == 0) { if (b[2 * j + 1] == 0) { x_train_mean += 2.0; n_train_nomiss++; } else { x_train_mean += 1.0; n_train_nomiss++; } } else { if (b[2 * j + 1] == 1) { n_train_nomiss++; } else { } } } else { if (b[2 * j] == 0) { if (b[2 * j + 1] == 0) { gsl_vector_set(x, ci_test, 2); x_mean += 2.0; } else { gsl_vector_set(x, ci_test, 1); x_mean += 1.0; } } else { if (b[2 * j + 1] == 1) { gsl_vector_set(x, ci_test, 0); } else { gsl_vector_set(x, ci_test, -9); n_miss++; } } ci_test++; } ci_total++; } } if (x->size == n_miss) { cout << "snp " << rs << " has missing genotype for all " << "individuals and will be ignored." << endl; continue; } x_mean /= (double)(x->size - n_miss); x_train_mean /= (double)(n_train_nomiss); for (size_t i = 0; i < x->size; ++i) { geno = gsl_vector_get(x, i); if (geno == -9) { gsl_vector_set(x, i, x_mean - x_train_mean); } else { gsl_vector_set(x, i, geno - x_train_mean); } } gsl_vector_scale(x, effect_size); gsl_vector_add(y_prdt, x); ns_test++; } cout << endl; gsl_vector_free(x); infile.close(); infile.clear(); return; } // Predict missing phenotypes using ridge regression. // Y_hat contains fixed effects void PRDT::MvnormPrdt(const gsl_matrix *Y_hat, const gsl_matrix *H, gsl_matrix *Y_full) { gsl_vector *y_obs = gsl_vector_alloc(np_obs); gsl_vector *y_miss = gsl_vector_alloc(np_miss); gsl_matrix *H_oo = gsl_matrix_alloc(np_obs, np_obs); gsl_matrix *H_mo = gsl_matrix_alloc(np_miss, np_obs); gsl_vector *Hiy = gsl_vector_alloc(np_obs); size_t c_obs1 = 0, c_obs2 = 0, c_miss1 = 0, c_miss2 = 0; // Obtain H_oo, H_mo. c_obs1 = 0; c_miss1 = 0; for (vector::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) { if (indicator_cvt[i1] == 0) { continue; } for (vector::size_type j1 = 0; j1 < n_ph; ++j1) { c_obs2 = 0; c_miss2 = 0; for (vector::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) { if (indicator_cvt[i2] == 0) { continue; } for (vector::size_type j2 = 0; j2 < n_ph; j2++) { if (indicator_pheno[i2][j2] == 1) { if (indicator_pheno[i1][j1] == 1) { gsl_matrix_set( H_oo, c_obs1, c_obs2, gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2)); } else { gsl_matrix_set( H_mo, c_miss1, c_obs2, gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2)); } c_obs2++; } else { c_miss2++; } } } if (indicator_pheno[i1][j1] == 1) { c_obs1++; } else { c_miss1++; } } } // Do LU decomposition of H_oo. int sig; gsl_permutation *pmt = gsl_permutation_alloc(np_obs); LUDecomp(H_oo, pmt, &sig); // Obtain y_obs=y_full-y_hat. // Add the fixed effects part to y_miss: y_miss=y_hat. c_obs1 = 0; c_miss1 = 0; for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) { if (indicator_cvt[i] == 0) { continue; } for (vector::size_type j = 0; j < n_ph; ++j) { if (indicator_pheno[i][j] == 1) { gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) - gsl_matrix_get(Y_hat, i, j)); c_obs1++; } else { gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j)); c_miss1++; } } } LUSolve(H_oo, pmt, y_obs, Hiy); gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss); // Put back predicted y_miss to Y_full. c_miss1 = 0; for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) { if (indicator_cvt[i] == 0) { continue; } for (vector::size_type j = 0; j < n_ph; ++j) { if (indicator_pheno[i][j] == 0) { gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1)); c_miss1++; } } } // Free matrices. gsl_vector_free(y_obs); gsl_vector_free(y_miss); gsl_matrix_free(H_oo); gsl_matrix_free(H_mo); gsl_vector_free(Hiy); return; }