/*
Genome-wide Efficient Mixed Model Association (GEMMA)
Copyright (C) 2011-2017, Xiang Zhou
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include "gsl/gsl_blas.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_vector.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "gzstream.h"
#include "gemma_io.h"
#include "lapack.h"
#include "mathfunc.h"
#include "prdt.h"
using namespace std;
void PRDT::CopyFromParam(PARAM &cPar) {
a_mode = cPar.a_mode;
d_pace = cPar.d_pace;
file_bfile = cPar.file_bfile;
file_geno = cPar.file_geno;
file_out = cPar.file_out;
path_out = cPar.path_out;
indicator_pheno = cPar.indicator_pheno;
indicator_cvt = cPar.indicator_cvt;
indicator_idv = cPar.indicator_idv;
snpInfo = cPar.snpInfo;
mapRS2est = cPar.mapRS2est;
time_eigen = 0;
n_ph = cPar.n_ph;
np_obs = cPar.np_obs;
np_miss = cPar.np_miss;
ns_total = cPar.ns_total;
ns_test = 0;
return;
}
void PRDT::CopyToParam(PARAM &cPar) {
cPar.ns_test = ns_test;
cPar.time_eigen = time_eigen;
return;
}
void PRDT::WriteFiles(gsl_vector *y_prdt) {
string file_str;
file_str = path_out + "/" + file_out;
file_str += ".";
file_str += "prdt";
file_str += ".txt";
ofstream outfile(file_str.c_str(), ofstream::out);
if (!outfile) {
cout << "error writing file: " << file_str.c_str() << endl;
return;
}
size_t ci_test = 0;
for (size_t i = 0; i < indicator_idv.size(); i++) {
if (indicator_idv[i] == 1) {
outfile << "NA" << endl;
} else {
outfile << gsl_vector_get(y_prdt, ci_test) << endl;
ci_test++;
}
}
outfile.close();
outfile.clear();
return;
}
void PRDT::WriteFiles(gsl_matrix *Y_full) {
string file_str;
file_str = path_out + "/" + file_out;
file_str += ".prdt.txt";
ofstream outfile(file_str.c_str(), ofstream::out);
if (!outfile) {
cout << "error writing file: " << file_str.c_str() << endl;
return;
}
size_t ci_test = 0;
for (size_t i = 0; i < indicator_cvt.size(); i++) {
if (indicator_cvt[i] == 0) {
outfile << "NA" << endl;
} else {
for (size_t j = 0; j < Y_full->size2; j++) {
outfile << gsl_matrix_get(Y_full, ci_test, j) << "\t";
}
outfile << endl;
ci_test++;
}
}
outfile.close();
outfile.clear();
return;
}
void PRDT::AddBV(gsl_matrix *G, const gsl_vector *u_hat, gsl_vector *y_prdt) {
size_t ni_test = u_hat->size, ni_total = G->size1;
gsl_matrix *Goo = gsl_matrix_alloc(ni_test, ni_test);
gsl_matrix *Gfo = gsl_matrix_alloc(ni_total - ni_test, ni_test);
gsl_matrix *U = gsl_matrix_alloc(ni_test, ni_test);
gsl_vector *eval = gsl_vector_alloc(ni_test);
gsl_vector *Utu = gsl_vector_alloc(ni_test);
gsl_vector *w = gsl_vector_alloc(ni_total);
gsl_permutation *pmt = gsl_permutation_alloc(ni_test);
// center matrix G based on indicator_idv
for (size_t i = 0; i < ni_total; i++) {
gsl_vector_set(w, i, indicator_idv[i]);
}
CenterMatrix(G, w);
// obtain Koo and Kfo
size_t o_i = 0, o_j = 0;
double d;
for (size_t i = 0; i < indicator_idv.size(); i++) {
o_j = 0;
for (size_t j = 0; j < indicator_idv.size(); j++) {
d = gsl_matrix_get(G, i, j);
if (indicator_idv[i] == 1 && indicator_idv[j] == 1) {
gsl_matrix_set(Goo, o_i, o_j, d);
}
if (indicator_idv[i] == 0 && indicator_idv[j] == 1) {
gsl_matrix_set(Gfo, i - o_i, o_j, d);
}
if (indicator_idv[j] == 1) {
o_j++;
}
}
if (indicator_idv[i] == 1) {
o_i++;
}
}
// matrix operations to get u_prdt
cout << "Start Eigen-Decomposition..." << endl;
clock_t time_start = clock();
EigenDecomp(Goo, U, eval, 0);
for (size_t i = 0; i < eval->size; i++) {
if (gsl_vector_get(eval, i) < 1e-10) {
gsl_vector_set(eval, i, 0);
}
}
time_eigen = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
gsl_blas_dgemv(CblasTrans, 1.0, U, u_hat, 0.0, Utu);
for (size_t i = 0; i < eval->size; i++) {
d = gsl_vector_get(eval, i);
if (d != 0) {
d = gsl_vector_get(Utu, i) / d;
gsl_vector_set(Utu, i, d);
}
}
gsl_blas_dgemv(CblasNoTrans, 1.0, U, Utu, 0.0, eval);
gsl_blas_dgemv(CblasNoTrans, 1.0, Gfo, eval, 1.0, y_prdt);
// Free matrices.
gsl_matrix_free(Goo);
gsl_matrix_free(Gfo);
gsl_matrix_free(U);
gsl_vector_free(eval);
gsl_vector_free(Utu);
gsl_vector_free(w);
gsl_permutation_free(pmt);
return;
}
void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) {
debug_msg("entering");
igzstream infile(file_geno.c_str(), igzstream::in);
if (!infile) {
cout << "error reading genotype file:" << file_geno << endl;
return;
}
string line;
char *ch_ptr;
string rs;
size_t n_miss, n_train_nomiss, c_phen;
double geno, x_mean, x_train_mean, effect_size;
gsl_vector *x = gsl_vector_alloc(y_prdt->size);
gsl_vector *x_miss = gsl_vector_alloc(y_prdt->size);
ns_test = 0;
// Start reading genotypes and analyze.
for (size_t t = 0; t < ns_total; ++t) {
safeGetline(infile, line).eof();
if (t % d_pace == 0 || t == (ns_total - 1)) {
ProgressBar("Reading SNPs ", t, ns_total - 1);
}
ch_ptr = strtok((char *)line.c_str(), " , \t");
rs = ch_ptr;
ch_ptr = strtok(NULL, " , \t");
ch_ptr = strtok(NULL, " , \t");
if (mapRS2est.count(rs) == 0) {
continue;
} else {
effect_size = mapRS2est[rs];
}
x_mean = 0.0;
c_phen = 0;
n_miss = 0;
x_train_mean = 0;
n_train_nomiss = 0;
gsl_vector_set_zero(x_miss);
for (size_t i = 0; i < indicator_idv.size(); ++i) {
ch_ptr = strtok(NULL, " , \t");
if (indicator_idv[i] == 1) {
if (strcmp(ch_ptr, "NA") != 0) {
geno = atof(ch_ptr);
x_train_mean += geno;
n_train_nomiss++;
}
} else {
if (strcmp(ch_ptr, "NA") == 0) {
gsl_vector_set(x_miss, c_phen, 0.0);
n_miss++;
} else {
geno = atof(ch_ptr);
gsl_vector_set(x, c_phen, geno);
gsl_vector_set(x_miss, c_phen, 1.0);
x_mean += geno;
}
c_phen++;
}
}
if (x->size == n_miss) {
cout << "snp " << rs << " has missing genotype for all "
<< "individuals and will be ignored." << endl;
continue;
}
x_mean /= (double)(x->size - n_miss);
x_train_mean /= (double)(n_train_nomiss);
for (size_t i = 0; i < x->size; ++i) {
geno = gsl_vector_get(x, i);
if (gsl_vector_get(x_miss, i) == 0) {
gsl_vector_set(x, i, x_mean - x_train_mean);
} else {
gsl_vector_set(x, i, geno - x_train_mean);
}
}
gsl_vector_scale(x, effect_size);
gsl_vector_add(y_prdt, x);
ns_test++;
}
cout << endl;
gsl_vector_free(x);
gsl_vector_free(x_miss);
infile.close();
infile.clear();
return;
}
void PRDT::AnalyzePlink(gsl_vector *y_prdt) {
debug_msg("entering");
string file_bed = file_bfile + ".bed";
ifstream infile(file_bed.c_str(), ios::binary);
if (!infile) {
cout << "error reading bed file:" << file_bed << endl;
return;
}
char ch[1];
bitset<8> b;
string rs;
size_t n_bit, n_miss, ci_total, ci_test, n_train_nomiss;
double geno, x_mean, x_train_mean, effect_size;
gsl_vector *x = gsl_vector_alloc(y_prdt->size);
// Calculate n_bit and c, the number of bit for each SNP.
if (indicator_idv.size() % 4 == 0) {
n_bit = indicator_idv.size() / 4;
} else {
n_bit = indicator_idv.size() / 4 + 1;
}
// Print the first 3 magic numbers.
for (size_t i = 0; i < 3; ++i) {
infile.read(ch, 1);
b = ch[0];
}
ns_test = 0;
for (vector::size_type t = 0; t < snpInfo.size(); ++t) {
if (t % d_pace == 0 || t == snpInfo.size() - 1) {
ProgressBar("Reading SNPs ", t, snpInfo.size() - 1);
}
rs = snpInfo[t].rs_number;
if (mapRS2est.count(rs) == 0) {
continue;
} else {
effect_size = mapRS2est[rs];
}
// n_bit, and 3 is the number of magic numbers.
infile.seekg(t * n_bit + 3);
// Read genotypes.
x_mean = 0.0;
n_miss = 0;
ci_total = 0;
ci_test = 0;
x_train_mean = 0;
n_train_nomiss = 0;
for (size_t i = 0; i < n_bit; ++i) {
infile.read(ch, 1);
b = ch[0];
// Minor allele homozygous: 2.0; major: 0.0.
for (size_t j = 0; j < 4; ++j) {
if ((i == (n_bit - 1)) && ci_total == indicator_idv.size()) {
break;
}
if (indicator_idv[ci_total] == 1) {
if (b[2 * j] == 0) {
if (b[2 * j + 1] == 0) {
x_train_mean += 2.0;
n_train_nomiss++;
} else {
x_train_mean += 1.0;
n_train_nomiss++;
}
} else {
if (b[2 * j + 1] == 1) {
n_train_nomiss++;
} else {
}
}
} else {
if (b[2 * j] == 0) {
if (b[2 * j + 1] == 0) {
gsl_vector_set(x, ci_test, 2);
x_mean += 2.0;
} else {
gsl_vector_set(x, ci_test, 1);
x_mean += 1.0;
}
} else {
if (b[2 * j + 1] == 1) {
gsl_vector_set(x, ci_test, 0);
} else {
gsl_vector_set(x, ci_test, -9);
n_miss++;
}
}
ci_test++;
}
ci_total++;
}
}
if (x->size == n_miss) {
cout << "snp " << rs << " has missing genotype for all "
<< "individuals and will be ignored." << endl;
continue;
}
x_mean /= (double)(x->size - n_miss);
x_train_mean /= (double)(n_train_nomiss);
for (size_t i = 0; i < x->size; ++i) {
geno = gsl_vector_get(x, i);
if (geno == -9) {
gsl_vector_set(x, i, x_mean - x_train_mean);
} else {
gsl_vector_set(x, i, geno - x_train_mean);
}
}
gsl_vector_scale(x, effect_size);
gsl_vector_add(y_prdt, x);
ns_test++;
}
cout << endl;
gsl_vector_free(x);
infile.close();
infile.clear();
return;
}
// Predict missing phenotypes using ridge regression.
// Y_hat contains fixed effects
void PRDT::MvnormPrdt(const gsl_matrix *Y_hat, const gsl_matrix *H,
gsl_matrix *Y_full) {
gsl_vector *y_obs = gsl_vector_alloc(np_obs);
gsl_vector *y_miss = gsl_vector_alloc(np_miss);
gsl_matrix *H_oo = gsl_matrix_alloc(np_obs, np_obs);
gsl_matrix *H_mo = gsl_matrix_alloc(np_miss, np_obs);
gsl_vector *Hiy = gsl_vector_alloc(np_obs);
size_t c_obs1 = 0, c_obs2 = 0, c_miss1 = 0, c_miss2 = 0;
// Obtain H_oo, H_mo.
c_obs1 = 0;
c_miss1 = 0;
for (vector::size_type i1 = 0; i1 < indicator_pheno.size(); ++i1) {
if (indicator_cvt[i1] == 0) {
continue;
}
for (vector::size_type j1 = 0; j1 < n_ph; ++j1) {
c_obs2 = 0;
c_miss2 = 0;
for (vector::size_type i2 = 0; i2 < indicator_pheno.size(); ++i2) {
if (indicator_cvt[i2] == 0) {
continue;
}
for (vector::size_type j2 = 0; j2 < n_ph; j2++) {
if (indicator_pheno[i2][j2] == 1) {
if (indicator_pheno[i1][j1] == 1) {
gsl_matrix_set(
H_oo, c_obs1, c_obs2,
gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
} else {
gsl_matrix_set(
H_mo, c_miss1, c_obs2,
gsl_matrix_get(H, c_obs1 + c_miss1, c_obs2 + c_miss2));
}
c_obs2++;
} else {
c_miss2++;
}
}
}
if (indicator_pheno[i1][j1] == 1) {
c_obs1++;
} else {
c_miss1++;
}
}
}
// Do LU decomposition of H_oo.
int sig;
gsl_permutation *pmt = gsl_permutation_alloc(np_obs);
LUDecomp(H_oo, pmt, &sig);
// Obtain y_obs=y_full-y_hat.
// Add the fixed effects part to y_miss: y_miss=y_hat.
c_obs1 = 0;
c_miss1 = 0;
for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) {
if (indicator_cvt[i] == 0) {
continue;
}
for (vector::size_type j = 0; j < n_ph; ++j) {
if (indicator_pheno[i][j] == 1) {
gsl_vector_set(y_obs, c_obs1, gsl_matrix_get(Y_full, i, j) -
gsl_matrix_get(Y_hat, i, j));
c_obs1++;
} else {
gsl_vector_set(y_miss, c_miss1, gsl_matrix_get(Y_hat, i, j));
c_miss1++;
}
}
}
LUSolve(H_oo, pmt, y_obs, Hiy);
gsl_blas_dgemv(CblasNoTrans, 1.0, H_mo, Hiy, 1.0, y_miss);
// Put back predicted y_miss to Y_full.
c_miss1 = 0;
for (vector::size_type i = 0; i < indicator_pheno.size(); ++i) {
if (indicator_cvt[i] == 0) {
continue;
}
for (vector::size_type j = 0; j < n_ph; ++j) {
if (indicator_pheno[i][j] == 0) {
gsl_matrix_set(Y_full, i, j, gsl_vector_get(y_miss, c_miss1));
c_miss1++;
}
}
}
// Free matrices.
gsl_vector_free(y_obs);
gsl_vector_free(y_miss);
gsl_matrix_free(H_oo);
gsl_matrix_free(H_mo);
gsl_vector_free(Hiy);
return;
}