aboutsummaryrefslogtreecommitdiff
/*
    Genome-wide Efficient Mixed Model Association (GEMMA)
    Copyright (C) 2011-2017, Xiang Zhou

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <bitset>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <set>
#include <sstream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <vector>

#include "gsl/gsl_blas.h"
#include "gsl/gsl_cdf.h"
#include "gsl/gsl_linalg.h"
#include "gsl/gsl_matrix.h"
#include "gsl/gsl_vector.h"

#include "gzstream.h"
#include "gemma_io.h"
#include "lapack.h"
#include "mathfunc.h"
#include "param.h"
#include "varcov.h"

using namespace std;

void VARCOV::CopyFromParam(PARAM &cPar) {
  d_pace = cPar.d_pace;

  file_bfile = cPar.file_bfile;
  file_geno = cPar.file_geno;
  file_out = cPar.file_out;
  path_out = cPar.path_out;

  time_opt = 0.0;

  window_cm = cPar.window_cm;
  window_bp = cPar.window_bp;
  window_ns = cPar.window_ns;

  indicator_idv = cPar.indicator_idv;
  indicator_snp = cPar.indicator_snp;
  snpInfo = cPar.snpInfo;

  return;
}

void VARCOV::CopyToParam(PARAM &cPar) {
  cPar.time_opt = time_opt;
  return;
}

void VARCOV::WriteCov(const int flag, const vector<SNPINFO> &snpInfo_sub,
                      const vector<vector<double>> &Cov_mat) {
  string file_cov;
  file_cov = path_out + "/" + file_out;
  file_cov += ".cor.txt";

  ofstream outfile;

  if (flag == 0) {
    outfile.open(file_cov.c_str(), ofstream::out);
    if (!outfile) {
      cout << "error writing file: " << file_cov << endl;
      return;
    }

    outfile << "chr"
            << "\t"
            << "rs"
            << "\t"
            << "ps"
            << "\t"
            << "n_mis"
            << "\t"
            << "n_obs"
            << "\t"
            << "allele1"
            << "\t"
            << "allele0"
            << "\t"
            << "af"
            << "\t"
            << "window_size"
            << "\t"
            << "var"
            << "\t"
            << "cor" << endl;
  } else {
    outfile.open(file_cov.c_str(), ofstream::app);
    if (!outfile) {
      cout << "error writing file: " << file_cov << endl;
      return;
    }

    for (size_t i = 0; i < Cov_mat.size(); i++) {
      outfile << snpInfo_sub[i].chr << "\t" << snpInfo_sub[i].rs_number << "\t"
              << snpInfo_sub[i].base_position << "\t" << snpInfo_sub[i].n_miss
              << "\t" << snpInfo_sub[i].n_idv << "\t" << snpInfo_sub[i].a_minor
              << "\t" << snpInfo_sub[i].a_major << "\t" << fixed
              << setprecision(3) << snpInfo_sub[i].maf << "\t"
              << Cov_mat[i].size() - 1 << "\t";
      outfile << scientific << setprecision(6) << Cov_mat[i][0] << "\t";

      if (Cov_mat[i].size() == 1) {
        outfile << "NA";
      } else {
        for (size_t j = 1; j < Cov_mat[i].size(); j++) {
          if (j == (Cov_mat[i].size() - 1)) {
            outfile << Cov_mat[i][j];
          } else {
            outfile << Cov_mat[i][j] << ",";
          }
        }
      }

      outfile << endl;
    }
  }

  outfile.close();
  outfile.clear();
  return;
}

bool CompareSNPinfo(const SNPINFO &snpInfo1, const SNPINFO &snpInfo2) {
  int c_chr = snpInfo1.chr.compare(snpInfo2.chr);
  long int c_bp = snpInfo1.base_position - snpInfo2.base_position;

  if (c_chr < 0) {
    return true;
  } else if (c_chr > 0) {
    return false;
  } else {
    if (c_bp < 0) {
      return true;
    } else if (c_bp > 0) {
      return false;
    } else {
      return true;
    }
  }
}

// Do not sort SNPs (because gzip files do not support random access)
// then calculate n_nb, the number of neighbours, for each SNP.
void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) {
  size_t t2 = 0, n_nb = 0;
  for (size_t t = 0; t < indicator_snp.size(); ++t) {
    if (indicator_snp[t] == 0) {
      continue;
    }

    if (snpInfo_sort[t].chr == "-9" ||
        (snpInfo_sort[t].cM == -9 && window_cm != 0) ||
        (snpInfo_sort[t].base_position == -9 && window_bp != 0)) {
      snpInfo_sort[t].n_nb = 0;
      continue;
    }

    if (t == indicator_snp.size() - 1) {
      snpInfo_sort[t].n_nb = 0;
      continue;
    }

    t2 = t + 1;
    n_nb = 0;

    while (t2 < indicator_snp.size() &&
           snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
           indicator_snp[t2] == 0) {
      t2++;
    }

    while (t2 < indicator_snp.size() &&
           snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
           (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm ||
            window_cm == 0) &&
           (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position <
            (long int) window_bp ||
            window_bp == 0) &&
           (n_nb < window_ns || window_ns == 0)) {
      t2++;
      n_nb++;
      while (t2 < indicator_snp.size() &&
             snpInfo_sort[t2].chr == snpInfo_sort[t].chr &&
             indicator_snp[t2] == 0) {
        t2++;
      }
    }

    snpInfo_sort[t].n_nb = n_nb;
  }

  return;
}

// Vector double is centered to have mean 0.
void Calc_Cor(vector<vector<double>> &X_mat, vector<double> &cov_vec) {
  cov_vec.clear();

  double v1, v2, r;
  vector<double> x_vec = X_mat[0];

  lapack_ddot(x_vec, x_vec, v1);
  cov_vec.push_back(v1 / (double)x_vec.size());

  for (size_t i = 1; i < X_mat.size(); i++) {
    lapack_ddot(X_mat[i], x_vec, r);
    lapack_ddot(X_mat[i], X_mat[i], v2);
    r /= sqrt(v1 * v2);

    cov_vec.push_back(r);
  }

  return;
}

// Read the genotype file again, calculate r2 between pairs of SNPs
// within a window, output the file every 10K SNPs the output results
// are sorted based on chr and bp output format similar to assoc.txt
// files (remember n_miss is replaced by n_idv).
//
// r2 between the current SNP and every following SNPs within the
// window_size (which can vary if cM was used) read bimbam mean
// genotype file and calculate the covariance matrix for neighboring
// SNPs output values at 10000-SNP-interval.
void VARCOV::AnalyzeBimbam() {
  debug_msg("entering");
  igzstream infile(file_geno.c_str(), igzstream::in);
  if (!infile) {
    cout << "error reading genotype file:" << file_geno << endl;
    return;
  }

  // Calculate the number of right-hand-side neighbours for each SNP.
  vector<SNPINFO> snpInfo_sub;
  CalcNB(snpInfo);

  size_t ni_test = 0;
  for (size_t i = 0; i < indicator_idv.size(); i++) {
    ni_test += indicator_idv[i];
  }

  gsl_vector *geno = gsl_vector_alloc(ni_test);
  double geno_mean;

  vector<double> x_vec, cov_vec;
  vector<vector<double>> X_mat, Cov_mat;

  for (size_t i = 0; i < ni_test; i++) {
    x_vec.push_back(0);
  }

  WriteCov(0, snpInfo_sub, Cov_mat);

  size_t t2 = 0, inc;
  int n_nb = 0;

  for (size_t t = 0; t < indicator_snp.size(); ++t) {
    if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
      ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
    }
    if (indicator_snp[t] == 0) {
      continue;
    }

    if (X_mat.size() == 0) {
      n_nb = snpInfo[t].n_nb + 1;
    } else {
      n_nb = snpInfo[t].n_nb - n_nb + 1;
    }

    for (int i = 0; i < n_nb; i++) {
      if (X_mat.size() == 0) {
        t2 = t;
      }

      // Read a line of the snp is filtered out.
      inc = 0;
      while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
        t2++;
        inc++;
      }

      Bimbam_ReadOneSNP(inc, indicator_idv, infile, geno, geno_mean);
      gsl_vector_add_constant(geno, -1.0 * geno_mean);

      for (size_t j = 0; j < geno->size; j++) {
        x_vec[j] = gsl_vector_get(geno, j);
      }
      X_mat.push_back(x_vec);

      t2++;
    }

    n_nb = snpInfo[t].n_nb;

    Calc_Cor(X_mat, cov_vec);
    Cov_mat.push_back(cov_vec);
    snpInfo_sub.push_back(snpInfo[t]);

    X_mat.erase(X_mat.begin());

    // Write out var/cov values.
    if (Cov_mat.size() == 10000) {
      WriteCov(1, snpInfo_sub, Cov_mat);
      Cov_mat.clear();
      snpInfo_sub.clear();
    }
  }

  if (Cov_mat.size() != 0) {
    WriteCov(1, snpInfo_sub, Cov_mat);
    Cov_mat.clear();
    snpInfo_sub.clear();
  }

  gsl_vector_free(geno);

  infile.close();
  infile.clear();

  return;
}

void VARCOV::AnalyzePlink() {
  debug_msg("entering");
  string file_bed = file_bfile + ".bed";
  ifstream infile(file_bed.c_str(), ios::binary);
  if (!infile) {
    cout << "error reading bed file:" << file_bed << endl;
    return;
  }

  // Calculate the number of right-hand-side neighbours for each SNP.
  vector<SNPINFO> snpInfo_sub;
  CalcNB(snpInfo);

  size_t ni_test = 0;
  for (size_t i = 0; i < indicator_idv.size(); i++) {
    ni_test += indicator_idv[i];
  }

  gsl_vector *geno = gsl_vector_alloc(ni_test);
  double geno_mean;

  vector<double> x_vec, cov_vec;
  vector<vector<double>> X_mat, Cov_mat;

  for (size_t i = 0; i < ni_test; i++) {
    x_vec.push_back(0);
  }

  WriteCov(0, snpInfo_sub, Cov_mat);

  size_t t2 = 0, inc;
  int n_nb = 0;

  for (size_t t = 0; t < indicator_snp.size(); ++t) {
    if (t % d_pace == 0 || t == (indicator_snp.size() - 1)) {
      ProgressBar("Reading SNPs  ", t, indicator_snp.size() - 1);
    }
    if (indicator_snp[t] == 0) {
      continue;
    }

    if (X_mat.size() == 0) {
      n_nb = snpInfo[t].n_nb + 1;
    } else {
      n_nb = snpInfo[t].n_nb - n_nb + 1;
    }

    for (int i = 0; i < n_nb; i++) {
      if (X_mat.size() == 0) {
        t2 = t;
      }

      // Read a line if the SNP is filtered out.
      inc = 0;
      while (t2 < indicator_snp.size() && indicator_snp[t2] == 0) {
        t2++;
        inc++;
      }

      Plink_ReadOneSNP(t2, indicator_idv, infile, geno, geno_mean);
      gsl_vector_add_constant(geno, -1.0 * geno_mean);

      for (size_t j = 0; j < geno->size; j++) {
        x_vec[j] = gsl_vector_get(geno, j);
      }
      X_mat.push_back(x_vec);

      t2++;
    }

    n_nb = snpInfo[t].n_nb;

    Calc_Cor(X_mat, cov_vec);
    Cov_mat.push_back(cov_vec);
    snpInfo_sub.push_back(snpInfo[t]);

    X_mat.erase(X_mat.begin());

    // Write out var/cov values.
    if (Cov_mat.size() == 10000) {
      WriteCov(1, snpInfo_sub, Cov_mat);
      Cov_mat.clear();
      snpInfo_sub.clear();
    }
  }

  if (Cov_mat.size() != 0) {
    WriteCov(1, snpInfo_sub, Cov_mat);
    Cov_mat.clear();
    snpInfo_sub.clear();
  }

  gsl_vector_free(geno);

  infile.close();
  infile.clear();

  return;
}