about summary refs log tree commit diff
path: root/src/gemma_io.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gemma_io.cpp')
-rw-r--r--src/gemma_io.cpp410
1 files changed, 256 insertions, 154 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 7d1f0ca..e630f64 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -2,7 +2,7 @@
     Genome-wide Efficient Mixed Model Association (GEMMA)
     Copyright © 2011-2017, Xiang Zhou
     Copyright © 2017, Peter Carbonetto
-    Copyright © 2017-2020, Pjotr Prins
+    Copyright © 2017-2025, Pjotr Prins
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -41,6 +41,7 @@
 #include "gsl/gsl_linalg.h"
 #include "gsl/gsl_matrix.h"
 #include "gsl/gsl_vector.h"
+#include <lmdb++.h>
 
 #include "checkpoint.h"
 #include "debug.h"
@@ -61,7 +62,7 @@ void ProgressBar(string str, double p, double total, double ratio) {
   const double progress = (100.0 * p / total);
   const uint barsize = (int)(progress / 2.0); // characters
   // cout << barsize << endl;
-  // cout << str << " ";
+  cout << str << " ";
   // cout << p << "/" << total << endl;
   assert(barsize < 101); // corrupted data somehow
   if (barsize > 0) {
@@ -153,6 +154,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) {
 // Read SNP file. A single column of SNP names.
 bool ReadFile_snps(const string file_snps, set<string> &setSnps) {
   debug_msg("enter ReadFile_snps");
+  checkpoint("read-file-snps",file_snps);
   setSnps.clear();
 
   igzstream infile(file_snps.c_str(), igzstream::in);
@@ -182,6 +184,7 @@ bool ReadFile_snps(const string file_snps, set<string> &setSnps) {
 bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) {
   debug_msg("entered");
   setSnps.clear();
+  checkpoint("read-file-header",file_snps);
 
   igzstream infile(file_snps.c_str(), igzstream::in);
   if (!infile) {
@@ -239,6 +242,7 @@ bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) {
 // Read log file.
 bool ReadFile_log(const string &file_log, double &pheno_mean) {
   debug_msg("ReadFile_log");
+  checkpoint("read-file-log",file_log);
   ifstream infile(file_log.c_str(), ifstream::in);
   if (!infile) {
     cout << "error! fail to open log file: " << file_log << endl;
@@ -282,6 +286,8 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr,
                    map<string, long int> &mapRS2bp,
                    map<string, double> &mapRS2cM) {
   debug_msg("ReadFile_anno");
+  checkpoint("read-file-anno",file_anno);
+
   mapRS2chr.clear();
   mapRS2bp.clear();
 
@@ -345,6 +351,7 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr,
 bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv,
                      vector<double> &pheno, const int &p_column) {
   debug_msg("entered");
+  checkpoint("read-file-column",file_pheno);
   indicator_idv.clear();
   pheno.clear();
 
@@ -389,6 +396,7 @@ bool ReadFile_pheno(const string &file_pheno,
                     vector<vector<double>> &pheno,
                     const vector<size_t> &p_column) {
   debug_msg("entered");
+  checkpoint("read-file-pheno",file_pheno);
   indicator_pheno.clear();
   pheno.clear();
 
@@ -398,35 +406,36 @@ bool ReadFile_pheno(const string &file_pheno,
     return false;
   }
 
-  string line;
-  char *ch_ptr;
-
-  string id;
+  // string id;
   double p;
 
   vector<double> pheno_row;
-  vector<int> ind_pheno_row;
+
+  string line;
+  char *ch_ptr;
+
+  vector<int> indicator_pheno_row;
 
   size_t p_max = *max_element(p_column.begin(), p_column.end());
   map<size_t, size_t> mapP2c;
   for (size_t i = 0; i < p_column.size(); i++) {
     mapP2c[p_column[i]] = i;
     pheno_row.push_back(-9);
-    ind_pheno_row.push_back(0);
+    indicator_pheno_row.push_back(0);
   }
 
   while (!safeGetline(infile, line).eof()) {
     ch_ptr = strtok((char *)line.c_str(), " ,\t");
     size_t i = 0;
     while (i < p_max) {
-      enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match phenotypes in geno file");
+      enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match selected columns");
       if (mapP2c.count(i + 1) != 0) {
         if (strcmp(ch_ptr, "NA") == 0) {
-          ind_pheno_row[mapP2c[i + 1]] = 0;
+          indicator_pheno_row[mapP2c[i + 1]] = 0; // skip this trait row
           pheno_row[mapP2c[i + 1]] = -9;
         } else {
           p = atof(ch_ptr);
-          ind_pheno_row[mapP2c[i + 1]] = 1;
+          indicator_pheno_row[mapP2c[i + 1]] = 1; // use this trait row
           pheno_row[mapP2c[i + 1]] = p;
         }
       }
@@ -434,7 +443,7 @@ bool ReadFile_pheno(const string &file_pheno,
       ch_ptr = strtok(NULL, " ,\t");
     }
 
-    indicator_pheno.push_back(ind_pheno_row);
+    indicator_pheno.push_back(indicator_pheno_row);
     pheno.push_back(pheno_row);
   }
 
@@ -448,6 +457,7 @@ bool ReadFile_pheno(const string &file_pheno,
 bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt,
                   vector<vector<double>> &cvt, size_t &n_cvt) {
   debug_msg("entered");
+  checkpoint("read-file-cvt",file_cvt);
   indicator_cvt.clear();
 
   ifstream infile(file_cvt.c_str(), ifstream::in);
@@ -515,6 +525,7 @@ bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt,
 
 // Read .bim file.
 bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) {
+  checkpoint("read-file-bim",file_bim);
   debug_msg("entered");
   snpInfo.clear();
 
@@ -563,6 +574,7 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) {
 bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
                   vector<vector<double>> &pheno, map<string, int> &mapID2num,
                   const vector<size_t> &p_column) {
+  checkpoint("read-file-fam",file_fam);
   debug_msg("entered");
   indicator_pheno.clear();
   pheno.clear();
@@ -639,13 +651,14 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
   return true;
 }
 
+
 // Read bimbam mean genotype file, the first time, to obtain #SNPs for
 // analysis (ns_test) and total #SNP (ns_total).
 /*
 ## Modified (Mutated) Parameters:
 
 1. **`indicator_idv`** (vector<int>&)
-   - Actually **not modified** - only read from to determine which individuals to include
+   - **Not modified** - only read from to determine which individuals to include
 
 2. **`indicator_snp`** (vector<int>&)
    - **Modified**: Cleared at the start (`indicator_snp.clear()`)
@@ -678,14 +691,15 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
 The function returns a `bool` indicating success/failure, but the real outputs are these three modified reference parameters.
 */
 
-bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
-                   const gsl_matrix *W, const vector<int> &indicator_idv,
-                   vector<int> &indicator_snp, const double &maf_level,
-                   const double &miss_level, const double &hwe_level,
-                   const double &r2_level, map<string, string> &mapRS2chr,
-                   map<string, long int> &mapRS2bp,
-                   map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo,
-                   size_t &ns_test) {
+bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
+                          const gsl_matrix *W, const vector<int> &indicator_idv,
+                          vector<int> &indicator_snp, const double &maf_level,
+                          const double &miss_level, const double &hwe_level,
+                          const double &r2_level, map<string, string> &mapRS2chr,
+                          map<string, long int> &mapRS2bp,
+                          map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo,
+                          size_t &ns_test) {
+  checkpoint("read-file-geno",file_geno);
   debug_msg("entered");
   indicator_snp.clear();
   snpInfo.clear();
@@ -724,9 +738,8 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   double cM;
   size_t file_pos;
 
-  double maf, geno, geno_old;
+  double geno, geno_old;
   size_t n_miss;
-  size_t n_0, n_1, n_2;
   double min_g = std::numeric_limits<float>::max();
   double max_g = std::numeric_limits<float>::min();
 
@@ -737,6 +750,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   for (int i = 0; i < ni_total; ++i) {
     ni_test += indicator_idv[i];
   }
+  // assert(ni_test == ni_total); // we use indicator_idv?
   ns_test = 0;
 
   file_pos = 0;
@@ -783,15 +797,14 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
     }
 
     // Start on a new marker/SNP
-    maf = 0;
     n_miss = 0;
     flag_poly = 0;
     geno_old = -9;
-    n_0 = 0;
-    n_1 = 0;
-    n_2 = 0;
     c_idv = 0;
     gsl_vector_set_zero(genotype_miss);
+    double maf = 0.0;
+    size_t n_0=0,n_1=0,n_2=0;
+
     auto infilen = file_geno.c_str();
     for (int i = 0; i < ni_total; ++i) { // read genotypes
       ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str());
@@ -808,19 +821,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
       }
 
       geno = atof(ch_ptr);
-      if (geno >= 0 && geno <= 0.5) {
-        n_0++;
-      }
-      if (geno > 0.5 && geno < 1.5) {
-        n_1++;
-      }
-      if (geno >= 1.5 && geno <= 2.0) {
-        n_2++;
-      }
 
       gsl_vector_set(genotype, c_idv, geno);
-      if (geno < min_g) min_g = geno;
-      if (geno > max_g) max_g = geno;
+      if (geno < min_g)
+        min_g = geno;
+      if (geno > max_g)
+        max_g = geno;
 
       // going through genotypes with 0.0 < geno < 2.0
       if (flag_poly == 0) { // first init in marker
@@ -831,10 +837,20 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
         flag_poly = 1;      // genotypes differ
       }
 
+      // compute ratios
+      if (hwe_level != 0 && maf_level != -1) {
+        if (geno >= 0 && geno <= 0.5)
+          n_0++;
+        if (geno > 0.5 && geno < 1.5)
+          n_1++;
+        if (geno >= 1.5 && geno <= 2.0)
+          n_2++;
+      }
       maf += geno;
 
       c_idv++;
     }
+
     maf /= 2.0 * (double)(ni_test - n_miss);
 
     SNPINFO sInfo = {chr,    rs,
@@ -871,8 +887,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
         continue;
       }
     }
-
-
     // -r2 flag
     for (size_t i = 0; i < genotype->size; ++i) {
       if (gsl_vector_get(genotype_miss, i) == 1) {
@@ -880,7 +894,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
         gsl_vector_set(genotype, i, geno);
       }
     }
-
     gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx);
     gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
     gsl_blas_ddot(genotype, genotype, &v_x);
@@ -924,6 +937,7 @@ bool ReadFile_bed(const string &file_bed, const set<string> &setSnps,
                   const double &maf_level, const double &miss_level,
                   const double &hwe_level, const double &r2_level,
                   size_t &ns_test) {
+  checkpoint("read-file-bed",file_bed);
   debug_msg("entered");
   indicator_snp.clear();
   size_t ns_total = snpInfo.size();
@@ -1232,6 +1246,7 @@ void Plink_ReadOneSNP(const int pos, const vector<int> &indicator_idv,
 void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv,
                   map<string, int> &mapID2num, const size_t k_mode, bool &error,
                   gsl_matrix *G) {
+  checkpoint("read-file-kin",file_kin);
   debug_msg("entered");
   igzstream infile(file_kin.c_str(), igzstream::in);
   if (!infile) {
@@ -1335,7 +1350,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv,
 
   infile.close();
   infile.clear();
-  checkpoint("read-kinship-file",file_kin);
+  checkpoint("end-read-file-kin",file_kin);
 
   return;
 }
@@ -1461,6 +1476,197 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
   return;
 }
 
+
+// Read bimbam mean genotype file and calculate kinship matrix.
+gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) {
+  checkpoint("mdb-kin",file_geno);
+  bool is_loco = !loco.empty();
+
+  // Convert loco string to what we use in the chrpos index
+  uint8_t loco_chr = 0;
+  if (is_loco) {
+    if (loco == "X") {
+      loco_chr = CHR_X;
+    } else if (loco == "Y") {
+      loco_chr = CHR_Y;
+    } else if (loco == "M") {
+      loco_chr = CHR_M;
+    } else {
+      try {
+        loco_chr = static_cast<uint8_t>(stoi(loco));
+      } catch (...) {
+        loco_chr = 0;
+      }
+    }
+  }
+
+  /* Create and open the LMDB environment: */
+  auto env = lmdb::env::create();
+
+  env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */
+  env.set_max_dbs(10);
+  env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664);
+  string format;
+
+  size_t ni_total = 0, num_markers = 0;
+  {
+    auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
+    auto info = lmdb::dbi::open(rtxn, "info");
+
+    string_view meta;
+    if (info.get(rtxn, "meta", meta)) {
+      cout << meta << endl;
+    } else {
+      cout << "meta not found!" << endl;
+    }
+    std::string_view view_int;
+    info.get(rtxn, "numsamples", view_int);
+    ni_total = lmdb::from_sv<size_t>(view_int);
+    string_view info_key,info_value;
+    info.get(rtxn,"format",info_value);
+    format = string(info_value);
+
+    MDB_stat stat;
+    mdb_stat(rtxn, info, &stat);
+    assert(stat.ms_entries == 5);
+  }
+
+  auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY);
+
+  auto geno_mdb = lmdb::dbi::open(rtxn, "geno");
+
+  MDB_stat stat;
+  mdb_stat(rtxn, geno_mdb, &stat);
+  // cout << "Number of records: " << stat.ms_entries << endl;
+  num_markers = stat.ms_entries;
+
+  gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total);
+  gsl_matrix_set_zero(matrix_kin);
+  gsl_vector *geno = gsl_vector_safe_alloc(ni_total);
+  gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total);
+
+  // Xlarge contains inds x markers
+  gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE);
+  enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE");
+  if (is_check_mode())
+    gsl_matrix_set_zero(Xlarge);
+
+  // [ ] check XLarge is zeroed properly
+  // [ ] handle missing data
+
+  // For every marker read the genotype x inds
+  size_t ns_test = 0;
+  size_t display_pace = 1000;
+
+  cout << "## number of total individuals = " << ni_total << endl;
+  cout << "## number of analyzed individuals = " << ni_total << endl;
+  cout << "## number of analyzed SNPs/var = " << num_markers << endl;
+
+  auto cursor = lmdb::cursor::open(rtxn, geno_mdb);
+
+  auto mdb_fetch = MDB_FIRST;
+  for (size_t t = 0; t < num_markers; ++t) {
+    string line;
+    if (t % display_pace == 0) {
+      ProgressBar("Reading SNPs", t, num_markers - 1);
+    }
+    string_view key, value;
+    cursor.get(key, value, mdb_fetch);
+    mdb_fetch = MDB_NEXT;
+
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data());
+    auto chr = static_cast<uint8_t>(data[1]);
+
+    if (is_loco && loco_chr == chr)
+      continue;
+
+    // ---- Depending on the format we get different buffers - currently float and byte are supported:
+    vector<double> gs;
+    if (format == "Gb") {
+      size_t num_bytes = value.size() / sizeof(uint8_t);
+      assert(num_bytes == ni_total);
+      auto size = num_bytes;
+      const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data());
+      gs.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        double g = static_cast<double>(gs_bbuf[i])/127.0;
+        gs.push_back(g);
+      }
+    } else {
+      size_t num_floats = value.size() / sizeof(float);
+      assert(num_floats == ni_total);
+      auto size = num_floats;
+      const float* gs_fbuf = reinterpret_cast<const float*>(value.data());
+      gs.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        gs.push_back(static_cast<double>(gs_fbuf[i]));
+      }
+    }
+
+    size_t n_miss = 0;
+    double geno_mean = 0.0;
+    double geno_var = 0.0;
+    gsl_vector_set_all(geno_miss, 0);
+
+    for (size_t i = 0; i < ni_total; ++i) {
+      double d = gs[i];
+      gsl_vector_set(geno, i, d);
+      gsl_vector_set(geno_miss, i, 1);
+      geno_mean += d;
+      geno_var += d * d;
+    }
+
+    geno_mean /= (double)(ni_total - n_miss);
+    geno_var += geno_mean * geno_mean * (double)n_miss;
+    geno_var /= (double)ni_total;
+    geno_var -= geno_mean * geno_mean;
+
+    // impute missing values by plugging in the mean
+    for (size_t i = 0; i < ni_total; ++i) {
+      if (gsl_vector_get(geno_miss, i) == 0) {
+        gsl_vector_set(geno, i, geno_mean);
+      }
+    }
+
+    // subtract the mean (centering genotype values)
+    gsl_vector_add_constant(geno, -1.0 * geno_mean);
+
+    // scale the genotypes
+    if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2
+                                        // flag does this
+      gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
+    }
+
+    // set the SNP column ns_test to copy geno into the compute matrix Xlarge
+    gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % K_BATCH_SIZE);
+    enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
+
+    ns_test++;
+
+    // Every K_BATCH_SIZE rows batch compute kinship matrix and return
+    // by adding to matrix_kin
+    if (ns_test % K_BATCH_SIZE == 0) {
+      fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+      gsl_matrix_set_zero(Xlarge);
+      write(matrix_kin,"K updated");
+    }
+  }
+  if (ns_test % K_BATCH_SIZE != 0) { // compute last batch
+    fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin);
+    write(matrix_kin,"K updated");
+  }
+  ProgressBar("Reading SNPs", num_markers, num_markers);
+  cout << endl;
+
+  enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test));
+
+  gsl_vector_free(geno);
+  gsl_vector_free(geno_miss);
+  gsl_matrix_free(Xlarge);
+
+  return matrix_kin;
+}
+
 // Read bimbam mean genotype file and calculate kinship matrix.
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
@@ -1470,7 +1676,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
   auto infilen = file_geno.c_str();
   igzstream infile(infilen, igzstream::in);
   enforce_msg(infilen, "error reading genotype file");
-  checkpoint("read-geno-file",file_geno);
+  checkpoint("bimbam-kin",file_geno);
 
   size_t n_miss;
   double geno_mean, geno_var;
@@ -1792,6 +1998,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp,
 bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv,
                    vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K,
                    const bool calc_K) {
+  checkpoint("read-file-geno",file_geno);
   debug_msg("entered");
   igzstream infile(file_geno.c_str(), igzstream::in);
   if (!infile) {
@@ -1893,121 +2100,12 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv,
   return true;
 }
 
-// Compact version of the above function, using uchar instead of
-// gsl_matrix.
-bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv,
-                   vector<int> &indicator_snp,
-                   vector<vector<unsigned char>> &Xt, gsl_matrix *K,
-                   const bool calc_K, const size_t ni_test,
-                   const size_t ns_test) {
-  debug_msg("entered");
-  igzstream infile(file_geno.c_str(), igzstream::in);
-  if (!infile) {
-    cout << "error reading genotype file:" << file_geno << endl;
-    return false;
-  }
-
-  Xt.clear();
-  vector<unsigned char> Xt_row;
-  for (size_t i = 0; i < ni_test; i++) {
-    Xt_row.push_back(0);
-  }
-
-  string line;
-  char *ch_ptr;
-
-  if (calc_K == true) {
-    gsl_matrix_set_zero(K);
-  }
-
-  gsl_vector *genotype = gsl_vector_safe_alloc(ni_test);
-  gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test);
-  double geno, geno_mean;
-  size_t n_miss;
-
-  size_t ni_total = indicator_idv.size();
-  size_t ns_total = indicator_snp.size();
-
-  size_t c_idv = 0, c_snp = 0;
-
-  auto infilen = file_geno.c_str();
-  for (size_t i = 0; i < ns_total; ++i) {
-    safeGetline(infile, line).eof();
-    if (indicator_snp[i] == 0) {
-      continue;
-    }
-
-    ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen);
-    ch_ptr = strtok_safe2(NULL, " ,\t",infilen);
-    ch_ptr = strtok_safe2(NULL, " ,\t",infilen);
-
-    c_idv = 0;
-    geno_mean = 0;
-    n_miss = 0;
-    gsl_vector_set_zero(genotype_miss);
-    for (uint j = 0; j < ni_total; ++j) {
-      ch_ptr = strtok_safe2(NULL, " ,\t",infilen);
-      if (indicator_idv[j] == 0) {
-        continue;
-      }
-
-      if (strcmp(ch_ptr, "NA") == 0) {
-        gsl_vector_set(genotype_miss, c_idv, 1);
-        n_miss++;
-      } else {
-        geno = atof(ch_ptr);
-        gsl_vector_set(genotype, c_idv, geno);
-        geno_mean += geno;
-      }
-      c_idv++;
-    }
-
-    geno_mean /= (double)(ni_test - n_miss);
-
-    for (size_t j = 0; j < genotype->size; ++j) {
-      if (gsl_vector_get(genotype_miss, j) == 1) {
-        geno = geno_mean;
-      } else {
-        geno = gsl_vector_get(genotype, j);
-      }
-
-      Xt_row[j] = Double02ToUchar(geno);
-      gsl_vector_set(genotype, j, (geno - geno_mean));
-    }
-    Xt.push_back(Xt_row);
-
-    if (calc_K == true) {
-      gsl_blas_dsyr(CblasUpper, 1.0, genotype, K);
-    }
-
-    c_snp++;
-  }
-
-  if (calc_K == true) {
-    gsl_matrix_scale(K, 1.0 / (double)ns_test);
-
-    for (size_t i = 0; i < genotype->size; ++i) {
-      for (size_t j = 0; j < i; ++j) {
-        geno = gsl_matrix_get(K, j, i);
-        gsl_matrix_set(K, i, j, geno);
-      }
-    }
-  }
-
-  gsl_vector_free(genotype);
-  gsl_vector_free(genotype_miss);
-
-  infile.clear();
-  infile.close();
-
-  return true;
-}
-
 // Read bimbam mean genotype file, the second time, recode "mean"
 // genotype and calculate K.
 bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv,
                   vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K,
                   const bool calc_K) {
+  checkpoint("read-file-bed",file_bed);
   debug_msg("entered");
   ifstream infile(file_bed.c_str(), ios::binary);
   if (!infile) {
@@ -2141,6 +2239,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv,
                   vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt,
                   gsl_matrix *K, const bool calc_K, const size_t ni_test,
                   const size_t ns_test) {
+  checkpoint("read-file-bed",file_bed);
   debug_msg("entered");
   ifstream infile(file_bed.c_str(), ios::binary);
   if (!infile) {
@@ -2277,6 +2376,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv,
 
 bool ReadFile_est(const string &file_est, const vector<size_t> &est_column,
                   map<string, double> &mapRS2est) {
+  checkpoint("read-file-est",file_est);
   debug_msg("entered");
   mapRS2est.clear();
 
@@ -2344,7 +2444,8 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column,
 }
 
 bool CountFileLines(const string &file_input, size_t &n_lines) {
-  debug_msg("entered");
+  checkpoint("count-lines",file_input);
+
   igzstream infile(file_input.c_str(), igzstream::in);
   if (!infile) {
     cout << "error! fail to open file: " << file_input << endl;
@@ -2361,6 +2462,7 @@ bool CountFileLines(const string &file_input, size_t &n_lines) {
 // Read gene expression file.
 bool ReadFile_gene(const string &file_gene, vector<double> &vec_read,
                    vector<SNPINFO> &snpInfo, size_t &ng_total) {
+  checkpoint("read-file-gene",file_gene);
   debug_msg("entered");
   vec_read.clear();
   ng_total = 0;