about summary refs log tree commit diff
path: root/src/param.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/param.cpp')
-rw-r--r--src/param.cpp176
1 files changed, 90 insertions, 86 deletions
diff --git a/src/param.cpp b/src/param.cpp
index f96e9c3..96a9cd2 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -2,7 +2,7 @@
     Genome-wide Efficient Mixed Model Association (GEMMA)
     Copyright © 2011-2017, Xiang Zhou
     Copyright © 2017, Peter Carbonetto
-    Copyright © 2017, Pjotr Prins
+    Copyright © 2017-2025, Pjotr Prins
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -40,6 +40,7 @@
 #include "mathfunc.h"
 #include "param.h"
 #include "fastblas.h"
+#include "checkpoint.h"
 
 using namespace std;
 
@@ -104,7 +105,9 @@ PARAM::PARAM(void)
       randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200),
       error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1),
       time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0),
-      time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {}
+      time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {
+    is_mdb = false;
+  }
 
 PARAM::~PARAM() {
   if (gsl_r)
@@ -113,6 +116,7 @@ PARAM::~PARAM() {
 
 // Read files: obtain ns_total, ng_total, ns_test, ni_test.
 void PARAM::ReadFiles(void) {
+  checkpoint_nofile("PARAM::ReadFiles");
   string file_str;
 
   // Read cat file.
@@ -145,22 +149,24 @@ void PARAM::ReadFiles(void) {
     }
   }
 
-  // Read SNP set.
-  if (!file_snps.empty()) {
-    if (ReadFile_snps(file_snps, setSnps) == false) {
-      error = true;
+  if (!is_mdb) {
+    // Read SNP set into setSnps (without filtering)
+    if (!file_snps.empty()) {
+      if (ReadFile_snps(file_snps, setSnps) == false) {
+        error = true;
+      }
+    } else {
+      setSnps.clear();
     }
-  } else {
-    setSnps.clear();
-  }
 
-  // Read KSNP set.
-  if (!file_ksnps.empty()) {
-    if (ReadFile_snps(file_ksnps, setKSnps) == false) {
-      error = true;
+    // Also read KSNP set. Reads all Snps later to be used by GRM.
+    if (!file_ksnps.empty()) {
+      if (ReadFile_snps(file_ksnps, setKSnps) == false) {
+        error = true;
+      }
+    } else {
+      setKSnps.clear();
     }
-  } else {
-    setKSnps.clear();
   }
 
   // For prediction.
@@ -180,13 +186,15 @@ void PARAM::ReadFiles(void) {
       }
     }
 
+
     if (!file_geno.empty()) {
-      if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) ==
-          false) {
+      // --- Read (multi-)column phenotypes into pheno and set the NA indicator
+      if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
         error = true;
       }
 
-      if (CountFileLines(file_geno, ns_total) == false) {
+      // --- compute ns_total by readin geno file
+      if (!is_mdb && is_check_mode() && CountFileLines(file_geno, ns_total) == false) {
         error = true;
       }
     }
@@ -215,8 +223,6 @@ void PARAM::ReadFiles(void) {
       indicator_idv.push_back(k);
     }
 
-    ns_test = 0;
-
     return;
   }
 
@@ -273,7 +279,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -290,7 +296,7 @@ void PARAM::ReadFiles(void) {
   }
 
   // Read genotype and phenotype file for BIMBAM format.
-  if (!file_geno.empty()) {
+  if (!is_mdb && !file_geno.empty()) {
 
     // Annotation file before genotype file.
     if (!file_anno.empty()) {
@@ -299,14 +305,14 @@ void PARAM::ReadFiles(void) {
       }
     }
 
-    // Phenotype file before genotype file.
+    // Phenotype file before genotype file. Already done this(?!)
     if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
       error = true;
     }
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -314,9 +320,10 @@ void PARAM::ReadFiles(void) {
 
     trim_individuals(indicator_idv, ni_max);
     trim_individuals(indicator_cvt, ni_max);
-    if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp,
-                      maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
-                      mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
+    // The following reads the geno file to get the SNPs - only for BIMBAM
+    if (ReadFile_bimbam_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp,
+                                          maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
+                                          mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) {
       error = true;
       return;
     }
@@ -324,6 +331,17 @@ void PARAM::ReadFiles(void) {
 
     ns_total = indicator_snp.size();
   }
+  // lmdb-type genotype file:
+  if (is_mdb && !file_geno.empty()) {
+    if (!file_kin.empty()) { // GWA
+      // Phenotype file before genotype file. Already done this(?!)
+      if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
+        error = true;
+      }
+      ProcessInclusionIndicators();
+      ns_total = indicator_snp.size();
+    }
+  }
 
   // Read genotype file for multiple PLINK files.
   if (!file_mbfile.empty()) {
@@ -360,7 +378,7 @@ void PARAM::ReadFiles(void) {
 
         // Post-process covariates and phenotypes, obtain
         // ni_test, save all useful covariates.
-        ProcessCvtPhen();
+        ProcessInclusionIndicators();
 
         // Obtain covariate matrix.
         W3 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -387,8 +405,8 @@ void PARAM::ReadFiles(void) {
     infile.clear();
   }
 
-  // Read genotype and phenotype file for multiple BIMBAM files.
-  if (!file_mgeno.empty()) {
+  // Read genotype and phenotype file for multiple BIMBAM files. Note this goes untested.
+  if (!is_mdb && !file_mgeno.empty()) {
 
     // Annotation file before genotype file.
     if (!file_anno.empty()) {
@@ -404,7 +422,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain ni_test,
     // save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -420,7 +438,7 @@ void PARAM::ReadFiles(void) {
     string file_name;
     size_t ns_test_tmp;
     while (!safeGetline(infile, file_name).eof()) {
-      if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp,
+      if (ReadFile_bimbam_geno(file_name, setSnps, W4, indicator_idv, indicator_snp,
                         maf_level, miss_level, hwe_level, r2_level, mapRS2chr,
                         mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) {
         error = true;
@@ -457,7 +475,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt);
@@ -480,7 +498,8 @@ void PARAM::ReadFiles(void) {
       ni_test += indicator_idv[i];
     }
 
-    enforce_msg(ni_test,"number of analyzed individuals equals 0.");
+    if (!is_mdb)
+      enforce_msg(ni_test,"number of analyzed individuals equals 0.");
   }
 
   // For ridge prediction, read phenotype only.
@@ -491,7 +510,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
   }
 
   // Compute setKSnps when -loco is passed in
@@ -920,13 +939,18 @@ void PARAM::CheckParam(void) {
   enforce_fexists(file_gwasnps, "open file");
   enforce_fexists(file_anno, "open file");
 
+  if (file_geno.find(".mdb") != string::npos) {
+    is_mdb = true;
+  }
+
   if (!loco.empty()) {
     enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 9 || a_mode == 21 || a_mode == 22,
                 "LOCO only works with LMM and K");
     // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)");
     enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)");
-    enforce_msg(!file_anno.empty(),
-                "LOCO requires annotation file (-a switch)");
+    if (!is_mdb)
+      enforce_msg(!file_anno.empty(), "Without mdb LOCO requires annotation file (-a switch)");
+
     enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch");
     enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch");
   }
@@ -1078,13 +1102,15 @@ void PARAM::CheckData(void) {
     }
   }
 
-  enforce_msg(ni_test,"number of analyzed individuals equals 0.");
+  if (!is_mdb) {
+    enforce_msg(ni_test,"number of analyzed individuals equals 0.");
 
-  if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() &&
-      file_study.empty() && file_beta.empty() && file_bf.empty()) {
-    error = true;
-    cout << "error! number of analyzed individuals equals 0. " << endl;
-    return;
+    if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() &&
+        file_study.empty() && file_beta.empty() && file_bf.empty()) {
+      error = true;
+      cout << "error! number of analyzed individuals equals 0. " << endl;
+      return;
+    }
   }
 
   if (a_mode == 43) {
@@ -1103,7 +1129,7 @@ void PARAM::CheckData(void) {
   }
 
   // Output some information.
-  if (file_cor.empty() && file_mstudy.empty() && file_study.empty() &&
+  if (!is_mdb && file_cor.empty() && file_mstudy.empty() && file_study.empty() &&
       a_mode != 15 && a_mode != 27 && a_mode != 28) {
     cout << "## number of total individuals = " << ni_total << endl;
     if (a_mode == 43) {
@@ -1251,14 +1277,14 @@ void PARAM::CheckData(void) {
 
 void PARAM::PrintSummary() {
   if (n_ph == 1) {
-    cout << "pve estimate =" << pve_null << endl;
-    cout << "se(pve) =" << pve_se_null << endl;
+    cout << "##   pve estimate = " << pve_null << endl;
+    cout << "##   se(pve) = " << pve_se_null << endl;
   } else {
   }
   return;
 }
 
-void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
+void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
   string file_str;
 
   if (!file_bfile.empty()) {
@@ -1268,8 +1294,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
       error = true;
     }
   } else {
-    if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K,
-                      calc_K) == false) {
+      if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, calc_K) == false) {
       error = true;
     }
   }
@@ -1277,44 +1302,25 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) {
   return;
 }
 
-void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K,
-                          const bool calc_K) {
-  string file_str;
-
-  if (!file_bfile.empty()) {
-    file_str = file_bfile + ".bed";
-    if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K,
-                     ni_test, ns_test) == false) {
-      error = true;
-    }
-  } else {
-    if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K,
-                      ni_test, ns_test) == false) {
-      error = true;
-    }
-  }
-
-  return;
+gsl_matrix *PARAM::MdbCalcKin() {
+  return mdb_calc_kin(file_geno, a_mode - 20, loco);
 }
 
 void PARAM::CalcKin(gsl_matrix *matrix_kin) {
+  checkpoint_nofile("PARAM::CalcKin");
   string file_str;
-
   gsl_matrix_set_zero(matrix_kin);
 
   if (!file_bfile.empty()) {
     file_str = file_bfile + ".bed";
-    // enforce_msg(loco.empty(), "FIXME: LOCO nyi");
     if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) ==
         false) {
       error = true;
     }
   } else {
-    file_str = file_geno;
-    if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace,
-                  matrix_kin, ni_max == 0) == false) {
-      error = true;
-    }
+    // indicator_snp is an array of bool
+    error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace,
+                       matrix_kin, ni_max == 0);
   }
 
   return;
@@ -1990,7 +1996,7 @@ void PARAM::CheckCvt() {
 }
 
 // Post-process phenotypes and covariates.
-void PARAM::ProcessCvtPhen() {
+void PARAM::ProcessInclusionIndicators() {
 
   // Convert indicator_pheno to indicator_idv.
   int k = 1;
@@ -2028,11 +2034,8 @@ void PARAM::ProcessCvtPhen() {
 
   // Obtain ni_test.
   ni_test = 0;
-  for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
-    if (indicator_idv[i] == 0) {
-      continue;
-    }
-    ni_test++;
+  for(auto &i : indicator_idv) {
+    ni_test += indicator_idv[i];
   }
 
   // If subsample number is set, perform a random sub-sampling
@@ -2071,13 +2074,14 @@ void PARAM::ProcessCvtPhen() {
   }
 
   // Check ni_test.
-  if (a_mode != M_BSLMM5)
-    enforce_msg(ni_test,"number of analyzed individuals equals 0.");
-  if (ni_test == 0 && a_mode != M_BSLMM5) {
-    error = true;
-    cout << "error! number of analyzed individuals equals 0. " << endl;
+  if (!is_mdb) {
+    if (a_mode != M_BSLMM5)
+      enforce_msg(ni_test,"number of analyzed individuals equals 0.");
+    if (ni_test == 0 && a_mode != M_BSLMM5) {
+      error = true;
+      cout << "error! number of analyzed individuals equals 0. " << endl;
+    }
   }
-
   // Check covariates to see if they are correlated with each
   // other, and to see if the intercept term is included.
   // After getting ni_test.