about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2025-12-01 11:05:03 +0100
committerPjotr Prins2025-12-01 11:05:03 +0100
commit02149fb38d05b14f74caa458741b1512bbf6a8d5 (patch)
tree6cac95601c46b36d71cf403bed3ebc0a4345e0fc
parentb76a21467c8692f263c07aa4f013370a8d10ce20 (diff)
downloadpangemma-02149fb38d05b14f74caa458741b1512bbf6a8d5.tar.gz
Refactoring
-rw-r--r--src/gemma_io.cpp15
-rw-r--r--src/param.cpp32
-rw-r--r--src/param.h2
3 files changed, 24 insertions, 25 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index ef02429..cfa92d1 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -413,28 +413,28 @@ bool ReadFile_pheno(const string &file_pheno,
   double p;
 
   vector<double> pheno_row;
-  vector<int> ind_pheno_row;
+  vector<int> indicator_pheno_row;
 
   size_t p_max = *max_element(p_column.begin(), p_column.end());
   map<size_t, size_t> mapP2c;
   for (size_t i = 0; i < p_column.size(); i++) {
     mapP2c[p_column[i]] = i;
     pheno_row.push_back(-9);
-    ind_pheno_row.push_back(0);
+    indicator_pheno_row.push_back(0);
   }
 
   while (!safeGetline(infile, line).eof()) {
     ch_ptr = strtok((char *)line.c_str(), " ,\t");
     size_t i = 0;
     while (i < p_max) {
-      enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match phenotypes in geno file");
+      enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match selected columns");
       if (mapP2c.count(i + 1) != 0) {
         if (strcmp(ch_ptr, "NA") == 0) {
-          ind_pheno_row[mapP2c[i + 1]] = 0;
+          indicator_pheno_row[mapP2c[i + 1]] = 0; // skip this trait row
           pheno_row[mapP2c[i + 1]] = -9;
         } else {
           p = atof(ch_ptr);
-          ind_pheno_row[mapP2c[i + 1]] = 1;
+          indicator_pheno_row[mapP2c[i + 1]] = 1; // use this trait row
           pheno_row[mapP2c[i + 1]] = p;
         }
       }
@@ -442,7 +442,7 @@ bool ReadFile_pheno(const string &file_pheno,
       ch_ptr = strtok(NULL, " ,\t");
     }
 
-    indicator_pheno.push_back(ind_pheno_row);
+    indicator_pheno.push_back(indicator_pheno_row);
     pheno.push_back(pheno_row);
   }
 
@@ -2422,7 +2422,8 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column,
 }
 
 bool CountFileLines(const string &file_input, size_t &n_lines) {
-  debug_msg("entered");
+  checkpoint("count-lines",file_input);
+
   igzstream infile(file_input.c_str(), igzstream::in);
   if (!infile) {
     cout << "error! fail to open file: " << file_input << endl;
diff --git a/src/param.cpp b/src/param.cpp
index 955da8e..a81d18f 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -163,7 +163,7 @@ void PARAM::ReadFiles(void) {
       setSnps.clear();
     }
 
-    // Also read KSNP set. Snps used by GRM.
+    // Also read KSNP set. Reads all Snps later to be used by GRM.
     if (!file_ksnps.empty()) {
       if (ReadFile_snps(file_ksnps, setKSnps) == false) {
         error = true;
@@ -192,10 +192,12 @@ void PARAM::ReadFiles(void) {
 
 
     if (!file_geno.empty()) {
+      // --- Read (multi-)column phenotypes into pheno and set the NA indicator
       if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) {
         error = true;
       }
 
+      // --- compute ns_total by readin geno file
       if (!is_mdb && is_check_mode() && CountFileLines(file_geno, ns_total) == false) {
         error = true;
       }
@@ -225,7 +227,7 @@ void PARAM::ReadFiles(void) {
       indicator_idv.push_back(k);
     }
 
-    ns_test = 0;
+    ns_test = 0; // number of snps to test
 
     return;
   }
@@ -283,7 +285,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -316,7 +318,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -335,10 +337,9 @@ void PARAM::ReadFiles(void) {
 
     ns_total = indicator_snp.size();
   }
-
-  // Check MDB format
+  // lmdb-type genotype file:
   if (is_mdb && !file_geno.empty()) {
-    ns_total = -1;
+    ns_total = indicator_snp.size();
   }
 
   // Read genotype file for multiple PLINK files.
@@ -376,7 +377,7 @@ void PARAM::ReadFiles(void) {
 
         // Post-process covariates and phenotypes, obtain
         // ni_test, save all useful covariates.
-        ProcessCvtPhen();
+        ProcessInclusionIndicators();
 
         // Obtain covariate matrix.
         W3 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -420,7 +421,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain ni_test,
     // save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt);
@@ -473,7 +474,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
 
     // Obtain covariate matrix.
     // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt);
@@ -508,7 +509,7 @@ void PARAM::ReadFiles(void) {
 
     // Post-process covariates and phenotypes, obtain
     // ni_test, save all useful covariates.
-    ProcessCvtPhen();
+    ProcessInclusionIndicators();
   }
 
   // Compute setKSnps when -loco is passed in
@@ -1989,7 +1990,7 @@ void PARAM::CheckCvt() {
 }
 
 // Post-process phenotypes and covariates.
-void PARAM::ProcessCvtPhen() {
+void PARAM::ProcessInclusionIndicators() {
 
   // Convert indicator_pheno to indicator_idv.
   int k = 1;
@@ -2027,11 +2028,8 @@ void PARAM::ProcessCvtPhen() {
 
   // Obtain ni_test.
   ni_test = 0;
-  for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
-    if (indicator_idv[i] == 0) {
-      continue;
-    }
-    ni_test++;
+  for(auto &i : indicator_idv) {
+    ni_test += indicator_idv[i];
   }
 
   // If subsample number is set, perform a random sub-sampling
diff --git a/src/param.h b/src/param.h
index 7cf4a6c..36c9a9f 100644
--- a/src/param.h
+++ b/src/param.h
@@ -344,7 +344,7 @@ public:
   void CopyA(size_t flag, gsl_matrix *A);
   void CopyGxe(gsl_vector *gxe);
   void CopyWeight(gsl_vector *w);
-  void ProcessCvtPhen();
+  void ProcessInclusionIndicators();
   void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag);
   void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag);
   gsl_matrix *MdbCalcKin();