about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPjotr Prins2025-12-01 08:31:39 +0100
committerPjotr Prins2025-12-01 08:31:39 +0100
commite0a32b97dd2e6b0ab769b6e7ec6d3d217eeb15a0 (patch)
treea16614b2046b59718eb8ef76bee98e181e4b7758
parente233c0d2b6b9744005c009af6b78683a24137a00 (diff)
downloadpangemma-e0a32b97dd2e6b0ab769b6e7ec6d3d217eeb15a0.tar.gz
Wiring in mdb GWA
-rw-r--r--src/gemma.cpp66
-rw-r--r--src/lmm.cpp54
-rw-r--r--src/lmm.h4
3 files changed, 95 insertions, 29 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp
index 1cced77..fb2227e 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -2591,20 +2591,24 @@ void GEMMA::BatchRun(PARAM &cPar) {
       cPar.a_mode == M_LMM4 || cPar.a_mode == M_LMM5 || cPar.a_mode == M_LMM9 ||
       cPar.a_mode == M_EIGEN) { // Fit LMM or mvLMM or eigen
     write(cPar.a_mode, "Mode");
-    gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); // Y is ind x phenotypes
+    auto ni_test = cPar.ni_test;
+    auto n_ph = cPar.n_ph;
+    enforce(ni_test > 0);
+    enforce(n_ph > 0);
+    gsl_matrix *Y = gsl_matrix_safe_alloc(ni_test, n_ph); // Y is ind x phenotypes
     enforce_msg(Y, "allocate Y"); // just to be sure
-    gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt);    // W is ind x covariates
+    gsl_matrix *W = gsl_matrix_safe_alloc(ni_test, cPar.n_cvt);    // W is ind x covariates
     gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2);      // B is a d by c (effect size)
                                                           // matrix
     gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2);   // se
-    enforce(Y->size1 > 0);
-    gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1);      // Kinship aka as G/GRM in the equation ind x ind
-    gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1);
-    gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2);        // Transformed ind x covariates
-    gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2);        // Transforemd ind x phenotypes
-    gsl_vector *eval = gsl_vector_calloc(Y->size1);                 // eigen values
-    gsl_vector *env = gsl_vector_safe_alloc(Y->size1);              // E environment
-    gsl_vector *weight = gsl_vector_safe_alloc(Y->size1);           // weights
+    enforce(ni_test > 0);
+    gsl_matrix *K = gsl_matrix_safe_alloc(ni_test, ni_test);      // Kinship aka as G/GRM in the equation ind x ind
+    gsl_matrix *U = gsl_matrix_safe_alloc(ni_test, ni_test);
+    gsl_matrix *UtW = gsl_matrix_calloc(ni_test, W->size2);        // Transformed ind x covariates
+    gsl_matrix *UtY = gsl_matrix_calloc(ni_test, Y->size2);        // Transforemd ind x phenotypes
+    gsl_vector *eval = gsl_vector_calloc(ni_test);                 // eigen values
+    gsl_vector *env = gsl_vector_safe_alloc(ni_test);              // E environment
+    gsl_vector *weight = gsl_vector_safe_alloc(ni_test);           // weights
     debug_msg("Started on LMM");
     assert_issue(is_issue(26), UtY->data[0] == 0.0);
 
@@ -2838,28 +2842,32 @@ void GEMMA::BatchRun(PARAM &cPar) {
           gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and its transformed
           write(&Y_col.vector, "Y_col");
 
-          if (!cPar.file_bfile.empty()) {
-            // PLINK analysis
-            if (cPar.file_gxe.empty()) {
-              cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W,
-                                &Y_col.vector, cPar.setGWASnps);
+          if (cPar.is_mdb)
+            cLmm.mdb_calc_gwa(U, eval, UtW, &UtY_col.vector, W,
+                              &Y_col.vector, cPar.setGWASnps);
+          else
+            if (!cPar.file_bfile.empty()) {
+              // PLINK analysis
+              if (cPar.file_gxe.empty()) {
+                cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W,
+                                  &Y_col.vector, cPar.setGWASnps);
+              }
+              else {
+                cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W,
+                                     &Y_col.vector, env);
+              }
             }
             else {
-              cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W,
-                                   &Y_col.vector, env);
+              // BIMBAM analysis
+
+              if (cPar.file_gxe.empty()) {
+                cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W,
+                                   &Y_col.vector, cPar.setGWASnps);
+              } else {
+                cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W,
+                                      &Y_col.vector, env);
+              }
             }
-          }
-          else {
-            // BIMBAM analysis
-
-            if (cPar.file_gxe.empty()) {
-              cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W,
-                                 &Y_col.vector, cPar.setGWASnps);
-            } else {
-              cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W,
-                                    &Y_col.vector, env);
-            }
-          }
           cLmm.WriteFiles(); // write output files
           cLmm.CopyToParam(cPar);
         } else {
diff --git a/src/lmm.cpp b/src/lmm.cpp
index f5b8a88..e58adc9 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -1867,6 +1867,60 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
 }
 
 /*
+  This is the mirror function of below AnalyzeBimbam, but uses mdb input instead.
+ */
+
+void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
+                          const gsl_matrix *UtW, const gsl_vector *Uty,
+                          const gsl_matrix *W, const gsl_vector *y,
+                          const set<string> gwasnps) {
+  debug_msg(file_geno);
+  auto infilen = file_geno.c_str();
+  checkpoint("start-read-geno-file",file_geno);
+
+  igzstream infile(infilen, igzstream::in);
+  enforce_msg(infile, "error reading genotype file");
+  size_t prev_line = 0;
+
+  std::vector <double> gs;
+  gs.resize(ni_total);
+
+  // fetch_snp is a callback function for every SNP row
+  std::function<SnpNameValues(size_t)>  fetch_snp = [&](size_t num) {
+    string line;
+    while (prev_line <= num) {
+      // also read SNPs that were skipped
+      safeGetline(infile, line);
+      prev_line++;
+    }
+    char *ch_ptr = strtok_safe2((char *)line.c_str(), " , \t",infilen);
+    // enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // ch_ptr should not be NULL
+
+    auto snp = string(ch_ptr);
+    ch_ptr = strtok_safe2(NULL, " , \t",infilen); // skip column
+    ch_ptr = strtok_safe2(NULL, " , \t",infilen); // skip column
+
+    gs.assign (ni_total,nan("")); // wipe values
+
+    for (size_t i = 0; i < ni_total; ++i) {
+      ch_ptr = strtok_safe2(NULL, " , \t",infilen);
+      if (strcmp(ch_ptr, "NA") != 0) {
+        gs[i] = atof(ch_ptr);
+        if (is_strict_mode() && gs[i] == 0.0)
+          enforce_is_float(std::string(ch_ptr)); // only allow for NA and valid numbers
+      }
+    }
+    return std::make_tuple(snp,gs);
+  };
+
+  LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps);
+
+  infile.close();
+  infile.clear();
+}
+
+
+/*
 
 Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified:
 
diff --git a/src/lmm.h b/src/lmm.h
index 7b90510..cedbf38 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -100,6 +100,10 @@ public:
                const gsl_matrix *UtW, const gsl_vector *Uty,
                const gsl_matrix *W, const gsl_vector *y,
                const set<string> gwasnps);
+  void mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
+                     const gsl_matrix *UtW, const gsl_vector *Uty,
+                     const gsl_matrix *W, const gsl_vector *y,
+                     const set<string> gwasnps);
   void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
                      const gsl_matrix *UtW, const gsl_vector *Uty,
                      const gsl_matrix *W, const gsl_vector *y,