Started generating output

author: Pjotr Prins 2025-12-02 09:04:28 +0100
committer: Pjotr Prins 2025-12-02 09:04:28 +0100
commit: a1e252bfc7fc874fb4c442288b4b33e232d5002e (patch)
tree: 240d7bd5910ee39f273221265b2064f7ec60fcd5
parent: db417c280c570dc153ec72b3874b11c23761f969 (diff)
download: pangemma-a1e252bfc7fc874fb4c442288b4b33e232d5002e.tar.gz
3 files changed, 43 insertions, 22 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp
index 69e338a..c7f7ab6 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -2842,9 +2842,10 @@ void GEMMA::BatchRun(PARAM &cPar) {
           gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and its transformed
           write(&Y_col.vector, "Y_col");
 
-          if (cPar.is_mdb)
+          if (cPar.is_mdb) {
             cLmm.mdb_calc_gwa(U, eval, UtW, &UtY_col.vector, W,
                               &Y_col.vector, cPar.setGWASnps);
+          }
           else
             if (!cPar.file_bfile.empty()) {
               // PLINK analysis
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 1319d7c..1e5e229 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -108,11 +108,11 @@ void LMM::WriteFiles() {
 
   file_str = path_out + "/" + file_out;
   file_str += ".assoc.txt";
-  checkpoint("lmm-write-files",file_str.c_str());
+  checkpoint("lmm-write-files",file_str);
 
   ofstream outfile(file_str.c_str(), ofstream::out);
   if (!outfile) {
-    cout << "error writing file: " << file_str.c_str() << endl;
+    cout << "error writing file: " << file_str << endl;
     return;
   }
 
@@ -206,21 +206,29 @@ void LMM::WriteFiles() {
 
     common_header();
 
-    size_t t = 0;
-    for (size_t i = 0; i < snpInfo.size(); ++i) {
-      if (indicator_snp[i] == 0)
-        continue;
-      auto snp = snpInfo[i].rs_number;
-      if (process_gwasnps && setGWASnps.count(snp) == 0)
-        continue;
-      // cout << t << endl;
-      outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
-              << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
-              << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t"
-              << fixed << setprecision(3) << snpInfo[i].maf << "\t";
-
-      sumstats(sumStat[t]);
-      t++;
+    if (snpInfo.size()) {
+      size_t t = 0;
+      for (size_t i = 0; i < snpInfo.size(); ++i) {
+        if (indicator_snp[i] == 0)
+          continue;
+        auto snp = snpInfo[i].rs_number;
+        if (process_gwasnps && setGWASnps.count(snp) == 0)
+          continue;
+        // cout << t << endl;
+        outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t"
+                << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t"
+                << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t"
+                << fixed << setprecision(3) << snpInfo[i].maf << "\t";
+
+        sumstats(sumStat[t]);
+        t++;
+      }
+    }
+    else
+    {
+      for (auto &s : sumStat) {
+        sumstats(s);
+      }
     }
   }
 
@@ -2008,7 +2016,7 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
       continue;
     auto snp = get<1>(tup);
     auto gs = get<2>(tup);
-    cout << t << " SNP: " << snp << endl;
+    // cout << t << " SNP: " << snp << endl;
 
     // check whether SNP is included in gwasnps (used by LOCO)
     /*
@@ -2061,8 +2069,8 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
   if (c % msize)
     batch_compute(c % msize);
   ProgressBar("Reading SNPS", num_markers - 1, num_markers - 1);
-  // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
   cout << endl;
+  cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl;
   checkpoint_nofile("end-lmm-mdb-analyze");
 
   gsl_vector_safe_free(x);
@@ -2125,7 +2133,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
 
     if (result) {
       size_t num_floats = value.size() / sizeof(float);
-      cout << string(key) << ":" << num_floats << "," << ni_total << " " << endl;
+      // cout << string(key) << ":" << num_floats << "," << ni_total << " " << endl;
       assert(num_floats == ni_total);
       const float* gsbuf = reinterpret_cast<const float*>(value.data());
       auto snp = string(key);
@@ -2135,7 +2143,7 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
       for (size_t i = 0; i < size; ++i) {
         gs.push_back(static_cast<double>(gsbuf[i]));
       }
-      cout << "!!!!" << size << snp << ": " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
+      // cout << "!!!!" << size << snp << ": " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
     }
     // return { result,snp,gs };
     return make_tuple(result, snp, gs);
diff --git a/test/performance/releases.org b/test/performance/releases.org
index 082cdde..0b1c638 100644
--- a/test/performance/releases.org
+++ b/test/performance/releases.org
@@ -5,6 +5,18 @@
 
 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz.
 
+Introducing mdb genotype format led to a 30% speed increase on the small mouse set:
+
+#+begin_src sh
+real    0m6.403s
+user    0m11.529s
+sys     0m6.325s
+#+end_src sh
+
+that may not look like much, but we are only starting!
+
+** Picking up the pieces
+
 We are facing a time regression.
 
 premake5 gmake2 && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check
author	Pjotr Prins	2025-12-02 09:04:28 +0100
committer	Pjotr Prins	2025-12-02 09:04:28 +0100
commit	a1e252bfc7fc874fb4c442288b4b33e232d5002e (patch)
tree	240d7bd5910ee39f273221265b2064f7ec60fcd5
parent	db417c280c570dc153ec72b3874b11c23761f969 (diff)
download	pangemma-a1e252bfc7fc874fb4c442288b4b33e232d5002e.tar.gz