diff options
author | Pjotr Prins | 2025-01-03 01:35:06 -0600 |
---|---|---|
committer | Pjotr Prins | 2025-01-03 01:35:50 -0600 |
commit | 53530c209c17fe1f3b37513b53c2f0ef9a491dc3 (patch) | |
tree | 9860e4954fad775277bf0dad10a88c8381e02331 | |
parent | 1db223ea71837a0cdfcb881411a5a4cd685a7dae (diff) | |
download | pangemma-53530c209c17fe1f3b37513b53c2f0ef9a491dc3.tar.gz |
Adding support for checkpoints and relevant documentation
-rw-r--r-- | doc/code/pangemma.md | 23 | ||||
-rw-r--r-- | src/checkpoint.cpp | 34 | ||||
-rw-r--r-- | src/checkpoint.h | 31 | ||||
-rw-r--r-- | src/gemma.cpp | 13 | ||||
-rw-r--r-- | src/gemma_io.cpp | 3 | ||||
-rw-r--r-- | src/param.h | 3 |
6 files changed, 105 insertions, 2 deletions
diff --git a/doc/code/pangemma.md b/doc/code/pangemma.md index ac65f37..c359a56 100644 --- a/doc/code/pangemma.md +++ b/doc/code/pangemma.md @@ -106,6 +106,29 @@ Every propagator has state (too). I.e. it may be idle, computing and done. The runner visits the list of propagators and checks wether the inputs are complete and whether they have changed. On change computation has to happen updating the output cell. +## Setting check points in GEMMA + +GEMMA is quite stateful in its original design. We want to break the work up into chunks setting 'check points'. For example the actual kinship multiplication could start as 'start-compute-kinship' and end with 'compute-kinship' check points. To not have to deal with state too much we can simply let gemma run from the start of the program until 'compute-kinship' to have a kinship-propagator. The output will be a kinship file. Similarly we can run until 'filter-genotypes' that is a step earlier. The output of these propagators can be used by other pangemma propagators as input for comparison and continuation. All the original GEMMA does is act as a reference for alternative implementation of these chunks. Speed is not a concern though there may be opportunities to start compute after some of these check points (using intermediate output) down the line. + +So, let's start with a first check point implementation for 'read-bimbam-file'. + +## read-bimbam-file + +Reading the bimbam file happens in the `ReadFile_bim' function in `gemma_io.cpp'. Of course all it does is read a file - which is the same as any output. But just for the sake of a simple pilot we'll add the check point at the end of the function that will exit GEMMA. +We'll add a CLI switch `-checkpoint read-geno-file' which will force the exit. + +```C++ +checkpoint("read-geno-file",file_geno); +``` + +It passes in the outputfile (the infile in this case), that is used to feed the calling propagator. Some of the outfiles may be composed of multiple outputs - in that case we may add filenames. And exits with: + +``` +**** Checkpoint reached: read-geno-file (normal exit) +``` + +# Other + ## Example I created a very minimalistic example in Ruby with a simple round robin scheduler: diff --git a/src/checkpoint.cpp b/src/checkpoint.cpp new file mode 100644 index 0000000..6fcc1a2 --- /dev/null +++ b/src/checkpoint.cpp @@ -0,0 +1,34 @@ +/* + Copyright © 2025, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <checkpoint.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <iostream> + +#include <param.h> + +using namespace std; + +void checkpoint(string msg, string filename) { + if (msg == checkpoint_name) { + cerr << "**** Checkpoint reached: " << msg << " (normal exit)" << endl; + exit(0); + } +} diff --git a/src/checkpoint.h b/src/checkpoint.h new file mode 100644 index 0000000..7a70855 --- /dev/null +++ b/src/checkpoint.h @@ -0,0 +1,31 @@ +/* + Checkpoints for pangemma propagators + + Copyright © 2015, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __CHECKPOINT_H__ +#define __CHECKPOINT_H__ + +#include <string> + +using namespace std; + +extern string checkpoint_name; + +void checkpoint(string msg, string filename); + +#endif diff --git a/src/gemma.cpp b/src/gemma.cpp index c73c174..a50d8ab 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2021, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -65,6 +65,8 @@ extern "C" { using namespace std; +string checkpoint_name; // -checkpoint switch is global + GEMMA::GEMMA(void) : version(GEMMA_VERSION), date(GEMMA_DATE), year(GEMMA_YEAR) {} void gemma_gsl_error_handler (const char * reason, @@ -726,6 +728,7 @@ void GEMMA::PrintHelp(size_t option) { cout << " -nind [num] read up to num individuals" << endl; cout << " -issue [num] enable tests relevant to issue tracker" << endl; cout << " -legacy run gemma in legacy mode" << endl; + cout << " -checkpoint [name] set checkpoint (see pangemma docs)" << endl; cout << endl; } @@ -1623,6 +1626,14 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { } else if (strcmp(argv[i], "-legacy") == 0) { debug_set_legacy_mode(true); warning_msg("you are running in legacy mode - support may drop in future versions of gemma"); + } else if (strcmp(argv[i], "-checkpoint") == 0) { + if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { + continue; + } + ++i; + str.clear(); + str.assign(argv[i]); + checkpoint_name = str; } else { cout << "error! unrecognized option: " << argv[i] << endl; cPar.error = true; diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index b11bf88..698d3e2 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -42,6 +42,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include "checkpoint.h" #include "debug.h" // #include "eigenlib.h" #include "fastblas.h" @@ -552,6 +553,7 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { infile.close(); infile.clear(); + checkpoint("read-bimbam-file",file_bim); return true; } @@ -869,6 +871,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, infile.close(); infile.clear(); + checkpoint("read-geno-file",file_geno); return true; } diff --git a/src/param.h b/src/param.h index e747182..d3ce686 100644 --- a/src/param.h +++ b/src/param.h @@ -148,7 +148,8 @@ public: string file_weight, file_wsnp, file_wcat; string file_out; string file_bf, file_hyp; - string path_out; + string path_out; // -outdir switch + // string checkpoint; // -checkpoint switch is global string file_epm; // Estimated parameter file. string file_ebv; // Estimated breeding value file. |