diff options
35 files changed, 2282 insertions, 682 deletions
diff --git a/.gitignore b/.gitignore index ef520c8..86c3228 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ +build/ +PanGemma.make +gemma.make +gemmalib.make +Makefile +gdb.txt +*.prof *.o *.tar.gz src/Eigen @@ -12,3 +19,4 @@ doc/manual.log doc/manual.out doc/manual.toc contrib/ +.aider* diff --git a/Makefile b/Makefile.orig index 1009a6e..1009a6e 100644 --- a/Makefile +++ b/Makefile.orig diff --git a/README.md b/README.md index e13ca36..3239ab9 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,11 @@ This repository is used to rewrite and modernize the original GEMMA tool. The idea it to upgrade the software, but keeping it going using ideas from Hanson and Sussman's book on *Software Design for Flexibility: How to Avoid Programming Yourself into a Corner*. It is work in progress (WIP). For more information see [PanGEMMA design](./doc/code/pangemma.md) -GEMMA is the original software toolkit for fast application of linear mixed models (LMMs) and related models to genome-wide association studies (GWAS) and other large-scale data sets. You can find the original code on [github](https://github.com/genetics-statistics/GEMMA). +GEMMA is the original software toolkit for fast application of linear mixed models (LMMs) and related models to genome-wide association studies (GWAS) and other large-scale data sets. You can find the original code on [github](https://github.com/genetics-statistics/GEMMA). It may even build from this repo for the time being. + +NOTE: December 2024 main software development has moved to [PanGEMMA](https://git.genenetwork.org/pangemma/about/)! +Pangemma is essentially a fork of GEMMA that is meant to scale up for pangenomics. We are also taking the opportunity to revamp the code base. GEMMA itself is in *maintenance* mode. + Check out [RELEASE-NOTES.md](./RELEASE-NOTES.md) to see what's new in each release. @@ -42,15 +46,10 @@ WIP ## Run GEMMA -GEMMA is run from the command line. To run gemma +Pangemma (for now) maintains a version of gemma and may support new features. Run the legacy version with: -```sh -gemma -h ``` - -a typical example would be - -```sh +gemma -h # compute Kinship matrix gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt \ -gk -o mouse_hs1940 @@ -60,44 +59,16 @@ gemma -g ./example/mouse_hs1940.geno.txt.gz \ -k ./output/mouse_hs1940.cXX.txt -lmm -o mouse_hs1940_CD8_lmm ``` -Above example files are in the git repo and can be downloaded from -[github](https://github.com/genetics-statistics/GEMMA/tree/master/example). - ### Debugging and optimization -GEMMA has a wide range of debugging options which can be viewed with +We use guix for debugging and development. Try something like ``` - DEBUG OPTIONS - - -check enable checks (slower) - -no-fpe-check disable hardware floating point checking - -strict strict mode will stop when there is a problem - -silence silent terminal display - -debug debug output - -debug-data debug data output - -debug-dump -debug-data, but store the data to files (grep write() calls for messages/names) - -nind [num] read up to num individuals - -issue [num] enable tests relevant to issue tracker - -legacy run gemma in legacy mode +LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib gdb --args ./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz \ + -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt \ + -k ./output/mouse_hs1940.cXX.txt -lmm -o mouse_hs1940_CD8_lmm ``` -typically when running gemma you should use -debug which includes -relevant checks. When compiled for debugging the debug version of -GEMMA gives more information. - -For performance you may want to use the -no-check option. Also check -the build optimization notes in [INSTALL.md](INSTALL.md). - -## Help - -+ [The GEMMA manual](doc/manual.pdf). - -+ [Detailed example with HS mouse data](example/demo.txt). - -+ [Tutorial on GEMMA for genome-wide association -analysis](https://github.com/rcc-uchicago/genetic-data-analysis-2). - ## Citing PanGEMMA PanGEMMA is not published yet. @@ -132,35 +103,21 @@ studies.](https://doi.org/10.1101/042846) *Annals of Applied Statistics*, in pre ## License -PanGEMMA Copyright (C) 2012–2025, Pjotr Prins, Xiang Zhou and others (see the soure file headers and git log). - -The *PanGEMMA* and *GEMMA* source code repository is free software: you can redistribute it under the terms of the [GNU General Public License](http://www.gnu.org/licenses/gpl.html). All the files in this project are part of *GEMMA*. This project is distributed in the hope that it will be useful, but **without any warranty**; without even the implied warranty of **merchantability or fitness for a particular purpose**. See file [LICENSE](LICENSE) for the full text of the license. - -Both the source code for the [gzstream zlib wrapper](http://www.cs.unc.edu/Research/compgeom/gzstream/) and [shUnit2](https://github.com/genenetwork/shunit2) unit testing framework included in GEMMA are distributed under the [GNU Lesser General Public License](contrib/shunit2-2.0.3/doc/LGPL-2.1), either version 2.1 of the License, or (at your option) any later revision. - -The source code for the included [Catch](http://catch-lib.net) unit testing framework is distributed under the [Boost Software Licence version 1](https://github.com/philsquared/Catch/blob/master/LICENSE.txt). +Copyright (C) 2012–2025, Pjotr Prins & Xiang Zhou -## Optimizing performance +The *PanGEMMA* source code is free software: you can redistribute it under the terms of the [GNU General Public License](http://www.gnu.org/licenses/gpl.html). All the files in this project are part of *PanGEMMA*. This project is distributed in the hope that it will be useful, but **without any warranty**; without even the implied warranty of **merchantability or fitness for a particular purpose**. See file [LICENSE](LICENSE) for the full text of the license. -Precompiled binaries and libraries may not be optimal for your particular hardware. See [INSTALL.md](INSTALL.md) for speeding up tips. +The original source code for *GEMMA* that is part of *PanGEMMA* is distributed under the same GPL license. -## Building from source - -More information on source code, dependencies and installation can be found in [INSTALL.md](INSTALL.md). - -## Input data formats - -## Contributing code, reporting a PanGEMMA bug or issue - -WIP +The source code for the [gzstream zlib wrapper](http://www.cs.unc.edu/Research/compgeom/gzstream/) (still) included in GEMMA are distributed under the [GNU Lesser General Public License](contrib/shunit2-2.0.3/doc/LGPL-2.1), either version 2.1 of the License, or (at your option) any later revision. ## Code of conduct -By using GEMMA and communicating with its communtity you implicitely agree to abide by the [code of conduct](https://software-carpentry.org/conduct/) as published by the Software Carpentry initiative. +Being part of the PanGEMMA community and communicating with its communtity you implicitely agree to abide by the [code of conduct](https://software-carpentry.org/conduct/) as published by the Software Carpentry initiative. ## Credits -The *PanGEMMA* software was developmed +The *PanGEMMA* software was developed by [Pjotr Prins](http://thebird.nl/)<br> Dept. of Genetics, Genomics and Informatics<br> @@ -172,5 +129,5 @@ The *GEMMA* software was developed by: Dept. of Biostatistics<br> University of Michigan<br> -with contributions from Peter Carbonetto, Tim Flutre, Matthew Stephens, +with (early) contributions from Peter Carbonetto, Tim Flutre, Matthew Stephens, and [others](https://github.com/genetics-statistics/GEMMA/graphs/contributors). diff --git a/doc/code/pangemma.md b/doc/code/pangemma.md index 1c3fed9..113b82c 100644 --- a/doc/code/pangemma.md +++ b/doc/code/pangemma.md @@ -1,43 +1,14 @@ # PanGEMMA -We are rewriting and modernizing the much beloved GEMMA tool that is, -for example, core to GeneNetwork.org. -The idea is to upgrade the software, and keep it going using ideas -from Hanson and Sussman's book on *Software Design for Flexibility: -How to Avoid Programming Yourself into a Corner*. -This is not the first attempt, in fact quite a few efforts have -started, but none really finished! - -We want to keep the heart of GEMMA beating while upgrading the -environment taking inspiration from fetal heart development: The human -heart is one of the first organs to form and function during -embryogenesis. By the end of gestational week 3, passive oxygen -diffusion becomes insufficient to support metabolism of the developing -embryo, and thus the fetal heart becomes vital for oxygen and nutrient -distribution. The initiation of the first heart beat via the -*primitive heart tube* begins at gestational day 22, followed by -active fetal blood circulation by the end of week 4. The start of -early heart development involves several types of progenitor cells -that are derived from the mesoderm, proepicardium, and neural crest. -This eventually leads to the formation of the 4-chambered heart by -gestational week 7 via heart looping and complex cellular interactions -in utero (e.g., [Tan and -Lewandowski](https://doi.org/10.1159/000501906)). - -What we will do is create components and wire them together, allowing -for sharing RAM between components. Each component may have multiple -implementations. We will introduce a DSL for orchestration and we may -introduce a propagator network to run components in parallel and test -them for correctness. At the same time, the core functionality of -GEMMA will keep going while we swap components in and out. See also -[propagators](https://groups.csail.mit.edu/mac/users/gjs/propagators/) -and [examples](https://github.com/namin/propagators/blob/master/examples/multiple-dwelling.scm). - -Propagators are [Unix pipes on steroids](https://github.com/pjotrp/ruby-propagator-network/blob/main/README.md). - -We want PanGEMMA to be able to run on high-performance computing (HPC) -architectures, including GPUs. This implies the core project can have -few dependencies and should easily compile from C. +We are rewriting and modernizing the much beloved GEMMA tool that is core to GeneNetwork.org. The idea is to upgrade the software, and keep it going using ideas from Hanson and Sussman's book on *Software Design for Flexibility: How to Avoid Programming Yourself into a Corner*. This is not the first attempt, in fact quite a few efforts have started, but none have really finished! For some reason we painted ourselves into a corner (indeed). + +Importantly, we want to keep the heart of GEMMA beating while upgrading the environment taking inspiration from fetal heart development: The human heart is one of the first organs to form and function during embryogenesis. By the end of gestational week 3, passive oxygen diffusion becomes insufficient to support metabolism of the developing embryo, and thus the fetal heart becomes vital for oxygen and nutrient distribution. The initiation of the first heart beat via the *primitive heart tube* begins at gestational day 22, followed by active fetal blood circulation by the end of week 4. The start of early heart development involves several types of progenitor cells that are derived from the mesoderm, proepicardium, and neural crest. This eventually leads to the formation of the 4-chambered heart by gestational week 7 via heart looping and complex cellular interactions in utero (e.g., [Tan and Lewandowski](https://doi.org/10.1159/000501906)). + +What we will do is create components and wire them together, allowing for sharing memory between components using Linux mmap techniques --- mostly using lmdb that is supported on non-Linux platforms. Each component may have multiple implementations. We will introduce a DSL for orchestration and we may introduce a propagator network to run components in parallel and test them for correctness. At the same time, the core functionality of GEMMA will keep going while we swap components in and out. See also [propagators](https://groups.csail.mit.edu/mac/users/gjs/propagators/) and [examples](https://github.com/namin/propagators/blob/master/examples/multiple-dwelling.scm). + +I wrote a piece with examples on propagators too. Propagators are [Unix pipes on steroids](https://github.com/pjotrp/ruby-propagator-network/blob/main/README.md). + +We want PanGEMMA to be able to run on high-performance computing (HPC) architectures, including GPUs. This implies the core project can have few dependencies and should easily compile from C. # Innovation @@ -47,49 +18,28 @@ few dependencies and should easily compile from C. ## Breaking with the past -The original gemma source base is considered stable and will be -maintained - mostly to prevent bit rot. See -https://github.com/genetics-statistics/GEMMA. To move forward we -forked pangemma to be able to break with the past. +The original gemma source base is considered stable and will be maintained - mostly to prevent bit rot. See https://github.com/genetics-statistics/GEMMA. To move forward we forked pangemma to be able to break with the past. -Even so, pangemma is supposed to be able to run the same steps as the -original gemma. And hopefully improve things. +Even so, pangemma is supposed to be able to run the same steps as the original gemma. And hopefully improve things. ## Adapting the old gemma code -For running the old gemma code we need to break up the code base into -pieces to run in a propagator network (see below). -Initially we can use the file system to pass state around. That will -break up the implicit global state that gemma carries right now and -makes working with the code rather tricky. -Note that we don't have the goal of making gemma more efficient -because people can still use the old stable code base. -Essentially we are going to add flags to the binary that will run -gemma partially by exporting and importing intermediate state. +For running the old gemma code we need to break up the code base into pieces to run in a propagator network (see below). Initially we can use the file system to pass state around. That will break up the implicit global state that gemma carries right now and makes working with the existing code base rather tricky. Note that we don't have the initial goal of making gemma more efficient because people can still use the old stable code base. Essentially we are going to add flags to the binary that will run gemma partially by exporting and importing intermediate state. -Later, when things work in a propagator network, we can create -alternatives that pass state around in memory. +Later, when things work in a propagator network, we can create alternatives that pass state around in memory and make pangemma more efficient. # A simple propagator network -We will create cells that hold basic computations and functions. We -won't do a full propagator setup, though we may do a full -implementation later. -For now we use a network of cells - essentially a dependency graph of -computations. Cells can tell other cells that they require them and -that allows for alternate paths. E.g. to create a kinship matrix: +We will create cells that hold basic computations and functions. We won't do a full propagator setup, though we may do a full implementation later. For now we use a network of cells - essentially a dependency graph of computations. Cells hold state. Cells can tell other cells that they require them and that allows for alternate paths. E.g. to create a kinship matrix: ``` (define-cell genotypes) -(define-cell kinship-matrix (e:kinship genotypes) +(define-cell kinship-matrix (e:calc-kinship genotypes) (run) (content kinship-matrix) ``` -essentially e:kinship gets run when genotypes are available. It is -kinda reversed programming. Now say we want to add an input and a -filter: - +essentially e:calc-kinship gets run when genotypes are available. Now say we want to add an input and a filter: ``` (define-cell bimbam-genofile) @@ -122,84 +72,44 @@ runs one path and will run both. -This simple example shows how complex running logic can be -described without (1) predicting how people will use the software and -(2) no complex if/then statements. +This simple example shows how complex running logic can be described without (1) predicting how people will use the software and (2) no complex if/then statements. -Why does this matter for GEMMA? It will allow us to run old parts of -GEMMA as part of the network in addition to new parts - and -potentially validate them. It also allows us to create multiple -implementations, such as for CPU and GPU, that can run in parallel and -validate each other's outcomes. +Why does this matter for GEMMA? It will allow us to run old parts of GEMMA as part of the network in addition to new parts - and potentially validate them. It also allows us to create multiple implementations, such as for CPU and GPU, that can run in parallel and validate each other's outcomes. ## Create the first cells -Let's start with default GEMMA behaviour and create the first cells to -get to exporting the kinship-matrix above. +Let's start with default GEMMA behaviour and create the first cells to get to exporting the kinship-matrix above. ``` (content kinship-matrix) ``` -We'll break this down by inspecting cell levels next. The genofile -cells contain a file name (string). -When the file is read we will capture the state in a file and carry -the file name of the new file forward. -In this setup cells simply represent file names (for state). -This allows us to simply read and write files in the C code. -Later, when we wire up new propagators we may carry this state in -memory. The whole point of using this approach is that we really don't -have to care! - -Our first simple implementation of the cell will simply contain a -string referring to a file. -Cells can be works-in-progress and incrementally improved. +We'll break this down by inspecting cell levels next. The genofile cells contain a file name (string). When the file is read we will capture the state in a file and carry the file name of the new file forward. In this setup cells simply represent file names (for state). This allows us to simply read and write files in the C code. Later, when we wire up new propagators we may carry this state in memory. The whole point of using this approach is that we really don't have to care! + +Our first simple implementation of the cell will simply contain a string referring to a file. Cells can be works-in-progress and incrementally improved. ## Create propagators -A propagator contains a list of inputs and an output cell. So we wire -up the graph by adding inputs to propagators. -Every propagator has state (too), that is, it may be idle, computing, or done. +A propagator contains a list of inputs and an output cell. So we wire up the graph by adding inputs to propagators. Every propagator has state (too), that is, it may be idle, computing, or done. ## The runner -The runner visits the list of propagators and checks whether the -inputs are complete and whether they have changed. On detecting a -change, a computation has to be triggered to update the output cell. +The runner visits the list of propagators and checks whether the inputs are complete and whether they have changed. On detecting a change, a computation has to be triggered to update the output cell. ## Setting check points in GEMMA -GEMMA is quite stateful in its original design. We want to break the -work up into chunks setting 'check points'. For example the actual -kinship multiplication could start as 'start-compute-kinship' and end -with 'compute-kinship' check points. To not have to deal with state -too much we can simply let GEMMA run from the start of the program -until 'compute-kinship' to have a kinship-propagator. The output will -be a kinship file. Similarly we can run until 'filter-genotypes' that -is a step earlier. The output of these propagators can be used by -other PanGEMMA propagators as input for comparison and continuation. -All the original GEMMA does is act as a reference for alternative -implementation of these chunks. Speed is not a concern though there -may be opportunities to start compute after some of these check-points -(using intermediate output) down the line. +GEMMA is rather stateful in its original design. We want to break the work up into chunks setting 'check points'. For example the actual kinship multiplication could start as 'start-compute-kinship' and end with 'compute-kinship' check points. To not have to deal with state too much we can simply let GEMMA run from the start of the program until 'compute-kinship' to have a kinship-propagator. The output will be a kinship file. Similarly we can run until 'filter-genotypes' that is a step earlier. The output of these propagators can be used by other PanGEMMA propagators as input for comparison and continuation. All the original GEMMA does is act as a reference for alternative implementation of these chunks. Speed is not a concern though there may be opportunities to start compute after some of these check-points (using intermediate output) down the line. So, let's start with a first check point implementation for 'read-bimbam-file'. ## read-bimbam-file -Reading the bimbam file happens in the `ReadFile_bim' function in -`gemma_io.cpp'. Of course all it does is read a file - which is the -same as any output. But just for the sake of a simple pilot we'll add -the check point at the end of the function that will exit GEMMA. -We'll add a CLI switch `-checkpoint read-geno-file' which will force the exit. +Reading the bimbam file happens in the `ReadFile_bim' function in `gemma_io.cpp'. Of course all it does is read a file - which is the same as any output. But just for the sake of a simple pilot we'll add the check point at the end of the function that will exit GEMMA. We'll add a CLI switch `-checkpoint read-geno-file' which will force the exit. ```C++ checkpoint("read-geno-file",file_geno); ``` -It passes in the outputfile (the infile in this case), that is used to -feed the calling propagator. Some of the outfiles may be composed of -multiple outputs - in that case we may add filenames. And exits with: - +It passes in the outputfile (the infile in this case), that is used to feed the calling propagator. Some of the outfiles may be composed of multiple outputs - in that case we may add filenames. And exits with: ``` **** Checkpoint reached: read-geno-file (normal exit) ``` @@ -238,46 +148,71 @@ It should be with a propagator infrastructure. The fun part is that both paths c ## Adding some propagator plumbing -I am highly resistant to writing C++ code. I probably have wrote most of lines of code in my life in C++, so I should know ;) -As we are still in the proof-of-concept stage I am going to use Ruby for the first plumbing (I do like that language, despite its warts) and maybe throw some scheme in for writing the new lmdb support. One fun side of propagators is that I don't need to worry too much about languages as people can trivially update any propagator in any way they like. We'll replace the Ruby propnet engine into something that can handle parallelism, e.g., when we want to start using actors to run computations. +I have become highly resistant to writing C++ code. I probably have wrote most of lines of code in my life in C++, so I should know ;) +As we are still in the proof-of-concept stage I may use Ruby for the first plumbing (I do like that language, despite its warts) and maybe throw some scheme in for writing the new lmdb support. One fun side of propagators is that I don't need to worry too much about languages as people can trivially update any propagator in any way they like. We'll replace the Ruby propnet engine into something that can handle parallelism, e.g., when we want to start using actors to run computations. Alright, to duplicate above filtering routine we are going to add an alternative path for GEMMA's read-geno-file, splitting the two tasks into two steps: read-geno-file (with filtering) -> list-of-filtered-genotypes rb-read-geno-file -> rb-filter-geno-file -> '' -'read-geno-file' translates to GEMMA's read-geno-file and rb-read-geno-file is the alternate path. list-of-filtered-genotypes will be a file that we can compare between the two paths. +'read-geno-file' translates to GEMMA's read-geno-file (maybe I should name it gemma-read-geno-file) and rb-read-geno-file is the alternate path. list-of-filtered-genotypes will be a file that we can compare between the two paths. These steps should be so simple that anyone can replace them with, say, a python version. So, what can go wrong? +# Adding our first converter + +We recently introduced an lmdb version of the genotype file. This new genotype format stores both normal and transposed forms of the genotype matrix, so they can be read directly from disk in an optimal version. lmdb is memory mapped and can give great performance. # Other ## Example -I created a very minimalistic example in Ruby with a simple -round-robin scheduler: - -=> https://github.com/pjotrp/ruby-propagator-network/blob/main/propagator.rb +I created minimalistic examples in Ruby with 0MQ and [simple round-robin scheduler](https://github.com/pjotrp/ruby-propagator-network/blob/main/) ## Running tasks in parallel -In principle propnets make it trivially easy to run tasks in parallel. -When inputs are complete the propagator goes into a -'compute' state and a process can be run (forked or on PBS) that -executes a command. The output file can be picked up to make sure the -propagator completes and switches to the 'done' state. Note that -actors and goblins would be perfect parallel companions to propagators -with the potential advantage of keeping results in RAM. - -We are not going there right now, but it is one of the important -reasons to build this setup. - -## Why guile and not ruby or python? - -Above example in Ruby is rather nice and we can build on that initially. -Ruby's multithreading capabilities (and that of Python), however, are -hampered by the layout of the interpreters and modules that can not be -re-entered. Even with the slow disappearance of the global interpreter -lock (GIL) these languages don't do great in that area. You can work -around it by using copy-on-write processes (like I did with the really -fast bioruby-vcf), but that gets clunky fast. +In principle propnets make it trivially easy to run tasks in parallel. When inputs are complete the propagator goes into a 'compute' state and a process can be run (forked or on PBS) that executes a command. The output file can be picked up to make sure the propagator completes and switches to the 'done' state. Note that actors and goblins would be perfect parallel companions to propagators with the potential advantage of keeping results in RAM. We are not going there right now, but it is one of the important reasons to build this setup. + +## Why guile and not Ruby or Python? + +Above example in Ruby is rather nice and we can build on that initially. Ruby's multithreading capabilities (and that of Python), however, are hampered by the layout of the interpreters and modules that can not be re-entered. Even with the slow disappearance of the global interpreter lock (GIL) these languages don't do great in that area. You can work around it by using copy-on-write processes (like I did with the really fast bioruby-vcf), but that gets clunky fast. + +The main reason to choose Guile over Ruby (or Python) is the REPL. The debugging experience of the Lisp REPL is hard to beat. What we want is GEMMA as resident 'server' that we can develop against. + +What we want to do in development/debugging is fire up GEMMA so it starts listening. Not exactly GEMMA as a server, but still responding to our requests. Next we want to go into a continuous cycle where we change some data and get a result. Next be able to inspect the result and either go back or continue with a next computation. This REPL-centric programming is not only addictive as a developer-experience, but also highly efficient in terms of developer-time. + +To make software listen interactively we need to be able to update it 'in place'. With Lisp code this is straightforward because partial code can get re-evaluated or recompiled on-the-fly. +With C++ code this is only possible if we make it listen to a socket and restart the socket server. And here +we are talking message passing! + +## Why Ruby and not guile? + +OK, I have a confession to make. Reading Lisp code does not come that natural to me. I find writing Lisp easier than reading Lisp(!) Maybe it is because I have been reading Algol-type languages all my life, or maybe because Ruby is just a better fit to my brain. Where Lisp is definitely the greater language, I just find it harder to parse. Even my own code! I met Matz once and he told me that Ruby was known as Matz's own Lisp. So, there we are. I need the Ruby syntax oddities as little helpers to make my brain disentangle code. + +## And again: why guile and not Ruby? + +Ruby's REPL is -- unfortunately -- not that useful. + +## The art of message passing + +Messaging to the rescue. If all code responds to messages we can create and destroy servers at will. No matter what language they are written in. That also ties in with propagator networks. A propagator cell will be fired up and checks its inputs. Based on the inputs it does a (partial) computation. When the computation is completed it switches to a 'done' state. The done state signals downstream cells to start their computations using the cell values as inputs. + +In a REPL we can evaluate these cells. That is where Lisp comes in. But as these cells have a transparent messaging mechanism we should be able to use them in any language. That means we have a choice of languages to develop the full framework. + +## ZeroMQ (0MQ) + +ZeroMQ provides a minimalistic message passing interface in most languages. It will allow us to create cells and wire up a propagator network. + +## Sharing data in RAM + +There are several methods for sharing data between threads and processes. First we can first use Linux copy on write for instantiated processes. The Linux kernel als has shared memory support and, more interesting, memory mapping support --- which is also used in lmdb. Lmdb is multi-platform and works on non-POSIX systems. It is very fast, but may need tweaking, for example by disabling read-ahead with MDB_NORDAHEAD. We'll get to that. +## Build system + +The build system for development relies on Guix. See the header of [guix.scm](../guix.scm). +Guix handles depencies well, so it is a great base to work from. + +Build systems have to target different platforms. pangemma will mostly be run on Linux on AMD64, but there are always people who want more and it requires support from the build system. Also we may need targets for debug and architecture optimized builds. Some of that may be handled through Guix. + +For the actual build system we may roll our own. +All build tools I know are too messy, including CMake, meson and make itself. So, as a real hacker, we +can reinvent the wheel. One starting point is Arun's [g-exp blog](https://systemreboot.net/post/g-expressions-makings-of-a-make-killer) though it would require a non-Guix way of building things. diff --git a/guix.scm b/guix.scm deleted file mode 100644 index c9c0410..0000000 --- a/guix.scm +++ /dev/null @@ -1,87 +0,0 @@ -;; To use this file to build HEAD of gemma: -;; -;; guix build -f guix.scm -;; -;; To get a development container (e.g., run in emacs shell). -;; -;; guix shell -C -D -f guix.scm -;; - -(use-modules - ((guix licenses) #:prefix license:) - (guix gexp) - (guix packages) - (guix git-download) - (guix build-system gnu) - (gnu packages algebra) - (gnu packages base) - (gnu packages compression) - ;; (gnu packages bioinformatics) - (gnu packages build-tools) - (gnu packages check) - ;; (gnu packages curl) - (gnu packages gdb) - (gnu packages guile) - ;; (gnu packages llvm) - (gnu packages maths) - ;; (gnu packages ninja) - ;; (gnu packages parallel) - (gnu packages perl) - ;; (gnu packages perl6) - ;; (gnu packages ruby) - (gnu packages pkg-config) - ;; (pjotr packages openblas) ;; we use this for the static builds - ;; (gnu packages shell) ;; for shunit2 - (srfi srfi-1) - (ice-9 popen) - (ice-9 rdelim)) - -(define %source-dir (dirname (current-filename))) - -(define %git-commit - (read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ))) - -(define %gemma-version - (read-string (open-pipe "cat VERSION" OPEN_READ))) - -(define-public gemma-git - (package - (name "gemma-git") - (version (git-version "0.98.5" "HEAD" %git-commit)) - (source (local-file %source-dir #:recursive? #t)) - (build-system gnu-build-system) - (inputs - (list catch2 - gdb - gsl - openblas - zlib)) - ;; ("gsl-static" ,gsl-static) - ;; ("zlib:static" ,zlib "static") - (propagated-inputs - (list guile-3.0)) - (native-inputs ; for running tests - (list perl which)) - (arguments - `(#:phases - (modify-phases %standard-phases - (delete 'configure) - (delete 'validate-runpath) - (add-before 'build 'bin-mkdir - (lambda _ - (mkdir-p "bin") - )) - (replace 'install - (lambda* (#:key outputs #:allow-other-keys) - (let ((out (assoc-ref outputs "out"))) - (install-file "bin/gemma" (string-append out "/bin")))))) - #:tests? #t - #:parallel-tests? #f)) - (home-page "https://github.com/genetics-statistics") - (synopsis "Tool for genome-wide efficient mixed model association") - (description "Genome-wide Efficient Mixed Model Association (GEMMA) -provides a standard linear mixed model resolver with application in -genome-wide association studies (GWAS).") - (license license:gpl3))) - -gemma-git diff --git a/guix/guix.scm b/guix/guix.scm new file mode 100644 index 0000000..d8d593b --- /dev/null +++ b/guix/guix.scm @@ -0,0 +1,256 @@ +;; To use this file to build HEAD of gemma: +;; +;; guix build -f guix/guix.scm # default builds pangemma-git +;; +;; To get a development container (e.g., run in emacs shell). +;; +;; guix shell -C -D -F -f guix/guix.scm # pangemma-shell-git +;; +;; see premake5.lua for build/test instructions +;; +;; optimized for arch: +;; +;; guix shell --tune=native -C -D -F # pangemma-shell-git +;; +;; see premake5.lua header for examples. +;; +;; To optimize use guix --tune=march-type (e.g. --tune=native) + +(define-module (guix) + #:use-module ((guix licenses) #:prefix license:) + #:use-module (guix gexp) + #:use-module (guix packages) + #:use-module (guix git-download) + #:use-module (guix build-system gnu) + #:use-module (guix utils) + + #:use-module (gnu packages algebra) + #:use-module (gnu packages base) + #:use-module (gnu packages build-tools) + #:use-module (gnu packages compression) + #:use-module (gnu packages commencement) + #:use-module (gnu packages check) + #:use-module (gnu packages cpp) + #:use-module (gnu packages databases) + #:use-module (gnu packages gcc) + #:use-module (gnu packages gdb) + #:use-module (gnu packages guile) + #:use-module (gnu packages guile-xyz) + #:use-module (gnu packages maths) + #:use-module (gnu packages ninja) + #:use-module (gnu packages perl) + #:use-module (gnu packages pkg-config) + #:use-module (gnu packages ruby) + #:use-module (gnu packages time) + #:use-module (srfi srfi-1) + #:use-module (ice-9 popen) + #:use-module (ice-9 rdelim)) + +(define %source-dir (dirname (current-filename))) + +(define %git-commit + (read-string (open-pipe "git describe --always --tags --long|tr -d $'\n'" OPEN_READ))) + +(define %pangemma-version + (read-string (open-pipe "cat VERSION|tr -d $'\n'" OPEN_READ))) + +(define-public openblas-pangemma +;; we are fixating on an older openblas, for now + (package + (name "openblas-pangemma") + (version "0.3.21") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/xianyi/OpenBLAS") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0yx1axiki12y0xz0d5s76vvl7ds36k0npv1sww08k2qslhz1g9qp")))) + (build-system gnu-build-system) + (properties `((tunable? . #t))) + (arguments + (list + #:tests? #f ;; skip tests + #:test-target "test" + ;; No default baseline is supplied for powerpc-linux. + #:substitutable? (not (target-ppc32?)) + #:make-flags + #~(list (string-append "PREFIX=" #$output) + (string-append "CFLAGS=-O3 -g -Wno-incompatible-pointer-types -Wno-error=implicit-function-declaration") + "COPT=" + "COMMON_OPT=" + "DYNAMIC_ARCH=" + "SHELL=bash" + "MAKE_NB_JOBS=0" ;use jobserver for submakes + + ;; This is the maximum number of threads OpenBLAS will ever use (that + ;; is, if $OPENBLAS_NUM_THREADS is greater than that, then NUM_THREADS + ;; is used.) If we don't set it, the makefile sets it to the number + ;; of cores of the build machine, which is obviously wrong. + "NUM_THREADS=128" + + ;; DYNAMIC_ARCH is only supported on some architectures. + ;; DYNAMIC_ARCH combined with TARGET=GENERIC provides a library + ;; which uses the optimizations for the detected CPU. This can + ;; be overridden at runtime with the environment variable + ;; OPENBLAS_CORETYPE=<type>, where "type" is a supported CPU + ;; type. On other architectures we target only the baseline CPU + ;; supported by Guix. + #$@(cond + ((or (target-x86-64?) + (target-x86-32?) + (target-ppc64le?) + (target-aarch64?)) + ;; Dynamic older enables a few extra CPU architectures + ;; on x86_64 that were released before 2010. + '("DYNAMIC_ARCH=1" "TARGET=GENERIC")) + ;; '("DYNAMIC_ARCH=" "TARGET_CORE=ZEN")) + ;; On some of these architectures the CPU type can't be detected. + ;; We list the oldest CPU core we want to have support for. + ;; On MIPS we force the "SICORTEX" TARGET, as for the other + ;; two available MIPS targets special extended instructions + ;; for Loongson cores are used. + ((target-mips64el?) + '("TARGET=SICORTEX")) + ((target-arm32?) + '("TARGET=ARMV7")) + ((target-riscv64?) + '("TARGET=RISCV64_GENERIC")) + (else '()))) + ;; no configure script + #:phases + #~(modify-phases %standard-phases + (delete 'configure) + (add-before 'build 'set-extralib + (lambda* (#:key inputs #:allow-other-keys) + ;; Get libgfortran found when building in utest. + (setenv "FEXTRALIB" + (string-append + "-L" + (dirname + (search-input-file inputs "/lib/libgfortran.so"))))))))) + (inputs + (list `(,gfortran "lib"))) + (native-inputs + (list cunit gfortran perl)) + (home-page "https://www.openblas.net/") + (synopsis "Optimized BLAS library based on GotoBLAS") + (description + "OpenBLAS is a BLAS library forked from the GotoBLAS2-1.13 BSD version.") + (license license:bsd-3))) + +(define-public pangemma-base-git + "Pangemma base build package" + (package + (name "pangemma-git") + (version (git-version %pangemma-version "HEAD" %git-commit)) + (source (local-file %source-dir #:recursive? #t)) + (build-system gnu-build-system) + (inputs + (list gsl + openblas-pangemma + guile-3.0 + `(,guile-3.0 "debug") + ;; `(,guile-3.0 "dev") + guile-lmdb + lmdb + lmdbxx + pkg-config + ;; ninja + ;; ruby + time + zlib)) + ;; (propagated-inputs + ;; (list + ;; `("guile" ,guile-3.0-latest) + ;; `("guile-debug" ,guile-3.0-latest "debug") + ;; `("guile" ,guile-3.0-latest "dev"))) + + ;; ("gsl-static" ,gsl-static) + ;; ("zlib:static" ,zlib "static") + (arguments + `(#:phases + (modify-phases %standard-phases + (delete 'configure) + (delete 'validate-runpath) + (add-before 'build 'bin-mkdir + (lambda _ + (mkdir-p "bin") + )) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((out (assoc-ref outputs "out"))) + (install-file "bin/gemma" (string-append out "/bin")))))) + #:tests? #t + #:parallel-tests? #f)) + (home-page "https://git.genenetwork.org/pangemma/") + (synopsis "Tool for genome-wide efficient mixed model association") + (description "New version of Genome-wide Efficient Mixed Model Association (PANGEMMA) +provides a standard linear mixed model resolver with application in +genome-wide association studies (GWAS).") + (license license:gpl3))) + +(define-public pangemma-shell-git + "Shell version for development" + (package + (inherit pangemma-base-git) + (name "pangemma-shell-git") + (build-system gnu-build-system) + (propagated-inputs + (modify-inputs (package-inputs pangemma-base-git) + (append which binutils coreutils gcc-toolchain premake5 gnu-make gdb gperftools ;; for the shell + ))) + (arguments + `(#:phases (modify-phases %standard-phases + (delete 'configure) + (delete 'build) + (delete 'package) + (delete 'check) + (delete 'install)))) + (description "Pangemma shell for development") + )) + +;; ---- legacy build ----------------------------------------------------------------- +(define-public gemma-git + "Original legacy gemma -- for as long as it compiles" + (package + (name "gemma-git") + (version (git-version %pangemma-version "HEAD" %git-commit)) + (source (local-file %source-dir #:recursive? #t)) + (build-system gnu-build-system) + (inputs + (list catch2 + gdb + gsl + openblas-pangemma + zlib)) + ;; ("gsl-static" ,gsl-static) + ;; ("zlib:static" ,zlib "static") + (native-inputs ; for running tests + (list perl which)) + (arguments + `(#:phases + (modify-phases %standard-phases + (delete 'configure) + (delete 'validate-runpath) + (add-before 'build 'bin-mkdir + (lambda _ + (mkdir-p "bin") + )) + (replace 'install + (lambda* (#:key outputs #:allow-other-keys) + (let ((out (assoc-ref outputs "out"))) + (install-file "bin/gemma" (string-append out "/bin")))))) + #:tests? #t + #:parallel-tests? #f)) + (home-page "https://github.com/genetics-statistics") + (synopsis "Tool for genome-wide efficient mixed model association") + (description "Genome-wide Efficient Mixed Model Association (GEMMA) +provides a standard linear mixed model resolver with application in +genome-wide association studies (GWAS).") + (license license:gpl3))) + +pangemma-shell-git diff --git a/premake5.lua b/premake5.lua new file mode 100644 index 0000000..9229800 --- /dev/null +++ b/premake5.lua @@ -0,0 +1,78 @@ +-- Build with +-- +-- make clean && rm build -rf +-- premake5 gmake && make verbose=1 gemmalib -j 8 +-- +-- Including bin +-- +-- premake5 gmake && make verbose=1 config=debug gemma -j 8 && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./test/runner +-- +-- Or +-- +-- premake5 gmake && make verbose=1 config=release -j 8 gemma && LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma +-- +-- Run +-- +-- LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Debug/gemma +-- +-- Next we start using the API from guile with +-- +-- env LD_LIBRARY_PATH=./build/bin/Debug/:$GUIX_ENVIRONMENT/lib guile +-- (load-extension "libgemmalib" "init_module") + +local pkg_cpp_flags = os.outputof("pkg-config --cflags openblas guile-3.0 gsl zlib") +local pkg_linker_flags = os.outputof("pkg-config --libs openblas guile-3.0 gsl zlib") + +workspace "PanGemma" + configurations { "Debug", "Release" } + + project "gemmalib" -- library for interactive development + kind "SharedLib" + defines { "OPENBLAS" } + language "C++" + objdir "build/" + targetdir "build/bin/%{cfg.buildcfg}" + + files { "src/*.h src/*.c src/**.hpp", "src/**.cpp" } + removefiles { "src/main.cpp" } + includedirs { "src/" } + + filter "configurations:Debug" + defines { "DEBUG" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + symbols "On" + + filter "configurations:Release" + defines { "NDEBUG", "HAVE_INLINE" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + buildoptions { "-pthread", "-Wall" } + optimize "Speed" + +project "gemma" + kind "ConsoleApp" + defines { "OPENBLAS" } + language "C++" + buildoptions { "-Wfatal-errors" } + objdir "build/" + targetdir "build/bin/%{cfg.buildcfg}" + + files { "src/*.h src/*.c src/**.hpp", "src/**.cpp" } + removefiles { "src/gemma_api.cpp" } + includedirs { "src/" } + links { "lmdb" } + + filter "configurations:Debug" + defines { "DEBUG" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + links { "profiler" } + symbols "On" + + filter "configurations:Release" + defines { "NDEBUG", "HAVE_INLINE" } + buildoptions { "-pthread", "-Wall" } + buildoptions { pkg_cpp_flags } + linkoptions { pkg_linker_flags } + optimize "Speed" diff --git a/scripts/gen_version_info.cmd b/scripts/gen_version_info.cmd deleted file mode 100644 index d824687..0000000 --- a/scripts/gen_version_info.cmd +++ /dev/null @@ -1,16 +0,0 @@ -@echo off -rem https://stackoverflow.com/questions/3472631/how-do-i-get-the-day-month-and-year-from-a-windows-cmd-exe-script -FOR /F "skip=1 tokens=1-6" %%A IN ('WMIC Path Win32_LocalTime Get Day^,Hour^,Minute^,Month^,Second^,Year /Format:table') DO ( - if "%%B" NEQ "" ( - SET /A FDATE=%%F*10000+%%D*100+%%A - ) -) -set year=%FDATE:~0,4% -set /p version=<VERSION - -echo // version.h generated by GEMMA -rem https://stackoverflow.com/questions/7105433/windows-batch-echo-without-new-line -echo|set /p="#define GEMMA_VERSION "" -echo %version%" -echo #define GEMMA_DATE "%FDATE:~0,8%" -echo #define GEMMA_YEAR "%year%" diff --git a/scripts/gen_version_info.sh b/scripts/gen_version_info.sh deleted file mode 100755 index 9be81c6..0000000 --- a/scripts/gen_version_info.sh +++ /dev/null @@ -1,14 +0,0 @@ -#! /bin/bash -# -# Script to generate the version info of GEMMA and its environment -# in ./src/version.h - -DATE=$(date "+%Y-%m-%d") -YEAR=$(date "+%Y") -PROFILE=$1 - -echo // version.h generated by GEMMA $0 -echo \#define GEMMA_VERSION \"$(cat ./VERSION)\" -echo \#define GEMMA_DATE \"$DATE\" -echo \#define GEMMA_YEAR \"$YEAR\" -echo \#define GEMMA_PROFILE \"$PROFILE\" diff --git a/src/checkpoint.h b/src/checkpoint.h index c2457d9..05fd836 100644 --- a/src/checkpoint.h +++ b/src/checkpoint.h @@ -1,7 +1,7 @@ /* Checkpoints for pangemma propagators - Copyright © 2015, Pjotr Prins + Copyright © 2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,6 +26,14 @@ using namespace std; extern string checkpoint_name; +/* + Checkpoint functions display a message in debug mode when + reached. If the (unique) msg (tag) is equal to the global + checkpoint_name the program will stop. The idea is that we can + stop computation at any checkpoint. Later we can write the code to + restart at a checkpoint. + */ + void checkpoint_run(string msg, string filename, string srcfilename, int line, string funcname); #define checkpoint(msg, fname) \ diff --git a/src/debug.cpp b/src/debug.cpp index 0aa4bfc..6cefcc7 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -139,6 +139,7 @@ inline int fedisableexcept(unsigned int excepts) #endif +/* void enable_segfpe() { if (!is_fpe_check_mode() || is_legacy_mode()) return; #ifdef __GNUC__ @@ -159,6 +160,9 @@ void disable_segfpe() { #endif #endif } +*/ + +#ifndef NDEBUG void write(const char *s, const char *msg) { if (!is_debug_data_mode() && !is_debug_dump_mode()) return; @@ -230,6 +234,8 @@ void write(const gsl_matrix *m, const char *msg) { cout << "}" << endl; } +#endif // NDEBUG + /* Helper function to make sure gsl allocations do their job because gsl_matrix_alloc does not initiatize values (behaviour that changed diff --git a/src/debug.h b/src/debug.h index 0489a81..a32bfd2 100644 --- a/src/debug.h +++ b/src/debug.h @@ -60,11 +60,22 @@ void disable_segfpe(); { auto x = m * n; \ enforce_msg(x / m == n, "multiply integer overflow"); } +#ifndef NDEBUG + void write(const double d, const char *msg = ""); void write(const char *s, const char *msg = ""); void write(const gsl_vector *v, const char *msg = ""); void write(const gsl_matrix *m, const char *msg = ""); +#else // NDEBUG + +inline void write(const double d, const char *msg = "") {}; +inline void write(const char *s, const char *msg = "") {}; +inline void write(const gsl_vector *v, const char *msg = "") {}; +inline void write(const gsl_matrix *m, const char *msg = "") {}; + +#endif // NDEBUG + gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols); int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src); void gsl_matrix_safe_free (gsl_matrix *v); diff --git a/src/gemma.cpp b/src/gemma.cpp index 6c2b3f4..a39d67a 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -62,6 +62,7 @@ extern "C" { #include "vc.h" #include "debug.h" #include "version.h" +#include <guile_api.h> using namespace std; @@ -81,10 +82,12 @@ void gemma_gsl_error_handler (const char * reason, #include <openblas_config.h> #endif +using namespace gemmaguile; + void GEMMA::PrintHeader(void) { cout << - "GEMMA forked executable --- part of PanGEMMA " << version << " (" << date << ") by Xiang Zhou, Pjotr Prins and team (C) 2012-" << year << endl; + "Pangemma " << version << " --- GEMMA 0.98.5 compatible executable (" << date << ") with" << OPENBLAS_VERSION << "and guile " << global_guile_version() << endl << "by Xiang Zhou, Pjotr Prins and team (C) 2012-" << year << endl ; return; } @@ -1653,7 +1656,7 @@ void GEMMA::BatchRun(PARAM &cPar) { clock_t time_begin, time_start; time_begin = clock(); - if (is_check_mode()) enable_segfpe(); // fast NaN checking by default + // if (is_check_mode()) enable_segfpe(); // fast NaN checking by default // Read Files. cout << "Reading Files ... " << endl; @@ -1908,32 +1911,52 @@ void GEMMA::BatchRun(PARAM &cPar) { } // Generate Kinship matrix (optionally using LOCO) - if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) { + // if (cPar.a_mode == M_KIN || cPar.a_mode == M_KIN2) { + if (is_kinship_compute(cPar.a_mode)) { cout << "Calculating Relatedness Matrix ... " << endl; - gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); - enforce_msg(G, "allocate G"); // just to be sure + if (cPar.is_mdb) { + time_start = clock(); + auto K = cPar.MdbCalcKin(); + cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + if (cPar.a_mode == M_KIN) { + cPar.WriteMatrix(K, "cXX"); + } else { // M_KIN2 + cPar.WriteMatrix(K, "sXX"); + } + + gsl_matrix_safe_free(K); + return; + } + + enforce(cPar.ni_total > 0); + gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); + enforce_msg(K, "allocate G"); // just to be sure time_start = clock(); - cPar.CalcKin(G); + if (cPar.error == true) { + cout << "error! failed to prepare for calculating relatedness matrix. " << endl; + return; + } + cPar.CalcKin(K); cPar.time_G = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); if (cPar.error == true) { - cout << "error! fail to calculate relatedness matrix. " << endl; + cout << "error! failed to calculate relatedness matrix. " << endl; return; } // Now we have the Kinship matrix test it - validate_K(G); + validate_K(K); if (cPar.a_mode == M_KIN) { - cPar.WriteMatrix(G, "cXX"); + cPar.WriteMatrix(K, "cXX"); } else { // M_KIN2 - cPar.WriteMatrix(G, "sXX"); + cPar.WriteMatrix(K, "sXX"); } - gsl_matrix_safe_free(G); + gsl_matrix_safe_free(K); } // Compute the LDSC weights (not implemented yet) @@ -1943,7 +1966,7 @@ void GEMMA::BatchRun(PARAM &cPar) { VARCOV cVarcov; cVarcov.CopyFromParam(cPar); - if (is_check_mode()) disable_segfpe(); // disable fast NaN checking for now + // if (is_check_mode()) disable_segfpe(); // disable fast NaN checking for now if (!cPar.file_bfile.empty()) { cVarcov.AnalyzePlink(); @@ -2058,7 +2081,7 @@ void GEMMA::BatchRun(PARAM &cPar) { VARCOV cVarcov; cVarcov.CopyFromParam(cPar); - if (is_check_mode()) disable_segfpe(); // fast NaN checking for now + // if (is_check_mode()) disable_segfpe(); // fast NaN checking for now if (!cPar.file_bfile.empty()) { cVarcov.AnalyzePlink(); @@ -2569,19 +2592,23 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.a_mode == M_LMM4 || cPar.a_mode == M_LMM5 || cPar.a_mode == M_LMM9 || cPar.a_mode == M_EIGEN) { // Fit LMM or mvLMM or eigen write(cPar.a_mode, "Mode"); - gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); + auto n_ph = cPar.n_ph; // # of trait values + auto ni_test = cPar.ni_test; // # of individuals to test + enforce(n_ph > 0); + enforce(ni_test > 0); + gsl_matrix *Y = gsl_matrix_safe_alloc(ni_test, n_ph); // Y is ind x phenotypes enforce_msg(Y, "allocate Y"); // just to be sure - gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); - gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2); // B is a d by c - // matrix - gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2); - gsl_matrix *G = gsl_matrix_safe_alloc(Y->size1, Y->size1); - gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1); - gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2); - gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2); - gsl_vector *eval = gsl_vector_calloc(Y->size1); - gsl_vector *env = gsl_vector_safe_alloc(Y->size1); - gsl_vector *weight = gsl_vector_safe_alloc(Y->size1); + gsl_matrix *W = gsl_matrix_safe_alloc(ni_test, cPar.n_cvt); // W is ind x covariates + gsl_matrix *B = gsl_matrix_safe_alloc(n_ph, W->size2); // B is a d by c (effect size) + gsl_matrix *se_B = gsl_matrix_safe_alloc(n_ph, W->size2); // se + enforce(ni_test > 0); + gsl_matrix *K = gsl_matrix_safe_alloc(ni_test, ni_test); // Kinship aka as G/GRM in the equation ind x ind + gsl_matrix *U = gsl_matrix_safe_alloc(ni_test, ni_test); + gsl_matrix *UtW = gsl_matrix_calloc(ni_test, W->size2); // Transformed ind x covariates + gsl_matrix *UtY = gsl_matrix_calloc(ni_test, n_ph); // Transformed ind x phenotypes + gsl_vector *eval = gsl_vector_calloc(ni_test); // eigen values + gsl_vector *env = gsl_vector_safe_alloc(ni_test); // E environment + gsl_vector *weight = gsl_vector_safe_alloc(ni_test); // weights debug_msg("Started on LMM"); assert_issue(is_issue(26), UtY->data[0] == 0.0); @@ -2592,38 +2619,38 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.CopyGxe(env); } - // read relatedness matrix G + // read relatedness matrix K if (!(cPar.file_kin).empty()) { ReadFile_kin(cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, - cPar.k_mode, cPar.error, G); + cPar.k_mode, cPar.error, K); debug_msg("Read K/GRM file"); if (cPar.error == true) { cout << "error! fail to read kinship/relatedness file. " << endl; return; } - // center matrix G - CenterMatrix(G); - validate_K(G); - write(G, "G"); + // center matrix K + CenterMatrix(K); + validate_K(K); + write(K, "K"); // is residual weights are provided, then if (!cPar.file_weight.empty()) { cPar.CopyWeight(weight); double d, wi, wj; - for (size_t i = 0; i < G->size1; i++) { + for (size_t i = 0; i < K->size1; i++) { wi = gsl_vector_get(weight, i); - for (size_t j = i; j < G->size2; j++) { + for (size_t j = i; j < K->size2; j++) { wj = gsl_vector_get(weight, j); - d = gsl_matrix_get(G, i, j); + d = gsl_matrix_get(K, i, j); if (wi <= 0 || wj <= 0) { d = 0; } else { d /= safe_sqrt(wi * wj); } - gsl_matrix_set(G, i, j, d); + gsl_matrix_set(K, i, j, d); if (j != i) { - gsl_matrix_set(G, j, i, d); + gsl_matrix_set(K, j, i, d); } } } @@ -2634,9 +2661,9 @@ void GEMMA::BatchRun(PARAM &cPar) { time_start = clock(); if (cPar.a_mode == M_EIGEN) { - cPar.trace_G = EigenDecomp_Zeroed(G, U, eval, 0); + cPar.trace_G = EigenDecomp_Zeroed(K, U, eval, 0); } else { - cPar.trace_G = EigenDecomp_Zeroed(G, U, eval, 0); + cPar.trace_G = EigenDecomp_Zeroed(K, U, eval, 0); } // write(eval,"eval"); @@ -2693,8 +2720,8 @@ void GEMMA::BatchRun(PARAM &cPar) { LMM cLmm; cLmm.CopyFromParam(cPar); - gsl_vector_view Y_col = gsl_matrix_column(Y, 0); - gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); + gsl_vector_view Y_col = gsl_matrix_column(Y, 0); // get a single phenotype column + gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and transformed column cLmm.AnalyzeGene(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); // y is the predictor, not the phenotype @@ -2811,34 +2838,40 @@ void GEMMA::BatchRun(PARAM &cPar) { // if (is_check_mode()) disable_segfpe(); // disable fast NaN checking for now - gsl_vector_view Y_col = gsl_matrix_column(Y, 0); - gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); + gsl_vector_view Y_col = gsl_matrix_column(Y, 0); // get pheno column + gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and its transformed write(&Y_col.vector, "Y_col"); - if (!cPar.file_bfile.empty()) { - // PLINK analysis - if (cPar.file_gxe.empty()) { - cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, cPar.setGWASnps); - } - else { - cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, env); - } + if (cPar.is_mdb) { + cLmm.mdb_calc_gwa(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, cPar.loco); + cLmm.CopyToParam(cPar); } else { - // BIMBAM analysis - - if (cPar.file_gxe.empty()) { - cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, cPar.setGWASnps); - } else { - cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector, env); + if (!cPar.file_bfile.empty()) { + // PLINK analysis + if (cPar.file_gxe.empty()) { + cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, cPar.setGWASnps); + } + else { + cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, env); + } + } + else { + // BIMBAM analysis + + if (cPar.file_gxe.empty()) { + cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, cPar.setGWASnps); + } else { + cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W, + &Y_col.vector, env); + } } + cLmm.WriteFiles(); // write output files + cLmm.CopyToParam(cPar); } - cLmm.WriteFiles(); - cLmm.CopyToParam(cPar); } else { debug_msg("fit mvLMM (multiple phenotypes)"); MVLMM cMvlmm; @@ -2873,7 +2906,7 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_matrix_safe_free(W); gsl_matrix_warn_free(B); // sometimes unused gsl_matrix_warn_free(se_B); - gsl_matrix_warn_free(G); + gsl_matrix_warn_free(K); gsl_matrix_safe_free(U); gsl_matrix_safe_free(UtW); gsl_matrix_safe_free(UtY); @@ -2898,7 +2931,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // run bvsr if rho==1 if (cPar.rho_min == 1 && cPar.rho_max == 1) { // read genotypes X (not UtX) - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // perform BSLMM analysis BSLMM cBslmm; @@ -2916,7 +2949,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // read relatedness matrix G if (!(cPar.file_kin).empty()) { - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // read relatedness matrix G ReadFile_kin(cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, @@ -2930,7 +2963,7 @@ void GEMMA::BatchRun(PARAM &cPar) { CenterMatrix(G); validate_K(G); } else { - cPar.ReadGenotypes(UtX, G, true); + cPar.ReadBIMBAMGenotypes(UtX, G, true); } // eigen-decomposition and calculate trace_G @@ -3008,7 +3041,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // run bvsr if rho==1 if (cPar.rho_min == 1 && cPar.rho_max == 1) { // read genotypes X (not UtX) - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // perform BSLMM analysis BSLMM cBslmm; @@ -3027,7 +3060,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // read relatedness matrix G if (!(cPar.file_kin).empty()) { - cPar.ReadGenotypes(UtX, G, false); + cPar.ReadBIMBAMGenotypes(UtX, G, false); // read relatedness matrix G ReadFile_kin(cPar.file_kin, cPar.indicator_idv, cPar.mapID2num, @@ -3042,7 +3075,7 @@ void GEMMA::BatchRun(PARAM &cPar) { validate_K(G); } else { - cPar.ReadGenotypes(UtX, G, true); + cPar.ReadBIMBAMGenotypes(UtX, G, true); } // eigen-decomposition and calculate trace_G @@ -3168,8 +3201,8 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { } outfile << "##" << endl; - outfile << "## GEMMA Version = " << version << " (" << date << ")" << endl; - outfile << "## Build profile = " << GEMMA_PROFILE << endl ; + outfile << "## PANGEMMA Version = " << version << " (" << date << ")" << endl; + // outfile << "## Build profile = " << GEMMA_PROFILE << endl ; #ifdef __GNUC__ outfile << "## GCC version = " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__ << endl; #endif diff --git a/src/gemma.h b/src/gemma.h index b34a958..fa7c24d 100644 --- a/src/gemma.h +++ b/src/gemma.h @@ -64,6 +64,7 @@ public: void Assign(int argc, char **argv, PARAM &cPar); void BatchRun(PARAM &cPar); void WriteLog(int argc, char **argv, PARAM &cPar); + inline bool is_kinship_compute(uint a_mode) { return (a_mode == M_KIN || a_mode == M_KIN2); } }; #endif diff --git a/src/gemma_api.cpp b/src/gemma_api.cpp new file mode 100644 index 0000000..77c1d56 --- /dev/null +++ b/src/gemma_api.cpp @@ -0,0 +1,33 @@ +// Testing bindings, see README.md and +// https://www.gnu.org/savannah-checkouts/gnu/guile/docs/docs-2.0/guile-ref/Dynamic-Types.html + +#include <stdio.h> +#include <libguile.h> +// #include <libguile/boolean.h> +// #include <libguile/numbers.h> + +// extern SCM my_incrementing_function (SCM a, SCM flag); + +// extern "C" SCM my_ping(SCM i); + +static SCM my_ping(SCM i) { + return i; +} + + +SCM my_incrementing_function (SCM a, SCM flag) +{ + SCM result; + + if (scm_is_true (flag)) + result = scm_sum (a, scm_from_int (1)); + else + result = a; + + return result; +} + + +extern "C" void init_module() { + scm_c_define_gsubr("my-ping", 1, 0, 0, (scm_t_subr)my_ping); +} diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index f5a79a2..e630f64 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2020, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -41,6 +41,7 @@ #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> #include "checkpoint.h" #include "debug.h" @@ -61,7 +62,7 @@ void ProgressBar(string str, double p, double total, double ratio) { const double progress = (100.0 * p / total); const uint barsize = (int)(progress / 2.0); // characters // cout << barsize << endl; - // cout << str << " "; + cout << str << " "; // cout << p << "/" << total << endl; assert(barsize < 101); // corrupted data somehow if (barsize > 0) { @@ -153,6 +154,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) { // Read SNP file. A single column of SNP names. bool ReadFile_snps(const string file_snps, set<string> &setSnps) { debug_msg("enter ReadFile_snps"); + checkpoint("read-file-snps",file_snps); setSnps.clear(); igzstream infile(file_snps.c_str(), igzstream::in); @@ -182,6 +184,7 @@ bool ReadFile_snps(const string file_snps, set<string> &setSnps) { bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { debug_msg("entered"); setSnps.clear(); + checkpoint("read-file-header",file_snps); igzstream infile(file_snps.c_str(), igzstream::in); if (!infile) { @@ -239,6 +242,7 @@ bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { // Read log file. bool ReadFile_log(const string &file_log, double &pheno_mean) { debug_msg("ReadFile_log"); + checkpoint("read-file-log",file_log); ifstream infile(file_log.c_str(), ifstream::in); if (!infile) { cout << "error! fail to open log file: " << file_log << endl; @@ -282,6 +286,8 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM) { debug_msg("ReadFile_anno"); + checkpoint("read-file-anno",file_anno); + mapRS2chr.clear(); mapRS2bp.clear(); @@ -345,6 +351,7 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr, bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column) { debug_msg("entered"); + checkpoint("read-file-column",file_pheno); indicator_idv.clear(); pheno.clear(); @@ -389,6 +396,7 @@ bool ReadFile_pheno(const string &file_pheno, vector<vector<double>> &pheno, const vector<size_t> &p_column) { debug_msg("entered"); + checkpoint("read-file-pheno",file_pheno); indicator_pheno.clear(); pheno.clear(); @@ -398,35 +406,36 @@ bool ReadFile_pheno(const string &file_pheno, return false; } - string line; - char *ch_ptr; - - string id; + // string id; double p; vector<double> pheno_row; - vector<int> ind_pheno_row; + + string line; + char *ch_ptr; + + vector<int> indicator_pheno_row; size_t p_max = *max_element(p_column.begin(), p_column.end()); map<size_t, size_t> mapP2c; for (size_t i = 0; i < p_column.size(); i++) { mapP2c[p_column[i]] = i; pheno_row.push_back(-9); - ind_pheno_row.push_back(0); + indicator_pheno_row.push_back(0); } while (!safeGetline(infile, line).eof()) { ch_ptr = strtok((char *)line.c_str(), " ,\t"); size_t i = 0; while (i < p_max) { - enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match phenotypes in geno file"); + enforce_msg(ch_ptr,"Number of phenotypes in pheno file do not match selected columns"); if (mapP2c.count(i + 1) != 0) { if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[i + 1]] = 0; + indicator_pheno_row[mapP2c[i + 1]] = 0; // skip this trait row pheno_row[mapP2c[i + 1]] = -9; } else { p = atof(ch_ptr); - ind_pheno_row[mapP2c[i + 1]] = 1; + indicator_pheno_row[mapP2c[i + 1]] = 1; // use this trait row pheno_row[mapP2c[i + 1]] = p; } } @@ -434,7 +443,7 @@ bool ReadFile_pheno(const string &file_pheno, ch_ptr = strtok(NULL, " ,\t"); } - indicator_pheno.push_back(ind_pheno_row); + indicator_pheno.push_back(indicator_pheno_row); pheno.push_back(pheno_row); } @@ -448,6 +457,7 @@ bool ReadFile_pheno(const string &file_pheno, bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt, vector<vector<double>> &cvt, size_t &n_cvt) { debug_msg("entered"); + checkpoint("read-file-cvt",file_cvt); indicator_cvt.clear(); ifstream infile(file_cvt.c_str(), ifstream::in); @@ -515,6 +525,7 @@ bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt, // Read .bim file. bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { + checkpoint("read-file-bim",file_bim); debug_msg("entered"); snpInfo.clear(); @@ -563,6 +574,7 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, vector<vector<double>> &pheno, map<string, int> &mapID2num, const vector<size_t> &p_column) { + checkpoint("read-file-fam",file_fam); debug_msg("entered"); indicator_pheno.clear(); pheno.clear(); @@ -639,16 +651,55 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, return true; } + // Read bimbam mean genotype file, the first time, to obtain #SNPs for // analysis (ns_test) and total #SNP (ns_total). -bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, - const gsl_matrix *W, vector<int> &indicator_idv, - vector<int> &indicator_snp, const double &maf_level, - const double &miss_level, const double &hwe_level, - const double &r2_level, map<string, string> &mapRS2chr, - map<string, long int> &mapRS2bp, - map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, - size_t &ns_test) { +/* +## Modified (Mutated) Parameters: + +1. **`indicator_idv`** (vector<int>&) + - **Not modified** - only read from to determine which individuals to include + +2. **`indicator_snp`** (vector<int>&) + - **Modified**: Cleared at the start (`indicator_snp.clear()`) + - Populated with 0s and 1s to indicate whether each SNP passes filters + - One entry per SNP in the genotype file + +3. **`mapRS2chr`** (map<string, string>&) + - **Not modified** - only read from to get chromosome info + +4. **`mapRS2bp`** (map<string, long int>&) + - **Not modified** - only read from to get base pair positions + +5. **`mapRS2cM`** (map<string, double>&) + - **Not modified** - only read from to get centimorgan positions + +6. **`snpInfo`** (vector<SNPINFO>&) + - **Modified**: Cleared at the start (`snpInfo.clear()`) + - Populated with SNPINFO structures for each SNP containing metadata (chr, rs, position, alleles, missingness, MAF, etc.) + +7. **`ns_test`** (size_t&) + - **Modified**: Set to 0 initially + - Incremented for each SNP that passes all filters + - Returns the count of SNPs that will be used for testing + +## Summary of Mutated Outputs: +- **`indicator_snp`**: Binary indicators for which SNPs passed filtering +- **`snpInfo`**: Complete metadata for all SNPs in the file +- **`ns_test`**: Count of SNPs passing filters + +The function returns a `bool` indicating success/failure, but the real outputs are these three modified reference parameters. +*/ + +bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, + const gsl_matrix *W, const vector<int> &indicator_idv, + vector<int> &indicator_snp, const double &maf_level, + const double &miss_level, const double &hwe_level, + const double &r2_level, map<string, string> &mapRS2chr, + map<string, long int> &mapRS2bp, + map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, + size_t &ns_test) { + checkpoint("read-file-geno",file_geno); debug_msg("entered"); indicator_snp.clear(); snpInfo.clear(); @@ -661,7 +712,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); // Covariates gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); @@ -687,9 +738,8 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, double cM; size_t file_pos; - double maf, geno, geno_old; + double geno, geno_old; size_t n_miss; - size_t n_0, n_1, n_2; double min_g = std::numeric_limits<float>::max(); double max_g = std::numeric_limits<float>::min(); @@ -700,6 +750,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, for (int i = 0; i < ni_total; ++i) { ni_test += indicator_idv[i]; } + // assert(ni_test == ni_total); // we use indicator_idv? ns_test = 0; file_pos = 0; @@ -709,13 +760,14 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, // cerr << key << endl; // } while (!safe_get_line(infile, line).eof()) { - // cout << line; + // fetch SNP annotation ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen); rs = ch_ptr; ch_ptr = strtok_safe2(NULL, " ,\t",infilen); minor = ch_ptr; ch_ptr = strtok_safe2(NULL, " ,\t",infilen); major = ch_ptr; + // genotypes get read below if (setSnps.size() != 0 && setSnps.count(rs) == 0) { // if SNP in geno but not in -snps we add an missing value @@ -745,21 +797,21 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } // Start on a new marker/SNP - maf = 0; n_miss = 0; flag_poly = 0; geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; c_idv = 0; gsl_vector_set_zero(genotype_miss); + double maf = 0.0; + size_t n_0=0,n_1=0,n_2=0; + auto infilen = file_geno.c_str(); - for (int i = 0; i < ni_total; ++i) { + for (int i = 0; i < ni_total; ++i) { // read genotypes ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str()); if (indicator_idv[i] == 0) continue; + // annotate missing genotypes enforce_msg(ch_ptr,"Problem reading geno file (not enough genotypes in line)"); if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(genotype_miss, c_idv, 1); @@ -769,19 +821,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } geno = atof(ch_ptr); - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } gsl_vector_set(genotype, c_idv, geno); - if (geno < min_g) min_g = geno; - if (geno > max_g) max_g = geno; + if (geno < min_g) + min_g = geno; + if (geno > max_g) + max_g = geno; // going through genotypes with 0.0 < geno < 2.0 if (flag_poly == 0) { // first init in marker @@ -792,10 +837,20 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, flag_poly = 1; // genotypes differ } + // compute ratios + if (hwe_level != 0 && maf_level != -1) { + if (geno >= 0 && geno <= 0.5) + n_0++; + if (geno > 0.5 && geno < 1.5) + n_1++; + if (geno >= 1.5 && geno <= 2.0) + n_2++; + } maf += geno; c_idv++; } + maf /= 2.0 * (double)(ni_test - n_miss); SNPINFO sInfo = {chr, rs, @@ -832,8 +887,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, continue; } } - - // -r2 flag for (size_t i = 0; i < genotype->size; ++i) { if (gsl_vector_get(genotype_miss, i) == 1) { @@ -841,7 +894,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, gsl_vector_set(genotype, i, geno); } } - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); gsl_blas_ddot(genotype, genotype, &v_x); @@ -863,7 +915,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, if (max_g != 2.0) warning_msg("The maximum genotype value is not 2.0 - this is not the BIMBAM standard and will skew l_lme and effect sizes"); - gsl_vector_free(genotype); + gsl_vector_free(genotype); // we don't hang on to the genotypes gsl_vector_free(genotype_miss); gsl_matrix_free(WtW); gsl_matrix_free(WtWi); @@ -885,6 +937,7 @@ bool ReadFile_bed(const string &file_bed, const set<string> &setSnps, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, size_t &ns_test) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); indicator_snp.clear(); size_t ns_total = snpInfo.size(); @@ -1193,6 +1246,7 @@ void Plink_ReadOneSNP(const int pos, const vector<int> &indicator_idv, void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, map<string, int> &mapID2num, const size_t k_mode, bool &error, gsl_matrix *G) { + checkpoint("read-file-kin",file_kin); debug_msg("entered"); igzstream infile(file_kin.c_str(), igzstream::in); if (!infile) { @@ -1296,7 +1350,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, infile.close(); infile.clear(); - checkpoint("read-kinship-file",file_kin); + checkpoint("end-read-file-kin",file_kin); return; } @@ -1422,6 +1476,197 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { return; } + +// Read bimbam mean genotype file and calculate kinship matrix. +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco) { + checkpoint("mdb-kin",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr = 0; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } + + /* Create and open the LMDB environment: */ + auto env = lmdb::env::create(); + + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); /* 10 GiB */ + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + string format; + + size_t ni_total = 0, num_markers = 0; + { + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto info = lmdb::dbi::open(rtxn, "info"); + + string_view meta; + if (info.get(rtxn, "meta", meta)) { + cout << meta << endl; + } else { + cout << "meta not found!" << endl; + } + std::string_view view_int; + info.get(rtxn, "numsamples", view_int); + ni_total = lmdb::from_sv<size_t>(view_int); + string_view info_key,info_value; + info.get(rtxn,"format",info_value); + format = string(info_value); + + MDB_stat stat; + mdb_stat(rtxn, info, &stat); + assert(stat.ms_entries == 5); + } + + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + // cout << "Number of records: " << stat.ms_entries << endl; + num_markers = stat.ms_entries; + + gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total); + gsl_matrix_set_zero(matrix_kin); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); + + // Xlarge contains inds x markers + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE); + enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE"); + if (is_check_mode()) + gsl_matrix_set_zero(Xlarge); + + // [ ] check XLarge is zeroed properly + // [ ] handle missing data + + // For every marker read the genotype x inds + size_t ns_test = 0; + size_t display_pace = 1000; + + cout << "## number of total individuals = " << ni_total << endl; + cout << "## number of analyzed individuals = " << ni_total << endl; + cout << "## number of analyzed SNPs/var = " << num_markers << endl; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + + auto mdb_fetch = MDB_FIRST; + for (size_t t = 0; t < num_markers; ++t) { + string line; + if (t % display_pace == 0) { + ProgressBar("Reading SNPs", t, num_markers - 1); + } + string_view key, value; + cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + + if (is_loco && loco_chr == chr) + continue; + + // ---- Depending on the format we get different buffers - currently float and byte are supported: + vector<double> gs; + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + auto size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + auto size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + + size_t n_miss = 0; + double geno_mean = 0.0; + double geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + + for (size_t i = 0; i < ni_total; ++i) { + double d = gs[i]; + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + // impute missing values by plugging in the mean + for (size_t i = 0; i < ni_total; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + // subtract the mean (centering genotype values) + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + // scale the genotypes + if (k_mode == 2 && geno_var != 0) { // some confusion here, -gk 2 + // flag does this + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + + // set the SNP column ns_test to copy geno into the compute matrix Xlarge + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % K_BATCH_SIZE); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); + + ns_test++; + + // Every K_BATCH_SIZE rows batch compute kinship matrix and return + // by adding to matrix_kin + if (ns_test % K_BATCH_SIZE == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + write(matrix_kin,"K updated"); + } + } + if (ns_test % K_BATCH_SIZE != 0) { // compute last batch + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + write(matrix_kin,"K updated"); + } + ProgressBar("Reading SNPs", num_markers, num_markers); + cout << endl; + + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + gsl_matrix_free(Xlarge); + + return matrix_kin; +} + // Read bimbam mean genotype file and calculate kinship matrix. bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, @@ -1431,6 +1676,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, auto infilen = file_geno.c_str(); igzstream infile(infilen, igzstream::in); enforce_msg(infilen, "error reading genotype file"); + checkpoint("bimbam-kin",file_geno); size_t n_miss; double geno_mean, geno_var; @@ -1752,12 +1998,14 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { + checkpoint("read-file-geno",file_geno); debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { cout << "error reading genotype file:" << file_geno << endl; return false; } + checkpoint("read-geno-file",file_geno); string line; char *ch_ptr; @@ -1852,121 +2100,12 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, return true; } -// Compact version of the above function, using uchar instead of -// gsl_matrix. -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, - vector<int> &indicator_snp, - vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test) { - debug_msg("entered"); - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - Xt.clear(); - vector<unsigned char> Xt_row; - for (size_t i = 0; i < ni_test; i++) { - Xt_row.push_back(0); - } - - string line; - char *ch_ptr; - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); - double geno, geno_mean; - size_t n_miss; - - size_t ni_total = indicator_idv.size(); - size_t ns_total = indicator_snp.size(); - - size_t c_idv = 0, c_snp = 0; - - auto infilen = file_geno.c_str(); - for (size_t i = 0; i < ns_total; ++i) { - safeGetline(infile, line).eof(); - if (indicator_snp[i] == 0) { - continue; - } - - ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen); - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - - c_idv = 0; - geno_mean = 0; - n_miss = 0; - gsl_vector_set_zero(genotype_miss); - for (uint j = 0; j < ni_total; ++j) { - ch_ptr = strtok_safe2(NULL, " ,\t",infilen); - if (indicator_idv[j] == 0) { - continue; - } - - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - } else { - geno = atof(ch_ptr); - gsl_vector_set(genotype, c_idv, geno); - geno_mean += geno; - } - c_idv++; - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t j = 0; j < genotype->size; ++j) { - if (gsl_vector_get(genotype_miss, j) == 1) { - geno = geno_mean; - } else { - geno = gsl_vector_get(genotype, j); - } - - Xt_row[j] = Double02ToUchar(geno); - gsl_vector_set(genotype, j, (geno - geno_mean)); - } - Xt.push_back(Xt_row); - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - - infile.clear(); - infile.close(); - - return true; -} - // Read bimbam mean genotype file, the second time, recode "mean" // genotype and calculate K. bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { @@ -2100,6 +2239,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, const size_t ns_test) { + checkpoint("read-file-bed",file_bed); debug_msg("entered"); ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { @@ -2236,6 +2376,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, map<string, double> &mapRS2est) { + checkpoint("read-file-est",file_est); debug_msg("entered"); mapRS2est.clear(); @@ -2303,7 +2444,8 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, } bool CountFileLines(const string &file_input, size_t &n_lines) { - debug_msg("entered"); + checkpoint("count-lines",file_input); + igzstream infile(file_input.c_str(), igzstream::in); if (!infile) { cout << "error! fail to open file: " << file_input << endl; @@ -2320,6 +2462,7 @@ bool CountFileLines(const string &file_input, size_t &n_lines) { // Read gene expression file. bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, vector<SNPINFO> &snpInfo, size_t &ng_total) { + checkpoint("read-file-gene",file_gene); debug_msg("entered"); vec_read.clear(); ng_total = 0; diff --git a/src/gemma_io.h b/src/gemma_io.h index dd1d5c0..06cd0ee 100644 --- a/src/gemma_io.h +++ b/src/gemma_io.h @@ -26,12 +26,17 @@ #include <algorithm> #include <map> #include <vector> +#include <cstdint> #include "gzstream.h" #include "param.h" #define tab(col) ( col ? "\t" : "") +constexpr uint8_t CHR_X = 'X'; +constexpr uint8_t CHR_Y = 'Y'; +constexpr uint8_t CHR_M = 'M'; + using namespace std; void ProgressBar(string str, double p, double total, double ratio = -1.0); @@ -59,8 +64,8 @@ bool ReadFile_pheno(const string &file_pheno, bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, vector<double> &pheno, const int &p_column); -bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, - const gsl_matrix *W, vector<int> &indicator_idv, +bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, + const gsl_matrix *W, const vector<int> &indicator_idv, vector<int> &indicator_snp, const double &maf_level, const double &miss_level, const double &hwe_level, const double &r2_level, map<string, string> &mapRS2chr, @@ -87,6 +92,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); +gsl_matrix *mdb_calc_kin(const string file_geno, const int k_mode, const string loco); + bool BimbamKin(const string file_geno, const set<string> ksnps, vector<int> &indicator_snp, const int k_mode, const int display_pace, gsl_matrix *matrix_kin, @@ -100,11 +107,6 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); -bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, - vector<int> &indicator_snp, - vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test); bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, diff --git a/src/guile/examples.scm b/src/guile/examples.scm new file mode 100644 index 0000000..8855a34 --- /dev/null +++ b/src/guile/examples.scm @@ -0,0 +1,6 @@ +(define-module (example) + #:use-module (srfi srfi-4) + #:export (make-floats)) + +(define (make-floats) + #f64( 1.0 2.2 3.3 4.0)) diff --git a/src/guile_api.cpp b/src/guile_api.cpp new file mode 100644 index 0000000..e3ef482 --- /dev/null +++ b/src/guile_api.cpp @@ -0,0 +1,44 @@ +#include <guile_api.h> + +namespace gemmaguile { + + void global_start_guile() { + scm_init_guile(); + } + + string global_guile_version() { + SCM version_scm = scm_version(); + char* c_str = scm_to_utf8_string(version_scm); + string version_str(c_str); + free(c_str); // Must free the allocated memory + + return version_str; + } + + SCM make_floats() + { + return scm_call_0(scm_c_public_ref("example", "make-floats")); + } + + void use_floats() + { + SCM vec = make_floats(); + + scm_t_array_handle handle; + size_t len; + ssize_t inc; + const SCM *elt; + + const double *ny = scm_f64vector_elements (vec,&handle,&len,&inc); + for (size_t i = 0; i < len; i++) { + printf("elem %zu = %f\n", i, ny[i]); + } + scm_array_handle_release (&handle); + } + + void start_test() { + scm_c_primitive_load("src/guile/examples.scm"); + use_floats(); + } + +} diff --git a/src/guile_api.h b/src/guile_api.h new file mode 100644 index 0000000..b413342 --- /dev/null +++ b/src/guile_api.h @@ -0,0 +1,14 @@ +#pragma once + +#include <string> +#include <libguile.h> + +using namespace std; + +namespace gemmaguile { + + void global_start_guile(); + string global_guile_version(); + void start_test(); + +}; diff --git a/src/lapack.cpp b/src/lapack.cpp index e1a63e1..2d10b18 100644 --- a/src/lapack.cpp +++ b/src/lapack.cpp @@ -196,7 +196,7 @@ void lapack_eigen_symmv(gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, // disable fast NaN checking for now - dsyevr throws NaN errors, // but fixes them (apparently) - if (is_check_mode()) disable_segfpe(); + // if (is_check_mode()) disable_segfpe(); // DSYEVR - computes selected eigenvalues and, optionally, // eigenvectors of a real symmetric matrix @@ -223,7 +223,7 @@ void lapack_eigen_symmv(gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec, if (INFO != 0) cerr << "ERROR: value of INFO is " << INFO; enforce_msg(INFO == 0, "lapack_eigen_symmv failed"); - if (is_check_mode()) enable_segfpe(); // reinstate fast NaN checking + // if (is_check_mode()) enable_segfpe(); // reinstate fast NaN checking gsl_matrix_transpose(evec); diff --git a/src/lmm.cpp b/src/lmm.cpp index 85e92fe..2b730f3 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2022 Pjotr Prins + Copyright © 2017-2025 Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,10 +40,14 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> +// #include <lmdb.h> +#include <sys/mman.h> #include "gzstream.h" #include "gemma.h" #include "gemma_io.h" +#include "checkpoint.h" #include "fastblas.h" #include "lapack.h" #include "lmm.h" @@ -54,6 +58,7 @@ using namespace std; void LMM::CopyFromParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-from-param"); a_mode = cPar.a_mode; d_pace = cPar.d_pace; @@ -90,10 +95,12 @@ void LMM::CopyFromParam(PARAM &cPar) { } void LMM::CopyToParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-to-param"); cPar.time_UtX = time_UtX; cPar.time_opt = time_opt; - - cPar.ng_test = ng_test; + cPar.ns_total = ns_total; + cPar.ns_test = ns_test; + cPar.ng_test = ng_test; // number of markers tested return; } @@ -104,10 +111,11 @@ void LMM::WriteFiles() { file_str = path_out + "/" + file_out; file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { - cout << "error writing file: " << file_str.c_str() << endl; + cout << "error writing file: " << file_str << endl; return; } @@ -147,7 +155,7 @@ void LMM::WriteFiles() { outfile << scientific << setprecision(6); if (a_mode != M_LMM2) { - outfile << st.beta << "\t"; + outfile << scientific << st.beta << "\t"; outfile << st.se << "\t"; } @@ -201,21 +209,29 @@ void LMM::WriteFiles() { common_header(); - size_t t = 0; - for (size_t i = 0; i < snpInfo.size(); ++i) { - if (indicator_snp[i] == 0) - continue; - auto snp = snpInfo[i].rs_number; - if (process_gwasnps && setGWASnps.count(snp) == 0) - continue; - // cout << t << endl; - outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" - << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" - << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" - << fixed << setprecision(3) << snpInfo[i].maf << "\t"; - - sumstats(sumStat[t]); - t++; + if (snpInfo.size()) { + size_t t = 0; + for (size_t i = 0; i < snpInfo.size(); ++i) { + if (indicator_snp[i] == 0) + continue; + auto snp = snpInfo[i].rs_number; + if (process_gwasnps && setGWASnps.count(snp) == 0) + continue; + // cout << t << endl; + outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" + << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" + << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" + << fixed << setprecision(3) << snpInfo[i].maf << "\t"; + + sumstats(sumStat[t]); + t++; + } + } + else + { + for (auto &s : sumStat) { + sumstats(s); + } } } @@ -1210,6 +1226,142 @@ void LMM::CalcRLScore(const double l, const FUNC_PARAM ¶ms, double &beta, gsl_vector_safe_free(v_temp); } +/* + ## What `CalcUab` does: + +`CalcUab` **computes element-wise products of transformed covariates and phenotypes** to create sufficient statistics needed for linear mixed model calculations. + +### Purpose: + +Creates a matrix `Uab` containing all pairwise element-wise products of: +- Transformed covariates: columns of `UtW` (U^T × W) +- Transformed phenotype: vector `Uty` (U^T × y) + +These products are used later to efficiently compute variance components and test statistics without repeatedly accessing the original data. + +### Function Signature: +```cpp +void CalcUab(const gsl_matrix *UtW, // U^T × W (transformed covariates) + const gsl_vector *Uty, // U^T × y (transformed phenotype) + gsl_matrix *Uab) // OUTPUT: matrix of products +``` + +### Matrix Structure: + +Given: +- **n_cvt** covariates (columns in W) +- 1 phenotype (y) + +The function creates products for indices `a` and `b` where: +- `a, b ∈ {1, 2, ..., n_cvt, n_cvt+2}` (note: n_cvt+1 is **skipped**) +- Only computes for `b ≤ a` (lower triangular, symmetric) + +**Index mapping:** +- `1` to `n_cvt`: Covariate columns from UtW +- `n_cvt+1`: **SKIPPED** (reserved for genotype, added later) +- `n_cvt+2`: Phenotype (Uty) + +### Algorithm Walkthrough: + +```cpp +for (size_t a = 1; a <= n_cvt + 2; ++a) { + // Get column 'a' + if (a == n_cvt + 2) { + u_a = Uty; // Phenotype + } else { + u_a = UtW[:, a-1]; // Covariate a + } + + for (size_t b = a; b >= 1; --b) { + // Get column 'b' + if (b == n_cvt + 2) { + u_b = Uty; // Phenotype + } else { + u_b = UtW[:, b-1]; // Covariate b + } + + // Store element-wise product: u_a .* u_b + index_ab = GetabIndex(a, b, n_cvt); + Uab[:, index_ab] = u_a .* u_b; + } +} +``` + +### Example: + +With **n_cvt = 2** (2 covariates): + +**Columns stored in Uab:** +``` +GetabIndex(1, 1, 2) → UtW₁ .* UtW₁ (covariate 1 × covariate 1) +GetabIndex(2, 1, 2) → UtW₂ .* UtW₁ (covariate 2 × covariate 1) +GetabIndex(2, 2, 2) → UtW₂ .* UtW₂ (covariate 2 × covariate 2) +GetabIndex(4, 1, 2) → Uty .* UtW₁ (phenotype × covariate 1) +GetabIndex(4, 2, 2) → Uty .* UtW₂ (phenotype × covariate 2) +GetabIndex(4, 4, 2) → Uty .* Uty (phenotype × phenotype) + +Note: Index 3 (n_cvt+1) is skipped - reserved for genotype +``` + +### Why Skip `n_cvt+1`? + +```cpp +if (a == n_cvt + 1) continue; +if (b == n_cvt + 1) continue; +``` + +The slot `n_cvt+1` is **reserved for the genotype (SNP)** being tested, which is added by the overloaded version: + +```cpp +void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_vector *Utx, // ← GENOTYPE + gsl_matrix *Uab); +``` + +This design allows the base covariate products to be computed once, then genotype-specific products added for each SNP. + +### Index Function: + +```cpp +size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt) +``` + +Maps `(a, b)` pairs to column indices in `Uab`, ensuring: +- Only lower triangular entries (b ≤ a) +- Skips the genotype position (n_cvt+1) + +### Matrix Dimensions: + +```cpp +size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; +``` + +Number of unique products in the symmetric matrix (including diagonal), accounting for the skipped genotype position. + +### In Context (from `batch_compute`): + +```cpp +// Pre-compute covariate products once +CalcUab(UtW, Uty, Uab); + +// For each SNP: +for (size_t i = 0; i < batch_size; i++) { + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); + + // Add genotype-specific products + CalcUab(UtW, Uty, Utx, Uab); // Overloaded version + + // Use Uab for likelihood calculations + CalcLambda(..., Uab, ...); +} +``` + +### Summary: + +**`CalcUab` pre-computes all pairwise element-wise products** of transformed covariates and phenotype (UtW columns and Uty). These products are sufficient statistics used repeatedly in likelihood calculations for different SNPs, avoiding redundant computation. The design reserves space for genotype products to be added later per-SNP while reusing the fixed covariate products. +*/ + void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { size_t index_ab; size_t n_cvt = UtW->size2; @@ -1470,6 +1622,53 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, return; } +/* +Looking at the LMM::Analyze function, here are the parameters that get modified: +Modified (Mutated) Parameters: + +None of the function parameters are directly modified. + +All parameters are either: + + const gsl_matrix * - pointer to const (read-only) + const gsl_vector * - pointer to const (read-only) + const set<string> - const set passed by value (copy) + std::function<...>& - reference to a function, which is called but not modified itself + +What Actually Gets Modified: + +The function modifies member variables of the LMM class object: + + sumStat (member variable) + Modified: sumStat.push_back(SNPs) is called in the batch_compute lambda + Accumulates summary statistics (beta, se, lambda values, p-values) for each SNP that passes filters + This is the primary output of the analysis + Timing member variables (implied by member access): + time_UtX: time_UtX += (clock() - time_start) / ... + time_opt: time_opt += (clock() - time_start) / ... + These accumulate timing information for performance profiling + Member variables read but not modified: + indicator_snp - read to determine which SNPs to process + indicator_idv - read to determine which individuals to include + ni_total - number of total individuals + ni_test - number of individuals in test + n_cvt - number of covariates + l_mle_null, l_min, l_max, n_region, logl_mle_H0 - parameters for likelihood calculations + a_mode - analysis mode (M_LMM1, M_LMM2, etc.) + d_pace - for progress display + +Summary: + +From the outside caller's perspective: None of the parameters passed to Analyze() are mutated. All inputs are const. + +The actual output is stored in the member variable sumStat, which accumulates one SUMSTAT structure per analyzed SNP containing: + + beta - effect size + se - standard error + lambda_remle, lambda_mle - variance component estimates + p_wald, p_lrt, p_score - p-values from different tests + logl_H1 - log-likelihood under alternative hypothesis +*/ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const gsl_matrix *U, const gsl_vector *eval, @@ -1478,12 +1677,9 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const set<string> gwasnps) { clock_t time_start = clock(); - write(W, "W"); - write(y, "y"); + checkpoint_nofile("start-lmm-analyze"); // Subset/LOCO support bool process_gwasnps = gwasnps.size(); - if (process_gwasnps) - debug_msg("Analyze subset of SNPs (LOCO)"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; @@ -1510,24 +1706,36 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, // start reading genotypes and analyze size_t c = 0; + /* + batch_compute(l) takes l SNPs that have been loaded into Xlarge, + transforms them all at once using the eigenvector matrix U, then + loops through each transformed SNP to compute association + statistics (beta, standard errors, p-values) and stores results in + sumStat. + */ auto batch_compute = [&](size_t l) { // using a C++ closure // Compute SNPs in batch, note the computations are independent per SNP // debug_msg("enter batch_compute"); + checkpoint_nofile("\nstart-batch_compute"); gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); gsl_matrix_view UtXlarge_sub = gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); time_start = clock(); + // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge + // This is much faster than doing l separate matrix-vector products + // U is the eigenvector matrix from the spectral decomposition of the kinship matrix fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); gsl_matrix_set_zero(Xlarge); for (size_t i = 0; i < l; i++) { - // for every batch... + // for each snp batch item extract transformed genotype: gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); + // Calculate design matrix components and compute sufficient statistics for the regression model CalcUab(UtW, Uty, Utx, Uab); time_start = clock(); @@ -1537,17 +1745,24 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, double p_lrt = 0.0, p_score = 0.0; double logl_H1 = 0.0; + // Run statistical tests based on analysis mode // 3 is before 1. if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) { CalcRLScore(l_mle_null, param1, beta, se, p_score); } + // Computes Wald statistic for testing association if (a_mode == M_LMM1 || a_mode == M_LMM4) { // for univariate a_mode is 1 + // Estimates variance component (lambda) via REML CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); CalcRLWald(lambda_remle, param1, beta, se, p_wald); } + // Estimates variance component (lambda) via REML + // Likelihood Ratio Test (modes 2, 4, 9): + // Estimates variance component via MLE + // Compares log-likelihood under alternative vs null hypothesis if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) { CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); @@ -1561,6 +1776,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, sumStat.push_back(SNPs); } // debug_msg("exit batch_compute"); + checkpoint_nofile("end-batch_compute"); }; const auto num_snps = indicator_snp.size(); @@ -1642,9 +1858,10 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } batch_compute(c % msize); - ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1); + ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1); // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; + checkpoint_nofile("end-lmm-analyze"); gsl_vector_safe_free(x); gsl_vector_safe_free(x_miss); @@ -1657,12 +1874,474 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } +/* + This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead. + */ + + + +void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + size_t num_markers) { + vector<SUMSTAT2> sumstat2; + clock_t time_start = clock(); + + checkpoint_nofile("start-lmm-mdb-analyze"); + + // Calculate basic quantities. + size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; + + const size_t inds = U->size1; + enforce(inds == ni_test); + assert(inds > 0); + + gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_safe_alloc(inds); + assert(ni_test == U->size2); + assert(ni_test > 0); + assert(ni_total > 0); + assert(n_index > 0); + gsl_vector *Utx = gsl_vector_safe_alloc(ni_test); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); + + const size_t msize = LMM_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_safe_alloc(inds, msize); + enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + enforce(Xlarge->size1 == inds); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero(Uab); + CalcUab(UtW, Uty, Uab); + + // start reading genotypes and analyze + size_t c = 0; + + /* + batch_compute(l) takes l x SNPs that have been loaded into Xlarge, + transforms them all at once using the eigenvector matrix U, then + loops through each transformed SNP to compute association + statistics (beta, standard errors, p-values) and stores results in + sumStat. + */ + auto batch_compute = [&](size_t l, const Markers &markers) { // using a C++ closure + // Compute SNPs in batch, note the computations are independent per SNP + debug_msg("enter batch_compute"); + assert(l>0); + assert(inds>0); + gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); + gsl_matrix_view UtXlarge_sub = + gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); + + time_start = clock(); + // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge + // This is much faster than doing l separate matrix-vector products + // U is the eigenvector matrix from the spectral decomposition of the kinship matrix + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); + time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < l; i++) { + // for each snp batch item extract transformed genotype: + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); + + // Calculate design matrix components and compute sufficient statistics for the regression model + CalcUab(UtW, Uty, Utx, Uab); + + time_start = clock(); + FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; + + double lambda_mle = 0.0, lambda_remle = 0.0, beta = 0.0, se = 0.0, p_wald = 0.0; + double p_lrt = 0.0, p_score = 0.0; + double logl_H1 = 0.0; + + // Run statistical tests based on analysis mode + // 3 is before 1. + if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) { + CalcRLScore(l_mle_null, param1, beta, se, p_score); + } + + // Computes Wald statistic for testing association + if (a_mode == M_LMM1 || a_mode == M_LMM4) { + // for univariate a_mode is 1 + // Estimates variance component (lambda) via REML + CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald(lambda_remle, param1, beta, se, p_wald); + } + + // Estimates variance component (lambda) via REML + // Likelihood Ratio Test (modes 2, 4, 9): + // Estimates variance component via MLE + // Compares log-likelihood under alternative vs null hypothesis + if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) { + CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); + } + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + auto markerinfo = markers[i]; + // Store summary data. + SUMSTAT2 st = {markerinfo, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score, logl_H1}; + sumstat2.push_back(st); + } + }; + + /* + const auto num_markers = indicator_snp.size(); + enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?"); + if (num_markers < 50) { + cerr << num_markers << " SNPs" << endl; + warning_msg("very few SNPs processed"); + } + */ + const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace); + Markers markers; + + assert(num_markers > 0); + for (size_t t = 0; t < num_markers; ++t) { + // for (size_t t = 0; t < 2; ++t) { + if (t % progress_step == 0 || t == (num_markers - 1)) { + ProgressBar("Reading markers", t, num_markers - 1); + } + // if (indicator_snp[t] == 0) + // continue; + + auto tup = fetch_snp(t); // use the callback + auto state = get<0>(tup); + if (state == SKIP) + continue; + if (state == LAST) + break; // marker loop because of LOCO + + auto markerinfo = get<1>(tup); + auto gs = get<2>(tup); + + markers.push_back(markerinfo); + + // drop missing idv and plug mean values for missing geno + double x_total = 0.0; // sum genotype values to compute x_mean + uint vpos = 0; // position in target vector + uint n_miss = 0; // count NA genotypes + gsl_vector_set_zero(x_miss); + for (size_t i = 0; i < ni_total; ++i) { + // get the genotypes per individual and compute stats per SNP + if (indicator_idv[i] == 0) // skip individual + continue; + + double geno = gs[i]; + if (isnan(geno)) { + gsl_vector_set(x_miss, vpos, 1.0); + n_miss++; + } else { + gsl_vector_set(x, vpos, geno); + x_total += geno; + } + vpos++; + } + enforce(vpos == ni_test); + + const double x_mean = x_total/(double)(ni_test - n_miss); + + // plug x_mean back into missing values + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(x_miss, i) == 1.0) { + gsl_vector_set(x, i, x_mean); + } + } + + enforce(x->size == ni_test); + + // copy genotype values for SNP into Xlarge cache + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); + gsl_vector_safe_memcpy(&Xlarge_col.vector, x); + c++; // count markers going in + + if (c % msize == 0) { + batch_compute(msize,markers); + markers.clear(); + markers.reserve(msize); + } + } + + if (c % msize) + batch_compute(c % msize,markers); + ProgressBar("Reading markers", num_markers - 1, num_markers - 1); + cout << endl; + cout << "Counted markers " << c << " sumStat " << sumstat2.size() << endl; + checkpoint_nofile("end-lmm-mdb-analyze"); + + string file_str; + debug_msg("LMM::WriteFiles"); + file_str = path_out + "/" + file_out; + file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); + + ofstream outfile(file_str.c_str(), ofstream::out); + if (!outfile) { + cout << "error writing file: " << file_str << endl; + return; + } + + auto sumstats = [&] (SUMSTAT2 st) { + outfile << scientific << setprecision(6); + auto m = st.markerinfo; + auto name = m.name; + auto chr = m.chr; + string chr_s; + if (chr == CHR_X) + chr_s = "X"; + else if (chr == CHR_Y) + chr_s = "Y"; + else if (chr == CHR_M) + chr_s = "M"; + else + chr_s = to_string(chr); + + outfile << chr_s << "\t"; + outfile << name << "\t"; + outfile << m.pos << "\t"; + outfile << m.n_miss << "\t-\t-\t"; // n_miss column + allele1 + allele0 + outfile << fixed << setprecision(3) << m.maf << "\t"; + outfile << scientific << setprecision(6); + if (a_mode != M_LMM2) { + outfile << st.beta << "\t"; + outfile << st.se << "\t"; + } + + if (a_mode != M_LMM3 && a_mode != M_LMM9) + outfile << st.logl_H1 << "\t"; + + switch(a_mode) { + case M_LMM1: + outfile << st.lambda_remle << "\t" + << st.p_wald << endl; + break; + case M_LMM2: + case M_LMM9: + outfile << st.lambda_mle << "\t" + << st.p_lrt << endl; + break; + case M_LMM3: + outfile << st.p_score << endl; + break; + case M_LMM4: + outfile << st.lambda_remle << "\t" + << st.lambda_mle << "\t" + << st.p_wald << "\t" + << st.p_lrt << "\t" + << st.p_score << endl; + break; + } + }; + + for (auto &s : sumstat2) { + sumstats(s); + } + + gsl_vector_safe_free(x); + gsl_vector_safe_free(x_miss); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_free(ab); // unused + + gsl_matrix_safe_free(Xlarge); + gsl_matrix_safe_free(UtXlarge); + +} + + +void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, const string loco) { + checkpoint("mdb-calc-gwa",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } + + auto env = lmdb::env::create(); + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + // Get mmap info using lmdb++ wrapper + MDB_envinfo info; + mdb_env_info(env.handle(), &info); + // Linux kernel aggressive readahead hints + madvise(info.me_mapaddr, info.me_mapsize, MADV_SEQUENTIAL); + madvise(info.me_mapaddr, info.me_mapsize, MADV_WILLNEED); + + std::cout << "## LMDB opened with optimized readahead; map size = " << (info.me_mapsize / 1024 / 1024) << " MB" << std::endl; + + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + auto marker_mdb = lmdb::dbi::open(rtxn, "marker"); + auto info_mdb = lmdb::dbi::open(rtxn, "info"); + string_view info_key,info_value; + info_mdb.get(rtxn,"format",info_value); + auto format = string(info_value); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + auto num_markers = stat.ms_entries; + + auto mdb_fetch = MDB_FIRST; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + cout << "## number of total individuals = " << ni_total << endl; + cout << "## number of analyzed individuals = " << ni_test << endl; + cout << "## number of analyzed SNPs/var = " << num_markers << endl; + + std::function<SnpNameValues2(size_t)> fetch_snp = [&](size_t num) { + + string_view key,value; + + auto mdb_success = cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + + // uint8_t chr; + vector<double> gs; + MarkerInfo markerinfo; + + if (mdb_success) { + size_t size = 0; + // ---- Depending on the format we get different buffers - currently float and byte are supported: + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + // Start unpacking key chr+pos + if (key.size() != 10) { + cerr << "key.size=" << key.size() << endl; + throw std::runtime_error("Invalid key size"); + } + // "S>L>L>" + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + // Extract big-endian uint32 + // uint32_t rest = static_cast<uint32_t>(data[2]); + uint32_t pos = (data[2] << 24) | (data[3] << 16) | + (data[4] << 8) | data[5]; + + uint32_t num = (data[6] << 24) | (data[7] << 16) | + (data[8] << 8) | data[9]; + + // printf("%#02x %#02x\n", chr, loco_chr); + + if (is_loco && loco_chr != chr) { + if (chr > loco_chr) + return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + else + return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + } + + string_view value2; + marker_mdb.get(rtxn,key,value2); + auto marker = string(value2); + // 1 rs13476251 174792257 + // cout << static_cast<int>(chr) << ":" << pos2 << " line " << rest2 << ":" << marker << endl ; + + // compute maf and n_miss (NAs) + size_t n_miss = 0; // count NAs: FIXME + double maf = compute_maf(ni_total, ni_test, n_miss, gs.data(), indicator_idv); + + markerinfo = MarkerInfo { .name=marker,.chr=chr,.pos=pos,.line_no=num,.n_miss=n_miss,.maf=maf }; + + // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl; + } + return make_tuple(COMPUTE, markerinfo, gs); + }; + LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers); + + ns_total = ns_test = num_markers; // track global number of snps in original gemma (goes to cPar) +} + + +/* + +Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified: + +## Modified (Mutated) Parameters: + +**None of the parameters are modified.** + +All parameters are passed as: +- `const gsl_matrix *` - pointer to const matrix (read-only) +- `const gsl_vector *` - pointer to const vector (read-only) +- `const set<string>` - const set passed by value (copy) + +## What Actually Gets Modified: + +The function modifies **internal state** and **local variables** only: + +1. **Local variables modified**: + - `prev_line` - tracks current line position in file + - `gs` - vector that gets resized and reassigned for each SNP + - `infile` - file stream that gets opened, read from, and closed + +2. **Member variables** (implied by `this->`): + - `file_geno` - used to open the file (read-only here) + - `ni_total` - number of individuals (read-only here) + +3. **The lambda function `fetch_snp`** captures local variables by reference (`[&]`) and modifies: + - `prev_line` - incremented as lines are read + - `gs` - reassigned with new genotype values for each SNP + - `infile` - advanced through the file + +## Summary: + +**From the outside caller's perspective**: Nothing passed into `AnalyzeBimbam` gets mutated. All inputs are treated as read-only (`const`). + +The function's work happens through: +- Reading from the genotype file +- Calling `LMM::Analyze(fetch_snp, ...)` with a callback that reads SNP data +- Any mutations likely happen inside the `LMM::Analyze` method (which would need to be examined separately to see what it modifies) + +*/ + void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps) { debug_msg(file_geno); auto infilen = file_geno.c_str(); + checkpoint("start-read-geno-file",file_geno); igzstream infile(infilen, igzstream::in); enforce_msg(infile, "error reading genotype file"); @@ -1942,6 +2621,153 @@ void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX, return; } +/* + +## What `CalcLambda` does: + +`CalcLambda` **estimates the variance component ratio (lambda)** in a linear mixed model by finding the value that maximizes either the log-likelihood or log-restricted likelihood function. + +### Purpose: + +In LMM, the variance structure is: +- **Var(y) = λτσ² K + τσ² I** + +Where: +- **λ (lambda)**: The ratio of genetic variance to environmental variance (Vg/Ve) +- **K**: Kinship/relatedness matrix +- **I**: Identity matrix + +Lambda determines how much of the phenotypic variance is explained by genetic relatedness vs random environmental effects. + +### Function Signature: +```cpp +void CalcLambda(const char func_name, // 'R' for REML, 'L' for MLE + FUNC_PARAM ¶ms, // Model parameters + const double l_min, // Min lambda to search + const double l_max, // Max lambda to search + const size_t n_region, // # intervals to divide search + double &lambda, // OUTPUT: estimated lambda + double &logf) // OUTPUT: log-likelihood at lambda +``` + +### Algorithm Overview: + +#### **Phase 1: Identify Candidate Intervals** +```cpp +for (size_t i = 0; i < n_region; ++i) { + lambda_l = l_min * exp(lambda_interval * i); + lambda_h = l_min * exp(lambda_interval * (i + 1.0)); + + dev1_l = LogRL_dev1(lambda_l, ¶ms); // First derivative at lower bound + dev1_h = LogRL_dev1(lambda_h, ¶ms); // First derivative at upper bound + + if (dev1_l * dev1_h <= 0) { + lambda_lh.push_back(make_pair(lambda_l, lambda_h)); // Sign change = root + } +} +``` +- Divides the search range [l_min, l_max] into `n_region` intervals (log-spaced) +- Evaluates the **first derivative** at interval boundaries +- If the derivative changes sign, there's a local maximum in that interval + +#### **Phase 2: Find Maximum** + +**Case A: No sign changes found** +```cpp +if (lambda_lh.empty()) { + // Maximum is at a boundary + if (logf_l >= logf_h) { + lambda = l_min; + } else { + lambda = l_max; + } +} +``` + +**Case B: Sign changes found (interior maximum)** + +For each candidate interval: + +1. **Brent's Method** (robust root-finding): + ```cpp + gsl_root_fsolver_brent + ``` + - Finds where the first derivative = 0 (critical point) + - Doesn't require computing second derivatives + - Robust but slower + +2. **Newton-Raphson Method** (refinement): + ```cpp + gsl_root_fdfsolver_newton + ``` + - Uses both first and second derivatives + - Refines the estimate from Brent's method + - Faster convergence near the root + +3. **Compare across intervals**: + ```cpp + if (logf < logf_l) { + logf = logf_l; + lambda = l; + } + ``` + - Keeps track of the lambda with the highest log-likelihood + - Handles multiple local maxima + +#### **Phase 3: Check Boundaries** +```cpp +if (logf_l > logf) { + lambda = l_min; +} +if (logf_h > logf) { + lambda = l_max; +} +``` +- Ensures the global maximum isn't at a boundary + +### Function Types: + +- **`func_name = 'R'` or `'r'`**: REML (Restricted Maximum Likelihood) + - Uses `LogRL_f`, `LogRL_dev1`, `LogRL_dev2` + - Accounts for loss of degrees of freedom from fixed effects + - Preferred for variance component estimation + +- **`func_name = 'L'` or `'l'`**: MLE (Maximum Likelihood) + - Uses `LogL_f`, `LogL_dev1`, `LogL_dev2` + - Direct likelihood maximization + - Used for likelihood ratio tests + +### Error Handling: + +```cpp +if (status == GSL_CONTINUE || status != GSL_SUCCESS) { + logf = nan("NAN"); + lambda = nan("NAN"); + return; +} +``` +- If optimization fails, returns NaN values +- Warnings issued for non-convergence + +### In Context (from `batch_compute`): + +```cpp +// REML for Wald test +CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); +CalcRLWald(lambda_remle, param1, beta, se, p_wald); + +// MLE for LRT test +CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); +p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); +``` + +### Summary: + +**`CalcLambda` finds the optimal variance component ratio (λ)** that maximizes the (restricted) log-likelihood for a given SNP's genotype data in a linear mixed model. This λ value quantifies the relative contribution of genetic relatedness vs environmental noise to phenotypic variation, and is essential for computing association test statistics (Wald, LRT) while properly accounting for population structure and relatedness. + +*/ + + void CalcLambda(const char func_name, FUNC_PARAM ¶ms, const double l_min, const double l_max, const size_t n_region, double &lambda, double &logf) { @@ -2291,6 +3117,7 @@ void LMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval, cout << "error reading genotype file:" << file_geno << endl; return; } + checkpoint("start-read-geno-file",file_geno); clock_t time_start = clock(); diff --git a/src/lmm.h b/src/lmm.h index 736c679..295602a 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ using namespace std; -#define LMM_BATCH_SIZE 20000 // used for batch processing +#define LMM_BATCH_SIZE 5000 // used for batch processing class FUNC_PARAM { @@ -44,7 +44,42 @@ public: size_t e_mode; }; -typedef std::tuple<string,std::vector<double> > SnpNameValues; + +// typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerInfo; // name, chr, pos, line +struct MarkerInfo { + string name; + uint8_t chr; + size_t pos, line_no, n_miss; + double maf; +} ; + +typedef vector<MarkerInfo> Markers; +typedef tuple< string,vector<double> > SnpNameValues; + +enum MarkerState { + FAIL, + COMPUTE, + SKIP, + LAST +}; + +typedef tuple< MarkerState,MarkerInfo,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed) +// Results for LMM. +class SUMSTAT2 { +public: + MarkerInfo markerinfo; + double beta; // REML estimator for beta. + double se; // SE for beta. + double lambda_remle; // REML estimator for lambda. + double lambda_mle; // MLE estimator for lambda. + double p_wald; // p value from a Wald test. + double p_lrt; // p value from a likelihood ratio test. + double p_score; // p value from a score test. + double logl_H1; // log likelihood under the alternative + // hypothesis as a measure of goodness of fit, + // see https://github.com/genetics-statistics/GEMMA/issues/81 +}; + class LMM { @@ -100,6 +135,14 @@ public: const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps); + void mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, size_t num_markers); + void mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const string loco); void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, diff --git a/src/main.cpp b/src/main.cpp index deadc63..ee6d597 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,7 @@ #include <sstream> #include <sys/stat.h> #include <sys/types.h> +#include <guile_api.h> using namespace std; @@ -31,6 +32,9 @@ int main(int argc, char *argv[]) { gsl_set_error_handler (&gemma_gsl_error_handler); + gemmaguile::global_start_guile(); + gemmaguile::start_test(); + if (argc <= 1) { cGemma.PrintHeader(); cGemma.PrintHelp(0); diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index 68d17a7..c82872e 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -494,6 +494,20 @@ void StandardizeVector(gsl_vector *y) { } // Calculate UtX (U gets transposed) +/* +CalcUtX transforms a genotype matrix by the eigenvector matrix from the kinship matrix decomposition. +Purpose: + +Computes UtX = U^T × X, where: + + U: Eigenvector matrix from the spectral decomposition of the kinship matrix K + X: Genotype matrix (individuals × SNPs) + UtX: Transformed genotype matrix + +This transformation rotates the genotype data into the eigenspace where the kinship-induced correlations become diagonal, simplifying likelihood calculations. + +CalcUtX transforms a genotype matrix into the eigenspace by computing U^T × X, where U contains the eigenvectors from the kinship matrix decomposition. This transformation diagonalizes the correlation structure induced by relatedness, making subsequent likelihood calculations tractable and efficient. The function overwrites its input with the transformed result, using a temporary copy to avoid data corruption. +*/ void CalcUtX(const gsl_matrix *U, gsl_matrix *UtX) { gsl_matrix *X = gsl_matrix_safe_alloc(UtX->size1, UtX->size2); gsl_matrix_safe_memcpy(X, UtX); @@ -631,3 +645,36 @@ double UcharToDouble02(const unsigned char c) { return (double)c * 0.01; } unsigned char Double02ToUchar(const double dosage) { return (int)(dosage * 100); } + + +std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) { + size_t n_0 = 0; + size_t n_1 = 0; + size_t n_2 = 0; + for (size_t i = 0; i < ni_total; ++i) { // read genotypes + double geno = gsl_vector_get(gs, i); + if (geno >= 0 && geno <= 0.5) { + n_0++; + } + if (geno > 0.5 && geno < 1.5) { + n_1++; + } + if (geno >= 1.5 && geno <= 2.0) { + n_2++; + } + } + return {n_0,n_1,n_2}; +} + +double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs, const vector<int> &indicator) { + double maf = 0.0; + + for (size_t i = 0; i < ni_total; ++i) { // read genotypes + if (indicator[i]) { + double geno = gs[i]; + maf += geno; // 0..2 range + } + } + maf /= 2.0 * (double)(ni_test - n_miss); // Assumption is that geno value is between 0 and 2. FIXME + return maf; +} diff --git a/src/mathfunc.h b/src/mathfunc.h index 641d0a3..f7c9e6e 100644 --- a/src/mathfunc.h +++ b/src/mathfunc.h @@ -24,6 +24,7 @@ // #include "Eigen/Dense" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" +#include <tuple> #define CONDITIONED_MAXRATIO 2e+6 // based on http://mathworld.wolfram.com/ConditionNumber.html #define EIGEN_MINVALUE 1e-10 @@ -77,4 +78,8 @@ unsigned char Double02ToUchar(const double dosage); // void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, // const size_t i_row, Eigen::VectorXd &x_row); +std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs); +double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs, const vector<int> &indicator); + + #endif diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index eb23900..1f64c3a 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -210,6 +210,31 @@ void MVLMM::WriteFiles() { } // Below are functions for EM algorithm. + +/* + src/mvlmm.cpp +> describe `EigenProc` + +The EigenProc function performs the following steps: + + 1 Eigen Decomposition of V_e: It computes the eigen decomposition of the matrix V_e to obtain the eigenvectors + and eigenvalues. This is used to transform V_e into a diagonal form. + 2 Calculate V_e_h and V_e_hi: It calculates the matrices V_e_h and V_e_hi, which are used to transform the + original matrices into a space where the eigenvalues are more manageable. V_e_h is a scaled version of the + eigenvectors, and V_e_hi is the inverse of V_e_h. + 3 Calculate Lambda: It computes the matrix Lambda as V_ehi * V_g * V_ehi, which is a transformed version of + V_g in the space defined by the eigenvectors of V_e. + 4 Eigen Decomposition of Lambda: It performs an eigen decomposition on Lambda to obtain the eigenvectors and + eigenvalues, which are used to further transform the matrices. + 5 Calculate UltVeh and UltVehi: It calculates UltVeh and UltVehi, which are used in subsequent calculations to + transform vectors and matrices into the space defined by the eigenvectors of Lambda. + 6 Return logdet_Ve: The function returns the logarithm of the determinant of V_e, which is used in likelihood + calculations. + +This function is crucial for transforming the problem into a space where the matrices are diagonal, simplifying +many calculations in the context of mixed models. + + */ double EigenProc(const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l, gsl_matrix *UltVeh, gsl_matrix *UltVehi) { size_t d_size = V_g->size1; diff --git a/src/param.cpp b/src/param.cpp index f96e9c3..96a9cd2 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins + Copyright © 2017-2025, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,7 @@ #include "mathfunc.h" #include "param.h" #include "fastblas.h" +#include "checkpoint.h" using namespace std; @@ -104,7 +105,9 @@ PARAM::PARAM(void) randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), - time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} + time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) { + is_mdb = false; + } PARAM::~PARAM() { if (gsl_r) @@ -113,6 +116,7 @@ PARAM::~PARAM() { // Read files: obtain ns_total, ng_total, ns_test, ni_test. void PARAM::ReadFiles(void) { + checkpoint_nofile("PARAM::ReadFiles"); string file_str; // Read cat file. @@ -145,22 +149,24 @@ void PARAM::ReadFiles(void) { } } - // Read SNP set. - if (!file_snps.empty()) { - if (ReadFile_snps(file_snps, setSnps) == false) { - error = true; + if (!is_mdb) { + // Read SNP set into setSnps (without filtering) + if (!file_snps.empty()) { + if (ReadFile_snps(file_snps, setSnps) == false) { + error = true; + } + } else { + setSnps.clear(); } - } else { - setSnps.clear(); - } - // Read KSNP set. - if (!file_ksnps.empty()) { - if (ReadFile_snps(file_ksnps, setKSnps) == false) { - error = true; + // Also read KSNP set. Reads all Snps later to be used by GRM. + if (!file_ksnps.empty()) { + if (ReadFile_snps(file_ksnps, setKSnps) == false) { + error = true; + } + } else { + setKSnps.clear(); } - } else { - setKSnps.clear(); } // For prediction. @@ -180,13 +186,15 @@ void PARAM::ReadFiles(void) { } } + if (!file_geno.empty()) { - if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == - false) { + // --- Read (multi-)column phenotypes into pheno and set the NA indicator + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } - if (CountFileLines(file_geno, ns_total) == false) { + // --- compute ns_total by readin geno file + if (!is_mdb && is_check_mode() && CountFileLines(file_geno, ns_total) == false) { error = true; } } @@ -215,8 +223,6 @@ void PARAM::ReadFiles(void) { indicator_idv.push_back(k); } - ns_test = 0; - return; } @@ -273,7 +279,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -290,7 +296,7 @@ void PARAM::ReadFiles(void) { } // Read genotype and phenotype file for BIMBAM format. - if (!file_geno.empty()) { + if (!is_mdb && !file_geno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -299,14 +305,14 @@ void PARAM::ReadFiles(void) { } } - // Phenotype file before genotype file. + // Phenotype file before genotype file. Already done this(?!) if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { error = true; } // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -314,9 +320,10 @@ void PARAM::ReadFiles(void) { trim_individuals(indicator_idv, ni_max); trim_individuals(indicator_cvt, ni_max); - if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, - maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { + // The following reads the geno file to get the SNPs - only for BIMBAM + if (ReadFile_bimbam_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, + maf_level, miss_level, hwe_level, r2_level, mapRS2chr, + mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; return; } @@ -324,6 +331,17 @@ void PARAM::ReadFiles(void) { ns_total = indicator_snp.size(); } + // lmdb-type genotype file: + if (is_mdb && !file_geno.empty()) { + if (!file_kin.empty()) { // GWA + // Phenotype file before genotype file. Already done this(?!) + if (ReadFile_pheno(file_pheno, indicator_pheno, pheno, p_column) == false) { + error = true; + } + ProcessInclusionIndicators(); + ns_total = indicator_snp.size(); + } + } // Read genotype file for multiple PLINK files. if (!file_mbfile.empty()) { @@ -360,7 +378,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -387,8 +405,8 @@ void PARAM::ReadFiles(void) { infile.clear(); } - // Read genotype and phenotype file for multiple BIMBAM files. - if (!file_mgeno.empty()) { + // Read genotype and phenotype file for multiple BIMBAM files. Note this goes untested. + if (!is_mdb && !file_mgeno.empty()) { // Annotation file before genotype file. if (!file_anno.empty()) { @@ -404,7 +422,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain ni_test, // save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); @@ -420,7 +438,7 @@ void PARAM::ReadFiles(void) { string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { - if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, + if (ReadFile_bimbam_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; @@ -457,7 +475,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); // Obtain covariate matrix. // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); @@ -480,7 +498,8 @@ void PARAM::ReadFiles(void) { ni_test += indicator_idv[i]; } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); } // For ridge prediction, read phenotype only. @@ -491,7 +510,7 @@ void PARAM::ReadFiles(void) { // Post-process covariates and phenotypes, obtain // ni_test, save all useful covariates. - ProcessCvtPhen(); + ProcessInclusionIndicators(); } // Compute setKSnps when -loco is passed in @@ -920,13 +939,18 @@ void PARAM::CheckParam(void) { enforce_fexists(file_gwasnps, "open file"); enforce_fexists(file_anno, "open file"); + if (file_geno.find(".mdb") != string::npos) { + is_mdb = true; + } + if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 9 || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); - enforce_msg(!file_anno.empty(), - "LOCO requires annotation file (-a switch)"); + if (!is_mdb) + enforce_msg(!file_anno.empty(), "Without mdb LOCO requires annotation file (-a switch)"); + enforce_msg(file_ksnps.empty(), "LOCO does not allow -ksnps switch"); enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } @@ -1078,13 +1102,15 @@ void PARAM::CheckData(void) { } } - enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (!is_mdb) { + enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && - file_study.empty() && file_beta.empty() && file_bf.empty()) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; - return; + if (ni_test == 0 && file_cor.empty() && file_mstudy.empty() && + file_study.empty() && file_beta.empty() && file_bf.empty()) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + return; + } } if (a_mode == 43) { @@ -1103,7 +1129,7 @@ void PARAM::CheckData(void) { } // Output some information. - if (file_cor.empty() && file_mstudy.empty() && file_study.empty() && + if (!is_mdb && file_cor.empty() && file_mstudy.empty() && file_study.empty() && a_mode != 15 && a_mode != 27 && a_mode != 28) { cout << "## number of total individuals = " << ni_total << endl; if (a_mode == 43) { @@ -1251,14 +1277,14 @@ void PARAM::CheckData(void) { void PARAM::PrintSummary() { if (n_ph == 1) { - cout << "pve estimate =" << pve_null << endl; - cout << "se(pve) =" << pve_se_null << endl; + cout << "## pve estimate = " << pve_null << endl; + cout << "## se(pve) = " << pve_se_null << endl; } else { } return; } -void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { +void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { string file_str; if (!file_bfile.empty()) { @@ -1268,8 +1294,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { error = true; } } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, - calc_K) == false) { + if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, calc_K) == false) { error = true; } } @@ -1277,44 +1302,25 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { return; } -void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K) { - string file_str; - - if (!file_bfile.empty()) { - file_str = file_bfile + ".bed"; - if (ReadFile_bed(file_str, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } else { - if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test) == false) { - error = true; - } - } - - return; +gsl_matrix *PARAM::MdbCalcKin() { + return mdb_calc_kin(file_geno, a_mode - 20, loco); } void PARAM::CalcKin(gsl_matrix *matrix_kin) { + checkpoint_nofile("PARAM::CalcKin"); string file_str; - gsl_matrix_set_zero(matrix_kin); if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } } else { - file_str = file_geno; - if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, - matrix_kin, ni_max == 0) == false) { - error = true; - } + // indicator_snp is an array of bool + error = !BimbamKin(file_geno, setKSnps, indicator_snp, a_mode - 20, d_pace, + matrix_kin, ni_max == 0); } return; @@ -1990,7 +1996,7 @@ void PARAM::CheckCvt() { } // Post-process phenotypes and covariates. -void PARAM::ProcessCvtPhen() { +void PARAM::ProcessInclusionIndicators() { // Convert indicator_pheno to indicator_idv. int k = 1; @@ -2028,11 +2034,8 @@ void PARAM::ProcessCvtPhen() { // Obtain ni_test. ni_test = 0; - for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) { - if (indicator_idv[i] == 0) { - continue; - } - ni_test++; + for(auto &i : indicator_idv) { + ni_test += indicator_idv[i]; } // If subsample number is set, perform a random sub-sampling @@ -2071,13 +2074,14 @@ void PARAM::ProcessCvtPhen() { } // Check ni_test. - if (a_mode != M_BSLMM5) - enforce_msg(ni_test,"number of analyzed individuals equals 0."); - if (ni_test == 0 && a_mode != M_BSLMM5) { - error = true; - cout << "error! number of analyzed individuals equals 0. " << endl; + if (!is_mdb) { + if (a_mode != M_BSLMM5) + enforce_msg(ni_test,"number of analyzed individuals equals 0."); + if (ni_test == 0 && a_mode != M_BSLMM5) { + error = true; + cout << "error! number of analyzed individuals equals 0. " << endl; + } } - // Check covariates to see if they are correlated with each // other, and to see if the intercept term is included. // After getting ni_test. diff --git a/src/param.h b/src/param.h index d3ce686..046f543 100644 --- a/src/param.h +++ b/src/param.h @@ -160,6 +160,8 @@ public: string file_ksnps; // File SNPs for computing K string file_gwasnps; // File SNPs for computing GWAS + bool is_mdb; + // QC-related parameters. double miss_level; double maf_level; @@ -336,17 +338,16 @@ public: void CheckParam(); void CheckData(); void PrintSummary(); - void ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); - void ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, - const bool calc_K); + void ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); void CheckCvt(); void CopyCvt(gsl_matrix *W); void CopyA(size_t flag, gsl_matrix *A); void CopyGxe(gsl_vector *gxe); void CopyWeight(gsl_vector *w); - void ProcessCvtPhen(); + void ProcessInclusionIndicators(); void CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag); void CopyCvtPhen(gsl_matrix *W, gsl_matrix *Y, size_t flag); + gsl_matrix *MdbCalcKin(); void CalcKin(gsl_matrix *matrix_kin); void CalcS(const map<string, double> &mapRS2wA, const map<string, double> &mapRS2wK, const gsl_matrix *W, @@ -368,6 +369,9 @@ public: const map<string, double> &mapRS2z, gsl_vector *w, gsl_vector *z, vector<size_t> &vec_cat); void UpdateSNP(const map<string, double> &mapRS2wA); + + inline bool is_loco() { return !loco.empty(); } + }; size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt); diff --git a/src/vc.cpp b/src/vc.cpp index 22aaea9..970c556 100644 --- a/src/vc.cpp +++ b/src/vc.cpp @@ -1021,7 +1021,7 @@ void ReadFile_cor(const string &file_cor, const vector<string> &vec_rs, string rs1, rs2; double d1, d2, d3, cor, var1, var2; - size_t n_nb, nsamp1, nsamp2, n12, bin_size = 10, bin; + size_t n_nb, nsamp1, nsamp2, n12, bin_size = 10, bin = 0; vector<vector<double>> mat_S, mat_Svar, mat_tmp; vector<double> vec_qvar, vec_tmp; diff --git a/src/version.h b/src/version.h index 3382003..02d4c09 100644 --- a/src/version.h +++ b/src/version.h @@ -1,5 +1,4 @@ // version.h generated by GEMMA scripts/gen_version_info.sh -#define GEMMA_VERSION "0.0.1" -#define GEMMA_DATE "2025-01-04" +#define GEMMA_VERSION "1.0.0" +#define GEMMA_DATE "2025-11-22" #define GEMMA_YEAR "2025" -#define GEMMA_PROFILE "/gnu/store/ln160n2kzn791jwgv36yrxlxygjwl9hh-profile" diff --git a/test/performance/releases.org b/test/performance/releases.org index b208e54..b9c451d 100644 --- a/test/performance/releases.org +++ b/test/performance/releases.org @@ -1,5 +1,108 @@ * GEMMA performance stats +** GEMMA 1.00-pre1 + + +Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. + +Introducing mdb genotype format led to a 30% speed increase on the small mouse set: + +#+begin_src sh +real 0m6.403s +user 0m11.529s +sys 0m6.325s +#+end_src sh + +that may not look like much, but we are only starting! + +** Picking up the pieces + +We are facing a time regression. + +#+begin_src sh +premake5 gmake && make verbose=1 config=release -j 8 gemma && time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g ./example/mouse_hs1940.geno.txt.mdb -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check +#+end_src sh + +With openblas 0.3.21 we go a bit faster. Still 10% behind though, there is room for tweaking. It may actually be a new SSD. I want to run some bigger files first. + +#+begin_src sh +Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025 +Reading Files ... +## number of total individuals = 1940 +## number of analyzed individuals = 1410 +## number of covariates = 1 +## number of phenotypes = 1 +## number of total SNPs/var = 12226 +## number of analyzed SNPs = 10768 +Start Eigen-Decomposition... +pve estimate =0.608801 +se(pve) =0.032774 +================================================== 100% +real 0m9.017s +user 0m13.168s +sys 0m5.919s +#+end_src sh + +Before it was + +#+begin_src sh +Pangemma --- GEMMA 0.98.5 compatible executable 1.0.0 (2025-11-22) with guile 3.0.9 by Xiang Zhou, Pjotr Prins and team (C) 2012-2025 +Reading Files ... +## number of total individuals = 1940 +## number of analyzed individuals = 1410 +## number of covariates = 1 +## number of phenotypes = 1 +## number of total SNPs/var = 12226 +## number of analyzed SNPs = 10768 +Start Eigen-Decomposition... +pve estimate =0.608801 +se(pve) =0.032774 +================================================== 100% +real 0m16.772s +user 0m25.443s +sys 0m0.901s +#+end_src sh + +The output looks the same. Good. So far the first difference is a much later openblas 0.3.30 (over 0.3.9). In the source code we added checkpoints and more debugging, particularly write statements. I disabled the latter, but still no dice. + +When compiled with the profiler library prefix the gemma run with + +#+begin_src sh +premake5 gmake && make verbose=1 config=debug -j 8 gemma && time CPUPROFILE=gemma.prof LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k ./output/result.cXX.txt -lmm -no-check -debug +CPUPROFILE=gemma.prof +pprof --text build/bin/Debug/gemma gemma.prof + + 1007 49.2% 49.2% 1015 49.6% dot_compute + 94 4.6% 53.8% 94 4.6% rpcc + 74 3.6% 57.5% 74 3.6% gsl_vector_div + 62 3.0% 60.5% 92 4.5% ____strtod_l_internal + 42 2.1% 62.5% 42 2.1% dgemm_kernel_ZEN +#+end_src sh + +this led me to try the newer openblas on the older gemma - and indeed, the regression is coming from the openblas version. Even though it says 'OpenBLAS 0.3.30 DYNAMIC_ARCH NO_AFFINITY Zen MAX_THREADS=128' I suspect the dynamic arch is not really optimizing. + +Well, at least I found the problem. Time for a special openblas build like I used to do. + + +*** Bigger run + +We translate this 10Gb (gzip compressed) job from our pangenome precompute + +``` +/bin/gemma -loco 3 -k /export2/data/wrk/services/gemma-wrapper/tmp/tmp/panlmm/93f6b39ec06c09fb9ba9ca628b5fb990921b6c60.3.cXX.txt.cXX.txt -o a3248cec40b3fe6b9e8672352b3ab2d7280c426c.3.assoc.txt -p pheno.json.txt -g pangenome-13M-genotypes.txt -a snps-matched.txt -lmm 9 -maf 0.1 -n 2 -outdir /export2/data/wrk/services/gemma-wrapper/tmp/tmp/panlmm/d20251126-4190721-c8bbo8 +``` + +to + +``` +time LD_LIBRARY_PATH=$GUIX_ENVIRONMENT/lib ./build/bin/Release/gemma -g tmp/pangenome-13M-genotypes.txt -p tmp/pheno.json.txt -n 1 -a tmp/snps-matched.txt -k tmp/93f6b39ec06c09fb9ba9ca628b5fb990921b6c60.3.cXX.txt.cXX.txt -lmm 9 -no-check +real 20m4.687s +user 23m42.508s +sys 9m51.929s +``` + +On my AMD Ryzen 7 3700X it uses about ~10Gb of RAM. With the -debug switch it clapped out because of sqrt(NaN). There is a lot that can be gained with better IO and multi-core use. + ** GEMMA 0.98.5-pre1 Measurements taken on a recent AMD Ryzen 7 3700X 8-Core Processor @2.195GHz. diff --git a/test/runner b/test/runner new file mode 100755 index 0000000..5002d80 --- /dev/null +++ b/test/runner @@ -0,0 +1,18 @@ +#!/bin/sh +# -*- mode: scheme; -*- +exec guile --debug -s "$0" "$@" +!# + +(define-module (test-runner) + #:use-module (ice-9 match) + #:use-module (srfi srfi-1) ; for last + #:use-module (srfi srfi-13) + #:use-module (srfi srfi-64) ; for tests + #:use-module (ice-9 rdelim) + ) + +(test-begin "all-tests") + +(load "test-uvlmm-integration.scm") + +(test-end "all-tests") diff --git a/test/test-mdb-integration.scm b/test/test-mdb-integration.scm new file mode 100755 index 0000000..006c241 --- /dev/null +++ b/test/test-mdb-integration.scm @@ -0,0 +1,51 @@ +#!/bin/sh +# -*- mode: scheme; -*- +exec guile --debug -s "$0" "$@" +!# + +(define-module (test-runner) + #:use-module (ice-9 match) + #:use-module (srfi srfi-1) ; for last + #:use-module (srfi srfi-13) + #:use-module (srfi srfi-64) ; for tests + #:use-module (ice-9 rdelim) + ) + +(define kinship-fn "./output/mouse_hs1940.cXX.txt") +(define gwa-fn "./output/mouse_hs1940.assoc.txt") + +(test-begin "uvlmm-mdb-kinship-run") + +(when (file-exists? kinship-fn) + (delete-file kinship-fn)) +(let [(err (system "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.mdb -p ./example/mouse_hs1940.pheno.txt -gk -o mouse_hs1940 -debug"))] + (test-eqv 0 err)) + +(test-end "uvlmm-mdb-kinship-run") + +(test-begin "uvlmm-mdb-gwa-run") + +(when (file-exists? gwa-fn) + (delete-file gwa-fn)) +;; The following integration test runs gemma uvlmm and adds up the output column as a check. +;; It uses the kinship-run matrix from the earlier test +(let [(err (system (string-append "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.mdb -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k " kinship-fn " -o mouse_hs1940 -lmm 9 -debug")))] + (test-eqv 0 err)) +(call-with-input-file gwa-fn + (lambda (port) + (read-line port) ; skip first line + (let* ((fields (string-split (read-line port) #\tab)) + (last-field (last fields))) + (test-eqv 208.0 (truncate (* 1000 (string->number last-field))))) + (test-eqv 5720672.0 + (let loop ((line (read-line port)) + (sum 208.0)) + (if (eof-object? line) + sum + (let* ((fields (string-split line #\tab)) + (last-field (last fields)) + (value (string->number last-field))) + (loop (read-line port) + (+ sum (truncate (* 1000 value)))))))))) + +(test-end "uvlmm-mdb-gwa-run") diff --git a/test/test-uvlmm-integration.scm b/test/test-uvlmm-integration.scm new file mode 100755 index 0000000..91eb14a --- /dev/null +++ b/test/test-uvlmm-integration.scm @@ -0,0 +1,52 @@ +#!/bin/sh +# -*- mode: scheme; -*- +exec guile --debug -s "$0" "$@" +!# + +(define-module (test-runner) + #:use-module (ice-9 match) + #:use-module (srfi srfi-1) ; for last + #:use-module (srfi srfi-13) + #:use-module (srfi srfi-64) ; for tests + #:use-module (ice-9 rdelim) + ) + +(define kinship-fn "./output/mouse_hs1940.cXX.txt") +(define gwa-fn "./output/mouse_hs1940.assoc.txt") + +(test-begin "uvlmm-bimbam-kinship-run") + +(when (file-exists? kinship-fn) + (delete-file kinship-fn)) +(let [(err (system "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -gk -p ./example/mouse_hs1940.pheno.txt -o mouse_hs1940 -debug"))] + (test-eqv 0 err)) + +(test-end "uvlmm-bimbam-kinship-run") + + +(test-begin "uvlmm-bimbam-gwa-run") + +(when (file-exists? gwa-fn) + (delete-file gwa-fn)) +;; The following integration test runs gemma uvlmm and adds up the output column as a check. +;; It uses the kinship-run matrix from the earlier test +(let [(err (system (string-append "./build/bin/Debug/gemma -g ./example/mouse_hs1940.geno.txt.gz -p ./example/mouse_hs1940.pheno.txt -n 1 -a ./example/mouse_hs1940.anno.txt -k " kinship-fn " -o mouse_hs1940 -lmm 9 -debug")))] + (test-eqv 0 err)) +(call-with-input-file gwa-fn + (lambda (port) + (read-line port) ; skip first line + (let* ((fields (string-split (read-line port) #\tab)) + (last-field (last fields))) + (test-eqv 208.0 (truncate (* 1000 (string->number last-field))))) + (test-eqv 5720672.0 + (let loop ((line (read-line port)) + (sum 208.0)) + (if (eof-object? line) + sum + (let* ((fields (string-split line #\tab)) + (last-field (last fields)) + (value (string->number last-field))) + (loop (read-line port) + (+ sum (truncate (* 1000 value)))))))))) + +(test-end "uvlmm-bimbam-gwa-run") |
