diff options
42 files changed, 2547 insertions, 3462 deletions
@@ -1,6 +1,7 @@ *.o *.tar.gz src/Eigen +src/version.h example/output test/output ./output diff --git a/.travis.yml b/.travis.yml index ec2d049..ffd674f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,46 +1,52 @@ language: C++ -compiler: gcc matrix: + # OSX testing is under development + # allow_failures: + # - os: osx include: - os: linux + compiler: gcc addons: apt: sources: - ubuntu-toolchain-r-test packages: + # Our dev environment is a more recent GNU C++ and GSL2 - g++-4.9 + - libopenblas-dev + - zlib1g-dev + - libeigen3-dev + - libgsl0-dev + - liblapack-dev + # - gfortran-dev for static env: - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" - - os: linux - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-6 + - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9 && EIGEN_INCLUDE_PATH=/usr/include/eigen3" + - os: osx + compiler: clang env: - - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6" + - MATRIX_EVAL="EIGEN_INCLUDE_PATH=/usr/local/include/eigen3" +# - os: linux +# addons: +# apt: +# sources: +# - ubuntu-toolchain-r-test +# packages: +# - g++-6 +# env: +# - MATRIX_EVAL="CC=gcc-6 && CXX=g++-6" before_install: - - sudo apt-get -qq update - - sudo apt-get install -y libopenblas-dev zlib1g-dev - - sudo apt-get install -y libeigen3-dev - - sudo apt-get install -y libgsl0-dev - - sudo apt-get install -y liblapack-dev - # for the static release version we need the following - # - sudo apt-get install -y gfortran-dev - - dpkg -l - - eval "${MATRIX_EVAL}" - - $CXX --version + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew cask uninstall oclint && brew update && brew install gsl openblas zlib eigen lapack ; fi script: + - echo $MATRIX_EVAL - eval "${MATRIX_EVAL}" - $CXX --version # build and test debug version - - make CXX=$CXX WITH_OPENBLAS=1 -j 4 - - time make CXX=$CXX WITH_OPENBLAS=1 check - - make clean - # build and test release version - - make CXX=$CXX FORCE_DYNAMIC=1 WITH_OPENBLAS=1 -j 4 - - time make CXX=$CXX WITH_OPENBLAS=1 DEBUG= check + - make CXX=$CXX EIGEN_INCLUDE_PATH=$EIGEN_INCLUDE_PATH WITH_LAPACK=1 OPENBLAS_LEGACY=1 WITH_GSLCBLAS=1 -j 4 -k + - time make CXX=$CXX EIGEN_INCLUDE_PATH=$EIGEN_INCLUDE_PATH WITH_LAPACK=1 OPENBLAS_LEGACY=1 WITH_GSLCBLAS=1 check + # - make clean + # build and test release version (integration test mostly) + # - make CXX=$CXX EIGEN_INCLUDE_PATH=$EIGEN_INCLUDE_PATH DEBUG= FORCE_DYNAMIC=1 WITH_OPENBLAS=1 OPENBLAS_LEGACY=1 -j 4 + # - time make CXX=$CXX DEBUG= WITH_OPENBLAS=1 fast-check # build static release (fast-check only) # - make clean # - make CXX=$CXX TRAVIS_CI=1 -j 4 fast-check @@ -14,7 +14,7 @@ GEMMA runs on Linux and MAC OSX and the runtime has the following dependencies: * C++ tool chain >= 4.9 -* GNU Science library (GSL) 1.x (GEMMA does not currently work with GSL >= 2). +* GNU Science library (GSL) 1.x (note that 2.x is not yet supported) * blas/openblas * lapack * [Eigen3 library](http://eigen.tuxfamily.org/dox/) @@ -65,12 +65,12 @@ if you get an Eigen error you may need to override the include path. E.g. to build GEMMA on GNU Guix with shared libs the following may work - make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 WITH_OPENBLAS=1 + make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 another example overriding optimization and LIB flags (so as to link against gslv1) would be - make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 WITH_OPENBLAS=1 GCC_FLAGS="-Wall" LIBS="$HOME/opt/gsl1/lib/libgsl.a $HOME/opt/gsl1/lib/libgslcblas.a -L$HOME/.guix-profile/lib -pthread -llapack -lblas -lz" + make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 GCC_FLAGS="-Wall -isystem/$HOME/opt/gsl1/include" LIBS="$HOME/opt/gsl1/lib/libgsl.a $HOME/opt/gsl1/lib/libgslcblas.a -L$HOME/.guix-profile/lib -pthread -llapack -lblas -lz" to run GEMMA tests @@ -86,7 +86,10 @@ You can run gemma in the debugger with, for example Note that if you get <optimized out> warnings on inspecting variables you should compile with GCC_FLAGS="" to disable optimizations (-O3). E.g. - make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 WITH_OPENBLAS=1 GCC_FLAGS= + make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 GCC_FLAGS= + +If you get older OpenBlas errors you may need to add +OPENBLAS_LEGACY=1. Other options, such as compiling with warnings, are listed in the Makefile. @@ -100,3 +103,72 @@ GEMMA includes the shunit2 test framework (version 2.0). or ./run_tests.sh + +## Optimizing performance + +### OpenBlas + +Linking against a built-from-source OpenBlas is a first optimization +step because it will optimize code for the local architecture (on my +workstation it easily doubles speed). When you check the output .log +file of GEMMA after a run, it will tell you how the linked-in OpenBlas +was compiled. + +It is worth checking that you use OpenBlas's lapack and cblas +interfaces instead of linking against default lapack and gslcblas +libs. + +To link a new version, compile OpenBlas as per +[instructions](http://www.openblas.net/). You can start with the +default: + + make + +and/or play with the switches (listed in OpenBlas Makefile.rule) + + make BINARY=64 NO_WARMUP=0 GEMM_MULTITHREAD_THRESHOLD=4 USE_THREAD=1 NO_AFFINITY=0 NO_LAPACK=1 NUM_THREADS=64 NO_SHARED=1 + +and you should see something like + + OpenBLAS build complete. (BLAS CBLAS LAPACK LAPACKE) + + OS ... Linux + Architecture ... x86_64 + BINARY ... 64bit + C compiler ... GCC (command line : gcc) + Fortran compiler ... GFORTRAN (command line : gfortran) + Library Name ... libopenblas_haswellp-r0.3.0.dev.a (Multi threaded; Max num-threads is 64) + +Note that OpenBlas by default uses a 32-bit integer API which can +overflow with large matrix sizes. We don't include LAPACK - the +OpenBlas version gives problems around eigenvalues for some reason. + +We now have a static library which you can link using the full path +with using the GEMMA Makefile: + + time env OPENBLAS_NUM_THREADS=4 make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 LIBS="~/tmp/OpenBLAS/libopenblas_haswellp-r0.3.0.dev.a -lgsl -pthread -lz" -j 4 unittests + +Latest (INT64, no gslcblas): + + time env OPENBLAS_NUM_THREADS=4 make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 LIBS="~/opt/gsl2/lib/libgsl.a ~/tmp/OpenBLAS/libopenblas_haswellp-r0.3.0.dev.a -pthread -lz -llapack" OPENBLAS_INCLUDE_PATH=~/tmp/OpenBLAS/ -j 4 fast-check + + +### OpenBlas 64-bit API + +<i>Warning: This is work in progress (WIP)</i> + +OpenBlas supports a 64-bit API which allows for large matrices. Unfortunately +GEMMA does not support it yet, see https://github.com/genetics-statistics/GEMMA/issues/120 + +For testing we can build + + make BINARY=64 INTERFACE64=1 NO_WARMUP=1 USE_THREAD=0 NO_LAPACK=0 NO_SHARED=1 -j 4 + +This builds a 64-bit binary and API and no external LAPACK. This is a very conservative +setting for testing the 64-bit API. + +Note, for performance we want a 64-bit binary with threading. + + make EIGEN_INCLUDE_PATH=~/.guix-profile/include/eigen3 LIBS="~/opt/gsl2/lib/libgsl.a ~/tmp/OpenBLAS/libopenblas_haswell-r0.3.0.dev.a ~/.guix-profile/lib/libgfortran.a ~/.guix-profile/lib/libquadmath.a -pthread -lz" OPENBLAS_INCLUDE_PATH=~/tmp/OpenBLAS/ -j 4 fast-check + +Note we don't include standard lapack, because it is 32-bits. @@ -10,13 +10,13 @@ # # Examples: # -# Make GEMMA on Linux with OPENBLAS support: +# Make GEMMA on Linux without OPENBLAS support: # -# make WITH_OPENBLAS=1 +# make WITH_OPENBLAS= # # Disable debug info and checks (slightly faster release mode) # -# make WITH_OPENBLAS=1 DEBUG= +# make DEBUG= # # Force static compilation # @@ -26,22 +26,35 @@ # # make check # +# Run quick (development) tests with +# +# make fast-check +# +# Run full (lengthy) tests with +# +# make check-all +# # See also the INSTALL.md document in the source tree at # # https://github.com/genetics-statistics/GEMMA/blob/master/INSTALL.md +GEMMA_VERSION = $(shell cat ./VERSION) + # Set this variable to either LNX or MAC SYS = LNX # LNX|MAC (Linux is the default) # Leave blank after "=" to disable; put "= 1" to enable -DIST_NAME = gemma-0.97.3 -DEBUG = 1 # DEBUG mode, set DEBUG=0 for a release +DIST_NAME = gemma-$(GEMMA_VERSION) +DEBUG = 1 # DEBUG mode, set DEBUG=0 for a release SHOW_COMPILER_WARNINGS = -WITH_LAPACK = 1 -WITH_OPENBLAS = # Defaults to LAPACK - OPENBLAS may be faster -FORCE_STATIC = # Static linking of libraries -GCC_FLAGS = -O3 # extra flags -Wl,--allow-multiple-definition -TRAVIS_CI = # used by TRAVIS for testing -EIGEN_INCLUDE_PATH=/usr/include/eigen3 +WITH_OPENBLAS = 1 # Without OpenBlas uses LAPACK +WITH_LAPACK = # Force linking LAPACK (if OpenBlas lacks it) +WITH_GSLCBLAS = # Force linking gslcblas (if OpenBlas lacks it) +OPENBLAS_LEGACY = # Using older OpenBlas +FORCE_STATIC = # Static linking of libraries +GCC_FLAGS = -Wall -O3 -std=gnu++11 # extra flags -Wl,--allow-multiple-definition +TRAVIS_CI = # used by TRAVIS for testing +EIGEN_INCLUDE_PATH = /usr/include/eigen3 +OPENBLAS_INCLUDE_PATH = /usr/local/opt/openblas/include # -------------------------------------------------------------------- # Edit below this line with caution @@ -58,15 +71,26 @@ else CPP = g++ endif -ifdef OPENBLAS - WITH_LAPACK = # OPENBLAS usually includes LAPACK +ifeq ($(CPP), clang++) + # macOS Homebrew settings (as used on Travis-CI) + GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ -isystem/$(OPENBLAS_INCLUDE_PATH) -isystem//usr/local/include/eigen3 -Wl,-L/usr/local/opt/openblas/lib +endif + +ifdef WITH_OPENBLAS + OPENBLAS=1 + # WITH_LAPACK = # OPENBLAS usually includes LAPACK + CPPFLAGS += -DOPENBLAS -isystem/$(OPENBLAS_INCLUDE_PATH) + ifdef OPENBLAS_LEGACY + # Legacy version (mostly for Travis-CI) + CPPFLAGS += -DOPENBLAS_LEGACY + endif endif ifdef DEBUG - CPPFLAGS = -g $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -g $(GCC_FLAGS) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc else # release mode - CPPFLAGS = -DNDEBUG $(GCC_FLAGS) -std=gnu++11 -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -DNDEBUG $(GCC_FLAGS) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc endif ifdef SHOW_COMPILER_WARNINGS @@ -74,27 +98,30 @@ ifdef SHOW_COMPILER_WARNINGS endif ifndef FORCE_STATIC - LIBS = -lgsl -lgslcblas -pthread -lz + LIBS = -lgsl -lopenblas -pthread -lz + ifdef WITH_GSLCBLAS + LIBS += -lgslcblas + else + LIBS += -lgfortran -lquadmath + endif else ifndef TRAVIS_CI # Travis static compile we cheat a little CPPFLAGS += -static endif endif -OUTPUT = $(BIN_DIR)/gemma - -SOURCES = $(SRC_DIR)/main.cpp +.PHONY: all -HDR = +OUTPUT = $(BIN_DIR)/gemma # Detailed libary paths, D for dynamic and S for static -LIBS_LNX_D_LAPACK = -llapack -LIBS_LNX_D_BLAS = -lblas -LIBS_LNX_D_OPENBLAS = -lopenblas +ifdef WITH_LAPACK + LIBS_LNX_D_LAPACK = -llapack +endif LIBS_MAC_D_LAPACK = -framework Accelerate # LIBS_LNX_S_LAPACK = /usr/lib/libgsl.a /usr/lib/libgslcblas.a /usr/lib/lapack/liblapack.a -lz -LIBS_LNX_S_LAPACK = /usr/lib/lapack/liblapack.a -lgfortran /usr/lib/atlas-base/libatlas.a /usr/lib/libblas/libblas.a -Wl,--allow-multiple-definition +# LIBS_LNX_S_LAPACK = /usr/lib/lapack/liblapack.a -lgfortran /usr/lib/atlas-base/libatlas.a /usr/lib/libblas/libblas.a -Wl,--allow-multiple-definition ifdef WITH_LAPACK ifeq ($(SYS), MAC) @@ -102,7 +129,7 @@ ifdef WITH_LAPACK else ifndef FORCE_STATIC ifdef WITH_OPENBLAS - LIBS += $(LIBS_LNX_D_OPENBLAS) + LIBS += -lopenblas else LIBS += $(LIBS_LNX_D_BLAS) endif @@ -113,7 +140,7 @@ ifdef WITH_LAPACK endif endif -HDR = $(wildcard src/*.h) +HDR = $(wildcard src/*.h) ./src/version.h SOURCES = $(wildcard src/*.cpp) # all @@ -121,17 +148,20 @@ OBJS = $(SOURCES:.cpp=.o) all: $(OUTPUT) +./src/version.h: + ./scripts/gen_version_info.sh > src/version.h + $(OUTPUT): $(OBJS) $(CPP) $(CPPFLAGS) $(OBJS) $(LIBS) -o $(OUTPUT) -$(OBJS) : $(HDR) +$(OBJS): $(HDR) -.cpp.o: - $(CPP) $(CPPFLAGS) $(HEADERS) -c $*.cpp -o $*.o .SUFFIXES : .cpp .c .o $(SUFFIXES) -unittests: all contrib/catch-1.9.7/catch.hpp $(TEST_SRC_DIR)/unittests-main.o $(TEST_SRC_DIR)/unittests-math.o +./bin/unittests-gemma: contrib/catch-1.9.7/catch.hpp $(TEST_SRC_DIR)/unittests-main.o $(TEST_SRC_DIR)/unittests-math.o $(OBJS) $(CPP) $(CPPFLAGS) $(TEST_SRC_DIR)/unittests-main.o $(TEST_SRC_DIR)/unittests-math.o $(filter-out src/main.o, $(OBJS)) $(LIBS) -o ./bin/unittests-gemma + +unittests: ./bin/unittests-gemma ./bin/unittests-gemma fast-check: all unittests @@ -154,16 +184,18 @@ check: fast-check slow-check check-all: check lengthy-check clean: + rm $(SRC_DIR)/version.h rm -vf $(SRC_DIR)/*.o rm -vf $(SRC_DIR)/*~ rm -vf $(TEST_SRC_DIR)/*.o rm -vf $(OUTPUT) rm -vf ./bin/unittests-gemma -DIST_COMMON = COPYING.txt README.txt Makefile +DIST_COMMON = *.md LICENSE VERSION Makefile DIST_SUBDIRS = src doc example bin -tar: +tar: version all + @echo "Creating $(DIST_NAME)" mkdir -p ./$(DIST_NAME) cp $(DIST_COMMON) ./$(DIST_NAME)/ cp -r $(DIST_SUBDIRS) ./$(DIST_NAME)/ diff --git a/Makefile.macosx b/Makefile.macosx index d2c1d90..fa7460e 100644 --- a/Makefile.macosx +++ b/Makefile.macosx @@ -30,16 +30,19 @@ # # https://github.com/genetics-statistics/GEMMA/blob/master/INSTALL.md +GEMMA_VERSION = $(shell cat ./VERSION) + # Set this variable to either LNX or MAC SYS = MAC # LNX|MAC (Linux is the default) # Leave blank after "=" to disable; put "= 1" to enable -DIST_NAME = gemma-0.97.2 +DIST_NAME = gemma-$(GEMMA_VERSION) DEBUG = # DEBUG mode, set DEBUG= for a release SHOW_COMPILER_WARNINGS = WITH_LAPACK = 1 WITH_OPENBLAS = # Defaults to LAPACK - OPENBLAS may be faster FORCE_STATIC = # Static linking of libraries -GCC_FLAGS = -O3 -I/usr/local/Cellar/gsl/2.4/include -I./eigen +GCC_FLAGS = -O3 -I/usr/local/Cellar/gsl/2.4/include -I./eigen \ + -I/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers TRAVIS_CI = # used by TRAVIS for testing EIGEN_INCLUDE_PATH= CXX = g++ @@ -86,7 +89,7 @@ LIBS += -framework Accelerate \ /usr/local/Cellar/gsl/2.4/lib/libgsl.a \ /usr/local/Cellar/gsl/2.4/lib/libgslcblas.a -HDR = $(wildcard src/*.h) +HDR = $(wildcard src/*.h) ./src/version.h SOURCES = $(wildcard src/*.cpp) # all @@ -94,6 +97,9 @@ OBJS = $(SOURCES:.cpp=.o) all: $(OUTPUT) +./src/version.h: + ./scripts/gen_version_info.sh > src/version.h + <print-% : ; @echo $* = $($*) $(OUTPUT): $(OBJS) @@ -129,6 +135,7 @@ check: fast-check slow-check check-all: check lengthy-check clean: + rm -vf $(SRC_DIR)/version.h rm -vf $(SRC_DIR)/*.o rm -vf $(SRC_DIR)/*~ rm -vf $(TEST_SRC_DIR)/*.o diff --git a/NEWS.md b/NEWS.md deleted file mode 100644 index 19d81d9..0000000 --- a/NEWS.md +++ /dev/null @@ -1,45 +0,0 @@ -## GEMMA 0.96.0 - -+ First stable release. - -## GEMMA 0.95.2 - -+ Resolved Issue #36. - -## GEMMA 0.95.1 - -+ Created first release of GEMMA 0.95a following request in Issue #33. - -## GEMMA 0.94.1 - -+ Fixed a bug (the predict option for multiple phenotype imputation -was not recoginzed with PLINK files). - -## GEMMA 0.94.0 - -+ Implemented the multivariate linear mixed model. - -## GEMMA 0.93 - -+ Implemented the Bayesian sparse linear mixed model. - -## GEMMA 0.92 - -+ Fixed a few typos. - -+ Now allows for missing values in the covariates file. - -+ Included REMLE estimate for lambda in the output .log file. - -+ Added small GWAS example dataset - -+ Added detailed user manual. - -## GEMMA 0.91 - -+ Fixed a bug (BIMBAM annotation file not recognized). - -## GEMMA 0.90 - -+ Initial pre-release. - @@ -19,7 +19,8 @@ also encourage contributions, for example, by forking the repository, making your changes to the code, and issuing a pull request. Currently, GEMMA is supported for 64-bit Mac OS X and Linux -platforms. *Windows is not currently supported.* If you are interested +platforms. *Windows is not currently supported.* though you can +run GEMMA in a Linux VM or [container](https://docs.docker.com/docker-for-windows/). If you are interested in helping to make GEMMA available on Windows platforms (e.g., by providing installation instructions for Windows, or by contributing Windows binaries) please post a note in the @@ -52,13 +53,46 @@ algorithm can be used to estimate variance components when individual-level data are available. For summary data, GEMMA uses the MQS algorithm to estimate variance components. -## Quick start +## Installation + +To install GEMMA you can + +1. Download the precompiled binaries (64-bit Linux and Mac only) + +2. Use existing package managers, see [INSTALL.md](INSTALL.md). + +3. Compile GEMMA from source, see [INSTALL.md](INSTALL.md). + +Compiling from source takes more work, but can potentially boost +performance of GEMMA when using specialized C++ compilers and +numerical libraries. + +### Precompiled binaries + +1. Fetch the [latest stable release][latest_release] and download the + file appropriate for your platform. + +2. For .tar.bz2 files unpack the tar ball + + tar xvjf gemma-$version-installer.tar.bz2 + + run the installer + + ./install.sh ~/gemma + + and run gemma + + ~/gemma/bin/gemma + +3. For .gz files run `gunzip gemma.linux.gz` or `gunzip +gemma.linux.gz` to unpack the file. + -1. Download and install the software. See [INSTALL.md](INSTALL.md). +## Quick start -2. Work through the demo. *Give more details here.* +1. Work through the demo. *Give more details here.* -3. Read the manual and run `gemma -h`. *Give more details here.* +2. Read the manual and run `gemma -h`. *Give more details here.* ## Citing GEMMA @@ -92,7 +126,7 @@ studies.](https://doi.org/10.1101/042846) *Annals of Applied Statistics*, in pre ## License -Copyright (C) 2012–2017, Xiang Zhou. +Copyright (C) 2012–2017, Xiang Zhou and team. The *GEMMA* source code repository is free software: you can redistribute it under the terms of the @@ -103,10 +137,10 @@ warranty**; without even the implied warranty of **merchantability or fitness for a particular purpose**. See file [LICENSE](LICENSE) for the full text of the license. -The source code for the -[shUnit2](https://github.com/genenetwork/shunit2) unit testing -framework, included in this repository [here](contrib/shunit2-2.0.3), is -distributed under the +Both the source code for the +[gzstream zlib wrapper](http://www.cs.unc.edu/Research/compgeom/gzstream/) +and [shUnit2](https://github.com/genenetwork/shunit2) unit testing +framework included in GEMMA are distributed under the [GNU Lesser General Public License](contrib/shunit2-2.0.3/doc/LGPL-2.1), either version 2.1 of the License, or (at your option) any later revision. @@ -115,64 +149,59 @@ The source code for the included [Catch](http://catch-lib.net) unit testing framework is distributed under the [Boost Software Licence version 1](https://github.com/philsquared/Catch/blob/master/LICENSE.txt). -## What's included - -This is the current structure of the GEMMA source repository: - -``` -├── LICENSE -├── Makefile -├── NEWS.md -├── README.md -├── bin -├── doc -├── example -└── src -``` +### Optimizing performance -*Write a paragraph here briefly explaining what is in each of the -subfolders; see Wilson et al "Good Enough Practices" paper for example -of this.* - -## Setup - -To install GEMMA you can - -1. Download the precompiled binaries (64-bit Linux and Mac only), see - [latest stable release][latest_release]. - -2. Use existing package managers, see [INSTALL.md](INSTALL.md). - -3. Compile GEMMA from source, see [INSTALL.md](INSTALL.md). - -Compiling from source takes more work, but can boost performance of -GEMMA when using specialized C++ compilers and numerical libraries. - -Source code and [latest stable release][latest_release] are available -from the Github repository. - -### Precompiled binaries - -1. Fetch the [latest stable release][latest_release] and download the -file appropriate for your platform: `gemma.linux.gz` for Linux, or -`gemma.macosx.gz` for Mac OS X. - -2. Run `gunzip gemma.linux.gz` or `gunzip gemma.linux.gz` to -unpack the file. - -3. Downloadable binaries are linked to static versions of the GSL, -LAPACK and BLAS libraries. There is no need to install these -libraries. +Precompiled binaries and libraries may not be optimal for your particular +hardware. See [INSTALL.md](INSTALL.md) for speeding up tips. ### Building from source -*Note that GEMMA currently does not work with GSL 2.x. We recommend -linking to the latest version of GSL 1.x, which is GSL 1.16 as of this -writing.* - More information on source code, dependencies and installation can be found in [INSTALL.md](INSTALL.md). +## Reporting a GEMMA bug or issue + +For bugs GEMMA has an +[issue tracker](https://github.com/genetics-statistics/GEMMA/issues) +on github. For general support GEMMA has a mailing list at +[gemma-discussion](https://groups.google.com/forum/#!forum/gemma-discussion) + +Before posting an issue search the issue tracker and mailing list +first. It is likely someone may have encountered something +similiar. Also try running the latest version of GEMMA to make sure it +has not been fixed already. Support/installation questions should be +aimed at the mailing list. The issue tracker is for development issues +around the software itself. When reporting an issue include the output +of the program and the contents of the .log.txt file in the output +directory. + +### Check list: + +1. [X] I have found and issue with GEMMA +2. [ ] I have searched for it on the [issue tracker](https://github.com/genetics-statistics/GEMMA/issues?q=is%3Aissue) (incl. closed issues) +3. [ ] I have searched for it on the [mailing list](https://groups.google.com/forum/#!forum/gemma-discussion) +4. [ ] I have tried the latest [release](https://github.com/genetics-statistics/GEMMA/releases) of GEMMA +5. [ ] I have read and agreed to below code of conduct +6. [ ] If it is a support/install question I have posted it to the [mailing list](https://groups.google.com/forum/#!forum/gemma-discussion) +7. [ ] If it is software development related I have posted a new issue on the [issue tracker](https://github.com/genetics-statistics/GEMMA/issues) or added to an existing one +8. [ ] In the message I have included the output of my GEMMA run +9. [ ] In the message I have included the relevant .log.txt file in the output directory +10. [ ] I have made available the data to reproduce the problem (optional) + +To find bugs the GEMMA software developers may ask to install a +development version of the software. They may also ask you for your +data and will treat it confidentially. Please always remember that +GEMMA is written and maintained by volunteers with good +intentions. Our time is valuable too. By helping us as much as +possible we can provide this tool for everyone to use. + +## Code of conduct + +By using GEMMA and communicating with its communtity you implicitely +agree to abide by the +[code of conduct](https://software-carpentry.org/conduct/) as +published by the Software Carpentry initiative. + ## Credits The *GEMMA* software was developed by: @@ -182,7 +211,8 @@ Dept. of Biostatistics<br> University of Michigan<br> 2012-2017 -Peter Carbonetto, Tim Flutre, Matthew Stephens, Pjotr Prins and others -have also contributed to the development of this software. +Peter Carbonetto, Tim Flutre, Matthew Stephens, +[Pjotr Prins](http://thebird.nl/) and others have also contributed to +the development of this software. [latest_release]: https://github.com/genetics-statistics/GEMMA/releases "Most recent stable releases" diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md new file mode 100644 index 0000000..de7f3b5 --- /dev/null +++ b/RELEASE-NOTES.md @@ -0,0 +1,101 @@ +## ChangeLog v0.97 (2017/12/19) + +This is a massive bug fix release with many improvements. For contributions +see +[contributors](https://github.com/genetics-statistics/GEMMA/graphs/contributors) +and +[commits](https://github.com/genetics-statistics/GEMMA/commits/master). + +### Speedup of GEMMA by using optimized OpenBlas + +* Providing a binary release with OpenBlas optimization for Intel Haswell +* Dropped using standar lapack and gslcblas libs +* Fixed NaN bug with GSL2 and made recent libraries the default +* Minimized use of Eigenlib libraries (single threaded and slow compilation) +* -legacy switch provides v0.96 behaviour (incl. eigenlib) + +### Added Leave One Chromosome Out (LOCO) support for Bimbam (K and LMM) + +* See 449d882a3b33ef81ef4f0127c3932b01fa796dbb +* -snps [filename] option allow selecting a subset of SNPs for analysis +* -loco [chr] option for K and LMM computations +* added [gemma-wrapper](https://github.com/genetics-statistics/gemma-wrapper) to make using LOCO easy +* LOCO examples in https://github.com/genetics-statistics/GEMMA/blob/master/test/dev_test_suite.sh + +### Added checks for matrices + +* #72 and #45 implements + 1. Fail if K has negative eigen values + 2. Fail if K is not symmetric + 3. Fail if K is not positive definite + 4. Warn in eigen values are very small + 5. Warn if K is ill conditioned +* Check for NaN values + +### Added test framework and unit tests + +* Added integration and unit tests, as well as + [Travis-CI](https://travis-ci.org/genenetwork/GEMMA) support +* Improved debug information and testing of input files + +### Other + +* #81 printing out beta and se(beta) under -lmm 2 as well as logl_H1 +* Improved README and INSTALL docs +* Added support info and code of conduct +* Reformatted the full source tree with 3935ba39d30666dd7d4a831155631847c77b70c4 +* Merged LMM computation for Plink and Bimbam formats +* Fixed progressbar issues +* #46 removed support for Oxford format +* Got rid of all compiler warnings +* Updated copyright banner, info and license information for included software +* Started a [discussion list](https://groups.google.com/forum/#!forum/gemma-discussion) + +See also [commits](https://github.com/genetics-statistics/GEMMA/commits/master). + +## GEMMA 0.96.0 + ++ First stable release. + +## GEMMA 0.95.2 + ++ Resolved Issue #36. + +## GEMMA 0.95.1 + ++ Created first release of GEMMA 0.95a following request in Issue #33. + +## GEMMA 0.94.1 + ++ Fixed a bug (the predict option for multiple phenotype imputation +was not recoginzed with PLINK files). + +## GEMMA 0.94.0 + ++ Implemented the multivariate linear mixed model. + +## GEMMA 0.93 + ++ Implemented the Bayesian sparse linear mixed model. + +## GEMMA 0.92 + ++ Fixed a few typos. + ++ Now allows for missing values in the covariates file. + ++ Included REMLE estimate for lambda in the output .log file. + ++ Added small GWAS example dataset + ++ Added detailed user manual. + +## GEMMA 0.91 + ++ Fixed a bug (BIMBAM annotation file not recognized). + +## GEMMA 0.90 + ++ Initial pre-release. + +See also https://github.com/genetics-statistics/GEMMA/releases @@ -0,0 +1 @@ +0.97 diff --git a/scripts/gen_version_info.sh b/scripts/gen_version_info.sh new file mode 100755 index 0000000..8a9e38d --- /dev/null +++ b/scripts/gen_version_info.sh @@ -0,0 +1,12 @@ +#! /bin/bash +# +# Script to generate the version info of GEMMA and its environment +# in ./src/version.h + +DATE=$(date "+%Y/%m/%d") +YEAR=$(date "+%Y") + +echo // version.h generated by GEMMA $0 +echo \#define GEMMA_VERSION \"$(cat ./VERSION)\" +echo \#define GEMMA_DATE \"$DATE\" +echo \#define GEMMA_YEAR \"$YEAR\" diff --git a/src/bslmmdap.cpp b/src/bslmmdap.cpp index 7aac1d4..e9900e3 100644 --- a/src/bslmmdap.cpp +++ b/src/bslmmdap.cpp @@ -116,16 +116,16 @@ void ReadFile_hyb(const string &file_hyp, vector<double> &vec_sa2, getline(infile, line); while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); vec_sa2.push_back(atof(ch_ptr)); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); vec_sb2.push_back(atof(ch_ptr)); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); vec_wab.push_back(atof(ch_ptr)); } @@ -152,7 +152,7 @@ void ReadFile_bf(const string &file_bf, vector<string> &vec_rs, vector<vector<double>> mat_bf; char *ch_ptr; - size_t bf_size, flag_block; + size_t bf_size = 0, flag_block; getline(infile, line); @@ -160,11 +160,11 @@ void ReadFile_bf(const string &file_bf, vector<string> &vec_rs, while (!safeGetline(infile, line).eof()) { flag_block = 0; - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; vec_rs.push_back(rs); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (t == 0) { block = ch_ptr; } else { @@ -223,7 +223,7 @@ void ReadFile_cat(const string &file_cat, const vector<string> &vec_rs, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); // Use the header to determine the number of categories. @@ -238,7 +238,7 @@ void ReadFile_cat(const string &file_cat, const vector<string> &vec_rs, // Read the following lines to record mapRS2cat. while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); if (header.rs_col == 0) { rs = chr + ":" + pos; @@ -248,6 +248,7 @@ void ReadFile_cat(const string &file_cat, const vector<string> &vec_rs, catd.clear(); for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } else if (header.chr_col != 0 && header.chr_col == i + 1) { @@ -658,13 +659,13 @@ void single_ct_regression(const gsl_matrix_int *Xd, sum_pip[i] = sum[i] = 0; } - for (int i = 0; i < Xd->size1; i++) { + for (size_t i = 0; i < Xd->size1; i++) { int cat = gsl_matrix_int_get(Xd, i, 0); sum_pip[cat] += gsl_vector_get(pip_vec, i); sum[cat] += 1; } - for (int i = 0; i < Xd->size1; i++) { + for (size_t i = 0; i < Xd->size1; i++) { int cat = gsl_matrix_int_get(Xd, i, 0); gsl_vector_set(prior_vec, i, sum_pip[cat] / sum[cat]); } @@ -683,10 +684,10 @@ void BSLMMDAP::DAP_EstimateHyper( const vector<double> &vec_sa2, const vector<double> &vec_sb2, const vector<double> &wab, const vector<vector<vector<double>>> &BF, gsl_matrix *Ac, gsl_matrix_int *Ad, gsl_vector_int *dlevel) { - clock_t time_start; + // clock_t time_start; // Set up BF. - double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save; + double h, rho, sigma_a2, sigma_b2, d, s, logm, logm_save = nan(""); size_t t1, t2; size_t n_grid = wab.size(), ns_test = vec_rs.size(); diff --git a/src/debug.cpp b/src/debug.cpp index 0d3c9cc..fd94f1e 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -1,3 +1,22 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ #include <cmath> #include <cstring> @@ -18,28 +37,140 @@ #include "debug.h" #include "mathfunc.h" -// Helper function called by macro validate_K(K, check) -void do_validate_K(const gsl_matrix *K, bool do_check, bool strict, const char *__file, int __line) { - if (do_check) { +static bool debug_mode = false; +static bool debug_check = true; // check data/algorithms +static bool debug_strict = false; // fail on error, more rigorous checks +static bool debug_quiet = false; +static uint debug_issue = 0; // track github issues +static bool debug_legacy = false; // legacy mode + +void debug_set_debug_mode(bool setting) { debug_mode = setting; } +void debug_set_no_check_mode(bool setting) {debug_check = !setting; } +void debug_set_strict_mode(bool setting) { debug_strict = setting; } +void debug_set_quiet_mode(bool setting) { debug_quiet = setting; } +void debug_set_issue(uint issue) { debug_issue = issue; } +void debug_set_legacy_mode(bool setting) { debug_legacy = setting; } + +bool is_debug_mode() { return debug_mode; }; +bool is_no_check_mode() { return !debug_check; }; +bool is_check_mode() { return debug_check; }; +bool is_strict_mode() { return debug_strict; }; +bool is_quiet_mode() { return debug_quiet; }; +bool is_issue(uint issue) { return issue == debug_issue; }; +bool is_legacy_mode() { return debug_legacy; }; + + +/* + Helper function to make sure gsl allocations do their job because + gsl_matrix_alloc does not initiatize values (behaviour that changed + in GSL2) we introduced a 'strict mode' by initializing the buffer + with NaNs. This happens when NO-CHECKS is not set (default) and with + DEBUG (i.e. -debug option). +*/ +gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols) { + gsl_matrix *m = gsl_matrix_alloc(rows,cols); + enforce_msg(m,"Not enough memory"); // just to be sure when there is no error handler set + if (is_check_mode() && is_debug_mode()) { + gsl_matrix_set_all(m, nan("")); + } + return m; +} + +int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src) { + enforce(dest->size1 == src->size1); + enforce(dest->size2 == src->size2); + return gsl_matrix_memcpy(dest,src); +} + +void do_gsl_matrix_safe_free (gsl_matrix *m, const char *__pretty_function, const char *__file, int __line) { + enforce(m); + if (is_strict_mode() && is_check_mode() && is_debug_mode()) { + bool has_NaN = has_nan(m); + bool has_Inf = has_inf(m); + if (has_NaN || has_Inf) { + std::string msg = "Matrix (size "; + msg += std::to_string(m->size1); + msg += "x"; + msg += std::to_string(m->size2); + msg += ")"; + if (has_Inf) + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,(msg+" contains Infinite on free!").c_str()); + if (has_NaN) + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,(msg+" contains NaN on free!").c_str()); + } + } + return gsl_matrix_free(m); +} + +int gsl_vector_safe_memcpy (gsl_vector *dest, const gsl_vector *src) { + enforce(dest->size == src->size); + return gsl_vector_memcpy(dest,src); +} + +void do_gsl_vector_safe_free (gsl_vector *v, const char *__pretty_function, const char *__file, int __line) { + enforce(v); + if (is_strict_mode() && is_check_mode() && is_debug_mode()) { + bool has_NaN = has_nan(v); + bool has_Inf = has_inf(v); + if (has_NaN || has_Inf) { + std::string msg = "Vector (size "; + msg += std::to_string(v->size); + msg += ")"; + if (has_Inf) + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,(msg+" contains Infinite on free!").c_str()); + if (has_NaN) + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,(msg+" contains NaN on free!").c_str()); + } + } + return gsl_vector_free(v); +} + +/* + Helper function to make sure gsl allocations do their job because + gsl_vector_alloc does not initiatize values (behaviour that changed + in GSL2) we introduced a 'strict mode' by initializing the buffer + with NaNs. This happens when NO-CHECKS is not set and with DEBUG + (i.e. -debug option). +*/ +gsl_vector *gsl_vector_safe_alloc(size_t n) { + gsl_vector *v = gsl_vector_alloc(n); + enforce_msg(v,"Not enough memory"); // just to be sure when there is no error handler set + if (is_check_mode() && is_debug_mode()) { + gsl_vector_set_all(v, nan("")); + } + return v; +} + +char *do_strtok_safe(char *tokenize, const char *delimiters, const char *__pretty_function, const char *__file, int __line) { + auto token = strtok(tokenize,delimiters); + if (token == NULL && (is_debug_mode() || is_strict_mode())) + fail_at_msg(__file,__line,string("strtok failed in ") + __pretty_function); + return token; +} + +// Helper function called by macro validate_K(K, check). K is validated +// unless -no-check option is used. +void do_validate_K(const gsl_matrix *K, const char *__pretty_function, const char *__file, int __line) { + if (is_check_mode()) { // debug_msg("Validating K"); auto eigenvalues = getEigenValues(K); - const uint count_small = count_small_values(eigenvalues,EIGEN_MINVALUE); + const uint count_small = count_abs_small_values(eigenvalues,EIGEN_MINVALUE); if (count_small>1) { std::string msg = "K has "; msg += std::to_string(count_small); msg += " eigenvalues close to zero"; warning_at_msg(__file,__line,msg); } - if (!isMatrixIllConditioned(eigenvalues)) + if (isMatrixIllConditioned(eigenvalues)) warning_at_msg(__file,__line,"K is ill conditioned!"); if (!isMatrixSymmetric(K)) - fail_at_msg(strict,__file,__line,"K is not symmetric!" ); + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,"K is not symmetric!" ); const bool negative_values = has_negative_values_but_one(eigenvalues); if (negative_values) { warning_at_msg(__file,__line,"K has more than one negative eigenvalues!"); } if (count_small>1 && negative_values && !isMatrixPositiveDefinite(K)) - fail_at_msg(strict,__file,__line,"K is not positive definite!"); + warnfail_at_msg(is_strict_mode(),__pretty_function,__file,__line,"K is not positive definite!"); gsl_vector_free(eigenvalues); } } diff --git a/src/debug.h b/src/debug.h index 06ca5cb..208868e 100644 --- a/src/debug.h +++ b/src/debug.h @@ -1,3 +1,23 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + #ifndef __DEBUG_H__ #define __DEBUG_H__ @@ -10,43 +30,92 @@ void gemma_gsl_error_handler (const char * reason, const char * file, int line, int gsl_errno); +void debug_set_debug_mode(bool setting); +void debug_set_no_check_mode(bool setting); +void debug_set_strict_mode(bool setting); +void debug_set_quiet_mode(bool setting); +void debug_set_issue(uint issue); +void debug_set_legacy_mode(bool setting); + +bool is_debug_mode(); +bool is_no_check_mode(); +bool is_check_mode(); +bool is_strict_mode(); +bool is_quiet_mode(); +bool is_issue(uint issue); +bool is_legacy_mode(); + +#define check_int_mult_overflow(m,n) \ + { auto x = m * n; \ + enforce_msg(x / m == n, "multiply integer overflow"); } + +gsl_matrix *gsl_matrix_safe_alloc(size_t rows,size_t cols); +int gsl_matrix_safe_memcpy (gsl_matrix *dest, const gsl_matrix *src); +void gsl_matrix_safe_free (gsl_matrix *v); +void do_gsl_matrix_safe_free (gsl_matrix *m, const char *__pretty_function, const char *__file, int __line); + +gsl_vector *gsl_vector_safe_alloc(size_t n); +int gsl_vector_safe_memcpy (gsl_vector *dest, const gsl_vector *src); +void gsl_vector_safe_free (gsl_vector *v); +void do_gsl_vector_safe_free (gsl_vector *v, const char *__pretty_function, const char *__file, int __line); + +char *do_strtok_safe(char *tokenize, const char *delimiters, const char *__pretty_function, const char *__file, int __line); +#define strtok_safe(string,delimiters) do_strtok_safe(string,delimiters,__SHOW_FUNC,__FILE__,__LINE__) // Validation routines -void do_validate_K(const gsl_matrix *K, bool do_check, bool strict, const char *__file, int __line); +void do_validate_K(const gsl_matrix *K, const char*__pretty_func, const char *__file, int __line); #define ROUND(f) round(f * 10000.)/10000 -#define validate_K(K,check,strict) do_validate_K(K,check,strict,__FILE__,__LINE__) +#define validate_K(K) do_validate_K(K,__SHOW_FUNC,__FILE__,__LINE__) #define warning_at_msg(__file,__line,msg) cerr << "**** WARNING: " << msg << " in " << __file << " at line " << __line << endl; -inline void fail_at_msg(bool strict, const char *__file, int __line, const char *msg) { +inline void warnfail_at_msg(bool strict, const char *__function, const char *__file, int __line, const char *msg) { if (strict) std::cerr << "**** STRICT FAIL: "; else std::cerr << "**** WARNING: "; - std::cerr << msg << " in " << __file << " at line " << __line << std::endl; + std::cerr << msg << " in " << __file << " at line " << __line << " in " << __function << std::endl; if (strict) exit(1); } +inline void fail_at_msg(const char *__file, int __line, std::string msg) { + std::cerr << msg << " in " << __file << " at line " << __line << std::endl; + exit(1); +} + # ifndef __ASSERT_VOID_CAST # define __ASSERT_VOID_CAST (void) # endif +inline void fail_msg(const char *msg) { + std::cerr << "**** FAILED: " << msg << std::endl; + exit(5); +} + +inline void fail_msg(std::string msg) { + std::cerr << "**** FAILED: " << msg << std::endl; + exit(5); +} + #if defined NDEBUG + #define __SHOW_FUNC __func__ -#define warning_msg(msg) cerr << "**** WARNING: " << msg << endl; -#define debug_msg(msg) -#define assert_issue(is_issue, expr) + #define warning_msg(msg) cerr << "**** WARNING: " << msg << endl; + #define debug_msg(msg) + #define assert_issue(is_issue, expr) #else // DEBUG -#define warning_msg(msg) cerr << "**** WARNING: " << msg << " in " << __FILE__ << " at line " << __LINE__ << " in " << __FUNCTION__ << endl; -#define debug_msg(msg) cerr << "**** DEBUG: " << msg << " in " << __FILE__ << " at line " << __LINE__ << " in " << __FUNCTION__ << endl; -#define assert_issue(is_issue, expr) \ - ((is_issue) ? enforce_msg(expr,"FAIL: ISSUE assert") : __ASSERT_VOID_CAST(0)) + #define __SHOW_FUNC __func__ -#endif + #define warning_msg(msg) cerr << "**** WARNING: " << msg << " in " << __FILE__ << " at line " << __LINE__ << " in " << __func__ << endl; + #define debug_msg(msg) (is_debug_mode() && cerr << "**** DEBUG: " << msg << " in " << __FILE__ << " at line " << __LINE__ << " in " << __func__ << endl); + #define assert_issue(is_issue, expr) \ + ((is_issue) ? enforce_msg(expr,"FAIL: ISSUE assert") : __ASSERT_VOID_CAST(0)) + +#endif // NDEBUG // enforce works like assert but also when NDEBUG is set (i.e., it // always works). enforce_msg prints message instead of expr @@ -56,25 +125,23 @@ inline void __enforce_fail(const char *__assertion, const char *__file, unsigned int __line, const char *__function) { - std::cout << "ERROR: Enforce failed for " << __assertion << " in " << __file << " at line " << __line << " in " << __PRETTY_FUNCTION__ << std::endl; + std::cout << "ERROR: Enforce failed for " << __assertion << " in " << __file << " at line " << __line << " in " << __function << std::endl; exit(1); } -#define __ASSERT_FUNCTION __PRETTY_FUNCTION__ - #define enforce(expr) \ ((expr) \ ? __ASSERT_VOID_CAST(0) \ - : __enforce_fail(__STRING(expr), __FILE__, __LINE__, __ASSERT_FUNCTION)) + : __enforce_fail(__STRING(expr), __FILE__, __LINE__, __SHOW_FUNC)) #define enforce_msg(expr, msg) \ ((expr) ? __ASSERT_VOID_CAST(0) \ - : __enforce_fail(msg, __FILE__, __LINE__, __ASSERT_FUNCTION)) + : __enforce_fail(msg, __FILE__, __LINE__, __SHOW_FUNC)) #define enforce_str(expr, msg) \ ((expr) \ ? __ASSERT_VOID_CAST(0) \ - : __enforce_fail((msg).c_str(), __FILE__, __LINE__, __ASSERT_FUNCTION)) + : __enforce_fail((msg).c_str(), __FILE__, __LINE__, __SHOW_FUNC)) // Helpers to create a unique varname per MACRO #define COMBINE1(X, Y) X##Y @@ -85,6 +152,16 @@ inline void __enforce_fail(const char *__assertion, const char *__file, (COMBINE(res, __LINE__) == 0 \ ? __ASSERT_VOID_CAST(0) \ : __enforce_fail(gsl_strerror(COMBINE(res, __LINE__)), __FILE__, \ - __LINE__, __ASSERT_FUNCTION)) + __LINE__, __SHOW_FUNC)) + +#define enforce_fexists(fn, msg) \ + if (!fn.empty()) \ + enforce_msg(stat(fn.c_str(), &fileInfo) == 0, \ + ((std::string(__STRING(fn)) + " " + fn + ": " + msg).c_str())); + +#define gsl_matrix_safe_free(m) \ + do_gsl_matrix_safe_free(m,__SHOW_FUNC,__FILE__,__LINE__); +#define gsl_vector_safe_free(v) \ + do_gsl_vector_safe_free(v,__SHOW_FUNC,__FILE__,__LINE__); #endif diff --git a/src/eigenlib.cpp b/src/eigenlib.cpp index a8c545c..4d6aacc 100644 --- a/src/eigenlib.cpp +++ b/src/eigenlib.cpp @@ -17,16 +17,18 @@ */ #include "Eigen/Dense" -#include "gsl/gsl_linalg.h" +// #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" -#include "gsl/gsl_vector.h" +// #include "gsl/gsl_vector.h" #include <cmath> #include <iostream> #include <vector> +#include <cblas.h> using namespace std; using namespace Eigen; + // On two different clusters, compare eigen vs lapack/gsl: // // dgemm, 5x or 0.5x faster or slower than lapack, 5x or 4x faster than gsl @@ -57,8 +59,6 @@ void eigenlib_dgemm(const char *TransA, const char *TransB, const double alpha, C_mat = alpha * A_mat.transpose() * B_mat.transpose() + beta * C_mat; } } - - return; } void eigenlib_dgemv(const char *TransA, const double alpha, const gsl_matrix *A, @@ -75,15 +75,12 @@ void eigenlib_dgemv(const char *TransA, const double alpha, const gsl_matrix *A, } else { y_vec = alpha * A_mat.transpose() * x_vec + beta * y_vec; } - - return; } void eigenlib_invert(gsl_matrix *A) { Map<Matrix<double, Dynamic, Dynamic, RowMajor>> A_mat(A->data, A->size1, A->size2); A_mat = A_mat.inverse(); - return; } void eigenlib_dsyr(const double alpha, const gsl_vector *b, gsl_matrix *A) { @@ -92,7 +89,6 @@ void eigenlib_dsyr(const double alpha, const gsl_vector *b, gsl_matrix *A) { Map<Matrix<double, Dynamic, 1>, 0, OuterStride<Dynamic>> b_vec( b->data, b->size, OuterStride<Dynamic>(b->stride)); A_mat = alpha * b_vec * b_vec.transpose() + A_mat; - return; } void eigenlib_eigensymm(const gsl_matrix *G, gsl_matrix *U, gsl_vector *eval) { @@ -108,5 +104,4 @@ void eigenlib_eigensymm(const gsl_matrix *G, gsl_matrix *U, gsl_vector *eval) { abort(); eval_vec = es.eigenvalues(); U_mat = es.eigenvectors(); - return; } diff --git a/src/eigenlib.h b/src/eigenlib.h index b29fa63..7fb69ad 100644 --- a/src/eigenlib.h +++ b/src/eigenlib.h @@ -19,9 +19,9 @@ #ifndef __EIGENLIB_H__ #define __EIGENLIB_H__ -#include <vector> +// #include <vector> -using namespace std; +// using namespace std; void eigenlib_dgemm(const char *TransA, const char *TransB, const double alpha, const gsl_matrix *A, const gsl_matrix *B, const double beta, diff --git a/src/fastblas.cpp b/src/fastblas.cpp new file mode 100644 index 0000000..362027c --- /dev/null +++ b/src/fastblas.cpp @@ -0,0 +1,239 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "gsl/gsl_matrix.h" +#include <algorithm> // std::min +#include <cmath> +#include <iomanip> +#include <vector> +#include <cblas.h> +#include "debug.h" +#include "fastblas.h" +#include "mathfunc.h" +#include <string.h> +#include "eigenlib.h" + +using namespace std; + +/* + Reasonably fast function to copy data from standard C array into + gsl_matrix. Avoid it for performance critical sections. +*/ +gsl_matrix *fast_copy(gsl_matrix *m, const double *mem) { + auto rows = m->size1; + auto cols = m->size2; + if (is_strict_mode()) { // slower correct version + for (size_t r=0; r<rows; r++) { + for (size_t c=0; c<cols; c++) { + gsl_matrix_set(m,r,c,mem[r*cols+c]); + } + } + } else { // faster goes by row + auto v = gsl_vector_calloc(cols); + enforce(v); // just to be sure + for (size_t r=0; r<rows; r++) { + assert(v->size == cols); + assert(v->block->size == cols); + assert(v->stride == 1); + memcpy(v->block->data,&mem[r*cols],cols*sizeof(double)); + gsl_matrix_set_row(m,r,v); + } + gsl_vector_free(v); + } + return m; +} + +/* + Helper function fast_cblas_dgemm runs the local dgemm +*/ +void fast_cblas_dgemm(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, + const size_t M, + const size_t N, + const size_t K, + const double alpha, + const double *A, + const size_t lda, + const double *B, + const size_t ldb, + const double beta, + double *C, + const size_t ldc) { +#ifndef NDEBUG + if (is_debug_mode()) { + #ifdef DISABLED + size_t i,j; + printf (" Top left corner of matrix A: \n"); + for (i=0; i<min(M,6); i++) { + for (j=0; j<min(K,6); j++) { + printf ("%12.0f", A[j+i*K]); + } + printf ("\n"); + } + + printf ("\n Top left corner of matrix B: \n"); + for (i=0; i<min(K,6); i++) { + for (j=0; j<min(N,6); j++) { + printf ("%12.0f", B[j+i*N]); + } + printf ("\n"); + } + + printf ("\n Top left corner of matrix C: \n"); + for (i=0; i<min(M,6); i++) { + for (j=0; j<min(N,6); j++) { + printf ("%12.5G", C[j+i*N]); + } + printf ("\n"); + } + #endif + + cout << scientific << setprecision(3) << "* RowMajor " << Order << "\t" ; + cout << "transA " << TransA << "\t" ; + cout << "transB " << TransB << "\t" ; + cout << "m " << M << "\t" ; + cout << "n " << N << "\t" ; + cout << "k " << K << "\n" ; + cout << "* lda " << lda << "\t" ; + cout << "ldb " << ldb << "\t" ; + cout << "ldc " << ldc << "\t" ; + cout << "alpha " << alpha << "\t" ; + cout << "beta " << beta << "\n" ; + cout << "* A03 " << A[3] << "\t" ; + cout << "B03 " << B[3] << "\t" ; + cout << "C03 " << C[3] << "\t" ; + cout << "Asum " << sum(A,M,K) << "\t" ; + cout << "Bsum " << sum(B,K,N) << "\n" ; + cout << "Csum " << sum(C,M,N) << "\n" ; + } +#endif // NDEBUG + + // Check for (integer) overflows + enforce(M>0); + enforce(N>0); + enforce(K>0); + + // check_int_mult_overflow(560000,8000); // fails on default int (32-bits) + check_int_mult_overflow(M,K); + check_int_mult_overflow(N,K); + check_int_mult_overflow(M,N); + + cblas_dgemm(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc); + +#ifndef NDEBUG + #ifdef DISABLED + if (is_debug_mode()) { + printf (" Top left corner of matrix A (cols=k %i, rows=m %i): \n",K,M); + for (i=0; i<min(M,6); i++) { + for (j=0; j<min(K,6); j++) { + printf ("%12.0f", A[j+i*K]); + } + printf ("\n"); + } + + printf ("\n Top left corner of matrix B: \n"); + for (i=0; i<min(K,6); i++) { + for (j=0; j<min(N,6); j++) { + printf ("%12.0f", B[j+i*N]); + } + printf ("\n"); + } + + printf ("\n Top left corner of matrix C: \n"); + for (i=0; i<min(M,6); i++) { + for (j=0; j<min(N,6); j++) { + printf ("%12.5G", C[j+i*N]); + } + printf ("\n"); + } + } + #endif +#endif // NDEBUG +} + +/* + Helper function fast_cblas_dgemm converts a GEMMA layout to cblas_dgemm. +*/ +static void fast_cblas_dgemm(const char *TransA, const char *TransB, const double alpha, + const gsl_matrix *A, const gsl_matrix *B, const double beta, + gsl_matrix *C) { + // C++ is row-major + auto transA = (*TransA == 'N' || *TransA == 'n' ? CblasNoTrans : CblasTrans); + auto transB = (*TransB == 'N' || *TransB == 'n' ? CblasNoTrans : CblasTrans); + const size_t M = C->size1; + const size_t N = C->size2; + const size_t MA = (transA == CblasNoTrans) ? A->size1 : A->size2; + const size_t NA = (transA == CblasNoTrans) ? A->size2 : A->size1; + const size_t MBx = (transB == CblasNoTrans) ? B->size1 : B->size2; + const size_t NB = (transB == CblasNoTrans) ? B->size2 : B->size1; + + if (M == MA && N == NB && NA == MBx) { /* [MxN] = [MAxNA][MBxNB] */ + + auto K = NA; + + // Check for (integer) overflows + enforce(M>0); + enforce(N>0); + enforce(K>0); + + // check_int_mult_overflow(560000,8000); + check_int_mult_overflow(M,K); + check_int_mult_overflow(N,K); + check_int_mult_overflow(M,N); + + cblas_dgemm (CblasRowMajor, transA, transB, M, N, NA, + alpha, A->data, A->tda, B->data, B->tda, beta, + C->data, C->tda); + + } else { + fail_msg("Range error in dgemm"); + } +} + + +/* + Use the fast/supported way to call BLAS dgemm +*/ + +void fast_dgemm(const char *TransA, const char *TransB, const double alpha, + const gsl_matrix *A, const gsl_matrix *B, const double beta, + gsl_matrix *C) { + fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C); + +#ifdef DISABLE + if (is_check_mode()) { + // ---- validate with original implementation + gsl_matrix *C1 = gsl_matrix_alloc(C->size1,C->size2); + eigenlib_dgemm(TransA,TransB,alpha,A,B,beta,C1); + enforce_msg(gsl_matrix_equal(C,C1),"dgemm outcomes are not equal for fast & eigenlib"); + gsl_matrix_free(C1); + } +#endif +} + +void fast_eigen_dgemm(const char *TransA, const char *TransB, const double alpha, + const gsl_matrix *A, const gsl_matrix *B, const double beta, + gsl_matrix *C) { + if (is_legacy_mode()) + eigenlib_dgemm(TransA,TransB,alpha,A,B,beta,C); + else + fast_cblas_dgemm(TransA,TransB,alpha,A,B,beta,C); +} diff --git a/src/fastblas.h b/src/fastblas.h new file mode 100644 index 0000000..6000983 --- /dev/null +++ b/src/fastblas.h @@ -0,0 +1,37 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __FASTBLAS_H__ +#define __FASTBLAS_H__ + +#include <assert.h> +#include <iostream> +#include "gsl/gsl_matrix.h" + +gsl_matrix *fast_copy(gsl_matrix *m, const double *mem); + +void fast_dgemm(const char *TransA, const char *TransB, const double alpha, + const gsl_matrix *A, const gsl_matrix *B, const double beta, + gsl_matrix *C); +void fast_eigen_dgemm(const char *TransA, const char *TransB, const double alpha, + const gsl_matrix *A, const gsl_matrix *B, const double beta, + gsl_matrix *C); + +#endif diff --git a/src/fastopenblas.h b/src/fastopenblas.h new file mode 100644 index 0000000..3dd8ef7 --- /dev/null +++ b/src/fastopenblas.h @@ -0,0 +1,44 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __FASTOPENBLAS_H__ +#define __FASTOPENBLAS_H__ + +#include <assert.h> +#include <iostream> +#include <cblas.h> // For OpenBlas +#include "gsl/gsl_matrix.h" + +void fast_cblas_dgemm(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, + const size_t M, + const size_t N, + const size_t K, + const double alpha, + const double *A, + const size_t lda, + const double *B, + const size_t ldb, + const double beta, + double *C, + const size_t ldc); + +#endif // __FASTOPENBLAS_H_ diff --git a/src/gemma.cpp b/src/gemma.cpp index 24173c3..edd79d7 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,6 +25,17 @@ #include <iostream> #include <string> #include <sys/stat.h> +#ifdef OPENBLAS +#pragma message "Compiling with OPENBLAS" +extern "C" { + // these functions are defined in cblas.h - but if we include that we + // conflicts with other BLAS includes + int openblas_get_num_threads(void); + int openblas_get_parallel(void); + char* openblas_get_config(void); + char* openblas_get_corename(void); +} +#endif #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" @@ -46,10 +59,11 @@ #include "varcov.h" #include "vc.h" #include "debug.h" +#include "version.h" using namespace std; -GEMMA::GEMMA(void) : version("0.97.3"), date("10/10/2017"), year("2017") {} +GEMMA::GEMMA(void) : version(GEMMA_VERSION), date(GEMMA_DATE), year(GEMMA_YEAR) {} void gemma_gsl_error_handler (const char * reason, const char * file, @@ -59,20 +73,14 @@ void gemma_gsl_error_handler (const char * reason, exit(22); } +#if defined(OPENBLAS) && !defined(OPENBLAS_LEGACY) +#include <openblas_config.h> +#endif + void GEMMA::PrintHeader(void) { - cout << endl; - cout << "*********************************************************" << endl; - cout << " Genome-wide Efficient Mixed Model Association (GEMMA) " << endl; - cout << " Version " << version << ", " << date - << " " << endl; - cout << " Visit http://www.xzlab.org/software.html For Updates " << endl; - cout << " (C) " << year << " Xiang Zhou " - << endl; - cout << " GNU General Public License " << endl; - cout << " For Help, Type ./gemma -h " << endl; - cout << "*********************************************************" << endl; - cout << endl; + cout << + "GEMMA " << version << " (" << date << ") by Xiang Zhou and team (C) 2012-" << year << endl; return; } @@ -141,22 +149,20 @@ void GEMMA::PrintLicense(void) { } void GEMMA::PrintHelp(size_t option) { + if (option == 0) { cout << endl; - cout << " GEMMA version " << version << ", released on " << date << endl; - cout << " implemented by Xiang Zhou" << endl; - cout << endl; - cout << " type ./gemma -h [num] for detailed helps" << endl; + cout << " type ./gemma -h [num] for detailed help" << endl; cout << " options: " << endl; - cout << " 1: quick guide" << endl; - cout << " 2: file I/O related" << endl; - cout << " 3: SNP QC" << endl; - cout << " 4: calculate relatedness matrix" << endl; - cout << " 5: perform eigen decomposition" << endl; - cout << " 6: perform variance component estimation" << endl; - cout << " 7: fit a linear model" << endl; - cout << " 8: fit a linear mixed model" << endl; - cout << " 9: fit a multivariate linear mixed model" << endl; + cout << " 1: quick guide" << endl; + cout << " 2: file I/O related" << endl; + cout << " 3: SNP QC" << endl; + cout << " 4: calculate relatedness matrix" << endl; + cout << " 5: perform eigen decomposition" << endl; + cout << " 6: perform variance component estimation" << endl; + cout << " 7: fit a linear model" << endl; + cout << " 8: fit a linear mixed model" << endl; + cout << " 9: fit a multivariate linear mixed model" << endl; cout << " 10: fit a Bayesian sparse linear mixed model" << endl; cout << " 11: obtain predicted values" << endl; cout << " 12: calculate snp variance covariance" << endl; @@ -310,11 +316,6 @@ void GEMMA::PrintHelp(size_t option) { cout << " rs#2, base_position, chr_number" << endl; cout << " ..." << endl; - // WJA added. - cout << " -oxford [prefix] " - << " specify input Oxford genotype bgen file prefix." << endl; - cout << " requires: *.bgen, *.sample files" << endl; - cout << " -gxe [filename] " << " specify input file that contains a column of environmental " "factor for g by e tests" @@ -429,8 +430,8 @@ void GEMMA::PrintHelp(size_t option) { "default 1)" << endl; cout << " -pace [num] " - << " specify terminal display update pace (default 100000 SNPs or " - "100000 iterations)." + << " specify terminal display update pace (default 1,000 SNPs or " + "1,000 iterations)." << endl; cout << " -outdir [path] " << " specify output directory path (default \"./output/\")" << endl; @@ -542,9 +543,13 @@ void GEMMA::PrintHelp(size_t option) { cout << " -lmax [num] " << " specify maximum value for lambda (default 1e+5)" << endl; cout - << " -region [num] " + << " -region [num] " << " specify the number of regions used to evaluate lambda (default 10)" << endl; + cout << " -loco [chr] " + << " leave one chromosome out (LOCO) by name (requires -a annotation " + "file)" + << endl; cout << endl; } @@ -715,9 +720,14 @@ void GEMMA::PrintHelp(size_t option) { cout << " -debug debug output" << endl; cout << " -nind [num] read up to num individuals" << endl; cout << " -issue [num] enable tests relevant to issue tracker" << endl; + cout << " -legacy run gemma in legacy mode" << endl; cout << endl; } + cout << "The GEMMA software is distributed under the GNU General Public v3" << endl; + cout << " -license show license information" << endl; + cout << + " see also http://www.xzlab.org/software.html, https://github.com/genetics-statistics" << endl; return; } @@ -759,8 +769,8 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.clear(); str.assign(argv[i]); cPar.file_mbfile = str; - } else if (strcmp(argv[i], "-silence") == 0) { - cPar.mode_silence = true; + } else if (strcmp(argv[i], "-silence") == 0 || strcmp(argv[i], "--quiet") == 0) { + debug_set_quiet_mode(true); } else if (strcmp(argv[i], "-g") == 0) { if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { continue; @@ -793,18 +803,6 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.clear(); str.assign(argv[i]); cPar.file_anno = str; - } - - // WJA added. - else if (strcmp(argv[i], "-oxford") == 0 || - strcmp(argv[i], "--oxford") == 0 || strcmp(argv[i], "-x") == 0) { - if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { - continue; - } - ++i; - str.clear(); - str.assign(argv[i]); - cPar.file_oxford = str; } else if (strcmp(argv[i], "-gxe") == 0) { if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { continue; @@ -1373,8 +1371,9 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { ++i; str.clear(); str.assign(argv[i]); - cPar.issue = atoi(str.c_str()); // for testing purposes - enforce(cPar.issue > 0); + auto issue = atoi(str.c_str()); // for testing purposes + enforce(issue > 0); + debug_set_issue(issue); } else if (strcmp(argv[i], "-emp") == 0) { if (argv[i + 1] == NULL || argv[i + 1][0] == '-') { continue; @@ -1594,11 +1593,17 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) { str.assign(argv[i]); cPar.window_ns = atoi(str.c_str()); } else if (strcmp(argv[i], "-debug") == 0) { - cPar.mode_debug = true; + // cPar.mode_debug = true; + debug_set_debug_mode(true); } else if (strcmp(argv[i], "-no-check") == 0) { - cPar.mode_check = false; + // cPar.mode_check = false; + debug_set_no_check_mode(true); } else if (strcmp(argv[i], "-strict") == 0) { - cPar.mode_strict = true; + // cPar.mode_strict = true; + debug_set_strict_mode(true); + } else if (strcmp(argv[i], "-legacy") == 0) { + debug_set_legacy_mode(true); + warning_msg("you are running in legacy mode - support may drop in future versions of gemma"); } else { cout << "error! unrecognized option: " << argv[i] << endl; cPar.error = true; @@ -1635,7 +1640,7 @@ void GEMMA::BatchRun(PARAM &cPar) { if (cPar.a_mode == 41 || cPar.a_mode == 42) { gsl_vector *y_prdt; - y_prdt = gsl_vector_alloc(cPar.ni_total - cPar.ni_test); + y_prdt = gsl_vector_safe_alloc(cPar.ni_total - cPar.ni_test); // set to zero gsl_vector_set_zero(y_prdt); @@ -1647,8 +1652,8 @@ void GEMMA::BatchRun(PARAM &cPar) { if (!cPar.file_kin.empty() && !cPar.file_ebv.empty()) { cout << "Adding Breeding Values ... " << endl; - gsl_matrix *G = gsl_matrix_alloc(cPar.ni_total, cPar.ni_total); - gsl_vector *u_hat = gsl_vector_alloc(cPar.ni_test); + gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); + gsl_vector *u_hat = gsl_vector_safe_alloc(cPar.ni_test); // read kinship matrix and set u_hat vector<int> indicator_all; @@ -1671,8 +1676,8 @@ void GEMMA::BatchRun(PARAM &cPar) { // read u cPRDT.AddBV(G, u_hat, y_prdt); - gsl_matrix_free(G); - gsl_vector_free(u_hat); + gsl_matrix_safe_free(G); + gsl_vector_safe_free(u_hat); } // add beta @@ -1699,32 +1704,32 @@ void GEMMA::BatchRun(PARAM &cPar) { cPRDT.WriteFiles(y_prdt); - gsl_vector_free(y_prdt); + gsl_vector_safe_free(y_prdt); } // Prediction with kinship matrix only; for one or more phenotypes if (cPar.a_mode == 43) { // first, use individuals with full phenotypes to obtain estimates of Vg and // Ve - gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph); - gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt); - gsl_matrix *G = gsl_matrix_alloc(Y->size1, Y->size1); - gsl_matrix *U = gsl_matrix_alloc(Y->size1, Y->size1); - gsl_matrix *UtW = gsl_matrix_alloc(Y->size1, W->size2); - gsl_matrix *UtY = gsl_matrix_alloc(Y->size1, Y->size2); - gsl_vector *eval = gsl_vector_alloc(Y->size1); + gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); + gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); + gsl_matrix *G = gsl_matrix_safe_alloc(Y->size1, Y->size1); + gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1); + gsl_matrix *UtW = gsl_matrix_safe_alloc(Y->size1, W->size2); + gsl_matrix *UtY = gsl_matrix_safe_alloc(Y->size1, Y->size2); + gsl_vector *eval = gsl_vector_safe_alloc(Y->size1); - gsl_matrix *Y_full = gsl_matrix_alloc(cPar.ni_cvt, cPar.n_ph); - gsl_matrix *W_full = gsl_matrix_alloc(Y_full->size1, cPar.n_cvt); + gsl_matrix *Y_full = gsl_matrix_safe_alloc(cPar.ni_cvt, cPar.n_ph); + gsl_matrix *W_full = gsl_matrix_safe_alloc(Y_full->size1, cPar.n_cvt); // set covariates matrix W and phenotype matrix Y // an intercept should be included in W, cPar.CopyCvtPhen(W, Y, 0); cPar.CopyCvtPhen(W_full, Y_full, 1); - gsl_matrix *Y_hat = gsl_matrix_alloc(Y_full->size1, cPar.n_ph); - gsl_matrix *G_full = gsl_matrix_alloc(Y_full->size1, Y_full->size1); - gsl_matrix *H_full = gsl_matrix_alloc(Y_full->size1 * Y_hat->size2, + gsl_matrix *Y_hat = gsl_matrix_safe_alloc(Y_full->size1, cPar.n_ph); + gsl_matrix *G_full = gsl_matrix_safe_alloc(Y_full->size1, Y_full->size1); + gsl_matrix *H_full = gsl_matrix_safe_alloc(Y_full->size1 * Y_hat->size2, Y_full->size1 * Y_hat->size2); // read relatedness matrix G, and matrix G_full @@ -1745,7 +1750,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // center matrix G CenterMatrix(G); CenterMatrix(G_full); - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); // eigen-decomposition and calculate trace_G cout << "Start Eigen-Decomposition..." << endl; @@ -1760,8 +1765,8 @@ void GEMMA::BatchRun(PARAM &cPar) { // calculate variance component and beta estimates // and then obtain predicted values if (cPar.n_ph == 1) { - gsl_vector *beta = gsl_vector_alloc(W->size2); - gsl_vector *se_beta = gsl_vector_alloc(W->size2); + gsl_vector *beta = gsl_vector_safe_alloc(W->size2); + gsl_vector *se_beta = gsl_vector_safe_alloc(W->size2); double lambda, logl, vg, ve; gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); @@ -1788,13 +1793,13 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_matrix_add(H_full, G_full); // free matrices - gsl_vector_free(beta); - gsl_vector_free(se_beta); + gsl_vector_safe_free(beta); + gsl_vector_safe_free(se_beta); } else { - gsl_matrix *Vg = gsl_matrix_alloc(cPar.n_ph, cPar.n_ph); - gsl_matrix *Ve = gsl_matrix_alloc(cPar.n_ph, cPar.n_ph); - gsl_matrix *B = gsl_matrix_alloc(cPar.n_ph, W->size2); - gsl_matrix *se_B = gsl_matrix_alloc(cPar.n_ph, W->size2); + gsl_matrix *Vg = gsl_matrix_safe_alloc(cPar.n_ph, cPar.n_ph); + gsl_matrix *Ve = gsl_matrix_safe_alloc(cPar.n_ph, cPar.n_ph); + gsl_matrix *B = gsl_matrix_safe_alloc(cPar.n_ph, W->size2); + gsl_matrix *se_B = gsl_matrix_safe_alloc(cPar.n_ph, W->size2); // obtain estimates CalcMvLmmVgVeBeta(eval, UtW, UtY, cPar.em_iter, cPar.nr_iter, @@ -1836,10 +1841,10 @@ void GEMMA::BatchRun(PARAM &cPar) { } // free matrices - gsl_matrix_free(Vg); - gsl_matrix_free(Ve); - gsl_matrix_free(B); - gsl_matrix_free(se_B); + gsl_matrix_safe_free(Vg); + gsl_matrix_safe_free(Ve); + gsl_matrix_safe_free(B); + gsl_matrix_safe_free(se_B); } PRDT cPRDT; @@ -1853,26 +1858,26 @@ void GEMMA::BatchRun(PARAM &cPar) { cPRDT.WriteFiles(Y_full); - gsl_matrix_free(Y); - gsl_matrix_free(W); - gsl_matrix_free(G); - gsl_matrix_free(U); - gsl_matrix_free(UtW); - gsl_matrix_free(UtY); - gsl_vector_free(eval); - - gsl_matrix_free(Y_full); - gsl_matrix_free(Y_hat); - gsl_matrix_free(W_full); - gsl_matrix_free(G_full); - gsl_matrix_free(H_full); + gsl_matrix_safe_free(Y); + gsl_matrix_safe_free(W); + gsl_matrix_safe_free(G); + gsl_matrix_safe_free(U); + gsl_matrix_safe_free(UtW); + gsl_matrix_safe_free(UtY); + gsl_vector_safe_free(eval); + + gsl_matrix_safe_free(Y_full); + gsl_matrix_safe_free(Y_hat); + gsl_matrix_safe_free(W_full); + gsl_matrix_safe_free(G_full); + gsl_matrix_safe_free(H_full); } // Generate Kinship matrix (optionally using LOCO) if (cPar.a_mode == 21 || cPar.a_mode == 22) { cout << "Calculating Relatedness Matrix ... " << endl; - gsl_matrix *G = gsl_matrix_alloc(cPar.ni_total, cPar.ni_total); + gsl_matrix *G = gsl_matrix_safe_alloc(cPar.ni_total, cPar.ni_total); enforce_msg(G, "allocate G"); // just to be sure time_start = clock(); @@ -1885,7 +1890,7 @@ void GEMMA::BatchRun(PARAM &cPar) { } // Now we have the Kinship matrix test it - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); if (cPar.a_mode == 21) { cPar.WriteMatrix(G, "cXX"); @@ -1893,7 +1898,7 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteMatrix(G, "sXX"); } - gsl_matrix_free(G); + gsl_matrix_safe_free(G); } // Compute the LDSC weights (not implemented yet) @@ -1917,8 +1922,8 @@ void GEMMA::BatchRun(PARAM &cPar) { if (cPar.a_mode == 25 || cPar.a_mode == 26) { cout << "Calculating the S Matrix ... " << endl; - gsl_matrix *S = gsl_matrix_alloc(cPar.n_vc * 2, cPar.n_vc); - gsl_vector *ns = gsl_vector_alloc(cPar.n_vc + 1); + gsl_matrix *S = gsl_matrix_safe_alloc(cPar.n_vc * 2, cPar.n_vc); + gsl_vector *ns = gsl_vector_safe_alloc(cPar.n_vc + 1); gsl_matrix_set_zero(S); gsl_vector_set_zero(ns); @@ -1927,13 +1932,13 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_matrix_submatrix(S, cPar.n_vc, 0, cPar.n_vc, cPar.n_vc); gsl_vector_view ns_vec = gsl_vector_subvector(ns, 0, cPar.n_vc); - gsl_matrix *K = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); - gsl_matrix *A = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); + gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); + gsl_matrix *A = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); gsl_matrix_set_zero(K); gsl_matrix_set_zero(A); - gsl_vector *y = gsl_vector_alloc(cPar.ni_test); - gsl_matrix *W = gsl_matrix_alloc(cPar.ni_test, cPar.n_cvt); + gsl_vector *y = gsl_vector_safe_alloc(cPar.ni_test); + gsl_matrix *W = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_cvt); cPar.CopyCvtPhen(W, y, 0); @@ -1957,22 +1962,22 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteVector(ns, "size"); cPar.WriteVar("snps"); - gsl_matrix_free(S); - gsl_vector_free(ns); + gsl_matrix_safe_free(S); + gsl_vector_safe_free(ns); - gsl_matrix_free(A); - gsl_matrix_free(K); + gsl_matrix_safe_free(A); + gsl_matrix_safe_free(K); - gsl_vector_free(y); - gsl_matrix_free(K); + gsl_vector_safe_free(y); + gsl_matrix_safe_free(K); } // Compute the q vector, that is used for variance component estimation using // summary statistics if (cPar.a_mode == 27 || cPar.a_mode == 28) { - gsl_matrix *Vq = gsl_matrix_alloc(cPar.n_vc, cPar.n_vc); - gsl_vector *q = gsl_vector_alloc(cPar.n_vc); - gsl_vector *s = gsl_vector_alloc(cPar.n_vc + 1); + gsl_matrix *Vq = gsl_matrix_safe_alloc(cPar.n_vc, cPar.n_vc); + gsl_vector *q = gsl_vector_safe_alloc(cPar.n_vc); + gsl_vector *s = gsl_vector_safe_alloc(cPar.n_vc + 1); gsl_vector_set_zero(q); gsl_vector_set_zero(s); @@ -1988,8 +1993,8 @@ void GEMMA::BatchRun(PARAM &cPar) { vec_weight, vec_z2, cPar.ni_total, cPar.ns_total, cPar.ns_test); cout << "## number of total individuals = " << cPar.ni_total << endl; - cout << "## number of total SNPs = " << cPar.ns_total << endl; - cout << "## number of analyzed SNPs = " << cPar.ns_test << endl; + cout << "## number of total SNPs/var = " << cPar.ns_total << endl; + cout << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; cout << "## number of variance components = " << cPar.n_vc << endl; cout << "Calculating the q vector ... " << endl; Calcq(cPar.n_block, vec_cat, vec_ni, vec_weight, vec_z2, Vq, q, @@ -2006,9 +2011,9 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteMatrix(Vq, "Vq"); cPar.WriteVector(q, "q"); cPar.WriteVector(s, "size"); - gsl_matrix_free(Vq); - gsl_vector_free(q); - gsl_vector_free(s); + gsl_matrix_safe_free(Vq); + gsl_vector_safe_free(q); + gsl_vector_safe_free(s); } // Calculate SNP covariance. @@ -2028,8 +2033,8 @@ void GEMMA::BatchRun(PARAM &cPar) { // LM. if (cPar.a_mode == 51 || cPar.a_mode == 52 || cPar.a_mode == 53 || cPar.a_mode == 54) { // Fit LM - gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph); - gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt); + gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); + gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); // set covariates matrix W and phenotype matrix Y // an intercept should be included in W, @@ -2047,8 +2052,6 @@ void GEMMA::BatchRun(PARAM &cPar) { &Y_col.vector); // y is the predictor, not the phenotype } else if (!cPar.file_bfile.empty()) { cLm.AnalyzePlink(W, &Y_col.vector); - } else if (!cPar.file_oxford.empty()) { - cLm.Analyzebgen(W, &Y_col.vector); } else { cLm.AnalyzeBimbam(W, &Y_col.vector); } @@ -2057,8 +2060,8 @@ void GEMMA::BatchRun(PARAM &cPar) { cLm.CopyToParam(cPar); } // release all matrices and vectors - gsl_matrix_free(Y); - gsl_matrix_free(W); + gsl_matrix_safe_free(Y); + gsl_matrix_safe_free(W); } // VC estimation with one or multiple kinship matrices @@ -2083,16 +2086,16 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.UpdateSNP(mapRS2wK); // Setup matrices and vectors. - gsl_matrix *S = gsl_matrix_alloc(cPar.n_vc * 2, cPar.n_vc); - gsl_matrix *Vq = gsl_matrix_alloc(cPar.n_vc, cPar.n_vc); - gsl_vector *q = gsl_vector_alloc(cPar.n_vc); - gsl_vector *s = gsl_vector_alloc(cPar.n_vc + 1); + gsl_matrix *S = gsl_matrix_safe_alloc(cPar.n_vc * 2, cPar.n_vc); + gsl_matrix *Vq = gsl_matrix_safe_alloc(cPar.n_vc, cPar.n_vc); + gsl_vector *q = gsl_vector_safe_alloc(cPar.n_vc); + gsl_vector *s = gsl_vector_safe_alloc(cPar.n_vc + 1); - gsl_matrix *K = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); - gsl_matrix *A = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); + gsl_matrix *K = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); + gsl_matrix *A = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc * cPar.ni_test); - gsl_vector *y = gsl_vector_alloc(cPar.ni_test); - gsl_matrix *W = gsl_matrix_alloc(cPar.ni_test, cPar.n_cvt); + gsl_vector *y = gsl_vector_safe_alloc(cPar.ni_test); + gsl_matrix *W = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_cvt); gsl_matrix_set_zero(K); gsl_matrix_set_zero(A); @@ -2120,8 +2123,8 @@ void GEMMA::BatchRun(PARAM &cPar) { cout << "Study Panel: " << endl; cout << "## number of total individuals = " << cPar.ni_study << endl; - cout << "## number of total SNPs = " << cPar.ns_study << endl; - cout << "## number of analyzed SNPs = " << cPar.ns_test << endl; + cout << "## number of total SNPs/var = " << cPar.ns_study << endl; + cout << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; cout << "## number of variance components = " << cPar.n_vc << endl; // compute q @@ -2186,15 +2189,15 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteVector(q, "q"); cPar.WriteVector(s, "size"); - gsl_matrix_free(S); - gsl_matrix_free(Vq); - gsl_vector_free(q); - gsl_vector_free(s); + gsl_matrix_safe_free(S); + gsl_matrix_safe_free(Vq); + gsl_vector_safe_free(q); + gsl_vector_safe_free(s); - gsl_matrix_free(A); - gsl_matrix_free(K); - gsl_vector_free(y); - gsl_matrix_free(W); + gsl_matrix_safe_free(A); + gsl_matrix_safe_free(K); + gsl_vector_safe_free(y); + gsl_matrix_safe_free(W); } else if (!cPar.file_study.empty() || !cPar.file_mstudy.empty()) { if (!cPar.file_study.empty()) { string sfile = cPar.file_study + ".size.txt"; @@ -2219,16 +2222,16 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.n_vc = cPar.n_vc - 1; - gsl_matrix *S = gsl_matrix_alloc(2 * cPar.n_vc, cPar.n_vc); - gsl_matrix *Vq = gsl_matrix_alloc(cPar.n_vc, cPar.n_vc); - // gsl_matrix *V=gsl_matrix_alloc (cPar.n_vc+1, + gsl_matrix *S = gsl_matrix_safe_alloc(2 * cPar.n_vc, cPar.n_vc); + gsl_matrix *Vq = gsl_matrix_safe_alloc(cPar.n_vc, cPar.n_vc); + // gsl_matrix *V=gsl_matrix_safe_alloc (cPar.n_vc+1, // (cPar.n_vc*(cPar.n_vc+1))/2*(cPar.n_vc+1) ); - // gsl_matrix *Vslope=gsl_matrix_alloc (n_lines+1, + // gsl_matrix *Vslope=gsl_matrix_safe_alloc (n_lines+1, // (n_lines*(n_lines+1))/2*(n_lines+1) ); - gsl_vector *q = gsl_vector_alloc(cPar.n_vc); - gsl_vector *s_study = gsl_vector_alloc(cPar.n_vc); - gsl_vector *s_ref = gsl_vector_alloc(cPar.n_vc); - gsl_vector *s = gsl_vector_alloc(cPar.n_vc + 1); + gsl_vector *q = gsl_vector_safe_alloc(cPar.n_vc); + gsl_vector *s_study = gsl_vector_safe_alloc(cPar.n_vc); + gsl_vector *s_ref = gsl_vector_safe_alloc(cPar.n_vc); + gsl_vector *s = gsl_vector_safe_alloc(cPar.n_vc + 1); gsl_matrix_set_zero(S); gsl_matrix_view S_mat = @@ -2270,7 +2273,7 @@ void GEMMA::BatchRun(PARAM &cPar) { assert(!has_nan(cPar.v_se_pve)); gsl_vector_view s_sub = gsl_vector_subvector(s, 0, cPar.n_vc); - gsl_vector_memcpy(&s_sub.vector, s_ref); + gsl_vector_safe_memcpy(&s_sub.vector, s_ref); gsl_vector_set(s, cPar.n_vc, cPar.ni_ref); cPar.WriteMatrix(S, "S"); @@ -2278,18 +2281,18 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteVector(q, "q"); cPar.WriteVector(s, "size"); - gsl_matrix_free(S); - gsl_matrix_free(Vq); - // gsl_matrix_free (V); - // gsl_matrix_free (Vslope); - gsl_vector_free(q); - gsl_vector_free(s_study); - gsl_vector_free(s_ref); - gsl_vector_free(s); + gsl_matrix_safe_free(S); + gsl_matrix_safe_free(Vq); + // gsl_matrix_safe_free (V); + // gsl_matrix_safe_free (Vslope); + gsl_vector_safe_free(q); + gsl_vector_safe_free(s_study); + gsl_vector_safe_free(s_ref); + gsl_vector_safe_free(s); } else { - gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph); - gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt); - gsl_matrix *G = gsl_matrix_alloc(Y->size1, Y->size1 * cPar.n_vc); + gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); + gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); + gsl_matrix *G = gsl_matrix_safe_alloc(Y->size1, Y->size1 * cPar.n_vc); // set covariates matrix W and phenotype matrix Y // an intercept should be included in W, @@ -2328,7 +2331,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // center matrix G CenterMatrix(G); - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); (cPar.v_traceG).clear(); double d = 0; @@ -2366,9 +2369,9 @@ void GEMMA::BatchRun(PARAM &cPar) { // the genotypes if (cPar.a_mode == 66 || cPar.a_mode == 67) { // read reference file first - gsl_matrix *S = gsl_matrix_alloc(cPar.n_vc, cPar.n_vc); - gsl_matrix *Svar = gsl_matrix_alloc(cPar.n_vc, cPar.n_vc); - gsl_vector *s_ref = gsl_vector_alloc(cPar.n_vc); + gsl_matrix *S = gsl_matrix_safe_alloc(cPar.n_vc, cPar.n_vc); + gsl_matrix *Svar = gsl_matrix_safe_alloc(cPar.n_vc, cPar.n_vc); + gsl_vector *s_ref = gsl_vector_safe_alloc(cPar.n_vc); gsl_matrix_set_zero(S); gsl_matrix_set_zero(Svar); @@ -2393,14 +2396,14 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.ObtainWeight(setSnps_beta, mapRS2wK); // set up matrices and vector - gsl_matrix *Xz = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc); - gsl_matrix *XWz = gsl_matrix_alloc(cPar.ni_test, cPar.n_vc); + gsl_matrix *Xz = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc); + gsl_matrix *XWz = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_vc); gsl_matrix *XtXWz = - gsl_matrix_alloc(mapRS2wK.size(), cPar.n_vc * cPar.n_vc); - gsl_vector *w = gsl_vector_alloc(mapRS2wK.size()); - gsl_vector *w1 = gsl_vector_alloc(mapRS2wK.size()); - gsl_vector *z = gsl_vector_alloc(mapRS2wK.size()); - gsl_vector *s_vec = gsl_vector_alloc(cPar.n_vc); + gsl_matrix_safe_alloc(mapRS2wK.size(), cPar.n_vc * cPar.n_vc); + gsl_vector *w = gsl_vector_safe_alloc(mapRS2wK.size()); + gsl_vector *w1 = gsl_vector_safe_alloc(mapRS2wK.size()); + gsl_vector *z = gsl_vector_safe_alloc(mapRS2wK.size()); + gsl_vector *s_vec = gsl_vector_safe_alloc(cPar.n_vc); vector<size_t> vec_cat, vec_size; vector<double> vec_z; @@ -2462,7 +2465,7 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.mindicator_snp, vec_cat, w1, z, Xz); } if (cPar.a_mode == 66) { - gsl_matrix_memcpy(XWz, Xz); + gsl_matrix_safe_memcpy(XWz, Xz); } else if (cPar.a_mode == 67) { cout << "Calculating XWz ... " << endl; @@ -2507,37 +2510,37 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.v_se_sigma2, cPar.v_enrich, cPar.v_se_enrich); assert(!has_nan(cPar.v_se_pve)); - gsl_matrix_free(S); - gsl_matrix_free(Svar); - gsl_vector_free(s_ref); - - gsl_matrix_free(Xz); - gsl_matrix_free(XWz); - gsl_matrix_free(XtXWz); - gsl_vector_free(w); - gsl_vector_free(w1); - gsl_vector_free(z); - gsl_vector_free(s_vec); + gsl_matrix_safe_free(S); + gsl_matrix_safe_free(Svar); + gsl_vector_safe_free(s_ref); + + gsl_matrix_safe_free(Xz); + gsl_matrix_safe_free(XWz); + gsl_matrix_safe_free(XtXWz); + gsl_vector_safe_free(w); + gsl_vector_safe_free(w1); + gsl_vector_safe_free(z); + gsl_vector_safe_free(s_vec); } // LMM or mvLMM or Eigen-Decomposition if (cPar.a_mode == 1 || cPar.a_mode == 2 || cPar.a_mode == 3 || cPar.a_mode == 4 || cPar.a_mode == 5 || cPar.a_mode == 31) { // Fit LMM or mvLMM or eigen - gsl_matrix *Y = gsl_matrix_alloc(cPar.ni_test, cPar.n_ph); + gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); enforce_msg(Y, "allocate Y"); // just to be sure - gsl_matrix *W = gsl_matrix_alloc(Y->size1, cPar.n_cvt); - gsl_matrix *B = gsl_matrix_alloc(Y->size2, W->size2); // B is a d by c + gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt); + gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2); // B is a d by c // matrix - gsl_matrix *se_B = gsl_matrix_alloc(Y->size2, W->size2); - gsl_matrix *G = gsl_matrix_alloc(Y->size1, Y->size1); - gsl_matrix *U = gsl_matrix_alloc(Y->size1, Y->size1); + gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2); + gsl_matrix *G = gsl_matrix_safe_alloc(Y->size1, Y->size1); + gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1); gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2); gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2); gsl_vector *eval = gsl_vector_calloc(Y->size1); - gsl_vector *env = gsl_vector_alloc(Y->size1); - gsl_vector *weight = gsl_vector_alloc(Y->size1); - assert_issue(cPar.issue == 26, UtY->data[0] == 0.0); + gsl_vector *env = gsl_vector_safe_alloc(Y->size1); + gsl_vector *weight = gsl_vector_safe_alloc(Y->size1); + assert_issue(is_issue(26), UtY->data[0] == 0.0); // set covariates matrix W and phenotype matrix Y // an intercept should be included in W, @@ -2557,7 +2560,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // center matrix G CenterMatrix(G); - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); // is residual weights are provided, then if (!cPar.file_weight.empty()) { @@ -2638,7 +2641,7 @@ void GEMMA::BatchRun(PARAM &cPar) { CalcUtX(U, W, UtW); CalcUtX(U, Y, UtY); - assert_issue(cPar.issue == 26, ROUND(UtY->data[0]) == -16.6143); + assert_issue(is_issue(26), ROUND(UtY->data[0]) == -16.6143); LMM cLmm; cLmm.CopyFromParam(cPar); @@ -2655,7 +2658,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // calculate UtW and Uty CalcUtX(U, W, UtW); CalcUtX(U, Y, UtY); - assert_issue(cPar.issue == 26, ROUND(UtY->data[0]) == -16.6143); + assert_issue(is_issue(26), ROUND(UtY->data[0]) == -16.6143); // calculate REMLE/MLE estimate and pve for univariate model if (cPar.n_ph == 1) { // one phenotype @@ -2663,31 +2666,27 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_vector_view se_beta = gsl_matrix_row(se_B, 0); gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); - assert_issue(cPar.issue == 26, ROUND(UtY->data[0]) == -16.6143); + assert_issue(is_issue(26), ROUND(UtY->data[0]) == -16.6143); CalcLambda('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max, cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0); assert(!std::isnan(UtY->data[0])); - assert(!std::isnan(B->data[0])); - assert(!std::isnan(se_B->data[0])); CalcLmmVgVeBeta(eval, UtW, &UtY_col.vector, cPar.l_mle_null, cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector, &se_beta.vector); assert(!std::isnan(UtY->data[0])); - assert(!std::isnan(B->data[0])); - assert(!std::isnan(se_B->data[0])); cPar.beta_mle_null.clear(); cPar.se_beta_mle_null.clear(); + assert(!std::isnan(B->data[0])); + assert(!std::isnan(se_B->data[0])); for (size_t i = 0; i < B->size2; i++) { cPar.beta_mle_null.push_back(gsl_matrix_get(B, 0, i)); cPar.se_beta_mle_null.push_back(gsl_matrix_get(se_B, 0, i)); } assert(!std::isnan(UtY->data[0])); - assert(!std::isnan(B->data[0])); - assert(!std::isnan(se_B->data[0])); assert(!std::isnan(cPar.beta_mle_null.front())); assert(!std::isnan(cPar.se_beta_mle_null.front())); @@ -2699,6 +2698,9 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.beta_remle_null.clear(); cPar.se_beta_remle_null.clear(); + assert(!std::isnan(B->data[0])); + assert(!std::isnan(se_B->data[0])); + for (size_t i = 0; i < B->size2; i++) { cPar.beta_remle_null.push_back(gsl_matrix_get(B, 0, i)); cPar.se_beta_remle_null.push_back(gsl_matrix_get(se_B, 0, i)); @@ -2710,14 +2712,14 @@ void GEMMA::BatchRun(PARAM &cPar) { // calculate and output residuals if (cPar.a_mode == 5) { - gsl_vector *Utu_hat = gsl_vector_alloc(Y->size1); - gsl_vector *Ute_hat = gsl_vector_alloc(Y->size1); - gsl_vector *u_hat = gsl_vector_alloc(Y->size1); - gsl_vector *e_hat = gsl_vector_alloc(Y->size1); - gsl_vector *y_hat = gsl_vector_alloc(Y->size1); + gsl_vector *Utu_hat = gsl_vector_safe_alloc(Y->size1); + gsl_vector *Ute_hat = gsl_vector_safe_alloc(Y->size1); + gsl_vector *u_hat = gsl_vector_safe_alloc(Y->size1); + gsl_vector *e_hat = gsl_vector_safe_alloc(Y->size1); + gsl_vector *y_hat = gsl_vector_safe_alloc(Y->size1); // obtain Utu and Ute - gsl_vector_memcpy(y_hat, &UtY_col.vector); + gsl_vector_safe_memcpy(y_hat, &UtY_col.vector); gsl_blas_dgemv(CblasNoTrans, -1.0, UtW, &beta.vector, 1.0, y_hat); double d, u, e; @@ -2738,9 +2740,9 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.WriteVector(u_hat, "residU"); cPar.WriteVector(e_hat, "residE"); - gsl_vector_free(u_hat); - gsl_vector_free(e_hat); - gsl_vector_free(y_hat); + gsl_vector_safe_free(u_hat); + gsl_vector_safe_free(e_hat); + gsl_vector_safe_free(y_hat); } } @@ -2755,18 +2757,18 @@ void GEMMA::BatchRun(PARAM &cPar) { gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); if (!cPar.file_bfile.empty()) { + // PLINK analysis if (cPar.file_gxe.empty()) { cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W, - &Y_col.vector); - } else { + &Y_col.vector, cPar.setGWASnps); + } + else { cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, env); } } - // WJA added - else if (!cPar.file_oxford.empty()) { - cLmm.Analyzebgen(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector); - } else { + else { + // BIMBAM analysis if (cPar.file_gxe.empty()) { cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W, &Y_col.vector, cPar.setGWASnps); @@ -2788,8 +2790,6 @@ void GEMMA::BatchRun(PARAM &cPar) { } else { cMvlmm.AnalyzePlinkGXE(U, eval, UtW, UtY, env); } - } else if (!cPar.file_oxford.empty()) { - cMvlmm.Analyzebgen(U, eval, UtW, UtY); } else { if (cPar.file_gxe.empty()) { cMvlmm.AnalyzeBimbam(U, eval, UtW, UtY); @@ -2805,24 +2805,24 @@ void GEMMA::BatchRun(PARAM &cPar) { } // release all matrices and vectors - gsl_matrix_free(Y); - gsl_matrix_free(W); - gsl_matrix_free(B); - gsl_matrix_free(se_B); - gsl_matrix_free(G); - gsl_matrix_free(U); - gsl_matrix_free(UtW); - gsl_matrix_free(UtY); - gsl_vector_free(eval); - gsl_vector_free(env); + gsl_matrix_safe_free(Y); + gsl_matrix_safe_free(W); + gsl_matrix_safe_free(B); + gsl_matrix_safe_free(se_B); + gsl_matrix_safe_free(G); + gsl_matrix_safe_free(U); + gsl_matrix_safe_free(UtW); + gsl_matrix_safe_free(UtY); + gsl_vector_safe_free(eval); + gsl_vector_safe_free(env); } // BSLMM if (cPar.a_mode == 11 || cPar.a_mode == 12 || cPar.a_mode == 13) { - gsl_vector *y = gsl_vector_alloc(cPar.ni_test); - gsl_matrix *W = gsl_matrix_alloc(y->size, cPar.n_cvt); - gsl_matrix *G = gsl_matrix_alloc(y->size, y->size); - gsl_matrix *UtX = gsl_matrix_alloc(y->size, cPar.ns_test); + gsl_vector *y = gsl_vector_safe_alloc(cPar.ni_test); + gsl_matrix *W = gsl_matrix_safe_alloc(y->size, cPar.n_cvt); + gsl_matrix *G = gsl_matrix_safe_alloc(y->size, y->size); + gsl_matrix *UtX = gsl_matrix_safe_alloc(y->size, cPar.ns_test); // set covariates matrix W and phenotype vector y // an intercept should be included in W, @@ -2845,10 +2845,10 @@ void GEMMA::BatchRun(PARAM &cPar) { cBslmm.CopyToParam(cPar); // else, if rho!=1 } else { - gsl_matrix *U = gsl_matrix_alloc(y->size, y->size); - gsl_vector *eval = gsl_vector_alloc(y->size); - gsl_matrix *UtW = gsl_matrix_alloc(y->size, W->size2); - gsl_vector *Uty = gsl_vector_alloc(y->size); + gsl_matrix *U = gsl_matrix_safe_alloc(y->size, y->size); + gsl_vector *eval = gsl_vector_safe_alloc(y->size); + gsl_matrix *UtW = gsl_matrix_safe_alloc(y->size, W->size2); + gsl_vector *Uty = gsl_vector_safe_alloc(y->size); // read relatedness matrix G if (!(cPar.file_kin).empty()) { @@ -2864,7 +2864,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // center matrix G CenterMatrix(G); - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); } else { cPar.ReadGenotypes(UtX, G, true); } @@ -2915,24 +2915,24 @@ void GEMMA::BatchRun(PARAM &cPar) { } // release all matrices and vectors - gsl_matrix_free(G); - gsl_matrix_free(U); - gsl_matrix_free(UtW); - gsl_vector_free(eval); - gsl_vector_free(Uty); + gsl_matrix_safe_free(G); + gsl_matrix_safe_free(U); + gsl_matrix_safe_free(UtW); + gsl_vector_safe_free(eval); + gsl_vector_safe_free(Uty); } - gsl_matrix_free(W); - gsl_vector_free(y); - gsl_matrix_free(UtX); + gsl_matrix_safe_free(W); + gsl_vector_safe_free(y); + gsl_matrix_safe_free(UtX); } // BSLMM-DAP if (cPar.a_mode == 14 || cPar.a_mode == 15 || cPar.a_mode == 16) { if (cPar.a_mode == 14) { - gsl_vector *y = gsl_vector_alloc(cPar.ni_test); - gsl_matrix *W = gsl_matrix_alloc(y->size, cPar.n_cvt); - gsl_matrix *G = gsl_matrix_alloc(y->size, y->size); - gsl_matrix *UtX = gsl_matrix_alloc(y->size, cPar.ns_test); + gsl_vector *y = gsl_vector_safe_alloc(cPar.ni_test); + gsl_matrix *W = gsl_matrix_safe_alloc(y->size, cPar.n_cvt); + gsl_matrix *G = gsl_matrix_safe_alloc(y->size, y->size); + gsl_matrix *UtX = gsl_matrix_safe_alloc(y->size, cPar.ns_test); // set covariates matrix W and phenotype vector y // an intercept should be included in W, @@ -2956,10 +2956,10 @@ void GEMMA::BatchRun(PARAM &cPar) { cBslmm.CopyToParam(cPar); // else, if rho!=1 } else { - gsl_matrix *U = gsl_matrix_alloc(y->size, y->size); - gsl_vector *eval = gsl_vector_alloc(y->size); - gsl_matrix *UtW = gsl_matrix_alloc(y->size, W->size2); - gsl_vector *Uty = gsl_vector_alloc(y->size); + gsl_matrix *U = gsl_matrix_safe_alloc(y->size, y->size); + gsl_vector *eval = gsl_vector_safe_alloc(y->size); + gsl_matrix *UtW = gsl_matrix_safe_alloc(y->size, W->size2); + gsl_vector *Uty = gsl_vector_safe_alloc(y->size); // read relatedness matrix G if (!(cPar.file_kin).empty()) { @@ -2975,7 +2975,7 @@ void GEMMA::BatchRun(PARAM &cPar) { // center matrix G CenterMatrix(G); - validate_K(G,cPar.mode_check,cPar.mode_strict); + validate_K(G); } else { cPar.ReadGenotypes(UtX, G, true); @@ -3019,16 +3019,16 @@ void GEMMA::BatchRun(PARAM &cPar) { cBslmmDap.CopyToParam(cPar); // release all matrices and vectors - gsl_matrix_free(G); - gsl_matrix_free(U); - gsl_matrix_free(UtW); - gsl_vector_free(eval); - gsl_vector_free(Uty); + gsl_matrix_safe_free(G); + gsl_matrix_safe_free(U); + gsl_matrix_safe_free(UtW); + gsl_vector_safe_free(eval); + gsl_vector_safe_free(Uty); } - gsl_matrix_free(W); - gsl_vector_free(y); - gsl_matrix_free(UtX); + gsl_matrix_safe_free(W); + gsl_vector_safe_free(y); + gsl_matrix_safe_free(UtX); } else if (cPar.a_mode == 15) { // perform EM algorithm and estimate parameters vector<string> vec_rs; @@ -3045,9 +3045,9 @@ void GEMMA::BatchRun(PARAM &cPar) { } // load annotations - gsl_matrix *Ac; - gsl_matrix_int *Ad; - gsl_vector_int *dlevel; + gsl_matrix *Ac = NULL; + gsl_matrix_int *Ad = NULL; + gsl_vector_int *dlevel = NULL; size_t kc, kd; if (!cPar.file_cat.empty()) { ReadFile_cat(cPar.file_cat, vec_rs, Ac, Ad, dlevel, kc, kd); @@ -3057,7 +3057,7 @@ void GEMMA::BatchRun(PARAM &cPar) { } cout << "## number of blocks = " << BF.size() << endl; - cout << "## number of analyzed SNPs = " << vec_rs.size() << endl; + cout << "## number of analyzed SNPs/var = " << vec_rs.size() << endl; cout << "## grid size for hyperparameters = " << wab.size() << endl; cout << "## number of continuous annotations = " << kc << endl; cout << "## number of discrete annotations = " << kd << endl; @@ -3077,7 +3077,7 @@ void GEMMA::BatchRun(PARAM &cPar) { cPar.time_opt = (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); cBslmmDap.CopyToParam(cPar); - gsl_matrix_free(Ac); + gsl_matrix_safe_free(Ac); gsl_matrix_int_free(Ad); gsl_vector_int_free(dlevel); } else { @@ -3090,6 +3090,8 @@ void GEMMA::BatchRun(PARAM &cPar) { return; } +#include "Eigen/Dense" + void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { string file_str; file_str = cPar.path_out + "/" + cPar.file_out; @@ -3102,9 +3104,21 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { } outfile << "##" << endl; - outfile << "## GEMMA Version = " << version << endl; - outfile << "## GSL Version = " << GSL_VERSION << endl; - outfile << "## Eigen Version = " << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << endl; + outfile << "## GEMMA Version = " << version << " (" << date << ")" << endl; + outfile << "## GSL Version = " << GSL_VERSION << endl; + outfile << "## Eigen Version = " << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << endl; +#ifdef OPENBLAS + + #ifndef OPENBLAS_LEGACY + outfile << "## OpenBlas =" << OPENBLAS_VERSION << " - " << openblas_get_config() << endl; + outfile << "## arch = " << openblas_get_corename() << endl; + outfile << "## threads = " << openblas_get_num_threads() << endl; + #else + outfile << "## OpenBlas = " << openblas_get_config() << endl; + #endif + string* pStr = new string[4] { "sequential", "threaded", "openmp" }; + outfile << "## parallel type = " << pStr[openblas_get_parallel()] << endl; +#endif outfile << "##" << endl; outfile << "## Command Line Input = "; @@ -3119,7 +3133,6 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { tm *ptm = localtime(&rawtime); outfile << "## Date = " << asctime(ptm); - // ptm->tm_year<<":"<<ptm->tm_month<<":"<<ptm->tm_day":"<<ptm->tm_hour<<":"<<ptm->tm_min<<endl; outfile << "##" << endl; outfile << "## Summary Statistics:" << endl; @@ -3129,11 +3142,6 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { << cPar.ni_study << endl; outfile << "## number of total individuals in the reference = " << cPar.ni_ref << endl; - // outfile<<"## number of total SNPs in the sample = "<<cPar.ns_study<<endl; - // outfile<<"## number of total SNPs in the reference panel = - // "<<cPar.ns_ref<<endl; - // outfile<<"## number of analyzed SNPs = "<<cPar.ns_test<<endl; - // outfile<<"## number of analyzed SNP pairs = "<<cPar.ns_pair<<endl; outfile << "## number of variance components = " << cPar.n_vc << endl; outfile << "## pve estimates = "; @@ -3183,11 +3191,11 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { << cPar.ni_study << endl; outfile << "## number of total individuals in the reference = " << cPar.ni_total << endl; - outfile << "## number of total SNPs in the sample = " << cPar.ns_study + outfile << "## number of total SNPs/var in the sample = " << cPar.ns_study << endl; - outfile << "## number of total SNPs in the reference panel = " + outfile << "## number of total SNPs/var in the reference panel = " << cPar.ns_total << endl; - outfile << "## number of analyzed SNPs = " << cPar.ns_test << endl; + outfile << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; outfile << "## number of variance components = " << cPar.n_vc << endl; } else if (!cPar.file_beta.empty() && (cPar.a_mode == 66 || cPar.a_mode == 67)) { @@ -3195,9 +3203,9 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { << cPar.ni_total << endl; outfile << "## number of total individuals in the reference = " << cPar.ni_ref << endl; - outfile << "## number of total SNPs in the sample = " << cPar.ns_total + outfile << "## number of total SNPs/var in the sample = " << cPar.ns_total << endl; - outfile << "## number of analyzed SNPs = " << cPar.ns_test << endl; + outfile << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; outfile << "## number of variance components = " << cPar.n_vc << endl; outfile << "## pve estimates = "; @@ -3267,10 +3275,10 @@ void GEMMA::WriteLog(int argc, char **argv, PARAM &cPar) { outfile << "## number of total genes = " << cPar.ng_total << endl; outfile << "## number of analyzed genes = " << cPar.ng_test << endl; } else if (cPar.file_epm.empty()) { - outfile << "## number of total SNPs = " << cPar.ns_total << endl; - outfile << "## number of analyzed SNPs = " << cPar.ns_test << endl; + outfile << "## number of total SNPs/var = " << cPar.ns_total << endl; + outfile << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; } else { - outfile << "## number of analyzed SNPs = " << cPar.ns_test << endl; + outfile << "## number of analyzed SNPs/var = " << cPar.ns_test << endl; } if (cPar.a_mode == 13) { diff --git a/src/gemma.h b/src/gemma.h index cd1683a..4deab51 100644 --- a/src/gemma.h +++ b/src/gemma.h @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,7 +42,8 @@ #include "gsl/gsl_vector.h" #include "debug.h" -#include "eigenlib.h" +// #include "eigenlib.h" +#include "fastblas.h" #include "gzstream.h" #include "io.h" #include "lapack.h" @@ -49,43 +52,24 @@ using namespace std; // Print progress bar. -void ProgressBar(string str, double p, double total) { - double progress = (100.0 * p / total); - int barsize = (int)(progress / 2.0); - char bar[51]; - - cout << str; - for (int i = 0; i < 50; i++) { - if (i < barsize) { - bar[i] = '='; - } else { - bar[i] = ' '; - } - cout << bar[i]; - } - cout << setprecision(2) << fixed << progress << "%\r" << flush; - - return; -} - -// Print progress bar with acceptance ratio. void ProgressBar(string str, double p, double total, double ratio) { - double progress = (100.0 * p / total); - int barsize = (int)(progress / 2.0); - char bar[51]; - - cout << str; - for (int i = 0; i < 50; i++) { - if (i < barsize) { - bar[i] = '='; - } else { - bar[i] = ' '; - } - cout << bar[i]; - } - cout << setprecision(2) << fixed << progress << "% " << ratio << "\r" - << flush; - return; + assert(p<=total); + assert(p>=0); + if (total <= 0.0) return; + const double progress = (100.0 * p / total); + const uint barsize = (int)(progress / 2.0); // characters + // cout << barsize << endl; + // cout << str << " "; + // cout << p << "/" << total << endl; + assert(barsize < 101); // corrupted data somehow + if (barsize > 0) { + cout << std::string(barsize,'='); + } + cout << std::string(50-barsize,' '); + cout << setprecision(0) << fixed << " " << progress << "%"; + if (ratio != -1.0) + cout << setprecision(2) << " " << ratio; + cout << "\r" << flush; } bool isBlankLine(char const *line) { @@ -177,7 +161,7 @@ bool ReadFile_snps_header(const string &file_snps, set<string> &setSnps) { // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.rs_col == 0 && (header.chr_col == 0 || header.pos_col == 0)) { @@ -233,7 +217,7 @@ bool ReadFile_log(const string &file_log, double &pheno_mean) { size_t flag = 0; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); ch_ptr = strtok(NULL, " , \t"); if (ch_ptr != NULL && strcmp(ch_ptr, "estimated") == 0) { @@ -241,7 +225,7 @@ bool ReadFile_log(const string &file_log, double &pheno_mean) { if (ch_ptr != NULL && strcmp(ch_ptr, "mean") == 0) { ch_ptr = strtok(NULL, " , \t"); if (ch_ptr != NULL && strcmp(ch_ptr, "=") == 0) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); pheno_mean = atof(ch_ptr); flag = 1; } @@ -339,7 +323,7 @@ bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv, string id; double p; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (int i = 0; i < (p_column - 1); ++i) { ch_ptr = strtok(NULL, " , \t"); } @@ -511,17 +495,17 @@ bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) { string minor; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); chr = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); cM = atof(ch_ptr); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); b_pos = atol(ch_ptr); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); minor = ch_ptr; - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); major = ch_ptr; SNPINFO sInfo = {chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; @@ -567,12 +551,12 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, } while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe(NULL, " \t"); id = ch_ptr; - ch_ptr = strtok(NULL, " \t"); - ch_ptr = strtok(NULL, " \t"); - ch_ptr = strtok(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); ch_ptr = strtok(NULL, " \t"); size_t i = 0; @@ -620,7 +604,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, - size_t &ns_test, bool debug) { + size_t &ns_test) { debug_msg("entered"); indicator_snp.clear(); snpInfo.clear(); @@ -631,12 +615,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, return false; } - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -674,11 +658,11 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, file_pos = 0; auto count_warnings = 0; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); minor = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); major = ch_ptr; if (setSnps.size() != 0 && setSnps.count(rs) == 0) { @@ -693,7 +677,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, } if (mapRS2bp.count(rs) == 0) { - if (debug && count_warnings++ < 10) { + if (is_debug_mode() && count_warnings++ < 10) { std::string msg = "Can't figure out position for "; msg += rs; debug_msg(msg); @@ -719,7 +703,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, c_idv = 0; gsl_vector_set_zero(genotype_miss); for (int i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) continue; @@ -842,12 +826,12 @@ bool ReadFile_bed(const string &file_bed, const set<string> &setSnps, return false; } - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -1029,13 +1013,13 @@ bool Bimbam_ReadOneSNP(const size_t inc, const vector<int> &indicator_idv, bool flag = false; for (size_t i = 0; i < inc; i++) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); } if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); geno_mean = 0.0; double d; @@ -1043,7 +1027,7 @@ bool Bimbam_ReadOneSNP(const size_t inc, const vector<int> &indicator_idv, vector<size_t> geno_miss; for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -1159,9 +1143,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, size_t i_test = 0, i_total = 0, j_test = 0, j_total = 0; while (getline(infile, line)) { if (i_total == ni_total) { - cout << "error! number of rows in the kinship " - << "file is larger than the number of phentypes." << endl; - error = true; + fail_msg("number of rows in the kinship file is larger than the number of phentypes"); } if (indicator_idv[i_total] == 0) { @@ -1174,10 +1156,7 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, ch_ptr = strtok((char *)line.c_str(), " , \t"); while (ch_ptr != NULL) { if (j_total == ni_total) { - cout << "error! number of columns in the " - << "kinship file is larger than the number" - << " of phenotypes for row = " << i_total << endl; - error = true; + fail_msg(string("number of columns in the kinship file is larger than the number of individuals for row = ")+to_string(i_total)); } d = atof(ch_ptr); @@ -1190,18 +1169,14 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, ch_ptr = strtok(NULL, " , \t"); } if (j_total != ni_total) { - cout << "error! number of columns in the kinship " - << "file do not match the number of phentypes for " - << "row = " << i_total << endl; - error = true; + string msg = "number of columns in the kinship file does not match the number of individuals for row = " + to_string( i_total ); + fail_msg(msg); } i_total++; i_test++; } if (i_total != ni_total) { - cout << "error! number of rows in the kinship file do " - << "not match the number of phenotypes." << endl; - error = true; + fail_msg("number of rows in the kinship file does not match the number of individuals."); } } else { map<size_t, size_t> mapID2ID; @@ -1218,11 +1193,11 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, size_t n_id1, n_id2; while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); id1 = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); id2 = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); d = atof(ch_ptr); if (mapID2num.count(id1) == 0 || mapID2num.count(id2) == 0) { continue; @@ -1237,9 +1212,10 @@ void ReadFile_kin(const string &file_kin, vector<int> &indicator_idv, Cov_d = gsl_matrix_get(G, n_id1, n_id2); if (Cov_d != 0 && Cov_d != d) { - cout << "error! redundant and unequal terms in the " + cerr << "error! redundant and unequal terms in the " << "kinship file, for id1 = " << id1 << " and id2 = " << id2 << endl; + fail_msg(""); } else { gsl_matrix_set(G, n_id1, n_id2, d); gsl_matrix_set(G, n_id2, n_id1, d); @@ -1278,7 +1254,6 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv, infile.close(); infile.clear(); - return; } void ReadFile_eigenU(const string &file_ku, bool &error, gsl_matrix *U) { @@ -1354,7 +1329,7 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { error = true; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); d = atof(ch_ptr); ch_ptr = strtok(NULL, " , \t"); @@ -1391,12 +1366,12 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, bool process_ksnps = ksnps.size(); size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); - gsl_vector *geno_miss = gsl_vector_alloc(ni_total); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); // Xlarge contains inds x markers const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); enforce_msg(Xlarge, "allocate Xlarge"); gsl_matrix_set_zero(Xlarge); @@ -1405,9 +1380,9 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, size_t ns_test = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { string line; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) continue; @@ -1421,7 +1396,12 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, uint token_num = 0; for (auto x = tokens; x != rend; x++) token_num++; - enforce_str(token_num == ni_total + 3, line + " count fields"); + if (token_num != ni_total+3) { + cerr << line << endl; + cerr << token_num << " != " << ni_total << endl; + warning_msg("Columns in geno file do not match # individuals"); + } + enforce_msg(token_num <= ni_total + 3,"not enough genotype fields"); } auto snp = *tokens; // first field @@ -1480,12 +1460,12 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, // compute kinship matrix and return in matrix_kin a SNP at a time if (ns_test % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } if (ns_test % msize != 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); } cout << endl; @@ -1531,14 +1511,14 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, double d, geno_mean, geno_var; size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); size_t ns_test = 0; int n_bit; // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_total, msize); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); gsl_matrix_set_zero(Xlarge); // Calculate n_bit and c, the number of bit for each snp. @@ -1556,7 +1536,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -1626,13 +1606,13 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, ns_test++; if (ns_test % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } if (ns_test % msize != 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); } cout << endl; @@ -1659,7 +1639,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, // genotype and calculate K. bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K, bool debug) { + const bool calc_K) { debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { @@ -1674,8 +1654,8 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(UtX->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(UtX->size1); + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(UtX->size1); double geno, geno_mean; size_t n_miss; @@ -1687,21 +1667,21 @@ bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, int c_idv = 0, c_snp = 0; for (int i = 0; i < ns_total; ++i) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (indicator_snp[i] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); c_idv = 0; geno_mean = 0; n_miss = 0; gsl_vector_set_zero(genotype_miss); for (int j = 0; j < ni_total; ++j) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[j] == 0) { continue; } @@ -1764,7 +1744,7 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, - const size_t ns_test, bool debug) { + const size_t ns_test) { debug_msg("entered"); igzstream infile(file_geno.c_str(), igzstream::in); if (!infile) { @@ -1785,8 +1765,8 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(ni_test); - gsl_vector *genotype_miss = gsl_vector_alloc(ni_test); + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); double geno, geno_mean; size_t n_miss; @@ -1796,21 +1776,21 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, size_t c_idv = 0, c_snp = 0; for (size_t i = 0; i < ns_total; ++i) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (indicator_snp[i] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); c_idv = 0; geno_mean = 0; n_miss = 0; gsl_vector_set_zero(genotype_miss); for (uint j = 0; j < ni_total; ++j) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[j] == 0) { continue; } @@ -1904,7 +1884,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(UtX->size1); + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); double geno, geno_mean; size_t n_miss; @@ -2040,7 +2020,7 @@ bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, gsl_matrix_set_zero(K); } - gsl_vector *genotype = gsl_vector_alloc(ni_test); + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); double geno, geno_mean; size_t n_miss; @@ -2160,22 +2140,26 @@ bool ReadFile_est(const string &file_est, const vector<size_t> &est_column, size_t n = *max_element(est_column.begin(), est_column.end()); while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); alpha = 0.0; beta = 0.0; gamma = 1.0; for (size_t i = 0; i < n + 1; ++i) { if (i == est_column[0] - 1) { + enforce(ch_ptr); rs = ch_ptr; } if (i == est_column[1] - 1) { + enforce(ch_ptr); alpha = atof(ch_ptr); } if (i == est_column[2] - 1) { + enforce(ch_ptr); beta = atof(ch_ptr); } if (i == est_column[3] - 1) { + enforce(ch_ptr); gamma = atof(ch_ptr); } if (i < n) { @@ -2237,7 +2221,7 @@ bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, getline(infile, line); while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; ch_ptr = strtok(NULL, " , \t"); @@ -2274,759 +2258,6 @@ bool ReadFile_gene(const string &file_gene, vector<double> &vec_read, return true; } -// WJA Added -// Read Oxford sample file. -bool ReadFile_sample(const string &file_sample, - vector<vector<int>> &indicator_pheno, - vector<vector<double>> &pheno, - const vector<size_t> &p_column, vector<int> &indicator_cvt, - vector<vector<double>> &cvt, size_t &n_cvt) { - debug_msg("entered"); - indicator_pheno.clear(); - pheno.clear(); - indicator_cvt.clear(); - - igzstream infile(file_sample.c_str(), igzstream::in); - - if (!infile) { - cout << "error! fail to open sample file: " << file_sample << endl; - return false; - } - - string line; - char *ch_ptr; - - string id; - double p, d; - - vector<double> pheno_row; - vector<int> ind_pheno_row; - int flag_na = 0; - - size_t num_cols = 0; - size_t num_p_in_file = 0; - size_t num_cvt_in_file = 0; - - map<size_t, size_t> mapP2c; - for (size_t i = 0; i < p_column.size(); i++) { - mapP2c[p_column[i]] = i; - pheno_row.push_back(-9); - ind_pheno_row.push_back(0); - } - - // Read header line1. - if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - if (strcmp(ch_ptr, "ID_1") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "ID_2") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "missing") != 0) { - return false; - } - while (ch_ptr != NULL) { - num_cols++; - ch_ptr = strtok(NULL, " \t"); - } - num_cols--; - } - - vector<map<uint32_t, size_t>> cvt_factor_levels; - - char col_type[num_cols]; - - // Read header line2. - if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - ch_ptr = strtok(NULL, " \t"); - if (strcmp(ch_ptr, "0") != 0) { - return false; - } - size_t it = 0; - ch_ptr = strtok(NULL, " \t"); - if (ch_ptr != NULL) - while (ch_ptr != NULL) { - col_type[it++] = ch_ptr[0]; - if (ch_ptr[0] == 'D') { - cvt_factor_levels.push_back(map<uint32_t, size_t>()); - num_cvt_in_file++; - } - if (ch_ptr[0] == 'C') { - num_cvt_in_file++; - } - if ((ch_ptr[0] == 'P') || (ch_ptr[0] == 'B')) { - num_p_in_file++; - } - ch_ptr = strtok(NULL, " \t"); - } - } - - while (!safeGetline(infile, line).eof()) { - - ch_ptr = strtok((char *)line.c_str(), " \t"); - - for (int it = 0; it < 3; it++) { - ch_ptr = strtok(NULL, " \t"); - } - - size_t i = 0; - size_t p_i = 0; - size_t fac_cvt_i = 0; - - while (i < num_cols) { - - if ((col_type[i] == 'P') || (col_type[i] == 'B')) { - if (mapP2c.count(p_i + 1) != 0) { - if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[p_i + 1]] = 0; - pheno_row[mapP2c[p_i + 1]] = -9; - } else { - p = atof(ch_ptr); - ind_pheno_row[mapP2c[p_i + 1]] = 1; - pheno_row[mapP2c[p_i + 1]] = p; - } - } - p_i++; - } - if (col_type[i] == 'D') { - - // NOTE THIS DOES NOT CHECK TO BE SURE LEVEL - // IS INTEGRAL i.e for atoi error. - if (strcmp(ch_ptr, "NA") != 0) { - uint32_t level = atoi(ch_ptr); - if (cvt_factor_levels[fac_cvt_i].count(level) == 0) { - cvt_factor_levels[fac_cvt_i][level] = - cvt_factor_levels[fac_cvt_i].size(); - } - } - fac_cvt_i++; - } - - ch_ptr = strtok(NULL, " \t"); - i++; - } - - indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - } - - // Close and reopen the file. - infile.close(); - infile.clear(); - - if (num_cvt_in_file > 0) { - igzstream infile2(file_sample.c_str(), igzstream::in); - - if (!infile2) { - cout << "error! fail to open sample file: " << file_sample << endl; - return false; - } - - // Skip header. - safeGetline(infile2, line); - safeGetline(infile2, line); - - // Pull in the covariates now we now the number of - // factor levels. - while (!safeGetline(infile2, line).eof()) { - - vector<double> v_d; - flag_na = 0; - ch_ptr = strtok((char *)line.c_str(), " \t"); - - for (int it = 0; it < 3; it++) { - ch_ptr = strtok(NULL, " \t"); - } - - size_t i = 0; - size_t fac_cvt_i = 0; - size_t num_fac_levels; - while (i < num_cols) { - - if (col_type[i] == 'C') { - if (strcmp(ch_ptr, "NA") == 0) { - flag_na = 1; - d = -9; - } else { - d = atof(ch_ptr); - } - - v_d.push_back(d); - } - - if (col_type[i] == 'D') { - - // NOTE THIS DOES NOT CHECK TO BE SURE - // LEVEL IS INTEGRAL i.e for atoi error. - num_fac_levels = cvt_factor_levels[fac_cvt_i].size(); - if (num_fac_levels > 1) { - if (strcmp(ch_ptr, "NA") == 0) { - flag_na = 1; - for (size_t it = 0; it < num_fac_levels - 1; it++) { - v_d.push_back(-9); - } - } else { - uint32_t level = atoi(ch_ptr); - for (size_t it = 0; it < num_fac_levels - 1; it++) { - cvt_factor_levels[fac_cvt_i][level] == it + 1 - ? v_d.push_back(1.0) - : v_d.push_back(0.0); - } - } - } - fac_cvt_i++; - } - - ch_ptr = strtok(NULL, " \t"); - i++; - } - - if (flag_na == 0) { - indicator_cvt.push_back(1); - } else { - indicator_cvt.push_back(0); - } - cvt.push_back(v_d); - } - - if (indicator_cvt.empty()) { - n_cvt = 0; - } else { - flag_na = 0; - for (vector<int>::size_type i = 0; i < indicator_cvt.size(); ++i) { - if (indicator_cvt[i] == 0) { - continue; - } - - if (flag_na == 0) { - flag_na = 1; - n_cvt = cvt[i].size(); - } - if (flag_na != 0 && n_cvt != cvt[i].size()) { - cout << "error! number of covariates in row " << i - << " do not match other rows." << endl; - return false; - } - } - } - - infile2.close(); - infile2.clear(); - } - return true; -} - -// WJA Added. -// Read bgen file, the first time. -bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, - const gsl_matrix *W, vector<int> &indicator_idv, - vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, - const double &maf_level, const double &miss_level, - const double &hwe_level, const double &r2_level, - size_t &ns_test) { - - debug_msg("entered"); - indicator_snp.clear(); - - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return false; - } - - gsl_vector *genotype = gsl_vector_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - bool LongIds = bgen_flags & 0x4; - - if (!LongIds) { - return false; - } - - infile.ignore(bgen_snp_block_offset); - - ns_test = 0; - - size_t ns_total = static_cast<size_t>(bgen_nsnps); - - snpInfo.clear(); - string rs; - long int b_pos; - string chr; - string major; - string minor; - string id; - - double v_x, v_w; - int c_idv = 0; - - double maf, geno, geno_old; - size_t n_miss; - size_t n_0, n_1, n_2; - int flag_poly; - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - // Total number of samples in phenotype file. - size_t ni_total = indicator_idv.size(); - - // Number of samples to use in test. - size_t ni_test = 0; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - - for (size_t i = 0; i < ni_total; ++i) { - ni_test += indicator_idv[i]; - } - - for (size_t t = 0; t < ns_total; ++t) { - - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - // Should we switch according to MAF? - minor = bgen_B_allele; - major = bgen_A_allele; - b_pos = static_cast<long int>(bgen_SNP_pos); - - uint16_t unzipped_data[3 * bgen_N]; - - if (setSnps.size() != 0 && setSnps.count(rs) == 0) { - SNPINFO sInfo = { - "-9", rs, -9, -9, minor, major, static_cast<size_t>(-9), - -9, (long int)-9}; - - snpInfo.push_back(sInfo); - indicator_snp.push_back(0); - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - maf = 0; - n_miss = 0; - flag_poly = 0; - geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; - c_idv = 0; - gsl_vector_set_zero(genotype_miss); - for (size_t i = 0; i < bgen_N; ++i) { - - // CHECK this set correctly! - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - - // CHECK 0.1 OK. - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - c_idv++; - continue; - } - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } - - gsl_vector_set(genotype, c_idv, geno); - - // CHECK WHAT THIS DOES. - if (flag_poly == 0) { - geno_old = geno; - flag_poly = 2; - } - if (flag_poly == 2 && geno != geno_old) { - flag_poly = 1; - } - - maf += geno; - - c_idv++; - } - - maf /= 2.0 * static_cast<double>(ni_test - n_miss); - - SNPINFO sInfo = {chr, rs, -9, b_pos, - minor, major, n_miss, (double)n_miss / (double)ni_test, - maf}; - snpInfo.push_back(sInfo); - - if ((double)n_miss / (double)ni_test > miss_level) { - indicator_snp.push_back(0); - continue; - } - - if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { - indicator_snp.push_back(0); - continue; - } - - if (flag_poly != 1) { - indicator_snp.push_back(0); - continue; - } - - if (hwe_level != 0 && maf_level != -1) { - if (CalcHWE(n_0, n_2, n_1) < hwe_level) { - indicator_snp.push_back(0); - continue; - } - } - - // Filter SNP if it is correlated with W - // unless W has only one column, of 1s. - for (size_t i = 0; i < genotype->size; ++i) { - if (gsl_vector_get(genotype_miss, i) == 1) { - geno = maf * 2.0; - gsl_vector_set(genotype, i, geno); - } - } - - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_ddot(genotype, genotype, &v_x); - gsl_blas_ddot(Wtx, WtWiWtx, &v_w); - - if (W->size2 != 1 && v_w / v_x >= r2_level) { - indicator_snp.push_back(0); - continue; - } - - indicator_snp.push_back(1); - ns_test++; - } - - return true; -} - -// Read oxford genotype file and calculate kinship matrix. -bool bgenKin(const string &file_oxford, vector<int> &indicator_snp, - const int k_mode, const int display_pace, gsl_matrix *matrix_kin) { - debug_msg("entered"); - string file_bgen = file_oxford; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return false; - } - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - double genotype; - - size_t n_miss; - double d, geno_mean, geno_var; - - size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_total); - gsl_vector *geno_miss = gsl_vector_alloc(ni_total); - - size_t ns_test = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading bgen SNPs ", t, indicator_snp.size() - 1); - } - - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - geno_mean = 0.0; - n_miss = 0; - geno_var = 0.0; - gsl_vector_set_all(geno_miss, 0); - - for (size_t i = 0; i < bgen_N; ++i) { - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - // WJA - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(geno_miss, i, 0.0); - n_miss++; - } else { - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - genotype = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(geno, i, genotype); - gsl_vector_set(geno_miss, i, 1.0); - geno_mean += genotype; - geno_var += genotype * genotype; - } - } - - geno_mean /= (double)(ni_total - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_total; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_total; ++i) { - if (gsl_vector_get(geno_miss, i) == 0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - if (geno_var != 0) { - if (k_mode == 1) { - gsl_blas_dsyr(CblasUpper, 1.0, geno, matrix_kin); - } else if (k_mode == 2) { - gsl_blas_dsyr(CblasUpper, 1.0 / geno_var, geno, matrix_kin); - } else { - cout << "Unknown kinship mode." << endl; - } - } - - ns_test++; - } - cout << endl; - - gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test); - - for (size_t i = 0; i < ni_total; ++i) { - for (size_t j = 0; j < i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i); - gsl_matrix_set(matrix_kin, i, j, d); - } - } - - gsl_vector_free(geno); - gsl_vector_free(geno_miss); - - infile.close(); - infile.clear(); - - return true; -} - // Read header to determine which column contains which item. bool ReadHeader_io(const string &line, HEADER &header) { debug_msg("entered"); @@ -3314,7 +2545,7 @@ bool ReadFile_cat(const string &file_cat, map<string, size_t> &mapRS2cat, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); // Use the header to count the number of categories. @@ -3340,10 +2571,11 @@ bool ReadFile_cat(const string &file_cat, map<string, size_t> &mapRS2cat, // Read the following lines to record mapRS2cat. while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); i_cat = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } else if (header.chr_col != 0 && header.chr_col == i + 1) { @@ -3436,13 +2668,13 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, double d, geno_mean, geno_var; size_t ni_test = matrix_kin->size1; - gsl_vector *geno = gsl_vector_alloc(ni_test); - gsl_vector *geno_miss = gsl_vector_alloc(ni_test); + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_test); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -3459,21 +2691,21 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); size_t ns_test = 0; for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) continue; - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); rs = snpInfo[t].rs_number; // This line is new. @@ -3487,7 +2719,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, if (indicator_idv[i] == 0) { continue; } - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(geno_miss, i, 0); n_miss++; @@ -3536,7 +2768,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, ns_vec[0]++; if (ns_vec[0] % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs) != 0) { @@ -3553,7 +2785,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix( matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); @@ -3569,7 +2801,7 @@ bool BimbamKinUncentered(const string &file_geno, const set<string> ksnps, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); } } @@ -3628,12 +2860,12 @@ bool PlinkKin(const string &file_bed, const int display_pace, size_t ni_test = matrix_kin->size1; size_t ni_total = indicator_idv.size(); - gsl_vector *geno = gsl_vector_alloc(ni_test); + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_alloc(W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); gsl_permutation *pmt = gsl_permutation_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -3653,7 +2885,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, // Create a large matrix. const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(ni_test, msize * n_vc); + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); gsl_matrix_set_zero(Xlarge); // Calculate n_bit and c, the number of bit for each SNP. @@ -3671,7 +2903,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, for (size_t t = 0; t < indicator_snp.size(); ++t) { if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -3762,7 +2994,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, ns_vec[0]++; if (ns_vec[0] % msize == 0) { - eigenlib_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); gsl_matrix_set_zero(Xlarge); } } else if (mapRS2cat.count(rs) != 0) { @@ -3779,7 +3011,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix( matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); gsl_matrix_set_zero(&X_sub.matrix); @@ -3795,7 +3027,7 @@ bool PlinkKin(const string &file_bed, const int display_pace, gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); gsl_matrix_view kin_sub = gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - eigenlib_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, &kin_sub.matrix); } } @@ -3852,8 +3084,8 @@ bool MFILEKin(const size_t mfile_mode, const string &file_mfile, string file_name; - gsl_matrix *kin_tmp = gsl_matrix_alloc(matrix_kin->size1, matrix_kin->size2); - gsl_vector *ns_tmp = gsl_vector_alloc(vector_ns->size); + gsl_matrix *kin_tmp = gsl_matrix_safe_alloc(matrix_kin->size1, matrix_kin->size2); + gsl_vector *ns_tmp = gsl_vector_safe_alloc(vector_ns->size); size_t l = 0; double d; @@ -3929,9 +3161,9 @@ bool ReadFile_wsnp(const string &file_wsnp, map<string, double> &mapRS2weight) { double weight; while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); weight = atof(ch_ptr); mapRS2weight[rs] = weight; } @@ -3960,17 +3192,18 @@ bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); while (!safeGetline(infile, line).eof()) { if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); size_t t = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } else if (header.chr_col != 0 && header.chr_col == i + 1) { @@ -4046,13 +3279,12 @@ void ReadFile_beta(const string &file_beta, string type; string rs, chr, a1, a0, pos, cm; - double z = 0, beta = 0, se_beta = 0, chisq = 0, pvalue = 0, zsquare = 0, - af = 0, var_x = 0; + double z = 0, beta = 0, se_beta = 0, pvalue = 0, zsquare = 0; // af = 0; size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.n_col == 0) { @@ -4074,21 +3306,22 @@ void ReadFile_beta(const string &file_beta, if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); z = 0; beta = 0; se_beta = 0; - chisq = 0; + auto chisq = 0.0; pvalue = 0; n_total = 0; n_mis = 0; n_obs = 0; n_case = 0; n_control = 0; - af = 0; - var_x = 0; + // af = 0; + // auto var_x = 0.0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -4118,7 +3351,7 @@ void ReadFile_beta(const string &file_beta, se_beta = atof(ch_ptr); } if (header.chisq_col != 0 && header.chisq_col == i + 1) { - chisq = atof(ch_ptr); + chisq = atof(ch_ptr); } if (header.p_col != 0 && header.p_col == i + 1) { pvalue = atof(ch_ptr); @@ -4139,12 +3372,12 @@ void ReadFile_beta(const string &file_beta, if (header.ncontrol_col != 0 && header.ncontrol_col == i + 1) { n_control = atoi(ch_ptr); } - if (header.af_col != 0 && header.af_col == i + 1) { - af = atof(ch_ptr); - } - if (header.var_col != 0 && header.var_col == i + 1) { - var_x = atof(ch_ptr); - } + // if (header.af_col != 0 && header.af_col == i + 1) { + // af = atof(ch_ptr); + // } + // if (header.var_col != 0 && header.var_col == i + 1) { + // var_x = atof(ch_ptr); + // } ch_ptr = strtok(NULL, " , \t"); } @@ -4177,9 +3410,9 @@ void ReadFile_beta(const string &file_beta, } // Obtain var_x. - if (header.var_col == 0 && header.af_col != 0) { - var_x = 2.0 * af * (1.0 - af); - } + // if (header.var_col == 0 && header.af_col != 0) { + // var_x = 2.0 * af * (1.0 - af); + // } // If the SNP is also present in cor file, then do calculations. if ((mapRS2wA.size() == 0 || mapRS2wA.count(rs) != 0) && @@ -4228,13 +3461,13 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, string type; string rs, chr, a1, a0, pos, cm; - double z = 0, beta = 0, se_beta = 0, chisq = 0, pvalue = 0, af = 0, var_x = 0; + double z = 0, beta = 0, se_beta = 0; // pvalue = 0, chisq=0, af = 0 , var_x = 0; size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; size_t ni_total = 0, ns_total = 0, ns_test = 0; // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_io(line, header); if (header.n_col == 0) { @@ -4255,21 +3488,22 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, if (isBlankLine(line)) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); z = 0; beta = 0; se_beta = 0; - chisq = 0; - pvalue = 0; + // chisq = 0; + // pvalue = 0; n_total = 0; n_mis = 0; n_obs = 0; n_case = 0; n_control = 0; - af = 0; - var_x = 0; + // af = 0; + // double var_x = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -4298,12 +3532,12 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, if (header.sebeta_col != 0 && header.sebeta_col == i + 1) { se_beta = atof(ch_ptr); } - if (header.chisq_col != 0 && header.chisq_col == i + 1) { - chisq = atof(ch_ptr); - } - if (header.p_col != 0 && header.p_col == i + 1) { - pvalue = atof(ch_ptr); - } + // if (header.chisq_col != 0 && header.chisq_col == i + 1) { + // chisq = atof(ch_ptr); + // } + // if (header.p_col != 0 && header.p_col == i + 1) { + // pvalue = atof(ch_ptr); + // } if (header.n_col != 0 && header.n_col == i + 1) { n_total = atoi(ch_ptr); @@ -4321,12 +3555,13 @@ void ReadFile_beta(const string &file_beta, const map<string, double> &mapRS2wA, n_control = atoi(ch_ptr); } - if (header.af_col != 0 && header.af_col == i + 1) { - af = atof(ch_ptr); - } - if (header.var_col != 0 && header.var_col == i + 1) { - var_x = atof(ch_ptr); - } + // if (header.af_col != 0 && header.af_col == i + 1) { + // af = atof(ch_ptr); + // } + + // if (header.var_col != 0 && header.var_col == i + 1) { + // var_x = atof(ch_ptr); + // } ch_ptr = strtok(NULL, " , \t"); } @@ -4540,8 +3775,8 @@ void ReadFile_vector(const string &file_vec, gsl_vector *vec) { char *ch_ptr; for (size_t i = 0; i < vec->size; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); gsl_vector_set(vec, i, atof(ch_ptr)); } @@ -4563,9 +3798,10 @@ void ReadFile_matrix(const string &file_mat, gsl_matrix *mat) { char *ch_ptr; for (size_t i = 0; i < mat->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } @@ -4590,18 +3826,20 @@ void ReadFile_matrix(const string &file_mat, gsl_matrix *mat1, char *ch_ptr; for (size_t i = 0; i < mat1->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat1->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat1, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } } for (size_t i = 0; i < mat2->size1; i++) { - !safeGetline(infile, line).eof(); - ch_ptr = strtok((char *)line.c_str(), " , \t"); + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t j = 0; j < mat2->size2; j++) { + enforce(ch_ptr); gsl_matrix_set(mat2, i, j, atof(ch_ptr)); ch_ptr = strtok(NULL, " , \t"); } @@ -4621,7 +3859,7 @@ void ReadFile_study(const string &file_study, gsl_matrix *Vq_mat, string sfile = file_study + ".size.txt"; string qfile = file_study + ".q.txt"; - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); ReadFile_matrix(Vqfile, Vq_mat); ReadFile_vector(sfile, s); @@ -4646,7 +3884,7 @@ void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, string sfile = file_ref + ".size.txt"; string Sfile = file_ref + ".S.txt"; - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); ReadFile_vector(sfile, s); ReadFile_matrix(Sfile, S_mat, Svar_mat); @@ -4672,9 +3910,9 @@ void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq_mat, gsl_vector_set_zero(s_vec); ni = 0; - gsl_matrix *Vq_sub = gsl_matrix_alloc(Vq_mat->size1, Vq_mat->size2); - gsl_vector *q_sub = gsl_vector_alloc(q_vec->size); - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_matrix *Vq_sub = gsl_matrix_safe_alloc(Vq_mat->size1, Vq_mat->size2); + gsl_vector *q_sub = gsl_vector_safe_alloc(q_vec->size); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); igzstream infile(file_mstudy.c_str(), igzstream::in); if (!infile) { @@ -4763,9 +4001,9 @@ void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, gsl_vector_set_zero(s_vec); ni = 0; - gsl_matrix *S_sub = gsl_matrix_alloc(S_mat->size1, S_mat->size2); - gsl_matrix *Svar_sub = gsl_matrix_alloc(Svar_mat->size1, Svar_mat->size2); - gsl_vector *s = gsl_vector_alloc(s_vec->size + 1); + gsl_matrix *S_sub = gsl_matrix_safe_alloc(S_mat->size1, S_mat->size2); + gsl_matrix *Svar_sub = gsl_matrix_safe_alloc(Svar_mat->size1, Svar_mat->size2); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); igzstream infile(file_mref.c_str(), igzstream::in); if (!infile) { @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -32,8 +34,8 @@ using namespace std; -void ProgressBar(string str, double p, double total); -void ProgressBar(string str, double p, double total, double ratio); +void ProgressBar(string str, double p, double total, double ratio = -1.0); + std::istream &safeGetline(std::istream &is, std::string &t); bool ReadFile_snps(const string file_snps, set<string> &setSnps); @@ -64,7 +66,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps, const double &r2_level, map<string, string> &mapRS2chr, map<string, long int> &mapRS2bp, map<string, double> &mapRS2cM, vector<SNPINFO> &snpInfo, - size_t &ns_test, bool debug); + size_t &ns_test); bool ReadFile_bed(const string &file_bed, const set<string> &setSnps, const gsl_matrix *W, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, @@ -94,7 +96,7 @@ bool PlinkKin(const string &file_bed, vector<int> &indicator_snp, bool ReadFile_geno(const string file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K, bool debug); + const bool calc_K); bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, const bool calc_K); @@ -102,7 +104,7 @@ bool ReadFile_geno(const string &file_geno, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, - const size_t ns_test, bool debug); + const size_t ns_test); bool ReadFile_bed(const string &file_bed, vector<int> &indicator_idv, vector<int> &indicator_snp, vector<vector<unsigned char>> &Xt, gsl_matrix *K, const bool calc_K, const size_t ni_test, @@ -176,16 +178,6 @@ void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq, gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); - -// WJA added. -bool bgenKin(const string &file_geno, vector<int> &indicator_snp, - const int k_mode, const int display_pace, gsl_matrix *matrix_kin); -bool ReadFile_bgen(const string &file_bgen, const set<string> &setSnps, - const gsl_matrix *W, vector<int> &indicator_idv, - vector<int> &indicator_snp, vector<SNPINFO> &snpInfo, - const double &maf_level, const double &miss_level, - const double &hwe_level, const double &r2_level, - size_t &ns_test); bool ReadFile_sample(const string &file_sample, vector<vector<int>> &indicator_pheno, vector<vector<double>> &pheno, diff --git a/src/lapack.cpp b/src/lapack.cpp index ee0a497..d15446b 100644 --- a/src/lapack.cpp +++ b/src/lapack.cpp @@ -128,6 +128,10 @@ void lapack_dgemm(char *TransA, char *TransB, double alpha, const gsl_matrix *A, gsl_matrix *C_t = gsl_matrix_alloc(C->size2, C->size1); gsl_matrix_transpose_memcpy(C_t, C); + check_int_mult_overflow(M,K1); + check_int_mult_overflow(N,K1); + check_int_mult_overflow(M,N); + dgemm_(TransA, TransB, &M, &N, &K1, &alpha, A_t->data, &LDA, B_t->data, &LDB, &beta, C_t->data, &LDC); @@ -302,27 +306,6 @@ double LULndet(const gsl_matrix *LU) { return gsl_linalg_LU_lndet((gsl_matrix *)LU); } -/* -double LULndet(gsl_matrix_float *LU) { - gsl_matrix *LU_double = gsl_matrix_alloc(LU->size1, LU->size2); - double d; - - // Copy float matrix to double. - for (size_t i = 0; i < LU->size1; i++) { - for (size_t j = 0; j < LU->size2; j++) { - gsl_matrix_set(LU_double, i, j, gsl_matrix_float_get(LU, i, j)); - } - } - - // LU decomposition. - d = gsl_linalg_LU_lndet(LU_double); - - // Free matrix - gsl_matrix_free(LU_double); - return d; -} -*/ - // LU solve. void LUSolve(const gsl_matrix *LU, const gsl_permutation *p, const gsl_vector *b, gsl_vector *x) { diff --git a/src/ldr.cpp b/src/ldr.cpp index 3554efa..f70eb85 100644 --- a/src/ldr.cpp +++ b/src/ldr.cpp @@ -29,7 +29,7 @@ #include <stdio.h> #include <stdlib.h> -#include "Eigen/Dense" +// #include "Eigen/Dense" #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_eigen.h" @@ -46,7 +46,7 @@ #include "param.h" using namespace std; -using namespace Eigen; +// using namespace Eigen; void LDR::CopyFromParam(PARAM &cPar) { a_mode = cPar.a_mode; @@ -70,8 +70,10 @@ void LDR::CopyFromParam(PARAM &cPar) { return; } + void LDR::CopyToParam(PARAM &cPar) { return; } +/* // X is a p by n matrix. void LDR::VB(const vector<vector<unsigned char>> &Xt, const gsl_matrix *W_gsl, const gsl_vector *y_gsl) { @@ -107,3 +109,4 @@ void LDR::VB(const vector<vector<unsigned char>> &Xt, const gsl_matrix *W_gsl, return; } +*/ @@ -39,7 +39,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" -#include "eigenlib.h" +// #include "eigenlib.h" #include "gzstream.h" #include "lapack.h" #include "lm.h" @@ -55,8 +55,6 @@ void LM::CopyFromParam(PARAM &cPar) { file_out = cPar.file_out; path_out = cPar.path_out; file_gene = cPar.file_gene; - // WJA added - file_oxford = cPar.file_oxford; time_opt = 0.0; @@ -333,14 +331,14 @@ void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) { for (size_t t = 0; t < ng_total; t++) { getline(infile, line); if (t % d_pace == 0 || t == ng_total - 1) { - ProgressBar("Performing Analysis ", t, ng_total - 1); + ProgressBar("Performing Analysis", t, ng_total - 1); } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; c_phen = 0; for (size_t i = 0; i < indicator_idv.size(); ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -381,232 +379,6 @@ void LM::AnalyzeGene(const gsl_matrix *W, const gsl_vector *x) { return; } -// WJA added -void LM::Analyzebgen(const gsl_matrix *W, const gsl_vector *y) { - debug_msg("entering"); - string file_bgen = file_oxford + ".bgen"; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return; - } - - clock_t time_start = clock(); - - string line; - char *ch_ptr; - - double beta = 0, se = 0, p_wald = 0, p_lrt = 0, p_score = 0; - int n_miss, c_phen; - double geno, x_mean; - - // Calculate some basic quantities. - double yPwy, xPwy, xPwx; - double df = (double)W->size1 - (double)W->size2 - 1.0; - - gsl_vector *x = gsl_vector_alloc(W->size1); - gsl_vector *x_miss = gsl_vector_alloc(W->size1); - - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wty = gsl_vector_alloc(W->size2); - gsl_vector *Wtx = gsl_vector_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); - CalcvPv(WtWi, Wty, y, yPwy); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB; - double bgen_geno_prob_BB, bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - std::cout << "Warning: WJA hard coded SNP missingness " - << "threshold of 10%" << std::endl; - - // Start reading genotypes and analyze. - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); - } - - // Read SNP header. - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - x_mean = 0.0; - c_phen = 0; - n_miss = 0; - gsl_vector_set_zero(x_miss); - for (size_t i = 0; i < bgen_N; ++i) { - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - - // WJA - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } else { - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean += geno; - } - c_phen++; - } - - x_mean /= static_cast<double>(ni_test - n_miss); - - for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(x_miss, i) == 0) { - gsl_vector_set(x, i, x_mean); - } - geno = gsl_vector_get(x, i); - } - - // Calculate statistics. - time_start = clock(); - - gsl_blas_dgemv(CblasTrans, 1.0, W, x, 0.0, Wtx); - CalcvPv(WtWi, Wty, Wtx, y, x, xPwy, xPwx); - LmCalcP(a_mode - 50, yPwy, xPwy, xPwx, df, W->size1, beta, se, p_wald, - p_lrt, p_score); - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - SUMSTAT SNPs = {beta, se, 0.0, 0.0, p_wald, p_lrt, p_score, -0.0}; - sumStat.push_back(SNPs); - } - cout << endl; - - gsl_vector_free(x); - gsl_vector_free(x_miss); - - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wty); - gsl_vector_free(Wtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return; -} - void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) { debug_msg("entering"); igzstream infile(file_geno.c_str(), igzstream::in); @@ -649,22 +421,22 @@ void LM::AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y) { for (size_t t = 0; t < indicator_snp.size(); ++t) { getline(infile, line); if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); + ProgressBar("Reading SNPs", t, ns_total - 1); } if (indicator_snp[t] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); x_mean = 0.0; c_phen = 0; n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -775,7 +547,7 @@ void LM::AnalyzePlink(const gsl_matrix *W, const gsl_vector *y) { for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { - ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + ProgressBar("Reading SNPs", t, snpInfo.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -67,9 +67,6 @@ public: void AnalyzeGene(const gsl_matrix *W, const gsl_vector *x); void AnalyzePlink(const gsl_matrix *W, const gsl_vector *y); void AnalyzeBimbam(const gsl_matrix *W, const gsl_vector *y); - // WJA added. - void Analyzebgen(const gsl_matrix *W, const gsl_vector *y); - void WriteFiles(); }; diff --git a/src/lmm.cpp b/src/lmm.cpp index 134fbf9..4198fab 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -13,7 +15,7 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <fstream> @@ -38,11 +40,14 @@ #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" -#include "eigenlib.h" +// #include "eigenlib.h" + #include "gzstream.h" #include "io.h" +#include "fastblas.h" #include "lapack.h" #include "lmm.h" +#include "mathfunc.h" using namespace std; @@ -56,9 +61,6 @@ void LMM::CopyFromParam(PARAM &cPar) { path_out = cPar.path_out; file_gene = cPar.file_gene; - // WJA added. - file_oxford = cPar.file_oxford; - l_min = cPar.l_min; l_max = cPar.l_max; n_region = cPar.n_region; @@ -107,12 +109,13 @@ void LMM::WriteFiles() { } auto common_header = [&] () { - if (a_mode != 2) + if (a_mode != 2) { outfile << "beta" << "\t"; + outfile << "se" << "\t"; + } - outfile << "se" << "\t"; - - outfile << "logl_H1" << "\t"; // we may make this an option + if (!is_legacy_mode()) + outfile << "logl_H1" << "\t"; // we may make this an option switch(a_mode) { case 1: @@ -139,12 +142,13 @@ void LMM::WriteFiles() { auto sumstats = [&] (SUMSTAT st) { outfile << scientific << setprecision(6); - if (a_mode != 2) + if (a_mode != 2) { outfile << st.beta << "\t"; + outfile << st.se << "\t"; + } - outfile << st.se << "\t"; - - outfile << st.logl_H1 << "\t"; + if (!is_legacy_mode()) + outfile << st.logl_H1 << "\t"; switch(a_mode) { case 1: @@ -229,6 +233,7 @@ void CalcPab(const size_t n_cvt, const size_t e_mode, const gsl_vector *Hi_eval, gsl_matrix_const_column(Uab, index_ab); gsl_blas_ddot(Hi_eval, &Uab_col.vector, &p_ab); if (e_mode != 0) { + assert(false); p_ab = gsl_vector_get(ab, index_ab) - p_ab; } gsl_matrix_set(Pab, 0, index_ab, p_ab); @@ -364,16 +369,16 @@ double LogL_f(double l, void *params) { double f = 0.0, logdet_h = 0.0, d; size_t index_yy; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); @@ -390,11 +395,13 @@ double LogL_f(double l, void *params) { index_yy = GetabIndex(n_cvt + 2, n_cvt + 2, n_cvt); double P_yy = gsl_matrix_get(Pab, nc_total, index_yy); - f = c - 0.5 * logdet_h - 0.5 * (double)ni_test * log(P_yy); - gsl_matrix_free(Pab); - gsl_vector_free(Hi_eval); - gsl_vector_free(v_temp); + assert(!is_nan(P_yy)); + f = c - 0.5 * logdet_h - 0.5 * (double)ni_test * log(P_yy); + assert(!is_nan(f)); + gsl_matrix_safe_free(Pab); // FIXME + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(v_temp); return f; } @@ -414,23 +421,23 @@ double LogL_dev1(double l, void *params) { double dev1 = 0.0, trace_Hi = 0.0; size_t index_yy; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -452,11 +459,11 @@ double LogL_dev1(double l, void *params) { double yPKPy = (P_yy - PP_yy) / l; dev1 = -0.5 * trace_HiK + 0.5 * (double)ni_test * yPKPy / P_yy; - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME: may contain NaN + gsl_matrix_safe_free(PPab); // FIXME: may contain NaN + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(v_temp); return dev1; } @@ -477,27 +484,27 @@ double LogL_dev2(double l, void *params) { double dev2 = 0.0, trace_Hi = 0.0, trace_HiHi = 0.0; size_t index_yy; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); - gsl_vector_memcpy(HiHiHi_eval, HiHi_eval); + gsl_vector_safe_memcpy(HiHiHi_eval, HiHi_eval); gsl_vector_mul(HiHiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -527,13 +534,13 @@ double LogL_dev2(double l, void *params) { 0.5 * (double)ni_test * (2.0 * yPKPKPy * P_yy - yPKPy * yPKPy) / (P_yy * P_yy); - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_matrix_free(PPPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(HiHiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME + gsl_matrix_safe_free(PPab); + gsl_matrix_safe_free(PPPab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(HiHiHi_eval); + gsl_vector_safe_free(v_temp); return dev2; } @@ -554,27 +561,27 @@ void LogL_dev12(double l, void *params, double *dev1, double *dev2) { double trace_Hi = 0.0, trace_HiHi = 0.0; size_t index_yy; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); - gsl_vector_memcpy(HiHiHi_eval, HiHi_eval); + gsl_vector_safe_memcpy(HiHiHi_eval, HiHi_eval); gsl_vector_mul(HiHiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -607,13 +614,13 @@ void LogL_dev12(double l, void *params, double *dev1, double *dev2) { 0.5 * (double)ni_test * (2.0 * yPKPKPy * P_yy - yPKPy * yPKPy) / (P_yy * P_yy); - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_matrix_free(PPPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(HiHiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME: may contain NaN + gsl_matrix_safe_free(PPab); // FIXME: may contain NaN + gsl_matrix_safe_free(PPPab); // FIXME: may contain NaN + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(HiHiHi_eval); + gsl_vector_safe_free(v_temp); return; } @@ -637,17 +644,17 @@ double LogRL_f(double l, void *params) { double f = 0.0, logdet_h = 0.0, logdet_hiw = 0.0, d; size_t index_ww; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *Iab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *Iab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); @@ -676,10 +683,10 @@ double LogRL_f(double l, void *params) { double c = 0.5 * df * (log(df) - log(2 * M_PI) - 1.0); f = c - 0.5 * logdet_h - 0.5 * logdet_hiw - 0.5 * df * log(P_yy); - gsl_matrix_free(Pab); - gsl_matrix_free(Iab); - gsl_vector_free(Hi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); + gsl_matrix_safe_free(Iab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(v_temp); return f; } @@ -702,23 +709,23 @@ double LogRL_dev1(double l, void *params) { double dev1 = 0.0, trace_Hi = 0.0; size_t index_ww; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -750,11 +757,11 @@ double LogRL_dev1(double l, void *params) { dev1 = -0.5 * trace_PK + 0.5 * df * yPKPy / P_yy; - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME: may contain NaN + gsl_matrix_safe_free(PPab); // FIXME: may contain NaN + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(v_temp); return dev1; } @@ -778,27 +785,27 @@ double LogRL_dev2(double l, void *params) { double dev2 = 0.0, trace_Hi = 0.0, trace_HiHi = 0.0; size_t index_ww; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); - gsl_vector_memcpy(HiHiHi_eval, HiHi_eval); + gsl_vector_safe_memcpy(HiHiHi_eval, HiHi_eval); gsl_vector_mul(HiHiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -838,13 +845,13 @@ double LogRL_dev2(double l, void *params) { dev2 = 0.5 * trace_PKPK - 0.5 * df * (2.0 * yPKPKPy * P_yy - yPKPy * yPKPy) / (P_yy * P_yy); - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_matrix_free(PPPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(HiHiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME + gsl_matrix_safe_free(PPab); + gsl_matrix_safe_free(PPPab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(HiHiHi_eval); + gsl_vector_safe_free(v_temp); return dev2; } @@ -868,27 +875,27 @@ void LogRL_dev12(double l, void *params, double *dev1, double *dev2) { double trace_Hi = 0.0, trace_HiHi = 0.0; size_t index_ww; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_matrix *PPPab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *HiHiHi_eval = gsl_vector_alloc((p->eval)->size); - gsl_vector *v_temp = gsl_vector_alloc((p->eval)->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_matrix *PPPab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *HiHiHi_eval = gsl_vector_safe_alloc((p->eval)->size); + gsl_vector *v_temp = gsl_vector_safe_alloc((p->eval)->size); - gsl_vector_memcpy(v_temp, p->eval); + gsl_vector_safe_memcpy(v_temp, p->eval); gsl_vector_scale(v_temp, l); if (p->e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); - gsl_vector_memcpy(HiHi_eval, Hi_eval); + gsl_vector_safe_memcpy(HiHi_eval, Hi_eval); gsl_vector_mul(HiHi_eval, Hi_eval); - gsl_vector_memcpy(HiHiHi_eval, HiHi_eval); + gsl_vector_safe_memcpy(HiHiHi_eval, HiHi_eval); gsl_vector_mul(HiHiHi_eval, Hi_eval); gsl_vector_set_all(v_temp, 1.0); @@ -930,13 +937,13 @@ void LogRL_dev12(double l, void *params, double *dev1, double *dev2) { *dev2 = 0.5 * trace_PKPK - 0.5 * df * (2.0 * yPKPKPy * P_yy - yPKPy * yPKPy) / (P_yy * P_yy); - gsl_matrix_free(Pab); - gsl_matrix_free(PPab); - gsl_matrix_free(PPPab); - gsl_vector_free(Hi_eval); - gsl_vector_free(HiHi_eval); - gsl_vector_free(HiHiHi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); // FIXME + gsl_matrix_safe_free(PPab); + gsl_matrix_safe_free(PPPab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(HiHi_eval); + gsl_vector_safe_free(HiHiHi_eval); + gsl_vector_safe_free(v_temp); return; } @@ -948,16 +955,16 @@ void LMM::CalcRLWald(const double &l, const FUNC_PARAM ¶ms, double &beta, int df = (int)ni_test - (int)n_cvt - 1; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc(params.eval->size); - gsl_vector *v_temp = gsl_vector_alloc(params.eval->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc(params.eval->size); + gsl_vector *v_temp = gsl_vector_safe_alloc(params.eval->size); - gsl_vector_memcpy(v_temp, params.eval); + gsl_vector_safe_memcpy(v_temp, params.eval); gsl_vector_scale(v_temp, l); if (params.e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); @@ -977,9 +984,9 @@ void LMM::CalcRLWald(const double &l, const FUNC_PARAM ¶ms, double &beta, se = sqrt(1.0 / (tau * P_xx)); p_wald = gsl_cdf_fdist_Q((P_yy - Px_yy) * tau, 1.0, df); - gsl_matrix_free(Pab); - gsl_vector_free(Hi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(v_temp); return; } @@ -990,16 +997,16 @@ void LMM::CalcRLScore(const double &l, const FUNC_PARAM ¶ms, double &beta, int df = (int)ni_test - (int)n_cvt - 1; - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc(params.eval->size); - gsl_vector *v_temp = gsl_vector_alloc(params.eval->size); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc(params.eval->size); + gsl_vector *v_temp = gsl_vector_safe_alloc(params.eval->size); - gsl_vector_memcpy(v_temp, params.eval); + gsl_vector_safe_memcpy(v_temp, params.eval); gsl_vector_scale(v_temp, l); if (params.e_mode == 0) { gsl_vector_set_all(Hi_eval, 1.0); } else { - gsl_vector_memcpy(Hi_eval, v_temp); + gsl_vector_safe_memcpy(Hi_eval, v_temp); } gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); @@ -1021,9 +1028,9 @@ void LMM::CalcRLScore(const double &l, const FUNC_PARAM ¶ms, double &beta, p_score = gsl_cdf_fdist_Q((double)ni_test * P_xy * P_xy / (P_yy * P_xx), 1.0, df); - gsl_matrix_free(Pab); - gsl_vector_free(Hi_eval); - gsl_vector_free(v_temp); + gsl_matrix_safe_free(Pab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(v_temp); return; } @@ -1031,7 +1038,7 @@ void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { size_t index_ab; size_t n_cvt = UtW->size2; - gsl_vector *u_a = gsl_vector_alloc(Uty->size); + gsl_vector *u_a = gsl_vector_safe_alloc(Uty->size); for (size_t a = 1; a <= n_cvt + 2; ++a) { if (a == n_cvt + 1) { @@ -1039,10 +1046,10 @@ void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { } if (a == n_cvt + 2) { - gsl_vector_memcpy(u_a, Uty); + gsl_vector_safe_memcpy(u_a, Uty); } else { gsl_vector_const_view UtW_col = gsl_matrix_const_column(UtW, a - 1); - gsl_vector_memcpy(u_a, &UtW_col.vector); + gsl_vector_safe_memcpy(u_a, &UtW_col.vector); } for (size_t b = a; b >= 1; --b) { @@ -1054,17 +1061,17 @@ void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) { gsl_vector_view Uab_col = gsl_matrix_column(Uab, index_ab); if (b == n_cvt + 2) { - gsl_vector_memcpy(&Uab_col.vector, Uty); + gsl_vector_safe_memcpy(&Uab_col.vector, Uty); } else { gsl_vector_const_view UtW_col = gsl_matrix_const_column(UtW, b - 1); - gsl_vector_memcpy(&Uab_col.vector, &UtW_col.vector); + gsl_vector_safe_memcpy(&Uab_col.vector, &UtW_col.vector); } gsl_vector_mul(&Uab_col.vector, u_a); } } - gsl_vector_free(u_a); + gsl_vector_safe_free(u_a); return; } @@ -1078,12 +1085,12 @@ void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_vector_view Uab_col = gsl_matrix_column(Uab, index_ab); if (b == n_cvt + 2) { - gsl_vector_memcpy(&Uab_col.vector, Uty); + gsl_vector_safe_memcpy(&Uab_col.vector, Uty); } else if (b == n_cvt + 1) { - gsl_vector_memcpy(&Uab_col.vector, Utx); + gsl_vector_safe_memcpy(&Uab_col.vector, Utx); } else { gsl_vector_const_view UtW_col = gsl_matrix_const_column(UtW, b - 1); - gsl_vector_memcpy(&Uab_col.vector, &UtW_col.vector); + gsl_vector_safe_memcpy(&Uab_col.vector, &UtW_col.vector); } gsl_vector_mul(&Uab_col.vector, Utx); @@ -1097,8 +1104,8 @@ void Calcab(const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { size_t n_cvt = W->size2; double d; - gsl_vector *v_a = gsl_vector_alloc(y->size); - gsl_vector *v_b = gsl_vector_alloc(y->size); + gsl_vector *v_a = gsl_vector_safe_alloc(y->size); + gsl_vector *v_b = gsl_vector_safe_alloc(y->size); for (size_t a = 1; a <= n_cvt + 2; ++a) { if (a == n_cvt + 1) { @@ -1106,10 +1113,10 @@ void Calcab(const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { } if (a == n_cvt + 2) { - gsl_vector_memcpy(v_a, y); + gsl_vector_safe_memcpy(v_a, y); } else { gsl_vector_const_view W_col = gsl_matrix_const_column(W, a - 1); - gsl_vector_memcpy(v_a, &W_col.vector); + gsl_vector_safe_memcpy(v_a, &W_col.vector); } for (size_t b = a; b >= 1; --b) { @@ -1120,10 +1127,10 @@ void Calcab(const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { index_ab = GetabIndex(a, b, n_cvt); if (b == n_cvt + 2) { - gsl_vector_memcpy(v_b, y); + gsl_vector_safe_memcpy(v_b, y); } else { gsl_vector_const_view W_col = gsl_matrix_const_column(W, b - 1); - gsl_vector_memcpy(v_b, &W_col.vector); + gsl_vector_safe_memcpy(v_b, &W_col.vector); } gsl_blas_ddot(v_a, v_b, &d); @@ -1131,8 +1138,8 @@ void Calcab(const gsl_matrix *W, const gsl_vector *y, gsl_vector *ab) { } } - gsl_vector_free(v_a); - gsl_vector_free(v_b); + gsl_vector_safe_free(v_a); + gsl_vector_safe_free(v_b); return; } @@ -1142,31 +1149,32 @@ void Calcab(const gsl_matrix *W, const gsl_vector *y, const gsl_vector *x, size_t n_cvt = W->size2; double d; - gsl_vector *v_b = gsl_vector_alloc(y->size); + gsl_vector *v_b = gsl_vector_safe_alloc(y->size); for (size_t b = 1; b <= n_cvt + 2; ++b) { index_ab = GetabIndex(n_cvt + 1, b, n_cvt); if (b == n_cvt + 2) { - gsl_vector_memcpy(v_b, y); + gsl_vector_safe_memcpy(v_b, y); } else if (b == n_cvt + 1) { - gsl_vector_memcpy(v_b, x); + gsl_vector_safe_memcpy(v_b, x); } else { gsl_vector_const_view W_col = gsl_matrix_const_column(W, b - 1); - gsl_vector_memcpy(v_b, &W_col.vector); + gsl_vector_safe_memcpy(v_b, &W_col.vector); } gsl_blas_ddot(x, v_b, &d); gsl_vector_set(ab, index_ab, d); } - gsl_vector_free(v_b); + gsl_vector_safe_free(v_b); return; } void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x) { + debug_msg(file_gene); igzstream infile(file_gene.c_str(), igzstream::in); if (!infile) { cout << "error reading gene expression file:" << file_gene << endl; @@ -1188,25 +1196,25 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_vector *y = gsl_vector_alloc(U->size1); - gsl_vector *Uty = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gsl_vector *y = gsl_vector_safe_alloc(U->size1); + gsl_vector *Uty = gsl_vector_safe_alloc(U->size2); + gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); // Header. getline(infile, line); for (size_t t = 0; t < ng_total; t++) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == ng_total - 1) { - ProgressBar("Performing Analysis ", t, ng_total - 1); + ProgressBar("Performing Analysis", t, ng_total - 1); } - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); rs = ch_ptr; c_phen = 0; for (size_t i = 0; i < indicator_idv.size(); ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -1260,10 +1268,10 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, } cout << endl; - gsl_vector_free(y); - gsl_vector_free(Uty); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_vector_safe_free(y); + gsl_vector_safe_free(Uty); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); infile.close(); infile.clear(); @@ -1271,35 +1279,37 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, return; } -void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y, - const set<string> gwasnps) { - debug_msg("entering"); + +void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps) { clock_t time_start = clock(); - // LOCO support + // Subset/LOCO support bool process_gwasnps = gwasnps.size(); if (process_gwasnps) - debug_msg("AnalyzeBimbam w. LOCO"); + debug_msg("Analyze subset of SNPs (LOCO)"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; const size_t inds = U->size1; - gsl_vector *x = gsl_vector_alloc(inds); // #inds - gsl_vector *x_miss = gsl_vector_alloc(inds); - gsl_vector *Utx = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + enforce(inds == ni_test); + gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_safe_alloc(inds); + gsl_vector *Utx = gsl_vector_safe_alloc(U->size2); + gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); // Create a large matrix with LMM_BATCH_SIZE columns for batched processing // const size_t msize=(process_gwasnps ? 1 : LMM_BATCH_SIZE); const size_t msize = LMM_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(inds, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(inds, msize); - + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_safe_alloc(inds, msize); enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + enforce(Xlarge->size1 == inds); gsl_matrix_set_zero(Xlarge); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); @@ -1307,9 +1317,6 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, // start reading genotypes and analyze size_t c = 0; - igzstream infile(file_geno.c_str(), igzstream::in); - enforce_msg(infile, "error reading genotype file"); - auto batch_compute = [&](size_t l) { // using a C++ closure // Compute SNPs in batch, note the computations are independent per SNP gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); @@ -1317,7 +1324,7 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); @@ -1325,15 +1332,15 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, for (size_t i = 0; i < l; i++) { // for every batch... gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(Utx, &UtXlarge_col.vector); + gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); CalcUab(UtW, Uty, Utx, Uab); time_start = clock(); FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; - double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; - double p_lrt = 0, p_score = 0; + double lambda_mle = 0.0, lambda_remle = 0.0, beta = 0.0, se = 0.0, p_wald = 0.0; + double p_lrt = 0.0, p_score = 0.0; double logl_H1 = 0.0; // 3 is before 1. @@ -1361,183 +1368,200 @@ void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, } }; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - // for every SNP - string line; - safeGetline(infile, line); - if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); + const auto num_snps = indicator_snp.size(); + enforce_msg(num_snps > 0,"Zero SNPs to process - data corrupt?"); + if (num_snps < 50) { + cerr << num_snps << " SNPs" << endl; + warning_msg("very few SNPs processed"); + } + const size_t progress_step = (num_snps/50>d_pace ? num_snps/50 : d_pace); + + for (size_t t = 0; t < num_snps; ++t) { + if (t % progress_step == 0 || t == (num_snps - 1)) { + ProgressBar("Reading SNPs", t, num_snps - 1); } if (indicator_snp[t] == 0) continue; - char *ch_ptr = strtok((char *)line.c_str(), " , \t"); - auto snp = string(ch_ptr); + auto tup = fetch_snp(t); + auto snp = get<0>(tup); + auto gs = get<1>(tup); + // check whether SNP is included in gwasnps (used by LOCO) if (process_gwasnps && gwasnps.count(snp) == 0) continue; - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); - double x_mean = 0.0; - int c_phen = 0; - int n_miss = 0; + // drop missing idv and plug mean values for missing geno + double x_total = 0.0; // sum genotype values to compute x_mean + uint pos = 0; // position in target vector + uint n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { // get the genotypes per individual and compute stats per SNP - ch_ptr = strtok(NULL, " , \t"); - if (indicator_idv[i] == 0) + if (indicator_idv[i] == 0) // skip individual continue; - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(x_miss, c_phen, 0.0); + double geno = gs[i]; + if (std::isnan(geno)) { + gsl_vector_set(x_miss, pos, 1.0); n_miss++; } else { - double geno = atof(ch_ptr); - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean += geno; + gsl_vector_set(x, pos, geno); + x_total += geno; } - c_phen++; + pos++; } + enforce(pos == ni_test); - x_mean /= (double)(ni_test - n_miss); + const double x_mean = x_total/(double)(ni_test - n_miss); + // plug x_mean back into missing values for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(x_miss, i) == 0) { + if (gsl_vector_get(x_miss, i) == 1.0) { gsl_vector_set(x, i, x_mean); } } + + /* this is what below GxE does + for (size_t i = 0; i < ni_test; ++i) { + auto geno = gsl_vector_get(x, i); + if (std::isnan(geno)) { + gsl_vector_set(x, i, x_mean); + geno = x_mean; + } + if (x_mean > 1.0) { + gsl_vector_set(x, i, 2 - geno); + } + } + */ + enforce(x->size == ni_test); + // copy genotype values for SNP into Xlarge cache gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); - gsl_vector_memcpy(&Xlarge_col.vector, x); + gsl_vector_safe_memcpy(&Xlarge_col.vector, x); c++; // count SNPs going in if (c % msize == 0) batch_compute(msize); } batch_compute(c % msize); + ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1); // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; - gsl_vector_free(x); - gsl_vector_free(x_miss); - gsl_vector_free(Utx); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_vector_safe_free(x); + gsl_vector_safe_free(x_miss); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); - gsl_matrix_free(Xlarge); - gsl_matrix_free(UtXlarge); + gsl_matrix_safe_free(Xlarge); + gsl_matrix_safe_free(UtXlarge); - infile.close(); - infile.clear(); - - return; } -void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y) { - debug_msg("entering"); - string file_bed = file_bfile + ".bed"; - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return; - } +void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps) { + debug_msg(file_geno); - clock_t time_start = clock(); + igzstream infile(file_geno.c_str(), igzstream::in); + enforce_msg(infile, "error reading genotype file"); + size_t prev_line = 0; - char ch[1]; - bitset<8> b; + std::vector <double> gs; + gs.resize(ni_total); - double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; - double p_lrt = 0, p_score = 0; - double logl_H1 = 0.0; - int n_bit, n_miss, ci_total, ci_test; - double geno, x_mean; + // fetch_snp is a callback function for every SNP row + std::function<SnpNameValues(size_t)> fetch_snp = [&](size_t num) { + string line; + while (prev_line <= num) { + // also read SNPs that were skipped + safeGetline(infile, line); + prev_line++; + } + char *ch_ptr = strtok((char *)line.c_str(), " , \t"); + enforce_msg(ch_ptr, "Parsing BIMBAM genofile"); // ch_ptr should not be NULL - // Calculate basic quantities. - size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; + auto snp = string(ch_ptr); + ch_ptr = strtok_safe(NULL, " , \t"); // skip column + ch_ptr = strtok_safe(NULL, " , \t"); // skip column - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *Utx = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gs.assign (ni_total,nan("")); // wipe values - // Create a large matrix. - size_t msize = LMM_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < ni_total; ++i) { + ch_ptr = strtok(NULL, " , \t"); + enforce_msg(ch_ptr,line.c_str()); + if (strcmp(ch_ptr, "NA") != 0) + gs[i] = atof(ch_ptr); + } + return std::make_tuple(snp,gs); + }; - gsl_matrix_set_zero(Uab); - CalcUab(UtW, Uty, Uab); + LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); + + infile.close(); + infile.clear(); +} + +void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps) { + string file_bed = file_bfile + ".bed"; + debug_msg(file_bed); + ifstream infile(file_bed.c_str(), ios::binary); + enforce_msg(infile,"error reading genotype (.bed) file"); + + char ch[1]; // 1 byte buffer // Calculate n_bit and c, the number of bit for each SNP. - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } + const size_t n_bit = (ni_total % 4 == 0 ? ni_total / 4 : ni_total / 4 + 1); - // Print the first three magic numbers. + // first three magic numbers. for (int i = 0; i < 3; ++i) { infile.read(ch, 1); - b = ch[0]; + // const bitset<8> b = ch[0]; b is never used } - size_t c = 0, t_last = 0; - for (size_t t = 0; t < snpInfo.size(); ++t) { - if (indicator_snp[t] == 0) - continue; - t_last++; - } - for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { - if (t % d_pace == 0 || t == snpInfo.size() - 1) { - ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); - } - if (indicator_snp[t] == 0) { - continue; - } + std::vector <double> gs; + gs.resize(ni_total); + // fetch_snp is a callback function for every SNP row + std::function<SnpNameValues(size_t)> fetch_snp = [&](size_t num) { + gs.assign (ni_total,nan("")); // wipe values // n_bit, and 3 is the number of magic numbers. + auto t = num; infile.seekg(t * n_bit + 3); - - // Read genotypes. - x_mean = 0.0; - n_miss = 0; - ci_total = 0; - ci_test = 0; - for (int i = 0; i < n_bit; ++i) { + auto ci_total = 0; + auto ci_test = 0; + // ---- for all genotypes + for (uint i = 0; i < n_bit; ++i) { infile.read(ch, 1); - b = ch[0]; + bitset<8> bset8 = ch[0]; // Minor allele homozygous: 2.0; major: 0.0. for (size_t j = 0; j < 4; ++j) { if ((i == (n_bit - 1)) && ci_total == (int)ni_total) { break; } - if (indicator_idv[ci_total] == 0) { + if (indicator_idv[ci_total] == 0) { // skip individual ci_total++; continue; } - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(x, ci_test, 2); - x_mean += 2.0; + if (bset8[2 * j] == 0) { + if (bset8[2 * j + 1] == 0) { + gs[ci_test] = 2.0; } else { - gsl_vector_set(x, ci_test, 1); - x_mean += 1.0; + gs[ci_test] = 1.0; } } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(x, ci_test, 0); + if (bset8[2 * j + 1] == 1) { + gs[ci_test] = 0.0; } else { - gsl_vector_set(x, ci_test, -9); - n_miss++; + gs[ci_test] = nan(""); // already set to NaN - originally was -9.0 } } @@ -1545,367 +1569,14 @@ void LMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, ci_test++; } } + string snp="unknown"; + return std::make_tuple(snp,gs); + }; - x_mean /= (double)(ni_test - n_miss); - - for (size_t i = 0; i < ni_test; ++i) { - geno = gsl_vector_get(x, i); - if (geno == -9) { - gsl_vector_set(x, i, x_mean); - geno = x_mean; - } - } - - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); - gsl_vector_memcpy(&Xlarge_col.vector, x); - c++; - - if (c % msize == 0 || c == t_last) { - size_t l = 0; - if (c % msize == 0) { - l = msize; - } else { - l = c % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); - - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(Utx, &UtXlarge_col.vector); - - CalcUab(UtW, Uty, Utx, Uab); - - time_start = clock(); - FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; - - // 3 is before 1, for beta. - if (a_mode == 3 || a_mode == 4) { - CalcRLScore(l_mle_null, param1, beta, se, p_score); - } - - if (a_mode == 1 || a_mode == 4) { - CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, - logl_H1); - CalcRLWald(lambda_remle, param1, beta, se, p_wald); - } - - if (a_mode == 2 || a_mode == 4) { - CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, - p_wald, p_lrt, p_score, logl_H1}; - sumStat.push_back(SNPs); - } - } - } - cout << endl; - - gsl_vector_free(x); - gsl_vector_free(Utx); - gsl_matrix_free(Uab); - gsl_vector_free(ab); - - gsl_matrix_free(Xlarge); - gsl_matrix_free(UtXlarge); - - infile.close(); - infile.clear(); - - return; -} - -// WJA added. -void LMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y) { - debug_msg("entering"); - string file_bgen = file_oxford + ".bgen"; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return; - } - - clock_t time_start = clock(); - double lambda_mle = 0, lambda_remle = 0, beta = 0, se = 0, p_wald = 0; - double p_lrt = 0, p_score = 0; - double logl_H1 = 0.0; - int n_miss, c_phen; - double geno, x_mean; - - // Calculate basic quantities. - size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *x_miss = gsl_vector_alloc(U->size1); - gsl_vector *Utx = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); - - // Create a large matrix. - size_t msize = LMM_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); - - gsl_matrix_set_zero(Uab); - CalcUab(UtW, Uty, Uab); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB; - double bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - std::cout << "Warning: WJA hard coded SNP missingness " - << "threshold of 10%" << std::endl; - - // Start reading genotypes and analyze. - size_t c = 0, t_last = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (indicator_snp[t] == 0) { - continue; - } - t_last++; - } - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); - } - if (indicator_snp[t] == 0) { - continue; - } - - // Read SNP header. - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - x_mean = 0.0; - c_phen = 0; - n_miss = 0; - gsl_vector_set_zero(x_miss); - for (size_t i = 0; i < bgen_N; ++i) { - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - - // WJA. - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } else { - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean += geno; - } - c_phen++; - } - - x_mean /= static_cast<double>(ni_test - n_miss); - - for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(x_miss, i) == 0) { - gsl_vector_set(x, i, x_mean); - } - geno = gsl_vector_get(x, i); - } - - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); - gsl_vector_memcpy(&Xlarge_col.vector, x); - c++; - - if (c % msize == 0 || c == t_last) { - size_t l = 0; - if (c % msize == 0) { - l = msize; - } else { - l = c % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); - - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(Utx, &UtXlarge_col.vector); - - CalcUab(UtW, Uty, Utx, Uab); - - time_start = clock(); - FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; - - // 3 is before 1. - if (a_mode == 3 || a_mode == 4) { - CalcRLScore(l_mle_null, param1, beta, se, p_score); - } - - if (a_mode == 1 || a_mode == 4) { - CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, - logl_H1); - CalcRLWald(lambda_remle, param1, beta, se, p_wald); - } - - if (a_mode == 2 || a_mode == 4) { - CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - SUMSTAT SNPs = {beta, se, lambda_remle, lambda_mle, - p_wald, p_lrt, p_score, logl_H1}; - sumStat.push_back(SNPs); - } - } - } - cout << endl; - - gsl_vector_free(x); - gsl_vector_free(x_miss); - gsl_vector_free(Utx); - gsl_matrix_free(Uab); - gsl_vector_free(ab); - - gsl_matrix_free(Xlarge); - gsl_matrix_free(UtXlarge); + LMM::Analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps); infile.close(); infile.clear(); - - return; } void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX, @@ -1914,10 +1585,10 @@ void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX, vector<pair<size_t, double>> &pos_loglr) { double logl_H0, logl_H1, log_lr, lambda0, lambda1; - gsl_vector *w = gsl_vector_alloc(Uty->size); - gsl_matrix *Utw = gsl_matrix_alloc(Uty->size, 1); - gsl_matrix *Uab = gsl_matrix_alloc(Uty->size, 6); - gsl_vector *ab = gsl_vector_alloc(6); + gsl_vector *w = gsl_vector_safe_alloc(Uty->size); + gsl_matrix *Utw = gsl_matrix_safe_alloc(Uty->size, 1); + gsl_matrix *Uab = gsl_matrix_safe_alloc(Uty->size, 6); + gsl_vector *ab = gsl_vector_safe_alloc(6); gsl_vector_set_zero(ab); gsl_vector_set_all(w, 1.0); @@ -1940,10 +1611,10 @@ void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX, pos_loglr.push_back(make_pair(i, log_lr)); } - gsl_vector_free(w); - gsl_matrix_free(Utw); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_vector_safe_free(w); + gsl_matrix_safe_free(Utw); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); return; } @@ -2122,8 +1793,8 @@ void CalcLambda(const char func_name, const gsl_vector *eval, size_t n_cvt = UtW->size2, ni_test = UtW->size1; size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_matrix *Uab = gsl_matrix_alloc(ni_test, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); @@ -2132,8 +1803,8 @@ void CalcLambda(const char func_name, const gsl_vector *eval, CalcLambda(func_name, param0, l_min, l_max, n_region, lambda, logl_H0); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); return; } @@ -2145,8 +1816,8 @@ void CalcPve(const gsl_vector *eval, const gsl_matrix *UtW, size_t n_cvt = UtW->size2, ni_test = UtW->size1; size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_matrix *Uab = gsl_matrix_alloc(ni_test, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); @@ -2158,8 +1829,8 @@ void CalcPve(const gsl_vector *eval, const gsl_matrix *UtW, pve = trace_G * lambda / (trace_G * lambda + 1.0); pve_se = trace_G / ((trace_G * lambda + 1.0) * (trace_G * lambda + 1.0)) * se; - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); return; } @@ -2172,27 +1843,27 @@ void CalcLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW, size_t n_cvt = UtW->size2, ni_test = UtW->size1; size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; - gsl_matrix *Uab = gsl_matrix_alloc(ni_test, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); - gsl_matrix *Pab = gsl_matrix_alloc(n_cvt + 2, n_index); - gsl_vector *Hi_eval = gsl_vector_alloc(eval->size); - gsl_vector *v_temp = gsl_vector_alloc(eval->size); - gsl_matrix *HiW = gsl_matrix_alloc(eval->size, UtW->size2); - gsl_matrix *WHiW = gsl_matrix_alloc(UtW->size2, UtW->size2); - gsl_vector *WHiy = gsl_vector_alloc(UtW->size2); - gsl_matrix *Vbeta = gsl_matrix_alloc(UtW->size2, UtW->size2); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); + gsl_matrix *Pab = gsl_matrix_safe_alloc(n_cvt + 2, n_index); + gsl_vector *Hi_eval = gsl_vector_safe_alloc(eval->size); + gsl_vector *v_temp = gsl_vector_safe_alloc(eval->size); + gsl_matrix *HiW = gsl_matrix_safe_alloc(eval->size, UtW->size2); + gsl_matrix *WHiW = gsl_matrix_safe_alloc(UtW->size2, UtW->size2); + gsl_vector *WHiy = gsl_vector_safe_alloc(UtW->size2); + gsl_matrix *Vbeta = gsl_matrix_safe_alloc(UtW->size2, UtW->size2); gsl_matrix_set_zero(Uab); CalcUab(UtW, Uty, Uab); - gsl_vector_memcpy(v_temp, eval); + gsl_vector_safe_memcpy(v_temp, eval); gsl_vector_scale(v_temp, lambda); gsl_vector_set_all(Hi_eval, 1.0); gsl_vector_add_constant(v_temp, 1.0); gsl_vector_div(Hi_eval, v_temp); // Calculate beta. - gsl_matrix_memcpy(HiW, UtW); + gsl_matrix_safe_memcpy(HiW, UtW); for (size_t i = 0; i < UtW->size2; i++) { gsl_vector_view HiW_col = gsl_matrix_column(HiW, i); gsl_vector_mul(&HiW_col.vector, Hi_eval); @@ -2223,15 +1894,15 @@ void CalcLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW, gsl_vector_set(se_beta, i, sqrt(gsl_matrix_get(Vbeta, i, i))); } - gsl_matrix_free(Uab); - gsl_matrix_free(Pab); - gsl_vector_free(ab); - gsl_vector_free(Hi_eval); - gsl_vector_free(v_temp); - gsl_matrix_free(HiW); - gsl_matrix_free(WHiW); - gsl_vector_free(WHiy); - gsl_matrix_free(Vbeta); + gsl_matrix_safe_free(Uab); + gsl_matrix_safe_free(Pab); + gsl_vector_safe_free(ab); + gsl_vector_safe_free(Hi_eval); + gsl_vector_safe_free(v_temp); + gsl_matrix_safe_free(HiW); + gsl_matrix_safe_free(WHiW); + gsl_vector_safe_free(WHiy); + gsl_matrix_safe_free(Vbeta); gsl_permutation_free(pmt); return; @@ -2262,40 +1933,40 @@ void LMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval, // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 2 + 1) * (n_cvt + 2 + 2) / 2; - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *x_miss = gsl_vector_alloc(U->size1); - gsl_vector *Utx = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gsl_vector *x = gsl_vector_safe_alloc(U->size1); + gsl_vector *x_miss = gsl_vector_safe_alloc(U->size1); + gsl_vector *Utx = gsl_vector_safe_alloc(U->size2); + gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); - gsl_matrix *UtW_expand = gsl_matrix_alloc(U->size1, UtW->size2 + 2); + gsl_matrix *UtW_expand = gsl_matrix_safe_alloc(U->size1, UtW->size2 + 2); gsl_matrix_view UtW_expand_mat = gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); - gsl_matrix_memcpy(&UtW_expand_mat.matrix, UtW); + gsl_matrix_safe_memcpy(&UtW_expand_mat.matrix, UtW); gsl_vector_view UtW_expand_env = gsl_matrix_column(UtW_expand, UtW->size2); gsl_blas_dgemv(CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); gsl_vector_view UtW_expand_x = gsl_matrix_column(UtW_expand, UtW->size2 + 1); // Start reading genotypes and analyze. for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); + ProgressBar("Reading SNPs", t, ns_total - 1); } if (indicator_snp[t] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); x_mean = 0.0; c_phen = 0; n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -2372,13 +2043,13 @@ void LMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval, } cout << endl; - gsl_vector_free(x); - gsl_vector_free(x_miss); - gsl_vector_free(Utx); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_vector_safe_free(x); + gsl_vector_safe_free(x_miss); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); - gsl_matrix_free(UtW_expand); + gsl_matrix_safe_free(UtW_expand); infile.close(); infile.clear(); @@ -2390,8 +2061,8 @@ void LMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const gsl_vector *env) { - debug_msg("entering"); string file_bed = file_bfile + ".bed"; + debug_msg(file_bed); ifstream infile(file_bed.c_str(), ios::binary); if (!infile) { cout << "error reading bed file:" << file_bed << endl; @@ -2412,15 +2083,15 @@ void LMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 2 + 1) * (n_cvt + 2 + 2) / 2; - gsl_vector *x = gsl_vector_alloc(U->size1); - gsl_vector *Utx = gsl_vector_alloc(U->size2); - gsl_matrix *Uab = gsl_matrix_alloc(U->size2, n_index); - gsl_vector *ab = gsl_vector_alloc(n_index); + gsl_vector *x = gsl_vector_safe_alloc(U->size1); + gsl_vector *Utx = gsl_vector_safe_alloc(U->size2); + gsl_matrix *Uab = gsl_matrix_safe_alloc(U->size2, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); - gsl_matrix *UtW_expand = gsl_matrix_alloc(U->size1, UtW->size2 + 2); + gsl_matrix *UtW_expand = gsl_matrix_safe_alloc(U->size1, UtW->size2 + 2); gsl_matrix_view UtW_expand_mat = gsl_matrix_submatrix(UtW_expand, 0, 0, U->size1, UtW->size2); - gsl_matrix_memcpy(&UtW_expand_mat.matrix, UtW); + gsl_matrix_safe_memcpy(&UtW_expand_mat.matrix, UtW); gsl_vector_view UtW_expand_env = gsl_matrix_column(UtW_expand, UtW->size2); gsl_blas_dgemv(CblasTrans, 1.0, U, env, 0.0, &UtW_expand_env.vector); gsl_vector_view UtW_expand_x = gsl_matrix_column(UtW_expand, UtW->size2 + 1); @@ -2440,7 +2111,7 @@ void LMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { - ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + ProgressBar("Reading SNPs", t, snpInfo.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -2550,12 +2221,12 @@ void LMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, } cout << endl; - gsl_vector_free(x); - gsl_vector_free(Utx); - gsl_matrix_free(Uab); - gsl_vector_free(ab); + gsl_vector_safe_free(x); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_safe_free(ab); - gsl_matrix_free(UtW_expand); + gsl_matrix_safe_free(UtW_expand); infile.close(); infile.clear(); @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,10 +25,12 @@ #include "gsl/gsl_vector.h" #include "io.h" #include "param.h" +#include <functional> +#include <tuple> using namespace std; -#define LMM_BATCH_SIZE 10000 // used for batch processing +#define LMM_BATCH_SIZE 20000 // used for batch processing class FUNC_PARAM { @@ -40,6 +44,8 @@ public: size_t e_mode; }; +typedef std::tuple<string,std::vector<double> > SnpNameValues; + class LMM { public: @@ -53,8 +59,6 @@ public: string path_out; string file_gene; - // WJA added - string file_oxford; // LMM related parameters double l_min; @@ -91,17 +95,19 @@ public: void AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Utx, const gsl_matrix *W, const gsl_vector *x); - void AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y); - // WJA added. - void Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_vector *Uty, - const gsl_matrix *W, const gsl_vector *y); + void Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps); void AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, const set<string> gwasnps); + void AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + const set<string> gwasnps); void AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_vector *Uty, const gsl_matrix *W, const gsl_vector *y, diff --git a/src/logistic.cpp b/src/logistic.cpp index 2dd0402..fd5f0d1 100644 --- a/src/logistic.cpp +++ b/src/logistic.cpp @@ -7,6 +7,7 @@ #include <stdio.h> #include "logistic.h" +#include "debug.h" // I need to bundle all the data that goes to the function to optimze // together. @@ -42,12 +43,12 @@ double fLogit_mixed(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, for (int i = 0; i < n; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < X->size2; ++k) { + for (size_t k = 0; k < X->size2; ++k) { if (gsl_matrix_int_get(X, i, k) > 0) Xbetai += beta->data[gsl_matrix_int_get(X, i, k) - 1 + iParm]; iParm += nlev->data[k] - 1; } - for (int k = 0; k < (Xc->size2); ++k) + for (size_t k = 0; k < (Xc->size2); ++k) Xbetai += gsl_matrix_get(Xc, i, k) * beta->data[iParm++]; total += y->data[i] * Xbetai - gsl_sf_log_1plusx(gsl_sf_exp(Xbetai)); } @@ -62,16 +63,16 @@ void logistic_mixed_pred(gsl_vector *beta, // Vector of parameters // obs x Kc (NULL if not used). gsl_vector *yhat) { // Vector of prob. predicted by // the logistic - for (int i = 0; i < X->size1; ++i) { + for (size_t i = 0; i < X->size1; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < X->size2; ++k) { + for (size_t k = 0; k < X->size2; ++k) { if (gsl_matrix_int_get(X, i, k) > 0) Xbetai += beta->data[gsl_matrix_int_get(X, i, k) - 1 + iParm]; iParm += nlev->data[k] - 1; } // Adding the continuous. - for (int k = 0; k < (Xc->size2); ++k) + for (size_t k = 0; k < (Xc->size2); ++k) Xbetai += gsl_matrix_get(Xc, i, k) * beta->data[iParm++]; yhat->data[i] = 1 / (1 + gsl_sf_exp(-Xbetai)); } @@ -135,7 +136,7 @@ void wgsl_mixed_optim_hessian(const gsl_vector *beta, void *params, int K = p->X->size2; int Kc = p->Xc->size2; int npar = beta->size; - gsl_vector *gn = gsl_vector_alloc(npar); // gn + gsl_vector *gn = gsl_vector_safe_alloc(npar); // gn // Intitialize Hessian out necessary ??? gsl_matrix_set_zero(out); @@ -191,11 +192,8 @@ void wgsl_mixed_optim_hessian(const gsl_vector *beta, void *params, } double wgsl_mixed_optim_f(gsl_vector *v, void *params) { - double mLogLik = 0; fix_parm_mixed_T *p = (fix_parm_mixed_T *)params; - mLogLik = - fLogit_mixed(v, p->X, p->nlev, p->Xc, p->y, p->lambdaL1, p->lambdaL2); - return mLogLik; + return fLogit_mixed(v, p->X, p->nlev, p->Xc, p->y, p->lambdaL1, p->lambdaL2); } // Compute both f and df together. @@ -209,7 +207,7 @@ void wgsl_mixed_optim_fdf(gsl_vector *x, void *params, double *f, int logistic_mixed_fit(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, gsl_matrix *Xc, gsl_vector *y, double lambdaL1, double lambdaL2) { - double mLogLik = 0; + // double mLogLik = 0; fix_parm_mixed_T p; int npar = beta->size; int iter = 0; @@ -224,13 +222,13 @@ int logistic_mixed_fit(gsl_vector *beta, gsl_matrix_int *X, p.lambdaL2 = lambdaL2; // Initial fit. - mLogLik = wgsl_mixed_optim_f(beta, &p); + // auto mLogLik = wgsl_mixed_optim_f(beta, &p); - gsl_matrix *myH = gsl_matrix_alloc(npar, npar); // Hessian matrix. - gsl_vector *stBeta = gsl_vector_alloc(npar); // Direction to move. + gsl_matrix *myH = gsl_matrix_safe_alloc(npar, npar); // Hessian matrix. + gsl_vector *stBeta = gsl_vector_safe_alloc(npar); // Direction to move. - gsl_vector *myG = gsl_vector_alloc(npar); // Gradient. - gsl_vector *tau = gsl_vector_alloc(npar); // tau for QR. + gsl_vector *myG = gsl_vector_safe_alloc(npar); // Gradient. + gsl_vector *tau = gsl_vector_safe_alloc(npar); // tau for QR. for (iter = 0; iter < 100; iter++) { wgsl_mixed_optim_hessian(beta, &p, myH); // Calculate Hessian. @@ -250,7 +248,7 @@ int logistic_mixed_fit(gsl_vector *beta, gsl_matrix_int *X, } // Final fit. - mLogLik = wgsl_mixed_optim_f(beta, &p); + // mLogLik = wgsl_mixed_optim_f(beta, &p); gsl_vector_free(tau); gsl_vector_free(stBeta); @@ -298,7 +296,7 @@ double fLogit_cat(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, for (int i = 0; i < n; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < X->size2; ++k) { + for (size_t k = 0; k < X->size2; ++k) { if (gsl_matrix_int_get(X, i, k) > 0) Xbetai += beta->data[gsl_matrix_int_get(X, i, k) - 1 + iParm]; iParm += nlev->data[k] - 1; @@ -314,10 +312,10 @@ void logistic_cat_pred(gsl_vector *beta, // Vector of parameters gsl_vector_int *nlev, // Vector with #categories gsl_vector *yhat) { // Vector of prob. predicted by // the logistic. - for (int i = 0; i < X->size1; ++i) { + for (size_t i = 0; i < X->size1; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < X->size2; ++k) { + for (size_t k = 0; k < X->size2; ++k) { if (gsl_matrix_int_get(X, i, k) > 0) Xbetai += beta->data[gsl_matrix_int_get(X, i, k) - 1 + iParm]; iParm += nlev->data[k] - 1; @@ -440,7 +438,7 @@ void wgsl_cat_optim_fdf(gsl_vector *x, void *params, double *f, int logistic_cat_fit(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, gsl_vector *y, double lambdaL1, double lambdaL2) { - double mLogLik = 0; + // double mLogLik = 0; fix_parm_cat_T p; int npar = beta->size; int iter = 0; @@ -453,14 +451,16 @@ int logistic_cat_fit(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, p.lambdaL1 = lambdaL1; p.lambdaL2 = lambdaL2; +#ifdef _RPR_DEBUG_ // Initial fit. - mLogLik = wgsl_cat_optim_f(beta, &p); + auto mLogLik = wgsl_cat_optim_f(beta, &p); +#endif - gsl_matrix *myH = gsl_matrix_alloc(npar, npar); // Hessian matrix. - gsl_vector *stBeta = gsl_vector_alloc(npar); // Direction to move. + gsl_matrix *myH = gsl_matrix_safe_alloc(npar, npar); // Hessian matrix. + gsl_vector *stBeta = gsl_vector_safe_alloc(npar); // Direction to move. - gsl_vector *myG = gsl_vector_alloc(npar); // Gradient. - gsl_vector *tau = gsl_vector_alloc(npar); // tau for QR. + gsl_vector *myG = gsl_vector_safe_alloc(npar); // Gradient. + gsl_vector *tau = gsl_vector_safe_alloc(npar); // tau for QR. for (iter = 0; iter < 100; iter++) { wgsl_cat_optim_hessian(beta, &p, myH); // Calculate Hessian. @@ -484,7 +484,7 @@ int logistic_cat_fit(gsl_vector *beta, gsl_matrix_int *X, gsl_vector_int *nlev, } // Final fit. - mLogLik = wgsl_cat_optim_f(beta, &p); + // mLogLik = wgsl_cat_optim_f(beta, &p); gsl_vector_free(tau); gsl_vector_free(stBeta); @@ -507,7 +507,7 @@ typedef struct { double lambdaL2; } fix_parm_cont_T; -double fLogit_cont(gsl_vector *beta, gsl_matrix *Xc, gsl_vector *y, +double fLogit_cont(const gsl_vector *beta, const gsl_matrix *Xc, const gsl_vector *y, double lambdaL1, double lambdaL2) { int n = y->size; int npar = beta->size; @@ -531,7 +531,7 @@ double fLogit_cont(gsl_vector *beta, gsl_matrix *Xc, gsl_vector *y, for (int i = 0; i < n; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < (Xc->size2); ++k) + for (size_t k = 0; k < (Xc->size2); ++k) Xbetai += gsl_matrix_get(Xc, i, k) * beta->data[iParm++]; total += y->data[i] * Xbetai - gsl_sf_log_1plusx(gsl_sf_exp(Xbetai)); } @@ -544,17 +544,17 @@ void logistic_cont_pred(gsl_vector *beta, // Vector of parameters // Nobs x Kc (NULL if not used). gsl_vector *yhat) { // Vector of prob. predicted by // the logistic. - for (int i = 0; i < Xc->size1; ++i) { + for (size_t i = 0; i < Xc->size1; ++i) { double Xbetai = beta->data[0]; int iParm = 1; - for (int k = 0; k < (Xc->size2); ++k) + for (size_t k = 0; k < (Xc->size2); ++k) Xbetai += gsl_matrix_get(Xc, i, k) * beta->data[iParm++]; yhat->data[i] = 1 / (1 + gsl_sf_exp(-Xbetai)); } } // The gradient of f, df = (df/dx, df/dy). -void wgsl_cont_optim_df(const gsl_vector *beta, void *params, gsl_vector *out) { +void wgsl_cont_optim_df(const gsl_vector *beta, const void *params, gsl_vector *out) { fix_parm_cont_T *p = (fix_parm_cont_T *)params; int n = p->y->size; int Kc = p->Xc->size2; @@ -596,7 +596,7 @@ void wgsl_cont_optim_hessian(const gsl_vector *beta, void *params, int n = p->y->size; int Kc = p->Xc->size2; int npar = beta->size; - gsl_vector *gn = gsl_vector_alloc(npar); // gn. + gsl_vector *gn = gsl_vector_safe_alloc(npar); // gn. // Intitialize Hessian out necessary ??? @@ -639,7 +639,7 @@ void wgsl_cont_optim_hessian(const gsl_vector *beta, void *params, gsl_vector_free(gn); } -double wgsl_cont_optim_f(gsl_vector *v, void *params) { +double wgsl_cont_optim_f(const gsl_vector *v, const void *params) { double mLogLik = 0; fix_parm_cont_T *p = (fix_parm_cont_T *)params; mLogLik = fLogit_cont(v, p->Xc, p->y, p->lambdaL1, p->lambdaL2); @@ -647,7 +647,7 @@ double wgsl_cont_optim_f(gsl_vector *v, void *params) { } // Compute both f and df together. -void wgsl_cont_optim_fdf(gsl_vector *x, void *params, double *f, +void wgsl_cont_optim_fdf(const gsl_vector *x, const void *params, double *f, gsl_vector *df) { *f = wgsl_cont_optim_f(x, params); wgsl_cont_optim_df(x, params, df); @@ -658,7 +658,6 @@ int logistic_cont_fit(gsl_vector *beta, // Nobs x Kc (NULL if not used). gsl_vector *y, double lambdaL1, double lambdaL2) { - double mLogLik = 0; fix_parm_cont_T p; int npar = beta->size; int iter = 0; @@ -670,14 +669,16 @@ int logistic_cont_fit(gsl_vector *beta, p.lambdaL1 = lambdaL1; p.lambdaL2 = lambdaL2; +#ifdef _RPR_DEBUG_ // Initial fit. - mLogLik = wgsl_cont_optim_f(beta, &p); + auto mLogLik = wgsl_cont_optim_f(beta, &p); +#endif - gsl_matrix *myH = gsl_matrix_alloc(npar, npar); // Hessian matrix. - gsl_vector *stBeta = gsl_vector_alloc(npar); // Direction to move. + gsl_matrix *myH = gsl_matrix_safe_alloc(npar, npar); // Hessian matrix. + gsl_vector *stBeta = gsl_vector_safe_alloc(npar); // Direction to move. - gsl_vector *myG = gsl_vector_alloc(npar); // Gradient. - gsl_vector *tau = gsl_vector_alloc(npar); // tau for QR. + gsl_vector *myG = gsl_vector_safe_alloc(npar); // Gradient. + gsl_vector *tau = gsl_vector_safe_alloc(npar); // tau for QR. for (iter = 0; iter < 100; iter++) { wgsl_cont_optim_hessian(beta, &p, myH); // Calculate Hessian. @@ -701,7 +702,7 @@ int logistic_cont_fit(gsl_vector *beta, } // Final fit. - mLogLik = wgsl_cont_optim_f(beta, &p); + // mLogLik = wgsl_cont_optim_f(beta, &p); gsl_vector_free(tau); gsl_vector_free(stBeta); diff --git a/src/logistic.h b/src/logistic.h index bebcbf6..c8c0cb3 100644 --- a/src/logistic.h +++ b/src/logistic.h @@ -58,8 +58,8 @@ int logistic_cont_fit(gsl_vector *beta, // Vector of parameters double lambdaL1, // Regularization L1, 0 if not used. double lambdaL2); // Regularization L2, 0 if not used. -double fLogit_cont(gsl_vector *beta, - gsl_matrix *Xc, // Continuous covariates matrix Nobs x Kc. - gsl_vector *y, double lambdaL1, double lambdaL2); +double fLogit_cont(const gsl_vector *beta, + const gsl_matrix *Xc, // Continuous covariates matrix Nobs x Kc. + const gsl_vector *y, double lambdaL1, double lambdaL2); #endif diff --git a/src/main.cpp b/src/main.cpp index 92c4d90..706ac35 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,19 +33,23 @@ int main(int argc, char *argv[]) { if (argc <= 1) { cGemma.PrintHeader(); + cGemma.PrintHelp(0); return EXIT_SUCCESS; } if (argc == 2 && argv[1][0] == '-' && argv[1][1] == 'h') { + cGemma.PrintHeader(); cGemma.PrintHelp(0); return EXIT_SUCCESS; } if (argc == 3 && argv[1][0] == '-' && argv[1][1] == 'h') { string str; str.assign(argv[2]); + cGemma.PrintHeader(); cGemma.PrintHelp(atoi(str.c_str())); return EXIT_SUCCESS; } if (argc == 2 && argv[1][0] == '-' && argv[1][1] == 'l') { + cGemma.PrintHeader(); cGemma.PrintLicense(); return EXIT_SUCCESS; } @@ -57,11 +61,14 @@ int main(int argc, char *argv[]) { mkdir((cPar.path_out).c_str(), S_IRWXU | S_IRGRP | S_IROTH); } + if (!is_quiet_mode()) + cGemma.PrintHeader(); + if (cPar.error == true) { return EXIT_FAILURE; } - if (cPar.mode_silence) { + if (is_quiet_mode()) { stringstream ss; cout.rdbuf(ss.rdbuf()); } diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index 4203837..9076c47 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -1,19 +1,21 @@ /* - Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <bitset> @@ -32,7 +34,7 @@ #include <tuple> #include <vector> -#include "Eigen/Dense" +// #include "Eigen/Dense" #include "gsl/gsl_version.h" @@ -40,6 +42,7 @@ #pragma message "GSL version " GSL_VERSION #endif +#include "gsl/gsl_sys.h" // for gsl_isnan, gsl_isinf, gsl_isfinite #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" #include "gsl/gsl_linalg.h" @@ -48,21 +51,49 @@ #include "gsl/gsl_eigen.h" #include "debug.h" -#include "eigenlib.h" +// #include "eigenlib.h" +#include "fastblas.h" #include "lapack.h" #include "mathfunc.h" using namespace std; -using namespace Eigen; +// using namespace Eigen; bool has_nan(const vector<double> v) { for (const auto& e: v) { - if (std::isnan(e)) + if (is_nan(e)) return true; } return false; } +bool has_nan(const gsl_vector *v) { + for (size_t i = 0; i < v->size; ++i) + if (gsl_isnan(gsl_vector_get(v,i))) return true; + return false; +} +bool has_inf(const gsl_vector *v) { + for (size_t i = 0; i < v->size; ++i) { + auto value = gsl_vector_get(v,i); + if (gsl_isinf(value) != 0) return true; + } + return false; +} +bool has_nan(const gsl_matrix *m) { + for (size_t i = 0; i < m->size1; ++i) + for (size_t j = 0; j < m->size2; ++j) + if (gsl_isnan(gsl_matrix_get(m,i,j))) return true; + return false; +} +bool has_inf(const gsl_matrix *m) { + for (size_t i = 0; i < m->size1; ++i) + for (size_t j = 0; j < m->size2; ++j) { + auto value = gsl_matrix_get(m,i,j); + if (gsl_isinf(value) != 0) return true; + } + return false; +} + // calculate variance of a vector double VectorVar(const gsl_vector *v) { double d, m = 0.0, m2 = 0.0; @@ -79,8 +110,8 @@ double VectorVar(const gsl_vector *v) { // Center the matrix G. void CenterMatrix(gsl_matrix *G) { double d; - gsl_vector *w = gsl_vector_alloc(G->size1); - gsl_vector *Gw = gsl_vector_alloc(G->size1); + gsl_vector *w = gsl_vector_safe_alloc(G->size1); + gsl_vector *Gw = gsl_vector_safe_alloc(G->size1); gsl_vector_set_all(w, 1.0); gsl_blas_dgemv(CblasNoTrans, 1.0, G, w, 0.0, Gw); @@ -95,8 +126,8 @@ void CenterMatrix(gsl_matrix *G) { } } - gsl_vector_free(w); - gsl_vector_free(Gw); + gsl_vector_safe_free(w); + gsl_vector_safe_free(Gw); return; } @@ -104,7 +135,7 @@ void CenterMatrix(gsl_matrix *G) { // Center the matrix G. void CenterMatrix(gsl_matrix *G, const gsl_vector *w) { double d, wtw; - gsl_vector *Gw = gsl_vector_alloc(G->size1); + gsl_vector *Gw = gsl_vector_safe_alloc(G->size1); gsl_blas_ddot(w, w, &wtw); gsl_blas_dgemv(CblasNoTrans, 1.0, G, w, 0.0, Gw); @@ -119,19 +150,19 @@ void CenterMatrix(gsl_matrix *G, const gsl_vector *w) { } } - gsl_vector_free(Gw); + gsl_vector_safe_free(Gw); return; } // Center the matrix G. void CenterMatrix(gsl_matrix *G, const gsl_matrix *W) { - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *WtWiWt = gsl_matrix_alloc(W->size2, G->size1); - gsl_matrix *GW = gsl_matrix_alloc(G->size1, W->size2); - gsl_matrix *WtGW = gsl_matrix_alloc(W->size2, W->size2); - gsl_matrix *Gtmp = gsl_matrix_alloc(G->size1, G->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWiWt = gsl_matrix_safe_alloc(W->size2, G->size1); + gsl_matrix *GW = gsl_matrix_safe_alloc(G->size1, W->size2); + gsl_matrix *WtGW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *Gtmp = gsl_matrix_safe_alloc(G->size1, G->size1); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); @@ -155,12 +186,12 @@ void CenterMatrix(gsl_matrix *G, const gsl_matrix *W) { gsl_matrix_add(G, Gtmp); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_matrix_free(WtWiWt); - gsl_matrix_free(GW); - gsl_matrix_free(WtGW); - gsl_matrix_free(Gtmp); + gsl_matrix_safe_free(WtW); + gsl_matrix_safe_free(WtWi); + gsl_matrix_safe_free(WtWiWt); + gsl_matrix_safe_free(GW); + gsl_matrix_safe_free(WtGW); + gsl_matrix_safe_free(Gtmp); return; } @@ -210,8 +241,8 @@ bool isMatrixSymmetric(const gsl_matrix *G) { auto m = G->data; // upper triangle auto size = G->size1; - for(auto c = 0; c < size; c++) { - for(auto r = 0; r < c; r++) { + for(size_t c = 0; c < size; c++) { + for(size_t r = 0; r < c; r++) { int x1 = c, y1 = r, x2 = r, y2 = c; auto idx1 = y1*size+x1, idx2 = y2*size+x2; // printf("(%d,%d %f - %d,%d %f)",x1,y1,m[idx1],x2,y2,m[idx2]); @@ -226,8 +257,8 @@ bool isMatrixSymmetric(const gsl_matrix *G) { bool isMatrixPositiveDefinite(const gsl_matrix *G) { enforce(G->size1 == G->size2); - auto G2 = gsl_matrix_alloc(G->size1, G->size2); - enforce_gsl(gsl_matrix_memcpy(G2,G)); + auto G2 = gsl_matrix_safe_alloc(G->size1, G->size2); + enforce_gsl(gsl_matrix_safe_memcpy(G2,G)); auto handler = gsl_set_error_handler_off(); #if GSL_MAJOR_VERSION >= 2 && GSL_MINOR_VERSION >= 3 auto s = gsl_linalg_cholesky_decomp1(G2); @@ -235,20 +266,24 @@ bool isMatrixPositiveDefinite(const gsl_matrix *G) { auto s = gsl_linalg_cholesky_decomp(G2); #endif gsl_set_error_handler(handler); + if (s == GSL_SUCCESS) { + gsl_matrix_safe_free(G2); + return true; + } gsl_matrix_free(G2); - return (s == GSL_SUCCESS); + return (false); } gsl_vector *getEigenValues(const gsl_matrix *G) { enforce(G->size1 == G->size2); - auto G2 = gsl_matrix_alloc(G->size1, G->size2); - enforce_gsl(gsl_matrix_memcpy(G2,G)); + auto G2 = gsl_matrix_safe_alloc(G->size1, G->size2); + enforce_gsl(gsl_matrix_safe_memcpy(G2,G)); auto eworkspace = gsl_eigen_symm_alloc(G->size1); enforce(eworkspace); - gsl_vector *eigenvalues = gsl_vector_alloc(G->size1); + gsl_vector *eigenvalues = gsl_vector_safe_alloc(G->size1); enforce_gsl(gsl_eigen_symm(G2, eigenvalues, eworkspace)); gsl_eigen_symm_free(eworkspace); - gsl_matrix_free(G2); + gsl_matrix_safe_free(G2); return eigenvalues; } @@ -256,11 +291,27 @@ gsl_vector *getEigenValues(const gsl_matrix *G) { // by default 1E-5. // Returns success, eigen min, eigen min-but-1, eigen max +tuple<double, double, double> minmax(const gsl_vector *v) { + auto min = v->data[0]; + auto min1 = min; + auto max = min; + for (size_t i=1; i<v->size; i++) { + auto value = std::abs(v->data[i]); + if (value < min) { + min1 = min; + min = value; + } + if (value > max) + max = value; + } + return std::make_tuple(min, min1, max); +} + tuple<double, double, double> abs_minmax(const gsl_vector *v) { auto min = std::abs(v->data[0]); - auto min1 = std::abs(v->data[0]); - auto max = std::abs(v->data[0]); - for (auto i=0; i<v->size; i++) { + auto min1 = min; + auto max = min; + for (size_t i=1; i<v->size; i++) { auto value = std::abs(v->data[i]); if (value < min) { min1 = min; @@ -276,7 +327,7 @@ tuple<double, double, double> abs_minmax(const gsl_vector *v) { // the lowest value bool has_negative_values_but_one(const gsl_vector *v) { bool one_skipped = false; - for (auto i=0; i<v->size; i++) { + for (size_t i=0; i<v->size; i++) { if (v->data[i] < 0.0) { if (one_skipped) return true; @@ -286,11 +337,12 @@ bool has_negative_values_but_one(const gsl_vector *v) { return false; } -uint count_small_values(const gsl_vector *v, double min) { +uint count_abs_small_values(const gsl_vector *v, double min) { uint count = 0; - for (auto i=0; i<v->size; i++) { - if (v->data[i] < min) + for (size_t i=0; i<v->size; i++) { + if (std::abs(v->data[i]) < min) { count += 1; + } } return count; } @@ -298,24 +350,35 @@ uint count_small_values(const gsl_vector *v, double min) { // Check for matrix being ill conditioned by taking the eigen values // and the ratio of max and min but one (min is expected to be zero). bool isMatrixIllConditioned(const gsl_vector *eigenvalues, double max_ratio) { - bool ret_valid = true; - auto t = abs_minmax(eigenvalues); auto absmin = get<0>(t); auto absmin1 = get<1>(t); auto absmax = get<2>(t); if (absmax/absmin1 > max_ratio) { #if !NDEBUG - cerr << "**** DEBUG: Eigenvalues [Min " << absmin << ", " << absmin1 << " ... " << absmax << " Max] Ratio " << absmax/absmin1 << endl; + cerr << "**** DEBUG: Ratio |eigenmax|/|eigenmin| suggests matrix is ill conditioned for double precision" << endl; + auto t = minmax(eigenvalues); + auto min = get<0>(t); + auto min1 = get<1>(t); + auto max = get<2>(t); + cerr << "**** DEBUG: Abs eigenvalues [Min " << absmin << ", " << absmin1 << " ... " << absmax << " Max] Ratio (" << absmax << "/" << absmin1 << ") = " << absmax/absmin1 << endl; + cerr << "**** DEBUG: Eigenvalues [Min " << min << ", " << min1 << " ... " << max << " Max]" << endl; #endif - ret_valid = false; + return true; } - return ret_valid; + return false; +} + +double sum(const double *m, size_t rows, size_t cols) { + double sum = 0.0; + for (size_t i = 0; i<rows*cols; i++) + sum += m[i]; + return sum; } double SumVector(const gsl_vector *v) { double sum = 0; - for (int i = 0; i < v->size; i++ ) { + for (size_t i = 0; i < v->size; i++ ) { sum += gsl_vector_get(v, i); } return( sum ); @@ -337,9 +400,9 @@ double CenterVector(gsl_vector *y) { // Center the vector y. void CenterVector(gsl_vector *y, const gsl_matrix *W) { - gsl_matrix *WtW = gsl_matrix_alloc(W->size2, W->size2); - gsl_vector *Wty = gsl_vector_alloc(W->size2); - gsl_vector *WtWiWty = gsl_vector_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wty = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWty = gsl_vector_safe_alloc(W->size2); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); gsl_blas_dgemv(CblasTrans, 1.0, W, y, 0.0, Wty); @@ -351,9 +414,9 @@ void CenterVector(gsl_vector *y, const gsl_matrix *W) { gsl_blas_dgemv(CblasNoTrans, -1.0, W, WtWiWty, 1.0, y); - gsl_matrix_free(WtW); - gsl_vector_free(Wty); - gsl_vector_free(WtWiWty); + gsl_matrix_safe_free(WtW); + gsl_vector_safe_free(Wty); + gsl_vector_safe_free(WtWiWty); return; } @@ -379,22 +442,18 @@ void StandardizeVector(gsl_vector *y) { // Calculate UtX. void CalcUtX(const gsl_matrix *U, gsl_matrix *UtX) { - gsl_matrix *X = gsl_matrix_alloc(UtX->size1, UtX->size2); - gsl_matrix_memcpy(X, UtX); - eigenlib_dgemm("T", "N", 1.0, U, X, 0.0, UtX); - gsl_matrix_free(X); - - return; + gsl_matrix *X = gsl_matrix_safe_alloc(UtX->size1, UtX->size2); + gsl_matrix_safe_memcpy(X, UtX); + fast_dgemm("T", "N", 1.0, U, X, 0.0, UtX); + gsl_matrix_safe_free(X); } void CalcUtX(const gsl_matrix *U, const gsl_matrix *X, gsl_matrix *UtX) { - eigenlib_dgemm("T", "N", 1.0, U, X, 0.0, UtX); - return; + fast_dgemm("T", "N", 1.0, U, X, 0.0, UtX); } void CalcUtX(const gsl_matrix *U, const gsl_vector *x, gsl_vector *Utx) { gsl_blas_dgemv(CblasTrans, 1.0, U, x, 0.0, Utx); - return; } // Kronecker product. @@ -403,7 +462,7 @@ void Kronecker(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) { for (size_t j = 0; j < K->size2; j++) { gsl_matrix_view H_sub = gsl_matrix_submatrix( H, i * V->size1, j * V->size2, V->size1, V->size2); - gsl_matrix_memcpy(&H_sub.matrix, V); + gsl_matrix_safe_memcpy(&H_sub.matrix, V); gsl_matrix_scale(&H_sub.matrix, gsl_matrix_get(K, i, j)); } } @@ -416,13 +475,13 @@ void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H) { for (size_t j = i; j < K->size2; j++) { gsl_matrix_view H_sub = gsl_matrix_submatrix( H, i * V->size1, j * V->size2, V->size1, V->size2); - gsl_matrix_memcpy(&H_sub.matrix, V); + gsl_matrix_safe_memcpy(&H_sub.matrix, V); gsl_matrix_scale(&H_sub.matrix, gsl_matrix_get(K, i, j)); if (i != j) { gsl_matrix_view H_sub_sym = gsl_matrix_submatrix( H, j * V->size1, i * V->size2, V->size1, V->size2); - gsl_matrix_memcpy(&H_sub_sym.matrix, &H_sub.matrix); + gsl_matrix_safe_memcpy(&H_sub_sym.matrix, &H_sub.matrix); } } } @@ -520,6 +579,7 @@ unsigned char Double02ToUchar(const double dosage) { return (int)(dosage * 100); } +/* void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, const size_t i_row, VectorXd &x_row) { if (i_row < X.size()) { @@ -531,3 +591,5 @@ void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, exit(1); } } + +*/ diff --git a/src/mathfunc.h b/src/mathfunc.h index 6e20b37..8258c22 100644 --- a/src/mathfunc.h +++ b/src/mathfunc.h @@ -1,25 +1,27 @@ /* - Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __MATHFUNC_H__ #define __MATHFUNC_H__ -#include "Eigen/Dense" +// #include "Eigen/Dense" #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" @@ -27,9 +29,16 @@ #define EIGEN_MINVALUE 1e-10 using namespace std; -using namespace Eigen; + +inline bool is_nan(double f) { + return (std::isnan(f)); +} bool has_nan(const vector<double> v); +bool has_nan(const gsl_vector *v); +bool has_inf(const gsl_vector *v); +bool has_nan(const gsl_matrix *m); +bool has_inf(const gsl_matrix *m); double VectorVar(const gsl_vector *v); void CenterMatrix(gsl_matrix *G); @@ -38,11 +47,12 @@ void CenterMatrix(gsl_matrix *G, const gsl_matrix *W); void StandardizeMatrix(gsl_matrix *G); double ScaleMatrix(gsl_matrix *G); bool has_negative_values_but_one(const gsl_vector *v); -uint count_small_values(const gsl_vector *v, double min); +uint count_abs_small_values(const gsl_vector *v, double min); bool isMatrixPositiveDefinite(const gsl_matrix *G); bool isMatrixIllConditioned(const gsl_vector *eigenvalues, double max_ratio=CONDITIONED_MAXRATIO); bool isMatrixSymmetric(const gsl_matrix *G); gsl_vector *getEigenValues(const gsl_matrix *G); +double sum(const double *m, size_t rows, size_t cols); double SumVector(const gsl_vector *v); double CenterVector(gsl_vector *y); void CenterVector(gsl_vector *y, const gsl_matrix *W); @@ -56,7 +66,7 @@ void KroneckerSym(const gsl_matrix *K, const gsl_matrix *V, gsl_matrix *H); double UcharToDouble02(const unsigned char c); unsigned char Double02ToUchar(const double dosage); -void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, - const size_t i_row, VectorXd &x_row); +// void uchar_matrix_get_row(const vector<vector<unsigned char>> &X, +// const size_t i_row, Eigen::VectorXd &x_row); #endif diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index c5efb6e..eee562d 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -31,14 +31,13 @@ #include "gsl/gsl_blas.h" #include "gsl/gsl_cdf.h" -#include "gsl/gsl_integration.h" #include "gsl/gsl_linalg.h" #include "gsl/gsl_matrix.h" #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" -#include "eigenlib.h" +#include "fastblas.h" #include "gzstream.h" #include "io.h" #include "lapack.h" @@ -54,7 +53,6 @@ void MVLMM::CopyFromParam(PARAM &cPar) { file_bfile = cPar.file_bfile; file_geno = cPar.file_geno; - file_oxford = cPar.file_oxford; file_out = cPar.file_out; path_out = cPar.path_out; @@ -719,7 +717,7 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec, gsl_matrix *Vbeta) { size_t n_size = eval->size, c_size = W->size1, d_size = V_g->size1; size_t dc_size = d_size * c_size; - double delta, dl, d, d1, d2, dy, dx, dw, logdet_Ve, logdet_Q, p_value; + double delta, dl, d, d1, d2, dy, dx, dw; // logdet_Ve, logdet_Q, p_value; gsl_vector *D_l = gsl_vector_alloc(d_size); gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size); @@ -738,10 +736,12 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec, gsl_vector_set_zero(WHiy); // Eigen decomposition and calculate log|Ve|. - logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + // double logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); // Calculate Qi and log|Q|. - logdet_Q = CalcQi(eval, D_l, W, Qi); + // double logdet_Q = CalcQi(eval, D_l, W, Qi); + CalcQi(eval, D_l, W, Qi); // Calculate UltVehiY. gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); @@ -799,7 +799,7 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec, // Calculate test statistic and p value. gsl_blas_ddot(D_l, xPy, &d); - p_value = gsl_cdf_chisq_Q(d, (double)d_size); + double p_value = gsl_cdf_chisq_Q(d, (double)d_size); gsl_vector_free(D_l); gsl_matrix_free(UltVeh); @@ -825,7 +825,7 @@ void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W, gsl_matrix *se_B) { size_t n_size = eval->size, c_size = W->size1, d_size = V_g->size1; size_t dc_size = d_size * c_size; - double delta, dl, d, dy, dw, logdet_Ve, logdet_Q; + double delta, dl, d, dy, dw; // , logdet_Ve, logdet_Q; gsl_vector *D_l = gsl_vector_alloc(d_size); gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size); @@ -840,10 +840,12 @@ void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W, gsl_vector_set_zero(WHiy); // Eigen decomposition and calculate log|Ve|. - logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + // double logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); // Calculate Qi and log|Q|. - logdet_Q = CalcQi(eval, D_l, W, Qi); + // double logdet_Q = CalcQi(eval, D_l, W, Qi); + CalcQi(eval, D_l, W, Qi); // Calculate UltVehiY. gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); @@ -2878,13 +2880,15 @@ void MphInitial(const size_t em_iter, const double em_prec, gsl_vector_set_zero(XHiy); - double logdet_Ve, logdet_Q, dl, d, delta, dx, dy; + double dl, d, delta, dx, dy; // Eigen decomposition and calculate log|Ve|. - logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + // double logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); + EigenProc(V_g, V_e, D_l, UltVeh, UltVehi); // Calculate Qi and log|Q|. - logdet_Q = CalcQi(eval, D_l, X, Qi); + // double logdet_Q = CalcQi(eval, D_l, X, Qi); + CalcQi(eval, D_l, X, Qi); // Calculate UltVehiY. gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY); @@ -2950,556 +2954,6 @@ double PCRT(const size_t mode, const size_t d_size, const double p_value, return p_crt; } -// WJA added. -void MVLMM::Analyzebgen(const gsl_matrix *U, const gsl_vector *eval, - const gsl_matrix *UtW, const gsl_matrix *UtY) { - debug_msg("entering"); - string file_bgen = file_oxford + ".bgen"; - ifstream infile(file_bgen.c_str(), ios::binary); - if (!infile) { - cout << "error reading bgen file:" << file_bgen << endl; - return; - } - - clock_t time_start = clock(); - time_UtX = 0; - time_opt = 0; - - string line; - - // Create a large matrix. - size_t msize = LMM_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix *UtXlarge = gsl_matrix_alloc(U->size1, msize); - gsl_matrix_set_zero(Xlarge); - - double logl_H0 = 0.0, logl_H1 = 0.0, p_wald = 0, p_lrt = 0, p_score = 0; - double crt_a, crt_b, crt_c; - int n_miss, c_phen; - double geno, x_mean; - size_t c = 0; - size_t n_size = UtY->size1, d_size = UtY->size2, c_size = UtW->size2; - - size_t dc_size = d_size * (c_size + 1), v_size = d_size * (d_size + 1) / 2; - - // Large matrices for EM. - gsl_matrix *U_hat = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *E_hat = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *OmegaU = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *OmegaE = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiY = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiBX = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiU = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *UltVehiE = gsl_matrix_alloc(d_size, n_size); - - // Large matrices for NR. Each dxd block is H_k^{-1}. - gsl_matrix *Hi_all = gsl_matrix_alloc(d_size, d_size * n_size); - - // Each column is H_k^{-1}y_k. - gsl_matrix *Hiy_all = gsl_matrix_alloc(d_size, n_size); - - // Each dcxdc block is x_k\otimes H_k^{-1}. - gsl_matrix *xHi_all = gsl_matrix_alloc(dc_size, d_size * n_size); - gsl_matrix *Hessian = gsl_matrix_alloc(v_size * 2, v_size * 2); - gsl_vector *x = gsl_vector_alloc(n_size); - gsl_vector *x_miss = gsl_vector_alloc(n_size); - - gsl_matrix *Y = gsl_matrix_alloc(d_size, n_size); - gsl_matrix *X = gsl_matrix_alloc(c_size + 1, n_size); - gsl_matrix *V_g = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *V_e = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *B = gsl_matrix_alloc(d_size, c_size + 1); - gsl_vector *beta = gsl_vector_alloc(d_size); - gsl_matrix *Vbeta = gsl_matrix_alloc(d_size, d_size); - - // Null estimates for initial values. - gsl_matrix *V_g_null = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *V_e_null = gsl_matrix_alloc(d_size, d_size); - gsl_matrix *B_null = gsl_matrix_alloc(d_size, c_size + 1); - gsl_matrix *se_B_null = gsl_matrix_alloc(d_size, c_size); - - gsl_matrix_view X_sub = gsl_matrix_submatrix(X, 0, 0, c_size, n_size); - gsl_matrix_view B_sub = gsl_matrix_submatrix(B, 0, 0, d_size, c_size); - gsl_matrix_view xHi_all_sub = - gsl_matrix_submatrix(xHi_all, 0, 0, d_size * c_size, d_size * n_size); - - gsl_matrix_transpose_memcpy(Y, UtY); - - gsl_matrix_transpose_memcpy(&X_sub.matrix, UtW); - - gsl_vector_view X_row = gsl_matrix_row(X, c_size); - gsl_vector_set_zero(&X_row.vector); - gsl_vector_view B_col = gsl_matrix_column(B, c_size); - gsl_vector_set_zero(&B_col.vector); - - MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min, - l_max, n_region, V_g, V_e, &B_sub.matrix); - logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, - OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, - V_e, &B_sub.matrix); - logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, - &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, - crt_c); - MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, - se_B_null); - - c = 0; - Vg_remle_null.clear(); - Ve_remle_null.clear(); - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - Vg_remle_null.push_back(gsl_matrix_get(V_g, i, j)); - Ve_remle_null.push_back(gsl_matrix_get(V_e, i, j)); - VVg_remle_null.push_back(gsl_matrix_get(Hessian, c, c)); - VVe_remle_null.push_back(gsl_matrix_get(Hessian, c + v_size, c + v_size)); - c++; - } - } - beta_remle_null.clear(); - se_beta_remle_null.clear(); - for (size_t i = 0; i < se_B_null->size1; i++) { - for (size_t j = 0; j < se_B_null->size2; j++) { - beta_remle_null.push_back(gsl_matrix_get(B, i, j)); - se_beta_remle_null.push_back(gsl_matrix_get(se_B_null, i, j)); - } - } - logl_remle_H0 = logl_H0; - - cout.setf(std::ios_base::fixed, std::ios_base::floatfield); - cout.precision(4); - - cout << "REMLE estimate for Vg in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_g, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Vg): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c, c)) << "\t"; - } - cout << endl; - } - cout << "REMLE estimate for Ve in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_e, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Ve): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c + v_size, c + v_size)) << "\t"; - } - cout << endl; - } - cout << "REMLE likelihood = " << logl_H0 << endl; - - logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat, - OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, - V_e, &B_sub.matrix); - logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all, - &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, - crt_c); - MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix, - se_B_null); - - c = 0; - Vg_mle_null.clear(); - Ve_mle_null.clear(); - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - Vg_mle_null.push_back(gsl_matrix_get(V_g, i, j)); - Ve_mle_null.push_back(gsl_matrix_get(V_e, i, j)); - VVg_mle_null.push_back(gsl_matrix_get(Hessian, c, c)); - VVe_mle_null.push_back(gsl_matrix_get(Hessian, c + v_size, c + v_size)); - c++; - } - } - beta_mle_null.clear(); - se_beta_mle_null.clear(); - for (size_t i = 0; i < se_B_null->size1; i++) { - for (size_t j = 0; j < se_B_null->size2; j++) { - beta_mle_null.push_back(gsl_matrix_get(B, i, j)); - se_beta_mle_null.push_back(gsl_matrix_get(se_B_null, i, j)); - } - } - logl_mle_H0 = logl_H0; - - cout << "MLE estimate for Vg in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_g, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Vg): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c, c)) << "\t"; - } - cout << endl; - } - cout << "MLE estimate for Ve in the null model: " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - cout << gsl_matrix_get(V_e, i, j) << "\t"; - } - cout << endl; - } - cout << "se(Ve): " << endl; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = 0; j <= i; j++) { - c = GetIndex(i, j, d_size); - cout << sqrt(gsl_matrix_get(Hessian, c + v_size, c + v_size)) << "\t"; - } - cout << endl; - } - cout << "MLE likelihood = " << logl_H0 << endl; - - vector<double> v_beta, v_Vg, v_Ve, v_Vbeta; - for (size_t i = 0; i < d_size; i++) { - v_beta.push_back(0.0); - } - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - v_Vg.push_back(0.0); - v_Ve.push_back(0.0); - v_Vbeta.push_back(0.0); - } - } - - gsl_matrix_memcpy(V_g_null, V_g); - gsl_matrix_memcpy(V_e_null, V_e); - gsl_matrix_memcpy(B_null, B); - - // Read in header. - uint32_t bgen_snp_block_offset; - uint32_t bgen_header_length; - uint32_t bgen_nsamples; - uint32_t bgen_nsnps; - uint32_t bgen_flags; - infile.read(reinterpret_cast<char *>(&bgen_snp_block_offset), 4); - infile.read(reinterpret_cast<char *>(&bgen_header_length), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsnps), 4); - bgen_snp_block_offset -= 4; - infile.read(reinterpret_cast<char *>(&bgen_nsamples), 4); - bgen_snp_block_offset -= 4; - infile.ignore(4 + bgen_header_length - 20); - bgen_snp_block_offset -= 4 + bgen_header_length - 20; - infile.read(reinterpret_cast<char *>(&bgen_flags), 4); - bgen_snp_block_offset -= 4; - bool CompressedSNPBlocks = bgen_flags & 0x1; - - infile.ignore(bgen_snp_block_offset); - - double bgen_geno_prob_AA, bgen_geno_prob_AB, bgen_geno_prob_BB; - double bgen_geno_prob_non_miss; - - uint32_t bgen_N; - uint16_t bgen_LS; - uint16_t bgen_LR; - uint16_t bgen_LC; - uint32_t bgen_SNP_pos; - uint32_t bgen_LA; - std::string bgen_A_allele; - uint32_t bgen_LB; - std::string bgen_B_allele; - uint32_t bgen_P; - size_t unzipped_data_size; - string id; - string rs; - string chr; - std::cout << "Warning: WJA hard coded SNP missingness threshold " - << "of 10%" << std::endl; - - // Start reading genotypes and analyze. - size_t csnp = 0, t_last = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (indicator_snp[t] == 0) { - continue; - } - t_last++; - } - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); - } - if (indicator_snp[t] == 0) { - continue; - } - - // Read SNP header. - id.clear(); - rs.clear(); - chr.clear(); - bgen_A_allele.clear(); - bgen_B_allele.clear(); - - infile.read(reinterpret_cast<char *>(&bgen_N), 4); - infile.read(reinterpret_cast<char *>(&bgen_LS), 2); - - id.resize(bgen_LS); - infile.read(&id[0], bgen_LS); - - infile.read(reinterpret_cast<char *>(&bgen_LR), 2); - rs.resize(bgen_LR); - infile.read(&rs[0], bgen_LR); - - infile.read(reinterpret_cast<char *>(&bgen_LC), 2); - chr.resize(bgen_LC); - infile.read(&chr[0], bgen_LC); - - infile.read(reinterpret_cast<char *>(&bgen_SNP_pos), 4); - - infile.read(reinterpret_cast<char *>(&bgen_LA), 4); - bgen_A_allele.resize(bgen_LA); - infile.read(&bgen_A_allele[0], bgen_LA); - - infile.read(reinterpret_cast<char *>(&bgen_LB), 4); - bgen_B_allele.resize(bgen_LB); - infile.read(&bgen_B_allele[0], bgen_LB); - - uint16_t unzipped_data[3 * bgen_N]; - - if (indicator_snp[t] == 0) { - if (CompressedSNPBlocks) - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - else - bgen_P = 6 * bgen_N; - - infile.ignore(static_cast<size_t>(bgen_P)); - - continue; - } - - if (CompressedSNPBlocks) { - - infile.read(reinterpret_cast<char *>(&bgen_P), 4); - uint8_t zipped_data[bgen_P]; - - unzipped_data_size = 6 * bgen_N; - - infile.read(reinterpret_cast<char *>(zipped_data), bgen_P); - - int result = uncompress(reinterpret_cast<Bytef *>(unzipped_data), - reinterpret_cast<uLongf *>(&unzipped_data_size), - reinterpret_cast<Bytef *>(zipped_data), - static_cast<uLong>(bgen_P)); - assert(result == Z_OK); - - } else { - - bgen_P = 6 * bgen_N; - infile.read(reinterpret_cast<char *>(unzipped_data), bgen_P); - } - - x_mean = 0.0; - c_phen = 0; - n_miss = 0; - gsl_vector_set_zero(x_miss); - for (size_t i = 0; i < bgen_N; ++i) { - if (indicator_idv[i] == 0) { - continue; - } - - bgen_geno_prob_AA = static_cast<double>(unzipped_data[i * 3]) / 32768.0; - bgen_geno_prob_AB = - static_cast<double>(unzipped_data[i * 3 + 1]) / 32768.0; - bgen_geno_prob_BB = - static_cast<double>(unzipped_data[i * 3 + 2]) / 32768.0; - - // WJA. - bgen_geno_prob_non_miss = - bgen_geno_prob_AA + bgen_geno_prob_AB + bgen_geno_prob_BB; - if (bgen_geno_prob_non_miss < 0.9) { - gsl_vector_set(x_miss, c_phen, 0.0); - n_miss++; - } else { - - bgen_geno_prob_AA /= bgen_geno_prob_non_miss; - bgen_geno_prob_AB /= bgen_geno_prob_non_miss; - bgen_geno_prob_BB /= bgen_geno_prob_non_miss; - - geno = 2.0 * bgen_geno_prob_BB + bgen_geno_prob_AB; - - gsl_vector_set(x, c_phen, geno); - gsl_vector_set(x_miss, c_phen, 1.0); - x_mean += geno; - } - c_phen++; - } - - x_mean /= static_cast<double>(ni_test - n_miss); - - for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(x_miss, i) == 0) { - gsl_vector_set(x, i, x_mean); - } - } - - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, csnp % msize); - gsl_vector_memcpy(&Xlarge_col.vector, x); - csnp++; - - if (csnp % msize == 0 || csnp == t_last) { - size_t l = 0; - if (csnp % msize == 0) { - l = msize; - } else { - l = csnp % msize; - } - - gsl_matrix_view Xlarge_sub = - gsl_matrix_submatrix(Xlarge, 0, 0, Xlarge->size1, l); - gsl_matrix_view UtXlarge_sub = - gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); - - time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); - time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - gsl_matrix_set_zero(Xlarge); - - for (size_t i = 0; i < l; i++) { - gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); - gsl_vector_memcpy(&X_row.vector, &UtXlarge_col.vector); - - // Initial values. - gsl_matrix_memcpy(V_g, V_g_null); - gsl_matrix_memcpy(V_e, V_e_null); - gsl_matrix_memcpy(B, B_null); - - time_start = clock(); - - // 3 is before 1. - if (a_mode == 3 || a_mode == 4) { - p_score = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g_null, - V_e_null, UltVehiY, beta, Vbeta); - if (p_score < p_nr && crt == 1) { - logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all, - Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c); - } - } - - if (a_mode == 2 || a_mode == 4) { - logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, - E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, - UltVehiE, V_g, V_e, B); - - // Calculate beta and Vbeta. - p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size); - - if (p_lrt < p_nr) { - logl_H1 = - MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, - xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - - // Calculate beta and Vbeta. - p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size); - - if (crt == 1) { - p_lrt = PCRT(2, d_size, p_lrt, crt_a, crt_b, crt_c); - } - } - } - - if (a_mode == 1 || a_mode == 4) { - logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, - E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, - UltVehiE, V_g, V_e, B); - p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - - if (p_wald < p_nr) { - logl_H1 = - MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, - xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c); - p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e, - UltVehiY, beta, Vbeta); - - if (crt == 1) { - p_wald = PCRT(1, d_size, p_wald, crt_a, crt_b, crt_c); - } - } - } - - time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); - - // Store summary data. - for (size_t i = 0; i < d_size; i++) { - v_beta[i] = gsl_vector_get(beta, i); - } - - c = 0; - for (size_t i = 0; i < d_size; i++) { - for (size_t j = i; j < d_size; j++) { - v_Vg[c] = gsl_matrix_get(V_g, i, j); - v_Ve[c] = gsl_matrix_get(V_e, i, j); - v_Vbeta[c] = gsl_matrix_get(Vbeta, i, j); - c++; - } - } - - MPHSUMSTAT SNPs = {v_beta, p_wald, p_lrt, p_score, v_Vg, v_Ve, v_Vbeta}; - sumStat.push_back(SNPs); - } - } - } - cout << endl; - - infile.close(); - infile.clear(); - - gsl_matrix_free(U_hat); - gsl_matrix_free(E_hat); - gsl_matrix_free(OmegaU); - gsl_matrix_free(OmegaE); - gsl_matrix_free(UltVehiY); - gsl_matrix_free(UltVehiBX); - gsl_matrix_free(UltVehiU); - gsl_matrix_free(UltVehiE); - - gsl_matrix_free(Hi_all); - gsl_matrix_free(Hiy_all); - gsl_matrix_free(xHi_all); - gsl_matrix_free(Hessian); - - gsl_vector_free(x); - gsl_vector_free(x_miss); - - gsl_matrix_free(Y); - gsl_matrix_free(X); - gsl_matrix_free(V_g); - gsl_matrix_free(V_e); - gsl_matrix_free(B); - gsl_vector_free(beta); - gsl_matrix_free(Vbeta); - - gsl_matrix_free(V_g_null); - gsl_matrix_free(V_e_null); - gsl_matrix_free(B_null); - gsl_matrix_free(se_B_null); - - gsl_matrix_free(Xlarge); - gsl_matrix_free(UtXlarge); - - return; -} - void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW, const gsl_matrix *UtY) { debug_msg("entering"); @@ -3739,24 +3193,24 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, t_last++; } for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); + ProgressBar("Reading SNPs", t, ns_total - 1); } if (indicator_snp[t] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); x_mean = 0.0; c_phen = 0; n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -3801,8 +3255,8 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, - &UtXlarge_sub.matrix); + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); gsl_matrix_set_zero(Xlarge); @@ -4190,7 +3644,7 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, } for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { - ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + ProgressBar("Reading SNPs", t, snpInfo.size() - 1); } if (indicator_snp[t] == 0) { continue; @@ -4268,7 +3722,7 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, gsl_matrix_submatrix(UtXlarge, 0, 0, UtXlarge->size1, l); time_start = clock(); - eigenlib_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, &UtXlarge_sub.matrix); time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); @@ -4416,7 +3870,7 @@ void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW, size_t n_size = UtY->size1, d_size = UtY->size2, c_size = UtW->size2; size_t dc_size = d_size * c_size, v_size = d_size * (d_size + 1) / 2; - double logl, crt_a, crt_b, crt_c; + double crt_a, crt_b, crt_c; // Large matrices for EM. gsl_matrix *U_hat = gsl_matrix_alloc(d_size, n_size); @@ -4448,10 +3902,10 @@ void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW, // Initial, EM, NR, and calculate B. MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max, n_region, V_g, V_e, B); - logl = MphEM('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, - UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); - logl = MphNR('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, - V_e, Hessian, crt_a, crt_b, crt_c); + MphEM('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE, + UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B); + MphNR('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g, + V_e, Hessian, crt_a, crt_b, crt_c); MphCalcBeta(eval, W, Y, V_g, V_e, UltVehiY, B, se_B); // Free matrices. @@ -4716,24 +4170,24 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval, // Start reading genotypes and analyze. for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == (ns_total - 1)) { - ProgressBar("Reading SNPs ", t, ns_total - 1); + ProgressBar("Reading SNPs", t, ns_total - 1); } if (indicator_snp[t] == 0) { continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); x_mean = 0.0; c_phen = 0; n_miss = 0; gsl_vector_set_zero(x_miss); for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (indicator_idv[i] == 0) { continue; } @@ -5175,7 +4629,7 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, for (vector<SNPINFO>::size_type t = 0; t < snpInfo.size(); ++t) { if (t % d_pace == 0 || t == snpInfo.size() - 1) { - ProgressBar("Reading SNPs ", t, snpInfo.size() - 1); + ProgressBar("Reading SNPs", t, snpInfo.size() - 1); } if (indicator_snp[t] == 0) { continue; diff --git a/src/param.cpp b/src/param.cpp index 3b319e9..bf6c195 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,12 +18,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <iostream> +#include <iomanip> +#include <string> #include <algorithm> #include <cmath> #include <cstring> #include <fstream> -#include <iostream> -#include <string> #include <sys/stat.h> #include "gsl/gsl_blas.h" @@ -66,7 +69,7 @@ void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps, // (indicator_idv[x] == 1). This should match indicator_cvt etc. If // this gives problems with certain sets we can simply trim to size. -void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { +void trim_individuals(vector<int> &idvs, size_t ni_max) { if (ni_max) { size_t count = 0; for (auto ind = idvs.begin(); ind != idvs.end(); ++ind) { @@ -76,7 +79,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { break; } if (count != idvs.size()) { - if (debug) + if (is_debug_mode()) cout << "**** TEST MODE: trim individuals from " << idvs.size() << " to " << count << endl; idvs.resize(count); @@ -87,7 +90,7 @@ void trim_individuals(vector<int> &idvs, size_t ni_max, bool debug) { // ---- PARAM class implementation PARAM::PARAM(void) - : mode_silence(false), a_mode(0), k_mode(1), d_pace(100000), + : a_mode(0), k_mode(1), d_pace(DEFAULT_PACE), file_out("result"), path_out("./output/"), miss_level(0.05), maf_level(0.01), hwe_level(0), r2_level(0.9999), l_min(1e-5), l_max(1e5), n_region(10), p_nr(0.001), em_prec(0.0001), nr_prec(0.0001), @@ -97,7 +100,7 @@ PARAM::PARAM(void) rho_ngrid(10), s_min(0), s_max(300), w_step(100000), s_step(1000000), r_pace(10), w_pace(1000), n_accept(0), n_mh(10), geo_mean(2000.0), randseed(-1), window_cm(0), window_bp(0), window_ns(0), n_block(200), - error(false), ni_subsample(0), n_cvt(1), n_vc(1), n_cat(0), + error(false), ni_subsample(0), n_cvt(1), n_cat(0), n_vc(1), time_total(0.0), time_G(0.0), time_eigen(0.0), time_UtX(0.0), time_UtZ(0.0), time_opt(0.0), time_Omega(0.0) {} @@ -221,7 +224,7 @@ void PARAM::ReadFiles(void) { } else { n_cvt = 1; } - trim_individuals(indicator_cvt, ni_max, mode_debug); + trim_individuals(indicator_cvt, ni_max); if (!file_gxe.empty()) { if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) { @@ -234,38 +237,7 @@ void PARAM::ReadFiles(void) { } } - trim_individuals(indicator_idv, ni_max, mode_debug); - - // WJA added. - // Read genotype and phenotype file for bgen format. - if (!file_oxford.empty()) { - file_str = file_oxford + ".sample"; - if (ReadFile_sample(file_str, indicator_pheno, pheno, p_column, - indicator_cvt, cvt, n_cvt) == false) { - error = true; - } - if ((indicator_cvt).size() == 0) { - n_cvt = 1; - } - - // Post-process covariates and phenotypes, obtain - // ni_test, save all useful covariates. - ProcessCvtPhen(); - - // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); - - file_str = file_oxford + ".bgen"; - if (ReadFile_bgen(file_str, setSnps, W, indicator_idv, indicator_snp, - snpInfo, maf_level, miss_level, hwe_level, r2_level, - ns_test) == false) { - error = true; - } - gsl_matrix_free(W); - - ns_total = indicator_snp.size(); - } + trim_individuals(indicator_idv, ni_max); // Read genotype and phenotype file for PLINK format. if (!file_bfile.empty()) { @@ -297,16 +269,16 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + auto W1 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W1); file_str = file_bfile + ".bed"; - if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_bed(file_str, setSnps, W1, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test) == false) { error = true; } - gsl_matrix_free(W); + gsl_matrix_free(W1); ns_total = indicator_snp.size(); } @@ -330,17 +302,17 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + auto W2 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W2); - trim_individuals(indicator_idv, ni_max, mode_debug); - trim_individuals(indicator_cvt, ni_max, mode_debug); - if (ReadFile_geno(file_geno, setSnps, W, indicator_idv, indicator_snp, + trim_individuals(indicator_idv, ni_max); + trim_individuals(indicator_cvt, ni_max); + if (ReadFile_geno(file_geno, setSnps, W2, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test, mode_debug) == false) { + mapRS2bp, mapRS2cM, snpInfo, ns_test) == false) { error = true; } - gsl_matrix_free(W); + gsl_matrix_free(W2); ns_total = indicator_snp.size(); } @@ -356,7 +328,7 @@ void PARAM::ReadFiles(void) { string file_name; size_t t = 0, ns_test_tmp = 0; - gsl_matrix *W; + gsl_matrix *W3 = NULL; while (!safeGetline(infile, file_name).eof()) { file_str = file_name + ".bim"; @@ -388,12 +360,12 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + W3 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W3); } file_str = file_name + ".bed"; - if (ReadFile_bed(file_str, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_bed(file_str, setSnps, W3, indicator_idv, indicator_snp, snpInfo, maf_level, miss_level, hwe_level, r2_level, ns_test_tmp) == false) { error = true; @@ -406,7 +378,7 @@ void PARAM::ReadFiles(void) { t++; } - gsl_matrix_free(W); + if (W3) gsl_matrix_free(W3); infile.close(); infile.clear(); @@ -432,8 +404,8 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + gsl_matrix *W4 = gsl_matrix_safe_alloc(ni_test, n_cvt); + CopyCvt(W4); igzstream infile(file_mgeno.c_str(), igzstream::in); if (!infile) { @@ -445,9 +417,9 @@ void PARAM::ReadFiles(void) { string file_name; size_t ns_test_tmp; while (!safeGetline(infile, file_name).eof()) { - if (ReadFile_geno(file_name, setSnps, W, indicator_idv, indicator_snp, + if (ReadFile_geno(file_name, setSnps, W4, indicator_idv, indicator_snp, maf_level, miss_level, hwe_level, r2_level, mapRS2chr, - mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp, mode_debug) == false) { + mapRS2bp, mapRS2cM, snpInfo, ns_test_tmp) == false) { error = true; } @@ -457,7 +429,7 @@ void PARAM::ReadFiles(void) { ns_total += indicator_snp.size(); } - gsl_matrix_free(W); + gsl_matrix_free(W4); infile.close(); infile.clear(); @@ -485,8 +457,8 @@ void PARAM::ReadFiles(void) { ProcessCvtPhen(); // Obtain covariate matrix. - gsl_matrix *W = gsl_matrix_alloc(ni_test, n_cvt); - CopyCvt(W); + // gsl_matrix *W5 = gsl_matrix_alloc(ni_test, n_cvt); + // CopyCvt(W5); if (ReadFile_gene(file_gene, vec_read, snpInfo, ng_total) == false) { error = true; @@ -741,19 +713,6 @@ void PARAM::CheckParam(void) { } } - if (!file_oxford.empty()) { - str = file_oxford + ".bgen"; - if (stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open .bgen file: " << str << endl; - error = true; - } - str = file_oxford + ".sample"; - if (stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open .sample file: " << str << endl; - error = true; - } - } - if ((!file_geno.empty() || !file_gene.empty())) { str = file_pheno; if (stat(str.c_str(), &fileInfo) == -1) { @@ -864,11 +823,6 @@ void PARAM::CheckParam(void) { flag++; } - // WJA added. - if (!file_oxford.empty()) { - flag++; - } - if (flag != 1 && a_mode != 15 && a_mode != 27 && a_mode != 28 && a_mode != 43 && a_mode != 5 && a_mode != 61 && a_mode != 62 && a_mode != 63 && a_mode != 66 && a_mode != 67) { @@ -942,14 +896,12 @@ void PARAM::CheckParam(void) { enforce_fexists(file_snps, "open file"); enforce_fexists(file_ksnps, "open file"); enforce_fexists(file_gwasnps, "open file"); - enforce_fexists(file_log, "open file"); enforce_fexists(file_anno, "open file"); if (!loco.empty()) { enforce_msg((a_mode >= 1 && a_mode <= 4) || a_mode == 21 || a_mode == 22, "LOCO only works with LMM and K"); - enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); - enforce_msg(file_oxford.empty(), "LOCO does not work with Oxford (yet)"); + // enforce_msg(file_bfile.empty(), "LOCO does not work with PLink (yet)"); enforce_msg(file_gxe.empty(), "LOCO does not support GXE (yet)"); enforce_msg(!file_anno.empty(), "LOCO requires annotation file (-a switch)"); @@ -957,54 +909,15 @@ void PARAM::CheckParam(void) { enforce_msg(file_gwasnps.empty(), "LOCO does not allow -gwasnps switch"); } - str = file_kin; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open relatedness matrix file: " << str << endl; - error = true; - } - - str = file_mk; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open relatedness matrix file: " << str << endl; - error = true; - } - - str = file_cvt; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open covariates file: " << str << endl; - error = true; - } - - str = file_gxe; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open environmental covariate file: " << str << endl; - error = true; - } - - str = file_weight; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open the residual weight file: " << str << endl; - error = true; - } - - str = file_epm; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open estimated parameter file: " << str << endl; - error = true; - } - - str = file_ebv; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open estimated breeding value file: " << str - << endl; - error = true; - } - - str = file_read; - if (!str.empty() && stat(str.c_str(), &fileInfo) == -1) { - cout << "error! fail to open total read file: " << str << endl; - error = true; - } + enforce_fexists(file_kin, "open file"); + enforce_fexists(file_mk, "open file"); + enforce_fexists(file_cvt, "open file"); + enforce_fexists(file_gxe, "open file"); + enforce_fexists(file_log, "open file"); + enforce_fexists(file_weight, "open file"); + enforce_fexists(file_epm, "open file"); + enforce_fexists(file_ebv, "open file"); + enforce_fexists(file_read, "open file"); // Check if files are compatible with analysis mode. if (k_mode == 2 && !file_geno.empty()) { @@ -1056,14 +969,6 @@ void PARAM::CheckParam(void) { void PARAM::CheckData(void) { - // WJA NOTE: I added this condition so that covariates can be added - // through sample, probably not exactly what is wanted. - if (file_oxford.empty()) { - if ((file_cvt).empty() || (indicator_cvt).size() == 0) { - n_cvt = 1; - } - } - if ((a_mode == 66 || a_mode == 67) && (v_pve.size() != n_vc)) { cout << "error! the number of pve estimates does not equal to " << "the number of categories in the cat file:" << v_pve.size() << " " @@ -1194,21 +1099,21 @@ void PARAM::CheckData(void) { cout << "## number of total genes = " << ng_total << endl; } else if (file_epm.empty() && a_mode != 43 && a_mode != 5) { if (!loco.empty()) - cout << "## leave one chromosome out (LOCO) = " << loco << endl; - cout << "## number of total SNPs = " << ns_total << endl; + cout << "## leave one chromosome out (LOCO) = " << setw(8) << loco << endl; + cout << "## number of total SNPs/var = " << setw(8) << ns_total << endl; if (setSnps.size()) - cout << "## number of considered SNPS = " << setSnps.size() << endl; + cout << "## number of considered SNPS = " << setw(8) << setSnps.size() << endl; if (setKSnps.size()) - cout << "## number of SNPS for K = " << setKSnps.size() << endl; + cout << "## number of SNPS for K = " << setw(8) << setKSnps.size() << endl; if (setGWASnps.size()) - cout << "## number of SNPS for GWAS = " << setGWASnps.size() << endl; - cout << "## number of analyzed SNPs = " << ns_test << endl; + cout << "## number of SNPS for GWAS = " << setw(8) << setGWASnps.size() << endl; + cout << "## number of analyzed SNPs = " << setw(8) << ns_test << endl; } else { } } // Set d_pace to 1000 for gene expression. - if (!file_gene.empty() && d_pace == 100000) { + if (!file_gene.empty() && d_pace == DEFAULT_PACE) { d_pace = 1000; } @@ -1340,7 +1245,7 @@ void PARAM::ReadGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_K) { } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, UtX, K, - calc_K, mode_debug) == false) { + calc_K) == false) { error = true; } } @@ -1360,7 +1265,7 @@ void PARAM::ReadGenotypes(vector<vector<unsigned char>> &Xt, gsl_matrix *K, } } else { if (ReadFile_geno(file_geno, indicator_idv, indicator_snp, Xt, K, calc_K, - ni_test, ns_test, mode_debug) == false) { + ni_test, ns_test) == false) { error = true; } } @@ -1375,18 +1280,11 @@ void PARAM::CalcKin(gsl_matrix *matrix_kin) { if (!file_bfile.empty()) { file_str = file_bfile + ".bed"; - enforce_msg(loco.empty(), "FIXME: LOCO nyi"); + // enforce_msg(loco.empty(), "FIXME: LOCO nyi"); if (PlinkKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == false) { error = true; } - } else if (!file_oxford.empty()) { - file_str = file_oxford + ".bgen"; - enforce_msg(loco.empty(), "FIXME: LOCO nyi"); - if (bgenKin(file_str, indicator_snp, a_mode - 20, d_pace, matrix_kin) == - false) { - error = true; - } } else { file_str = file_geno; if (BimbamKin(file_str, setKSnps, indicator_snp, a_mode - 20, d_pace, diff --git a/src/param.h b/src/param.h index ff279bd..9ad14b2 100644 --- a/src/param.h +++ b/src/param.h @@ -1,6 +1,8 @@ /* Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright (C) 2011-2017, Xiang Zhou + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,7 +28,8 @@ #include <set> #include <vector> -#define K_BATCH_SIZE 10000 // #snps used for batched K +#define K_BATCH_SIZE 20000 // #snps used for batched K +#define DEFAULT_PACE 1000 // for display only using namespace std; @@ -115,16 +118,16 @@ public: class PARAM { public: // IO-related parameters - bool mode_check = true; // run data checks (slower) - bool mode_strict = false; // exit on some data checks - bool mode_silence; - bool mode_debug = false; - uint issue; // enable tests for issue on github tracker + // bool mode_check = true; // run data checks (slower) + // bool mode_strict = false; // exit on some data checks + // bool mode_silence; + // bool mode_debug = false; + // uint issue; // enable tests for issue on github tracker uint a_mode; // Analysis mode, 1/2/3/4 for Frequentist tests int k_mode; // Kinship read mode: 1: n by n matrix, 2: id/id/k_value; vector<size_t> p_column; // Which phenotype column needs analysis. - size_t d_pace; // Display pace + size_t d_pace = DEFAULT_PACE; // Display pace (-pace switch) string file_bfile, file_mbfile; string file_geno, file_mgeno; @@ -155,9 +158,6 @@ public: string file_ksnps; // File SNPs for computing K string file_gwasnps; // File SNPs for computing GWAS - // WJA added. - string file_oxford; - // QC-related parameters. double miss_level; double maf_level; @@ -368,10 +368,4 @@ public: size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt); -// Helpers for checking parameters -#define enforce_fexists(fn, msg) \ - if (!fn.empty()) \ - enforce_msg(stat(fn.c_str(), &fileInfo) == 0, \ - ((std::string(__STRING(fn)) + ": " + msg).c_str())); - #endif diff --git a/src/prdt.cpp b/src/prdt.cpp index 9dc84bc..fc0abe8 100644 --- a/src/prdt.cpp +++ b/src/prdt.cpp @@ -227,7 +227,7 @@ void PRDT::AnalyzeBimbam(gsl_vector *y_prdt) { // Start reading genotypes and analyze. for (size_t t = 0; t < ns_total; ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % d_pace == 0 || t == (ns_total - 1)) { ProgressBar("Reading SNPs ", t, ns_total - 1); } diff --git a/src/varcov.cpp b/src/varcov.cpp index 39c3523..e9c7295 100644 --- a/src/varcov.cpp +++ b/src/varcov.cpp @@ -198,7 +198,7 @@ void VARCOV::CalcNB(vector<SNPINFO> &snpInfo_sort) { (snpInfo_sort[t2].cM - snpInfo_sort[t].cM < window_cm || window_cm == 0) && (snpInfo_sort[t2].base_position - snpInfo_sort[t].base_position < - window_bp || + (long int) window_bp || window_bp == 0) && (n_nb < window_ns || window_ns == 0)) { t2++; @@ -41,7 +41,7 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_multiroots.h" -#include "Eigen/Dense" +// #include "Eigen/Dense" #include "eigenlib.h" #include "gzstream.h" @@ -53,7 +53,7 @@ #include "vc.h" using namespace std; -using namespace Eigen; +// using namespace Eigen; // In this file, X, Y are already transformed (i.e. UtX and UtY). void VC::CopyFromParam(PARAM &cPar) { @@ -663,7 +663,7 @@ void ReadFile_cor(const string &file_cor, const set<string> &setSnps, HEADER header; // Header. - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_vc(line, header); if (header.n_col == 0) { @@ -678,7 +678,7 @@ void ReadFile_cor(const string &file_cor, const set<string> &setSnps, while (!safeGetline(infile, line).eof()) { // do not read cor values this time; upto col_n-1. - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); n_total = 0; n_mis = 0; @@ -688,6 +688,7 @@ void ReadFile_cor(const string &file_cor, const set<string> &setSnps, d_cm = 0; d_pos = 0; for (size_t i = 0; i < header.coln - 1; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -822,7 +823,7 @@ void ReadFile_beta(const bool flag_priorscale, const string &file_beta, // Read header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_vc(line, header); if (header.n_col == 0) { @@ -844,7 +845,7 @@ void ReadFile_beta(const bool flag_priorscale, const string &file_beta, } while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); z = 0; beta = 0; @@ -857,6 +858,7 @@ void ReadFile_beta(const bool flag_priorscale, const string &file_beta, af = 0; var_x = 0; for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -1055,7 +1057,7 @@ void ReadFile_cor(const string &file_cor, const vector<string> &vec_rs, // Header. HEADER header; - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); ReadHeader_vc(line, header); while (!safeGetline(infile, line).eof()) { @@ -1063,8 +1065,9 @@ void ReadFile_cor(const string &file_cor, const vector<string> &vec_rs, // Do not read cor values this time; upto col_n-1. d_pos1 = 0; d_cm1 = 0; - ch_ptr = strtok((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); for (size_t i = 0; i < header.coln - 1; i++) { + enforce(ch_ptr); if (header.rs_col != 0 && header.rs_col == i + 1) { rs = ch_ptr; } @@ -1932,7 +1935,7 @@ void VC::CalcVCacl(const gsl_matrix *K, const gsl_matrix *W, size_t n1 = K->size1, n2 = K->size2; size_t n_vc = n2 / n1; - double d, y2_sum, tau_inv, se_tau_inv; + double d, y2_sum, tau_inv; // New matrices/vectors. gsl_matrix *K_scale = gsl_matrix_alloc(n1, n2); @@ -2131,7 +2134,7 @@ void VC::CalcVCacl(const gsl_matrix *K, const gsl_matrix *W, // Compute variance for tau_inv. gsl_blas_dgemv(CblasNoTrans, 1.0, V_mat, y_scale, 0.0, n1_vec); gsl_blas_ddot(y_scale, n1_vec, &d); - se_tau_inv = sqrt(2 * d) / (double)n1; + // auto se_tau_inv = sqrt(2 * d) / (double)n1; UNUSED // Transform pve back to the original scale and save data. v_pve.clear(); @@ -2238,7 +2241,7 @@ bool BimbamXwz(const string &file_geno, const int display_pace, gsl_vector_mul(wz, w); for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } @@ -2246,9 +2249,9 @@ bool BimbamXwz(const string &file_geno, const int display_pace, continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); geno_mean = 0.0; n_miss = 0; @@ -2260,7 +2263,7 @@ bool BimbamXwz(const string &file_geno, const int display_pace, if (indicator_idv[i] == 0) { continue; } - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(geno_miss, i, 0); n_miss++; @@ -2491,7 +2494,7 @@ bool BimbamXtXwz(const string &file_geno, const int display_pace, gsl_vector *geno_miss = gsl_vector_alloc(ni_test); for (size_t t = 0; t < indicator_snp.size(); ++t) { - !safeGetline(infile, line).eof(); + safeGetline(infile, line).eof(); if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { ProgressBar("Reading SNPs ", t, indicator_snp.size() - 1); } @@ -2499,9 +2502,9 @@ bool BimbamXtXwz(const string &file_geno, const int display_pace, continue; } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); geno_mean = 0.0; n_miss = 0; @@ -2513,7 +2516,7 @@ bool BimbamXtXwz(const string &file_geno, const int display_pace, if (indicator_idv[i] == 0) { continue; } - ch_ptr = strtok(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); if (strcmp(ch_ptr, "NA") == 0) { gsl_vector_set(geno_miss, i, 0); n_miss++; diff --git a/test/dev_test_suite.sh b/test/dev_test_suite.sh index 0fc4423..0d3d8a0 100755 --- a/test/dev_test_suite.sh +++ b/test/dev_test_suite.sh @@ -1,29 +1,31 @@ #!/usr/bin/env bash gemma=../bin/gemma +# gemmaopts="-debug -strict" +gemmaopts="-debug" # Related to https://github.com/genetics-statistics/GEMMA/issues/78 testBXDStandardRelatednessMatrixKSingularError() { outn=BXDerr rm -f output/$outn.* - $gemma -g ../example/BXD_geno.txt.gz \ + $gemma $gemmaopts \ + -g ../example/BXD_geno.txt.gz \ -p ../example/BXD_pheno.txt \ -c ../example/BXD_covariates.txt \ -a ../example/BXD_snps.txt \ -gk \ - -debug -o $outn + -o $outn assertEquals 22 $? # should show singular error } testBXDStandardRelatednessMatrixK() { outn=BXD rm -f output/$outn.* - $gemma -g ../example/BXD_geno.txt.gz \ + $gemma $gemmaopts -g ../example/BXD_geno.txt.gz \ -p ../example/BXD_pheno.txt \ -c ../example/BXD_covariates2.txt \ -a ../example/BXD_snps.txt \ -gk \ - -debug \ -o $outn assertEquals 0 $? outfn=output/$outn.cXX.txt @@ -31,28 +33,43 @@ testBXDStandardRelatednessMatrixK() { assertEquals "-116.11" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` } +testBXDLMLikelihoodRatio() { + outn=BXD_LM_LR + $gemma $gemmaopts -g ../example/BXD_geno.txt.gz \ + -p ../example/BXD_pheno.txt \ + -c ../example/BXD_covariates2.txt \ + -a ../example/BXD_snps.txt \ + -k ./output/BXD.cXX.txt \ + -lm 4 -maf 0.1 \ + -o $outn + assertEquals 0 $? + + outfn=output/$outn.assoc.txt + assertEquals "95134" `wc -w < $outfn` + assertEquals "3089042886.28" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} + testBXDLMMLikelihoodRatio() { outn=BXD_LMM_LR - $gemma -g ../example/BXD_geno.txt.gz \ + $gemma $gemmaopts -g ../example/BXD_geno.txt.gz \ -p ../example/BXD_pheno.txt \ -c ../example/BXD_covariates2.txt \ -a ../example/BXD_snps.txt \ -k ./output/BXD.cXX.txt \ -lmm 2 -maf 0.1 \ - -debug \ -o $outn assertEquals 0 $? outfn=output/$outn.assoc.txt - assertEquals "80498" `wc -w < $outfn` + assertEquals "73180" `wc -w < $outfn` assertEquals "3088458212.93" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` } testCenteredRelatednessMatrixKLOCO1() { outn=mouse_hs1940_LOCO1 rm -f output/$outn.* - $gemma -g ../example/mouse_hs1940.geno.txt.gz -p ../example/mouse_hs1940.pheno.txt \ - -a ../example/mouse_hs1940.anno.txt -snps ../example/mouse_hs1940_snps.txt -nind 400 -loco 1 -gk -debug -o $outn + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz -p ../example/mouse_hs1940.pheno.txt \ + -a ../example/mouse_hs1940.anno.txt -snps ../example/mouse_hs1940_snps.txt -nind 400 -loco 1 -gk -o $outn assertEquals 0 $? grep "total computation time" < output/$outn.log.txt outfn=output/$outn.cXX.txt @@ -65,7 +82,7 @@ testCenteredRelatednessMatrixKLOCO1() { testUnivariateLinearMixedModelLOCO1() { outn=mouse_hs1940_CD8_LOCO1_lmm rm -f output/$outn.* - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ -n 1 \ -loco 1 \ @@ -73,7 +90,47 @@ testUnivariateLinearMixedModelLOCO1() { -k ./output/mouse_hs1940_LOCO1.cXX.txt \ -snps ../example/mouse_hs1940_snps.txt -lmm \ -nind 400 \ - -debug \ + -o $outn + assertEquals 0 $? + grep "total computation time" < output/$outn.log.txt + assertEquals 0 $? + outfn=output/$outn.assoc.txt + assertEquals "68" `wc -l < $outfn` + assertEquals "15465346.22" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} + +testPlinkCenteredRelatednessMatrixKLOCO1() { + return 0 + outn=mouse_hs1940_Plink_LOCO1 + rm -f output/$outn.* + $gemma $gemmaopts -bfile ../example/mouse_hs1940 \ + -a ../example/mouse_hs1940.anno.txt \ + -snps ../example/mouse_hs1940_snps.txt \ + -nind 400 \ + -loco 1 \ + -gk \ + -o $outn + assertEquals 0 $? + grep "total computation time" < output/$outn.log.txt + outfn=output/$outn.cXX.txt + assertEquals 0 $? + assertEquals "400" `wc -l < $outfn` + assertEquals "0.312" `head -c 5 $outfn` + assertEquals "71.03" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} + + +testPlinkUnivariateLinearMixedModelLOCO1() { + return 0 + outn=mouse_hs1940_CD8_Plink_LOCO1_lmm + rm -f output/$outn.* + $gemma $gemmaopts -bfile ../example/mouse_hs1940 \ + -n 1 \ + -loco 1 \ + -k ./output/mouse_hs1940_Plink_LOCO1.cXX.txt \ + -a ../example/mouse_hs1940.anno.txt \ + -snps ../example/mouse_hs1940_snps.txt -lmm \ + -nind 400 \ -o $outn assertEquals 0 $? grep "total computation time" < output/$outn.log.txt diff --git a/test/src/unittests-math.cpp b/test/src/unittests-math.cpp index ac4c180..757c2dc 100644 --- a/test/src/unittests-math.cpp +++ b/test/src/unittests-math.cpp @@ -1,14 +1,23 @@ #include <catch.hpp> #include <iostream> #include "gsl/gsl_matrix.h" -#include "mathfunc.h" +#include <cblas.h> + #include <algorithm> #include <limits> #include <numeric> +#include "debug.h" +#include "mathfunc.h" +#include "fastblas.h" +#include "fastopenblas.h" + using namespace std; TEST_CASE( "Math functions", "[math]" ) { + debug_set_debug_mode(true); + debug_set_no_check_mode(false); + debug_set_strict_mode(true); double data[] = { 2,-1, 0, -1, 2,-1, 0,-1, 2}; @@ -51,3 +60,109 @@ TEST_CASE( "Math functions", "[math]" ) { REQUIRE (std::isnan(v3[2])); REQUIRE(has_nan(v3)); } + +TEST_CASE("cblas_dgemm", "[math]") { + double *A, *B, *C; + int m, n, k, i, j; + double alpha, beta; + + printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" + " Intel(R) MKL function dgemm, where A, B, and C are matrices and \n" + " alpha and beta are double precision scalars\n\n"); + + m = 2000, k = 200, n = 1000; + printf (" Initializing data for matrix multiplication C=A*B for matrix \n" + " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); + alpha = 1.0; beta = 0.0; + + printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" + " performance \n\n"); + A = (double *)malloc( m*k*sizeof( double )); + B = (double *)malloc( k*n*sizeof( double )); + C = (double *)malloc( m*n*sizeof( double )); + + printf (" Intializing matrix data \n\n"); + for (i = 0; i < (m*k); i++) { + A[i] = (double)(i+1); + } + + for (i = 0; i < (k*n); i++) { + B[i] = (double)(-i-1); + } + + for (i = 0; i < (m*n); i++) { + C[i] = 0.0; + } + + printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n"); + assert(m==2000); + assert(k==200); + assert(n==1000); + //cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + // m, n, k, alpha, A, k, B, n, beta, C, n); + fast_cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + m, n, k, alpha, A, k, B, n, beta, C, n); + + REQUIRE(trunc(C[0]) == -2666620100.0 ); + REQUIRE(trunc(C[1]) == -2666640200.0 ); + REQUIRE(trunc(C[2003]) == -10627000400.0 ); + +} + +TEST_CASE("fast_dgemm", "[math]") { + double *A, *B, *C; + int m, n, k, i, j; + double alpha, beta; + + printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n" + " Intel(R) MKL function dgemm, where A, B, and C are matrices and \n" + " alpha and beta are double precision scalars\n\n"); + + m = 2000, k = 200, n = 1000; + printf (" Initializing data for matrix multiplication C=A*B for matrix \n" + " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n); + alpha = 1.0; beta = 0.0; + + printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n" + " performance \n\n"); + A = (double *)malloc( m*k*sizeof( double )); + B = (double *)malloc( k*n*sizeof( double )); + C = (double *)malloc( m*n*sizeof( double )); + + printf (" Intializing matrix data \n\n"); + for (i = 0; i < (m*k); i++) { + A[i] = (double)(i+1); + } + + for (i = 0; i < (k*n); i++) { + B[i] = (double)(-i-1); + } + + for (i = 0; i < (m*n); i++) { + C[i] = 0.0; + } + + printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n"); + // cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + // m, n, k, alpha, A, k, B, n, beta, C, n); + // eigenlib_dgemm(const char *TransA, const char *TransB, const double alpha, + // const gsl_matrix *A, const gsl_matrix *B, const double beta, + // gsl_matrix *C) { + gsl_matrix *AM = gsl_matrix_safe_alloc(m,k); // rows x cols + gsl_matrix *BM = gsl_matrix_safe_alloc(k,n); + gsl_matrix *CM = gsl_matrix_calloc(m,n); + + fast_copy(AM,A); + fast_copy(BM,B); + fast_copy(CM,C); + fast_dgemm("N","N",alpha,AM,BM,beta,CM); + printf ("\n Computations completed.\n\n"); + A = AM->data; + B = BM->data; + C = CM->data; + + REQUIRE(trunc(C[0]) == -2666620100.0 ); + REQUIRE(trunc(C[1]) == -2666640200.0 ); + REQUIRE(trunc(C[2003]) == -10627000400.0 ); + +} diff --git a/test/test_suite.sh b/test/test_suite.sh index 350fc27..7af33aa 100755 --- a/test/test_suite.sh +++ b/test/test_suite.sh @@ -1,13 +1,89 @@ #!/usr/bin/env bash gemma=../bin/gemma +gemmaopts="-debug" + +testBslmm1() { + outn=mouse_hs1940_CD8_bslmm + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ + -p ../example/mouse_hs1940.pheno.txt \ + -n 2 -a ../example/mouse_hs1940.anno.txt \ + -bslmm \ + -o $outn -w 1000 -s 10000 -seed 1 + assertEquals 0 $? + outfn1=output/$outn.hyp.txt + outfn2=output/$outn.param.txt + # assertEquals "45181" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.0f",(substr($x,,0,6))) } END { printf "%.0f",$sum }' $outfn1` + # assertEquals "4043967139.42" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn2` +} + +testBslmm2() { + outn=mouse_hs1940_CD8_train + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ + -p ../example/mouse_hs1940.pheno.txt \ + -n 2 \ + -a ../example/mouse_hs1940.anno.txt \ + -gk 1 -o $outn + assertEquals 0 $? + outfn=output/$outn.cXX.txt + assertEquals "579.66" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} + +testBslmm3() { + ## Fit a binary trait using a linear model + outn=mouse_hs1940_CD8_bslmm_cc1 + $gemma $gemmaopts \ + -g ../example/mouse_hs1940.geno.txt.gz \ + -p ../example/mouse_hs1940.pheno.txt \ + -n 4 \ + -a ../example/mouse_hs1940.anno.txt \ + -bslmm \ + -o $outn \ + -w 1000 -s 10000 -seed 1 + assertEquals 0 $? + outfn=output/$outn.hyp.txt + # assertEquals "291" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.0f",(substr($x,,0,6))) } END { printf "%.0f",100*$sum }' $outfn` +} + +testBslmm4() { + outn=mouse_hs1940_CD8_prdt_k + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ + -p ../example/mouse_hs1940.pheno.txt \ + -n 2 \ + -epm ./output/mouse_hs1940_CD8_bslmm.param.txt \ + -emu ./output/mouse_hs1940_CD8_bslmm.log.txt \ + -ebv ./output/mouse_hs1940_CD8_bslmm.bv.txt \ + -k ./output/mouse_hs1940_CD8_train.cXX.txt \ + -predict \ + -o $outn + assertEquals 0 $? + outfn=output/$outn.prdt.txt + # assertEquals "-60.33" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} + +testBslmm5() { + ## Now, do prediction in the test set for the binary traits + ## If the traits were fitted using the linear model, then: + outn=mouse_hs1940_CD8_prdt_cc1 + $gemma $gemmaopts \ + -g ../example/mouse_hs1940.geno.txt.gz \ + -p ../example/mouse_hs1940.pheno.txt \ + -n 4 \ + -epm ./output/mouse_hs1940_CD8_bslmm_cc1.param.txt \ + -emu ./output/mouse_hs1940_CD8_bslmm_cc1.log.txt \ + -predict \ + -o $outn + assertEquals 0 $? + outfn=output/$outn.prdt.txt + assertEquals "550.67" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` +} testCenteredRelatednessMatrixKFullLOCO1() { outn=mouse_hs1940_full_LOCO1 - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ -a ../example/mouse_hs1940.anno.txt \ - -loco 1 -gk -debug -o $outn + -loco 1 -gk -o $outn assertEquals 0 $? outfn=output/$outn.cXX.txt assertEquals "1940" `wc -l < $outfn` @@ -16,14 +92,13 @@ testCenteredRelatednessMatrixKFullLOCO1() { testUnivariateLinearMixedModelFullLOCO1() { outn=mouse_hs1940_CD8_full_LOCO1_lmm - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ -n 1 \ -loco 1 \ -a ../example/mouse_hs1940.anno.txt \ -k ./output/mouse_hs1940_full_LOCO1.cXX.txt \ -lmm \ - -debug \ -o $outn assertEquals 0 $? grep "total computation time" < output/$outn.log.txt @@ -34,9 +109,9 @@ testUnivariateLinearMixedModelFullLOCO1() { } testCenteredRelatednessMatrixK() { - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ - -gk -o mouse_hs1940 -debug + -gk -o mouse_hs1940 assertEquals 0 $? outfn=output/mouse_hs1940.cXX.txt assertEquals "1940" `wc -l < $outfn` @@ -46,14 +121,13 @@ testCenteredRelatednessMatrixK() { } testUnivariateLinearMixedModel() { - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ -n 1 \ -a ../example/mouse_hs1940.anno.txt \ -k ./output/mouse_hs1940.cXX.txt \ -lmm \ - -o mouse_hs1940_CD8_lmm \ - -debug + -o mouse_hs1940_CD8_lmm assertEquals 0 $? grep "total computation time" < output/mouse_hs1940_CD8_lmm.log.txt assertEquals 0 $? @@ -62,14 +136,13 @@ testUnivariateLinearMixedModel() { assertEquals "4038540440.86" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` } -testMultivariateLinearMixedModel() { - $gemma -g ../example/mouse_hs1940.geno.txt.gz \ +testLinearMixedModelPhenotypes() { + $gemma $gemmaopts -g ../example/mouse_hs1940.geno.txt.gz \ -p ../example/mouse_hs1940.pheno.txt \ -n 1 6 \ -a ../example/mouse_hs1940.anno.txt \ -k ./output/mouse_hs1940.cXX.txt \ - -lmm -o mouse_hs1940_CD8MCH_lmm \ - -debug + -lmm -o mouse_hs1940_CD8MCH_lmm assertEquals 0 $? outfn=output/mouse_hs1940_CD8MCH_lmm.assoc.txt @@ -82,9 +155,8 @@ testPlinkStandardRelatednessMatrixK() { datadir=../example outfn=output/$testname.sXX.txt rm -f $outfn - $gemma -bfile $datadir/HLC \ - -gk 2 -o $testname \ - -debug + $gemma $gemmaopts -bfile $datadir/HLC \ + -gk 2 -o $testname assertEquals 0 $? assertEquals "427" `wc -l < $outfn` assertEquals "-358.07" `perl -nle 'foreach $x (split(/\s+/,$_)) { $sum += sprintf("%.2f",(substr($x,,0,6))) } END { printf "%.2f",$sum }' $outfn` @@ -92,15 +164,14 @@ testPlinkStandardRelatednessMatrixK() { # Test for https://github.com/genetics-statistics/GEMMA/issues/58 # fixed GSLv2 NaN's that appeared with covariates. -testPlinkMultivariateLinearMixedModel() { - testname=testPlinkMultivariateLinearMixedModel +testPlinkLinearMixedModelCovariates() { + testname=testPlinkLinearMixedModelCovariates datadir=../example - $gemma -bfile $datadir/HLC \ + $gemma $gemmaopts -bfile $datadir/HLC \ -k output/testPlinkStandardRelatednessMatrixK.sXX.txt \ -lmm 1 \ -maf 0.1 \ -c $datadir/HLC_covariates.txt \ - -debug \ -o $testname assertEquals 0 $? outfn=output/$testname.assoc.txt |