From 1b4bdcfabe6d618548114fecba93c8188fb4a7dc Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 15:38:31 +0100 Subject: Windows script to generate src/version.h --- scripts/gen_version_info.cmd | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 scripts/gen_version_info.cmd diff --git a/scripts/gen_version_info.cmd b/scripts/gen_version_info.cmd new file mode 100644 index 0000000..d824687 --- /dev/null +++ b/scripts/gen_version_info.cmd @@ -0,0 +1,16 @@ +@echo off +rem https://stackoverflow.com/questions/3472631/how-do-i-get-the-day-month-and-year-from-a-windows-cmd-exe-script +FOR /F "skip=1 tokens=1-6" %%A IN ('WMIC Path Win32_LocalTime Get Day^,Hour^,Minute^,Month^,Second^,Year /Format:table') DO ( + if "%%B" NEQ "" ( + SET /A FDATE=%%F*10000+%%D*100+%%A + ) +) +set year=%FDATE:~0,4% +set /p version= src/version.h + $(VGEN) > src/version.h $(OUTPUT): $(OBJS) $(CPP) $(CPPFLAGS) $(OBJS) $(LIBS) -o $(OUTPUT) -- cgit v1.2.3 From 2fc39eb39792384f797bc29002a0a6b3ef2d50d3 Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 16:07:43 +0100 Subject: Windows compilation paths, and restructuring the -isystem --- Makefile | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 23b834f..a1b9b76 100644 --- a/Makefile +++ b/Makefile @@ -65,8 +65,20 @@ OPENBLAS_LEGACY = # Using older OpenBlas FORCE_STATIC = # Static linking of libraries GCC_FLAGS = -Wall -O3 -std=gnu++11 # extra flags -Wl,--allow-multiple-definition TRAVIS_CI = # used by TRAVIS for testing -EIGEN_INCLUDE_PATH = /usr/include/eigen3 -OPENBLAS_INCLUDE_PATH = /usr/local/opt/openblas/include + +GSL_INCLUDE_PATH = +ifeq ($(SYS), WIN) + GSL_INCLUDE_PATH = -Ic:/GnuWin32/include + EIGEN_INCLUDE_PATH = -I../eigen-git-mirror + OPENBLAS_INCLUDE_PATH = -I../OpenBLAS-v0.2.19-Win64-int32/include +else + OPENBLAS_INCLUDE_PATH = -isystem//usr/local/opt/openblas/include + ifeq ($(SYS), MAC) + EIGEN_INCLUDE_PATH = -isystem//usr/local/include/eigen3 + else + EIGEN_INCLUDE_PATH = -isystem//usr/include/eigen3 + endif +endif # -------------------------------------------------------------------- # Edit below this line with caution @@ -85,13 +97,13 @@ endif ifeq ($(CPP), clang++) # macOS Homebrew settings (as used on Travis-CI) - GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ -isystem/$(OPENBLAS_INCLUDE_PATH) -isystem//usr/local/include/eigen3 -Wl,-L/usr/local/opt/openblas/lib + GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ $(OPENBLAS_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Wl,-L/usr/local/opt/openblas/lib endif ifdef WITH_OPENBLAS OPENBLAS=1 # WITH_LAPACK = # OPENBLAS usually includes LAPACK - CPPFLAGS += -DOPENBLAS -isystem/$(OPENBLAS_INCLUDE_PATH) + CPPFLAGS += -DOPENBLAS $(OPENBLAS_INCLUDE_PATH) ifdef OPENBLAS_LEGACY # Legacy version (mostly for Travis-CI) CPPFLAGS += -DOPENBLAS_LEGACY @@ -99,10 +111,14 @@ ifdef WITH_OPENBLAS endif ifdef DEBUG - CPPFLAGS += -g $(GCC_FLAGS) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -g $(GCC_FLAGS) $(GSL_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc else # release mode - CPPFLAGS += -DNDEBUG $(GCC_FLAGS) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -DNDEBUG $(GCC_FLAGS) $(GSL_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc +endif + +ifeq ($(SYS), WIN) + CPPFLAGS += -Duint="unsigned int" -D__CRT__NO_INLINE -D__STRING="__STRINGIFY" endif ifdef SHOW_COMPILER_WARNINGS -- cgit v1.2.3 From 421355faac3b1aae1fb165fd99ff8f393d3c492a Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 16:22:03 +0100 Subject: io.h is a std lib function for MinGW64, renamed it to gemma_io.h --- src/gemma_io.cpp | 4080 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/gemma_io.h | 187 +++ src/io.cpp | 4080 ------------------------------------------------------ src/io.h | 187 --- 4 files changed, 4267 insertions(+), 4267 deletions(-) create mode 100644 src/gemma_io.cpp create mode 100644 src/gemma_io.h delete mode 100644 src/io.cpp delete mode 100644 src/io.h diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp new file mode 100644 index 0000000..d20b473 --- /dev/null +++ b/src/gemma_io.cpp @@ -0,0 +1,4080 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gsl/gsl_blas.h" +#include "gsl/gsl_cdf.h" +#include "gsl/gsl_linalg.h" +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" + +#include "debug.h" +// #include "eigenlib.h" +#include "fastblas.h" +#include "gzstream.h" +#include "io.h" +#include "lapack.h" +#include "mathfunc.h" + +using namespace std; + +// Print progress bar. +void ProgressBar(string str, double p, double total, double ratio) { + assert(p<=total); + assert(p>=0); + if (total <= 0.0) return; + const double progress = (100.0 * p / total); + const uint barsize = (int)(progress / 2.0); // characters + // cout << barsize << endl; + // cout << str << " "; + // cout << p << "/" << total << endl; + assert(barsize < 101); // corrupted data somehow + if (barsize > 0) { + cout << std::string(barsize,'='); + } + cout << std::string(50-barsize,' '); + cout << setprecision(0) << fixed << " " << progress << "%"; + if (ratio != -1.0) + cout << setprecision(2) << " " << ratio; + cout << "\r" << flush; +} + +bool isBlankLine(char const *line) { + for (char const *cp = line; *cp; ++cp) { + if (!isspace(*cp)) + return false; + } + return true; +} + +bool isBlankLine(std::string const &line) { return isBlankLine(line.c_str()); } + +// In case files are ended with "\r" or "\r\n". +std::istream &safeGetline(std::istream &is, std::string &t) { + t.clear(); + + // The characters in the stream are read one-by-one using a + // std::streambuf. That is faster than reading them one-by-one + // using the std::istream. Code that uses streambuf this way must + // be guarded by a sentry object. The sentry object performs + // various tasks, such as thread synchronization and updating the + // stream state. + std::istream::sentry se(is, true); + std::streambuf *sb = is.rdbuf(); + + for (;;) { + int c = sb->sbumpc(); + switch (c) { + case '\n': + return is; + case '\r': + if (sb->sgetc() == '\n') + sb->sbumpc(); + return is; + case EOF: + + // Also handle the case when the last line has no line + // ending. + if (t.empty()) + is.setstate(std::ios::eofbit); + return is; + default: + t += (char)c; + } + } +} + +// Read SNP file. A single column of SNP names. +bool ReadFile_snps(const string file_snps, set &setSnps) { + debug_msg("entered"); + setSnps.clear(); + + igzstream infile(file_snps.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open snps file: " << file_snps << endl; + return false; + } + + string line; + char *ch_ptr; + + while (getline(infile, line)) { + ch_ptr = strtok((char *)line.c_str(), " , \t"); + enforce_msg(ch_ptr,"Problem reading SNP file"); + setSnps.insert(ch_ptr); + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read SNP file using a header. The header determines how the +// values for each row are parsed. A valid header can be, for example, +// RS POS CHR +bool ReadFile_snps_header(const string &file_snps, set &setSnps) { + debug_msg("entered"); + setSnps.clear(); + + igzstream infile(file_snps.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open snps file: " << file_snps << endl; + return false; + } + + string line, rs, chr, pos; + char *ch_ptr; + + // Read header. + HEADER header; + safeGetline(infile, line).eof(); + ReadHeader_io(line, header); + + if (header.rs_col == 0 && (header.chr_col == 0 || header.pos_col == 0)) { + cout << "missing rs id in the header" << endl; + } + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) { + continue; + } + ch_ptr = strtok((char *)line.c_str(), " , \t"); + enforce_msg(ch_ptr,"Problem reading SNP header"); + + for (size_t i = 0; i < header.coln; i++) { + enforce_msg(ch_ptr,"Problem reading SNP file"); + if (header.rs_col != 0 && header.rs_col == i + 1) { + rs = ch_ptr; + } + if (header.chr_col != 0 && header.chr_col == i + 1) { + chr = ch_ptr; + } + if (header.pos_col != 0 && header.pos_col == i + 1) { + pos = ch_ptr; + } + + ch_ptr = strtok(NULL, " , \t"); + } + + if (header.rs_col == 0) { + rs = chr + ":" + pos; + } + + setSnps.insert(rs); + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read log file. +bool ReadFile_log(const string &file_log, double &pheno_mean) { + debug_msg("ReadFile_log"); + ifstream infile(file_log.c_str(), ifstream::in); + if (!infile) { + cout << "error! fail to open log file: " << file_log << endl; + return false; + } + + string line; + char *ch_ptr; + size_t flag = 0; + + while (getline(infile, line)) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok(NULL, " , \t"); + + if (ch_ptr != NULL && strcmp(ch_ptr, "estimated") == 0) { + ch_ptr = strtok(NULL, " , \t"); + if (ch_ptr != NULL && strcmp(ch_ptr, "mean") == 0) { + ch_ptr = strtok(NULL, " , \t"); + if (ch_ptr != NULL && strcmp(ch_ptr, "=") == 0) { + ch_ptr = strtok_safe(NULL, " , \t"); + pheno_mean = atof(ch_ptr); + flag = 1; + } + } + } + + if (flag == 1) { + break; + } + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read bimbam annotation file which consists of rows of SNP, POS and CHR +bool ReadFile_anno(const string &file_anno, map &mapRS2chr, + map &mapRS2bp, + map &mapRS2cM) { + debug_msg("ReadFile_anno"); + mapRS2chr.clear(); + mapRS2bp.clear(); + + ifstream infile(file_anno.c_str(), ifstream::in); + if (!infile) { + cout << "error opening annotation file: " << file_anno << endl; + return false; + } + + string line; + + while (!safeGetline(infile, line).eof()) { + const char *ch_ptr = strtok((char *)line.c_str(), " , \t"); + enforce_str(ch_ptr, line + " Bad RS format"); + const string rs = ch_ptr; + enforce_str(rs != "", line + " Bad RS format"); + + ch_ptr = strtok(NULL, " , \t"); + enforce_str(ch_ptr, line + " Bad format"); + long b_pos; + if (strcmp(ch_ptr, "NA") == 0) { + b_pos = -9; + } else { + b_pos = atol(ch_ptr); + } + enforce_str(b_pos,line + " Bad pos format (is zero)"); + + string chr; + ch_ptr = strtok(NULL, " , \t"); + if (ch_ptr == NULL || strcmp(ch_ptr, "NA") == 0) { + chr = "-9"; + } else { + chr = ch_ptr; + enforce_str(chr != "", line + " Bad chr format"); + } + + double cM; + ch_ptr = strtok(NULL, " , \t"); + if (ch_ptr == NULL || strcmp(ch_ptr, "NA") == 0) { + cM = -9; + } else { + cM = atof(ch_ptr); + enforce_str(b_pos, line + "Bad cM format (is zero)"); + } + + mapRS2chr[rs] = chr; + mapRS2bp[rs] = b_pos; + mapRS2cM[rs] = cM; + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read 1 column of phenotype. +bool ReadFile_column(const string &file_pheno, vector &indicator_idv, + vector &pheno, const int &p_column) { + debug_msg("entered"); + indicator_idv.clear(); + pheno.clear(); + + igzstream infile(file_pheno.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open phenotype file: " << file_pheno << endl; + return false; + } + + string line; + char *ch_ptr; + + string id; + double p; + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + for (int i = 0; i < (p_column - 1); ++i) { + ch_ptr = strtok(NULL, " , \t"); + } + enforce_msg(ch_ptr,"Problem reading PHENO column"); + if (strcmp(ch_ptr, "NA") == 0) { + indicator_idv.push_back(0); + pheno.push_back(-9); + } else { + // Pheno is different from pimass2. + p = atof(ch_ptr); + indicator_idv.push_back(1); + pheno.push_back(p); + } + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read bimbam phenotype file, p_column=1, 2,... +bool ReadFile_pheno(const string &file_pheno, + vector> &indicator_pheno, + vector> &pheno, + const vector &p_column) { + debug_msg("entered"); + indicator_pheno.clear(); + pheno.clear(); + + igzstream infile(file_pheno.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open phenotype file: " << file_pheno << endl; + return false; + } + + string line; + char *ch_ptr; + + string id; + double p; + + vector pheno_row; + vector ind_pheno_row; + + size_t p_max = *max_element(p_column.begin(), p_column.end()); + map mapP2c; + for (size_t i = 0; i < p_column.size(); i++) { + mapP2c[p_column[i]] = i; + pheno_row.push_back(-9); + ind_pheno_row.push_back(0); + } + + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok((char *)line.c_str(), " , \t"); + size_t i = 0; + while (i < p_max) { + enforce_msg(ch_ptr,"Number of phenotypes out of range"); + if (mapP2c.count(i + 1) != 0) { + if (strcmp(ch_ptr, "NA") == 0) { + ind_pheno_row[mapP2c[i + 1]] = 0; + pheno_row[mapP2c[i + 1]] = -9; + } else { + p = atof(ch_ptr); + ind_pheno_row[mapP2c[i + 1]] = 1; + pheno_row[mapP2c[i + 1]] = p; + } + } + i++; + ch_ptr = strtok(NULL, " , \t"); + } + + indicator_pheno.push_back(ind_pheno_row); + pheno.push_back(pheno_row); + } + + infile.close(); + infile.clear(); + + return true; +} + +bool ReadFile_cvt(const string &file_cvt, vector &indicator_cvt, + vector> &cvt, size_t &n_cvt) { + debug_msg("entered"); + indicator_cvt.clear(); + + ifstream infile(file_cvt.c_str(), ifstream::in); + if (!infile) { + cout << "error! fail to open covariates file: " << file_cvt << endl; + return false; + } + + string line; + char *ch_ptr; + double d; + + int flag_na = 0; + + while (!safeGetline(infile, line).eof()) { + vector v_d; + flag_na = 0; + ch_ptr = strtok((char *)line.c_str(), " , \t"); + while (ch_ptr != NULL) { + if (strcmp(ch_ptr, "NA") == 0) { + flag_na = 1; + d = -9; + } else { + d = atof(ch_ptr); + } + + v_d.push_back(d); + ch_ptr = strtok(NULL, " , \t"); + } + if (flag_na == 0) { + indicator_cvt.push_back(1); + } else { + indicator_cvt.push_back(0); + } + cvt.push_back(v_d); + } + + if (indicator_cvt.empty()) { + n_cvt = 0; + } else { + flag_na = 0; + for (vector::size_type i = 0; i < indicator_cvt.size(); ++i) { + if (indicator_cvt[i] == 0) { + continue; + } + + if (flag_na == 0) { + flag_na = 1; + n_cvt = cvt[i].size(); + } + if (flag_na != 0 && n_cvt != cvt[i].size()) { + cout << "error! number of covariates in row " << i + << " do not match other rows." << endl; + return false; + } + } + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read .bim file. +bool ReadFile_bim(const string &file_bim, vector &snpInfo) { + debug_msg("entered"); + snpInfo.clear(); + + ifstream infile(file_bim.c_str(), ifstream::in); + if (!infile) { + cout << "error opening .bim file: " << file_bim << endl; + return false; + } + + string line; + char *ch_ptr; + + string rs; + long int b_pos; + string chr; + double cM; + string major; + string minor; + + while (getline(infile, line)) { + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); + chr = ch_ptr; + ch_ptr = strtok_safe(NULL, " \t"); + rs = ch_ptr; + ch_ptr = strtok_safe(NULL, " \t"); + cM = atof(ch_ptr); + ch_ptr = strtok_safe(NULL, " \t"); + b_pos = atol(ch_ptr); + ch_ptr = strtok_safe(NULL, " \t"); + minor = ch_ptr; + ch_ptr = strtok_safe(NULL, " \t"); + major = ch_ptr; + + SNPINFO sInfo = {chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; + snpInfo.push_back(sInfo); + } + + infile.close(); + infile.clear(); + return true; +} + +// Read .fam file (ignored with -p phenotypes switch) +bool ReadFile_fam(const string &file_fam, vector> &indicator_pheno, + vector> &pheno, map &mapID2num, + const vector &p_column) { + debug_msg("entered"); + indicator_pheno.clear(); + pheno.clear(); + mapID2num.clear(); + + igzstream infile(file_fam.c_str(), igzstream::in); + if (!infile) { + cout << "error opening .fam file: " << file_fam << endl; + return false; + } + + string line; + char *ch_ptr; + + string id; + int c = 0; + double p; + + vector pheno_row; + vector ind_pheno_row; + + size_t p_max = *max_element(p_column.begin(), p_column.end()); + map mapP2c; + for (size_t i = 0; i < p_column.size(); i++) { + mapP2c[p_column[i]] = i; + pheno_row.push_back(-9); + ind_pheno_row.push_back(0); + } + + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + id = ch_ptr; + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok_safe(NULL, " \t"); + ch_ptr = strtok(NULL, " \t"); + + size_t i = 0; + while (i < p_max) { + if (mapP2c.count(i + 1) != 0) { + enforce_msg(ch_ptr,"Problem reading FAM file (phenotypes out of range)"); + + if (strcmp(ch_ptr, "NA") == 0) { + ind_pheno_row[mapP2c[i + 1]] = 0; + pheno_row[mapP2c[i + 1]] = -9; + } else { + p = atof(ch_ptr); + + if (p == -9) { + ind_pheno_row[mapP2c[i + 1]] = 0; + pheno_row[mapP2c[i + 1]] = -9; + } else { + ind_pheno_row[mapP2c[i + 1]] = 1; + pheno_row[mapP2c[i + 1]] = p; + } + } + } + i++; + ch_ptr = strtok(NULL, " , \t"); + } + + indicator_pheno.push_back(ind_pheno_row); + pheno.push_back(pheno_row); + + mapID2num[id] = c; + c++; + } + + infile.close(); + infile.clear(); + return true; +} + +// Read bimbam mean genotype file, the first time, to obtain #SNPs for +// analysis (ns_test) and total #SNP (ns_total). +bool ReadFile_geno(const string &file_geno, const set &setSnps, + const gsl_matrix *W, vector &indicator_idv, + vector &indicator_snp, const double &maf_level, + const double &miss_level, const double &hwe_level, + const double &r2_level, map &mapRS2chr, + map &mapRS2bp, + map &mapRS2cM, vector &snpInfo, + size_t &ns_test) { + debug_msg("entered"); + indicator_snp.clear(); + snpInfo.clear(); + + igzstream infile(file_geno.c_str(), igzstream::in); + if (!infile) { + cout << "error reading genotype file:" << file_geno << endl; + return false; + } + + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + + LUInvert(WtW, pmt, WtWi); // @@ + + double v_x, v_w; + int c_idv = 0; + + string line; + char *ch_ptr; + + string rs; + long int b_pos; + string chr; + string major; + string minor; + double cM; + size_t file_pos; + + double maf, geno, geno_old; + size_t n_miss; + size_t n_0, n_1, n_2; + int flag_poly; + + int ni_total = indicator_idv.size(); + int ni_test = 0; + for (int i = 0; i < ni_total; ++i) { + ni_test += indicator_idv[i]; + } + ns_test = 0; + + file_pos = 0; + auto count_warnings = 0; + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + rs = ch_ptr; + ch_ptr = strtok_safe(NULL, " , \t"); + minor = ch_ptr; + ch_ptr = strtok_safe(NULL, " , \t"); + major = ch_ptr; + + if (setSnps.size() != 0 && setSnps.count(rs) == 0) { + // if SNP in geno but not in -snps we add an missing value + SNPINFO sInfo = {"-9", rs, -9, -9, minor, major, + 0, -9, -9, 0, 0, file_pos}; + snpInfo.push_back(sInfo); + indicator_snp.push_back(0); + + file_pos++; + continue; + } + + if (mapRS2bp.count(rs) == 0) { + if (is_debug_mode() && count_warnings++ < 10) { + std::string msg = "Can't figure out position for "; + msg += rs; + debug_msg(msg); + if (count_warnings == 10) + debug_msg("Skipping similar warnings"); + } + chr = "-9"; + b_pos = -9; + cM = -9; + } else { + b_pos = mapRS2bp[rs]; + chr = mapRS2chr[rs]; + cM = mapRS2cM[rs]; + } + + maf = 0; + n_miss = 0; + flag_poly = 0; + geno_old = -9; + n_0 = 0; + n_1 = 0; + n_2 = 0; + c_idv = 0; + gsl_vector_set_zero(genotype_miss); + for (int i = 0; i < ni_total; ++i) { + ch_ptr = strtok_safe(NULL, " , \t"); + if (indicator_idv[i] == 0) + continue; + + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(genotype_miss, c_idv, 1); + n_miss++; + c_idv++; + continue; + } + + geno = atof(ch_ptr); + if (geno >= 0 && geno <= 0.5) { + n_0++; + } + if (geno > 0.5 && geno < 1.5) { + n_1++; + } + if (geno >= 1.5 && geno <= 2.0) { + n_2++; + } + + gsl_vector_set(genotype, c_idv, geno); + + if (flag_poly == 0) { + geno_old = geno; + flag_poly = 2; + } + if (flag_poly == 2 && geno != geno_old) { + flag_poly = 1; + } + + maf += geno; + + c_idv++; + } + maf /= 2.0 * (double)(ni_test - n_miss); + + SNPINFO sInfo = {chr, rs, + cM, b_pos, + minor, major, + n_miss, (double)n_miss / (double)ni_test, + maf, ni_test - n_miss, + 0, file_pos}; + snpInfo.push_back(sInfo); + file_pos++; + + if ((double)n_miss / (double)ni_test > miss_level) { + indicator_snp.push_back(0); + continue; + } + + if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { + indicator_snp.push_back(0); + continue; + } + + if (flag_poly != 1) { + indicator_snp.push_back(0); + continue; + } + + if (hwe_level != 0 && maf_level != -1) { + if (CalcHWE(n_0, n_2, n_1) < hwe_level) { + indicator_snp.push_back(0); + continue; + } + } + + // Filter SNP if it is correlated with W unless W has + // only one column, of 1s. + for (size_t i = 0; i < genotype->size; ++i) { + if (gsl_vector_get(genotype_miss, i) == 1) { + geno = maf * 2.0; + gsl_vector_set(genotype, i, geno); + } + } + + gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot(genotype, genotype, &v_x); + gsl_blas_ddot(Wtx, WtWiWtx, &v_w); + + if (W->size2 != 1 && v_w / v_x >= r2_level) { + indicator_snp.push_back(0); + continue; + } + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free(genotype); + gsl_vector_free(genotype_miss); + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wtx); + gsl_vector_free(WtWiWtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return true; +} + +// Read bed file, the first time. +bool ReadFile_bed(const string &file_bed, const set &setSnps, + const gsl_matrix *W, vector &indicator_idv, + vector &indicator_snp, vector &snpInfo, + const double &maf_level, const double &miss_level, + const double &hwe_level, const double &r2_level, + size_t &ns_test) { + debug_msg("entered"); + indicator_snp.clear(); + size_t ns_total = snpInfo.size(); + + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return false; + } + + gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + double v_x, v_w, geno; + size_t c_idv = 0; + + char ch[1]; + bitset<8> b; + + size_t ni_total = indicator_idv.size(); + size_t ni_test = 0; + for (size_t i = 0; i < ni_total; ++i) { + ni_test += indicator_idv[i]; + } + ns_test = 0; + + // Calculate n_bit and c, the number of bit for each snp. + size_t n_bit; + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // Ignore the first three magic numbers. + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + double maf; + size_t n_miss; + size_t n_0, n_1, n_2, c; + + // Start reading snps and doing association test. + for (size_t t = 0; t < ns_total; ++t) { + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + if (setSnps.size() != 0 && setSnps.count(snpInfo[t].rs_number) == 0) { + snpInfo[t].n_miss = -9; + snpInfo[t].missingness = -9; + snpInfo[t].maf = -9; + snpInfo[t].file_position = t; + indicator_snp.push_back(0); + continue; + } + + // Read genotypes. + c = 0; + maf = 0.0; + n_miss = 0; + n_0 = 0; + n_1 = 0; + n_2 = 0; + c_idv = 0; + gsl_vector_set_zero(genotype_miss); + for (size_t i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0; + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && c == ni_total) { + break; + } + if (indicator_idv[c] == 0) { + c++; + continue; + } + c++; + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(genotype, c_idv, 2.0); + maf += 2.0; + n_2++; + } else { + gsl_vector_set(genotype, c_idv, 1.0); + maf += 1.0; + n_1++; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(genotype, c_idv, 0.0); + maf += 0.0; + n_0++; + } else { + gsl_vector_set(genotype_miss, c_idv, 1); + n_miss++; + } + } + c_idv++; + } + } + maf /= 2.0 * (double)(ni_test - n_miss); + + snpInfo[t].n_miss = n_miss; + snpInfo[t].missingness = (double)n_miss / (double)ni_test; + snpInfo[t].maf = maf; + snpInfo[t].n_idv = ni_test - n_miss; + snpInfo[t].n_nb = 0; + snpInfo[t].file_position = t; + + if ((double)n_miss / (double)ni_test > miss_level) { + indicator_snp.push_back(0); + continue; + } + + if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { + indicator_snp.push_back(0); + continue; + } + + if ((n_0 + n_1) == 0 || (n_1 + n_2) == 0 || (n_2 + n_0) == 0) { + indicator_snp.push_back(0); + continue; + } + + if (hwe_level != 0 && maf_level != -1) { + if (CalcHWE(n_0, n_2, n_1) < hwe_level) { + indicator_snp.push_back(0); + continue; + } + } + + // Filter SNP if it is correlated with W unless W has + // only one column, of 1s. + for (size_t i = 0; i < genotype->size; ++i) { + if (gsl_vector_get(genotype_miss, i) == 1) { + geno = maf * 2.0; + gsl_vector_set(genotype, i, geno); + } + } + + gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_ddot(genotype, genotype, &v_x); + gsl_blas_ddot(Wtx, WtWiWtx, &v_w); + + if (W->size2 != 1 && v_w / v_x > r2_level) { + indicator_snp.push_back(0); + continue; + } + + indicator_snp.push_back(1); + ns_test++; + } + + gsl_vector_free(genotype); + gsl_vector_free(genotype_miss); + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(Wtx); + gsl_vector_free(WtWiWtx); + gsl_permutation_free(pmt); + + infile.close(); + infile.clear(); + + return true; +} + +// Read the genotype for one SNP; remember to read empty lines. +// Geno stores original genotypes without centering. +// Missing values are replaced by mean. +bool Bimbam_ReadOneSNP(const size_t inc, const vector &indicator_idv, + igzstream &infile, gsl_vector *geno, double &geno_mean) { + debug_msg("entered"); + size_t ni_total = indicator_idv.size(); + + string line; + char *ch_ptr; + bool flag = false; + + for (size_t i = 0; i < inc; i++) { + safeGetline(infile, line).eof(); + } + + if (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + + geno_mean = 0.0; + double d; + size_t c_idv = 0; + vector geno_miss; + + for (size_t i = 0; i < ni_total; ++i) { + ch_ptr = strtok_safe(NULL, " , \t"); + if (indicator_idv[i] == 0) { + continue; + } + + if (strcmp(ch_ptr, "NA") == 0) { + geno_miss.push_back(c_idv); + } else { + d = atof(ch_ptr); + gsl_vector_set(geno, c_idv, d); + geno_mean += d; + } + c_idv++; + } + + geno_mean /= (double)(c_idv - geno_miss.size()); + + for (size_t i = 0; i < geno_miss.size(); ++i) { + gsl_vector_set(geno, geno_miss[i], geno_mean); + } + flag = true; + } + + return flag; +} + +// For PLINK, store SNPs as double too. +void Plink_ReadOneSNP(const int pos, const vector &indicator_idv, + ifstream &infile, gsl_vector *geno, double &geno_mean) { + debug_msg("entered"); + size_t ni_total = indicator_idv.size(), n_bit; + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(pos * n_bit + 3); + + // Read genotypes. + char ch[1]; + bitset<8> b; + + geno_mean = 0.0; + size_t c = 0, c_idv = 0; + vector geno_miss; + + for (size_t i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && c == ni_total) { + break; + } + if (indicator_idv[c] == 0) { + c++; + continue; + } + c++; + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(geno, c_idv, 2); + geno_mean += 2.0; + } else { + gsl_vector_set(geno, c_idv, 1); + geno_mean += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(geno, c_idv, 0); + geno_mean += 0.0; + } else { + geno_miss.push_back(c_idv); + } + } + + c_idv++; + } + } + + geno_mean /= (double)(c_idv - geno_miss.size()); + + for (size_t i = 0; i < geno_miss.size(); ++i) { + gsl_vector_set(geno, geno_miss[i], geno_mean); + } + + return; +} + +void ReadFile_kin(const string &file_kin, vector &indicator_idv, + map &mapID2num, const size_t k_mode, bool &error, + gsl_matrix *G) { + debug_msg("entered"); + igzstream infile(file_kin.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open kinship file: " << file_kin << endl; + error = true; + return; + } + + size_t ni_total = indicator_idv.size(); + + gsl_matrix_set_zero(G); + + string line; + char *ch_ptr; + double d; + + if (k_mode == 1) { + size_t i_test = 0, i_total = 0, j_test = 0, j_total = 0; + while (getline(infile, line)) { + if (i_total == ni_total) { + fail_msg("number of rows in the kinship file is larger than the number of phentypes"); + } + + if (indicator_idv[i_total] == 0) { + i_total++; + continue; + } + + j_total = 0; + j_test = 0; + ch_ptr = strtok((char *)line.c_str(), " , \t"); + while (ch_ptr != NULL) { + if (j_total == ni_total) { + fail_msg(string("number of columns in the kinship file is larger than the number of individuals for row = ")+to_string(i_total)); + } + + d = atof(ch_ptr); + if (indicator_idv[j_total] == 1) { + gsl_matrix_set(G, i_test, j_test, d); + j_test++; + } + j_total++; + + ch_ptr = strtok(NULL, " , \t"); + } + if (j_total != ni_total) { + string msg = "number of columns in the kinship file does not match the number of individuals for row = " + to_string( i_total ); + fail_msg(msg); + } + i_total++; + i_test++; + } + if (i_total != ni_total) { + fail_msg("number of rows in the kinship file does not match the number of individuals."); + } + } else { + map mapID2ID; + size_t c = 0; + for (size_t i = 0; i < indicator_idv.size(); i++) { + if (indicator_idv[i] == 1) { + mapID2ID[i] = c; + c++; + } + } + + string id1, id2; + double Cov_d; + size_t n_id1, n_id2; + + while (getline(infile, line)) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + id1 = ch_ptr; + ch_ptr = strtok_safe(NULL, " , \t"); + id2 = ch_ptr; + ch_ptr = strtok_safe(NULL, " , \t"); + d = atof(ch_ptr); + if (mapID2num.count(id1) == 0 || mapID2num.count(id2) == 0) { + continue; + } + if (indicator_idv[mapID2num[id1]] == 0 || + indicator_idv[mapID2num[id2]] == 0) { + continue; + } + + n_id1 = mapID2ID[mapID2num[id1]]; + n_id2 = mapID2ID[mapID2num[id2]]; + + Cov_d = gsl_matrix_get(G, n_id1, n_id2); + if (Cov_d != 0 && Cov_d != d) { + cerr << "error! redundant and unequal terms in the " + << "kinship file, for id1 = " << id1 << " and id2 = " << id2 + << endl; + fail_msg(""); + } else { + gsl_matrix_set(G, n_id1, n_id2, d); + gsl_matrix_set(G, n_id2, n_id1, d); + } + } + } + + infile.close(); + infile.clear(); + + return; +} + +void ReadFile_mk(const string &file_mk, vector &indicator_idv, + map &mapID2num, const size_t k_mode, bool &error, + gsl_matrix *G) { + debug_msg("entered"); + igzstream infile(file_mk.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open file: " << file_mk << endl; + error = true; + return; + } + + string file_kin, line; + + size_t i = 0; + while (getline(infile, line)) { + file_kin = line.c_str(); + gsl_matrix_view G_sub = + gsl_matrix_submatrix(G, 0, i * G->size1, G->size1, G->size1); + ReadFile_kin(file_kin, indicator_idv, mapID2num, k_mode, error, + &G_sub.matrix); + i++; + } + + infile.close(); + infile.clear(); +} + +void ReadFile_eigenU(const string &file_ku, bool &error, gsl_matrix *U) { + debug_msg("entered"); + igzstream infile(file_ku.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open the U file: " << file_ku << endl; + error = true; + return; + } + + size_t n_row = U->size1, n_col = U->size2, i_row = 0, i_col = 0; + + gsl_matrix_set_zero(U); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row == n_row) { + cout << "error! number of rows in the U file is larger " + << "than expected." << endl; + error = true; + } + + i_col = 0; + ch_ptr = strtok((char *)line.c_str(), " , \t"); + while (ch_ptr != NULL) { + if (i_col == n_col) { + cout << "error! number of columns in the U file " + << "is larger than expected, for row = " << i_row << endl; + error = true; + } + + d = atof(ch_ptr); + gsl_matrix_set(U, i_row, i_col, d); + i_col++; + + ch_ptr = strtok(NULL, " , \t"); + } + + i_row++; + } + + infile.close(); + infile.clear(); + + return; +} + +void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { + debug_msg("entered"); + igzstream infile(file_kd.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open the D file: " << file_kd << endl; + error = true; + return; + } + + size_t n_row = eval->size, i_row = 0; + + gsl_vector_set_zero(eval); + + string line; + char *ch_ptr; + double d; + + while (getline(infile, line)) { + if (i_row == n_row) { + cout << "error! number of rows in the D file is larger " + << "than expected." << endl; + error = true; + } + + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + d = atof(ch_ptr); + + ch_ptr = strtok(NULL, " , \t"); + if (ch_ptr != NULL) { + cout << "error! number of columns in the D file is larger " + << "than expected, for row = " << i_row << endl; + error = true; + } + + gsl_vector_set(eval, i_row, d); + + i_row++; + } + + infile.close(); + infile.clear(); + + return; +} + +// Read bimbam mean genotype file and calculate kinship matrix. +bool BimbamKin(const string file_geno, const set ksnps, + vector &indicator_snp, const int k_mode, + const int display_pace, gsl_matrix *matrix_kin, + const bool test_nind) { + debug_msg("entered"); + igzstream infile(file_geno.c_str(), igzstream::in); + enforce_msg(infile, "error reading genotype file"); + + size_t n_miss; + double d, geno_mean, geno_var; + + // setKSnp and/or LOCO support + bool process_ksnps = ksnps.size(); + + size_t ni_total = matrix_kin->size1; + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); + + // Xlarge contains inds x markers + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); + enforce_msg(Xlarge, "allocate Xlarge"); + + gsl_matrix_set_zero(Xlarge); + + // For every SNP read the genotype per individual + size_t ns_test = 0; + for (size_t t = 0; t < indicator_snp.size(); ++t) { + string line; + safeGetline(infile, line).eof(); + if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) + continue; + + std::regex_token_iterator rend; + regex split_on("[,[:blank:]]+"); + regex_token_iterator tokens(line.begin(), line.end(), + split_on, -1); + if (test_nind) { + // ascertain the number of genotype fields match + uint token_num = 0; + for (auto x = tokens; x != rend; x++) + token_num++; + if (token_num != ni_total+3) { + cerr << line << endl; + cerr << token_num << " != " << ni_total << endl; + warning_msg("Columns in geno file do not match # individuals"); + } + enforce_msg(token_num <= ni_total + 3,"not enough genotype fields"); + } + + auto snp = *tokens; // first field + // check whether SNP is included in ksnps (used by LOCO) + if (process_ksnps && ksnps.count(snp) == 0) + continue; + + tokens++; // skip nucleotide fields + tokens++; // skip nucleotide fields + + // calc SNP stats + geno_mean = 0.0; + n_miss = 0; + geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + for (size_t i = 0; i < ni_total; ++i) { + tokens++; + enforce_str(tokens != rend, line + " number of fields"); + string field = *tokens; + if (field == "NA") { + gsl_vector_set(geno_miss, i, 0); + n_miss++; + } else { + d = stod(field); + // make sure genotype field contains a number + if (field != "0" && field != "0.0") + enforce_str(d != 0.0f, field); + gsl_vector_set(geno, i, d); + gsl_vector_set(geno_miss, i, 1); + geno_mean += d; + geno_var += d * d; + } + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + for (size_t i = 0; i < ni_total; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + if (k_mode == 2 && geno_var != 0) { + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + // set the SNP column ns_test + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); + enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); + + ns_test++; + + // compute kinship matrix and return in matrix_kin a SNP at a time + if (ns_test % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } + if (ns_test % msize != 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + } + cout << endl; + + // scale the kinship matrix + enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); + + // and transpose + // FIXME: the following is very slow + + for (size_t i = 0; i < ni_total; ++i) { + for (size_t j = 0; j < i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i); + gsl_matrix_set(matrix_kin, i, j, d); + } + } + // GSL is faster - and there are even faster methods + // enforce_gsl(gsl_matrix_transpose(matrix_kin)); + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + gsl_matrix_free(Xlarge); + + infile.close(); + infile.clear(); + + return true; +} + +bool PlinkKin(const string &file_bed, vector &indicator_snp, + const int k_mode, const int display_pace, + gsl_matrix *matrix_kin) { + debug_msg("entered"); + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return false; + } + + char ch[1]; + bitset<8> b; + + size_t n_miss, ci_total; + double d, geno_mean, geno_var; + + size_t ni_total = matrix_kin->size1; + gsl_vector *geno = gsl_vector_safe_alloc(ni_total); + + size_t ns_test = 0; + int n_bit; + + // Create a large matrix. + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); + gsl_matrix_set_zero(Xlarge); + + // Calculate n_bit and c, the number of bit for each snp. + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // print the first three magic numbers + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) { + continue; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + // Read genotypes. + geno_mean = 0.0; + n_miss = 0; + ci_total = 0; + geno_var = 0.0; + for (int i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && ci_total == ni_total) { + break; + } + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(geno, ci_total, 2.0); + geno_mean += 2.0; + geno_var += 4.0; + } else { + gsl_vector_set(geno, ci_total, 1.0); + geno_mean += 1.0; + geno_var += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(geno, ci_total, 0.0); + } else { + gsl_vector_set(geno, ci_total, -9.0); + n_miss++; + } + } + + ci_total++; + } + } + + geno_mean /= (double)(ni_total - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_total; + geno_var -= geno_mean * geno_mean; + + for (size_t i = 0; i < ni_total; ++i) { + d = gsl_vector_get(geno, i); + if (d == -9.0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + if (k_mode == 2 && geno_var != 0) { + gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); + } + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); + gsl_vector_memcpy(&Xlarge_col.vector, geno); + + ns_test++; + + if (ns_test % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } + + if (ns_test % msize != 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + } + + cout << endl; + + gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test); + + for (size_t i = 0; i < ni_total; ++i) { + for (size_t j = 0; j < i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i); + gsl_matrix_set(matrix_kin, i, j, d); + } + } + + gsl_vector_free(geno); + gsl_matrix_free(Xlarge); + + infile.close(); + infile.clear(); + + return true; +} + +// Read bimbam mean genotype file, the second time, recode "mean" +// genotype and calculate K. +bool ReadFile_geno(const string file_geno, vector &indicator_idv, + vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, + const bool calc_K) { + debug_msg("entered"); + igzstream infile(file_geno.c_str(), igzstream::in); + if (!infile) { + cout << "error reading genotype file:" << file_geno << endl; + return false; + } + + string line; + char *ch_ptr; + + if (calc_K == true) { + gsl_matrix_set_zero(K); + } + + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(UtX->size1); + double geno, geno_mean; + size_t n_miss; + + int ni_total = (int)indicator_idv.size(); + int ns_total = (int)indicator_snp.size(); + int ni_test = UtX->size1; + int ns_test = UtX->size2; + + int c_idv = 0, c_snp = 0; + + for (int i = 0; i < ns_total; ++i) { + safeGetline(infile, line).eof(); + if (indicator_snp[i] == 0) { + continue; + } + + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + + c_idv = 0; + geno_mean = 0; + n_miss = 0; + gsl_vector_set_zero(genotype_miss); + for (int j = 0; j < ni_total; ++j) { + ch_ptr = strtok_safe(NULL, " , \t"); + if (indicator_idv[j] == 0) { + continue; + } + + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(genotype_miss, c_idv, 1); + n_miss++; + } else { + geno = atof(ch_ptr); + gsl_vector_set(genotype, c_idv, geno); + geno_mean += geno; + } + c_idv++; + } + + geno_mean /= (double)(ni_test - n_miss); + + for (size_t i = 0; i < genotype->size; ++i) { + if (gsl_vector_get(genotype_miss, i) == 1) { + geno = 0; + } else { + geno = gsl_vector_get(genotype, i); + geno -= geno_mean; + } + + gsl_vector_set(genotype, i, geno); + gsl_matrix_set(UtX, i, c_snp, geno); + } + + if (calc_K == true) { + gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); + } + + c_snp++; + } + + if (calc_K == true) { + gsl_matrix_scale(K, 1.0 / (double)ns_test); + + for (size_t i = 0; i < genotype->size; ++i) { + for (size_t j = 0; j < i; ++j) { + geno = gsl_matrix_get(K, j, i); + gsl_matrix_set(K, i, j, geno); + } + } + } + + gsl_vector_free(genotype); + gsl_vector_free(genotype_miss); + + infile.clear(); + infile.close(); + + return true; +} + +// Compact version of the above function, using uchar instead of +// gsl_matrix. +bool ReadFile_geno(const string &file_geno, vector &indicator_idv, + vector &indicator_snp, + vector> &Xt, gsl_matrix *K, + const bool calc_K, const size_t ni_test, + const size_t ns_test) { + debug_msg("entered"); + igzstream infile(file_geno.c_str(), igzstream::in); + if (!infile) { + cout << "error reading genotype file:" << file_geno << endl; + return false; + } + + Xt.clear(); + vector Xt_row; + for (size_t i = 0; i < ni_test; i++) { + Xt_row.push_back(0); + } + + string line; + char *ch_ptr; + + if (calc_K == true) { + gsl_matrix_set_zero(K); + } + + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); + gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); + double geno, geno_mean; + size_t n_miss; + + size_t ni_total = indicator_idv.size(); + size_t ns_total = indicator_snp.size(); + + size_t c_idv = 0, c_snp = 0; + + for (size_t i = 0; i < ns_total; ++i) { + safeGetline(infile, line).eof(); + if (indicator_snp[i] == 0) { + continue; + } + + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + + c_idv = 0; + geno_mean = 0; + n_miss = 0; + gsl_vector_set_zero(genotype_miss); + for (uint j = 0; j < ni_total; ++j) { + ch_ptr = strtok_safe(NULL, " , \t"); + if (indicator_idv[j] == 0) { + continue; + } + + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(genotype_miss, c_idv, 1); + n_miss++; + } else { + geno = atof(ch_ptr); + gsl_vector_set(genotype, c_idv, geno); + geno_mean += geno; + } + c_idv++; + } + + geno_mean /= (double)(ni_test - n_miss); + + for (size_t j = 0; j < genotype->size; ++j) { + if (gsl_vector_get(genotype_miss, j) == 1) { + geno = geno_mean; + } else { + geno = gsl_vector_get(genotype, j); + } + + Xt_row[j] = Double02ToUchar(geno); + gsl_vector_set(genotype, j, (geno - geno_mean)); + } + Xt.push_back(Xt_row); + + if (calc_K == true) { + gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); + } + + c_snp++; + } + + if (calc_K == true) { + gsl_matrix_scale(K, 1.0 / (double)ns_test); + + for (size_t i = 0; i < genotype->size; ++i) { + for (size_t j = 0; j < i; ++j) { + geno = gsl_matrix_get(K, j, i); + gsl_matrix_set(K, i, j, geno); + } + } + } + + gsl_vector_free(genotype); + gsl_vector_free(genotype_miss); + + infile.clear(); + infile.close(); + + return true; +} + +// Read bimbam mean genotype file, the second time, recode "mean" +// genotype and calculate K. +bool ReadFile_bed(const string &file_bed, vector &indicator_idv, + vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, + const bool calc_K) { + debug_msg("entered"); + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return false; + } + + char ch[1]; + bitset<8> b; + + size_t ni_total = indicator_idv.size(); + size_t ns_total = indicator_snp.size(); + size_t ni_test = UtX->size1; + size_t ns_test = UtX->size2; + int n_bit; + + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // Print the first three magic numbers. + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + if (calc_K == true) { + gsl_matrix_set_zero(K); + } + + gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); + + double geno, geno_mean; + size_t n_miss; + size_t c_idv = 0, c_snp = 0, c = 0; + + // Start reading snps and doing association test. + for (size_t t = 0; t < ns_total; ++t) { + if (indicator_snp[t] == 0) { + continue; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + // Read genotypes. + c_idv = 0; + geno_mean = 0.0; + n_miss = 0; + c = 0; + for (int i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && c == ni_total) { + break; + } + if (indicator_idv[c] == 0) { + c++; + continue; + } + c++; + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(genotype, c_idv, 2.0); + geno_mean += 2.0; + } else { + gsl_vector_set(genotype, c_idv, 1.0); + geno_mean += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(genotype, c_idv, 0.0); + geno_mean += 0.0; + } else { + gsl_vector_set(genotype, c_idv, -9.0); + n_miss++; + } + } + c_idv++; + } + } + + geno_mean /= (double)(ni_test - n_miss); + + for (size_t i = 0; i < genotype->size; ++i) { + geno = gsl_vector_get(genotype, i); + if (geno == -9) { + geno = 0; + } else { + geno -= geno_mean; + } + + gsl_vector_set(genotype, i, geno); + gsl_matrix_set(UtX, i, c_snp, geno); + } + + if (calc_K == true) { + gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); + } + + c_snp++; + } + + if (calc_K == true) { + gsl_matrix_scale(K, 1.0 / (double)ns_test); + + for (size_t i = 0; i < genotype->size; ++i) { + for (size_t j = 0; j < i; ++j) { + geno = gsl_matrix_get(K, j, i); + gsl_matrix_set(K, i, j, geno); + } + } + } + + gsl_vector_free(genotype); + infile.clear(); + infile.close(); + + return true; +} + +// Compact version of the above function, using uchar instead of gsl_matrix. +bool ReadFile_bed(const string &file_bed, vector &indicator_idv, + vector &indicator_snp, vector> &Xt, + gsl_matrix *K, const bool calc_K, const size_t ni_test, + const size_t ns_test) { + debug_msg("entered"); + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return false; + } + + Xt.clear(); + vector Xt_row; + for (size_t i = 0; i < ni_test; i++) { + Xt_row.push_back(0); + } + + char ch[1]; + bitset<8> b; + + size_t ni_total = indicator_idv.size(); + size_t ns_total = indicator_snp.size(); + int n_bit; + + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // Print the first three magic numbers. + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + if (calc_K == true) { + gsl_matrix_set_zero(K); + } + + gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); + + double geno, geno_mean; + size_t n_miss; + size_t c_idv = 0, c_snp = 0, c = 0; + + // Start reading SNPs and doing association test. + for (size_t t = 0; t < ns_total; ++t) { + if (indicator_snp[t] == 0) { + continue; + } + + // n_bit, and 3 is the number of magic numbers. + infile.seekg(t * n_bit + 3); + + // Read genotypes. + c_idv = 0; + geno_mean = 0.0; + n_miss = 0; + c = 0; + for (int i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0. + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && c == ni_total) { + break; + } + if (indicator_idv[c] == 0) { + c++; + continue; + } + c++; + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(genotype, c_idv, 2.0); + geno_mean += 2.0; + } else { + gsl_vector_set(genotype, c_idv, 1.0); + geno_mean += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(genotype, c_idv, 0.0); + geno_mean += 0.0; + } else { + gsl_vector_set(genotype, c_idv, -9.0); + n_miss++; + } + } + c_idv++; + } + } + + geno_mean /= (double)(ni_test - n_miss); + + for (size_t i = 0; i < genotype->size; ++i) { + geno = gsl_vector_get(genotype, i); + if (geno == -9) { + geno = geno_mean; + } + + Xt_row[i] = Double02ToUchar(geno); + + geno -= geno_mean; + + gsl_vector_set(genotype, i, geno); + } + Xt.push_back(Xt_row); + + if (calc_K == true) { + gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); + } + + c_snp++; + } + + if (calc_K == true) { + gsl_matrix_scale(K, 1.0 / (double)ns_test); + + for (size_t i = 0; i < genotype->size; ++i) { + for (size_t j = 0; j < i; ++j) { + geno = gsl_matrix_get(K, j, i); + gsl_matrix_set(K, i, j, geno); + } + } + } + + gsl_vector_free(genotype); + infile.clear(); + infile.close(); + + return true; +} + +bool ReadFile_est(const string &file_est, const vector &est_column, + map &mapRS2est) { + debug_msg("entered"); + mapRS2est.clear(); + + ifstream infile(file_est.c_str(), ifstream::in); + if (!infile) { + cout << "error opening estimated parameter file: " << file_est << endl; + return false; + } + + string line; + char *ch_ptr; + + string rs; + double alpha, beta, gamma, d; + + // Header. + getline(infile, line); + + size_t n = *max_element(est_column.begin(), est_column.end()); + + while (getline(infile, line)) { + ch_ptr = strtok_safe((char *)line.c_str(), " \t"); + + alpha = 0.0; + beta = 0.0; + gamma = 1.0; + for (size_t i = 0; i < n + 1; ++i) { + if (i == est_column[0] - 1) { + enforce(ch_ptr); + rs = ch_ptr; + } + if (i == est_column[1] - 1) { + enforce(ch_ptr); + alpha = atof(ch_ptr); + } + if (i == est_column[2] - 1) { + enforce(ch_ptr); + beta = atof(ch_ptr); + } + if (i == est_column[3] - 1) { + enforce(ch_ptr); + gamma = atof(ch_ptr); + } + if (i < n) { + ch_ptr = strtok(NULL, " \t"); + } + } + + d = alpha + beta * gamma; + + if (mapRS2est.count(rs) == 0) { + mapRS2est[rs] = d; + } else { + cout << "the same SNP occurs more than once in estimated " + << "parameter file: " << rs << endl; + return false; + } + } + + infile.clear(); + infile.close(); + return true; +} + +bool CountFileLines(const string &file_input, size_t &n_lines) { + debug_msg("entered"); + igzstream infile(file_input.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open file: " << file_input << endl; + return false; + } + + n_lines = count(istreambuf_iterator(infile), + istreambuf_iterator(), '\n'); + infile.seekg(0, ios::beg); + + return true; +} + +// Read gene expression file. +bool ReadFile_gene(const string &file_gene, vector &vec_read, + vector &snpInfo, size_t &ng_total) { + debug_msg("entered"); + vec_read.clear(); + ng_total = 0; + + igzstream infile(file_gene.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open gene expression file: " << file_gene << endl; + return false; + } + + string line; + char *ch_ptr; + string rs; + + size_t n_idv = 0, t = 0; + + // Header. + getline(infile, line); + + while (getline(infile, line)) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + rs = ch_ptr; + + ch_ptr = strtok(NULL, " , \t"); + + t = 0; + while (ch_ptr != NULL) { + if (ng_total == 0) { + vec_read.push_back(0); + t++; + n_idv++; + } else { + vec_read[t] += atof(ch_ptr); + t++; + } + + ch_ptr = strtok(NULL, " , \t"); + } + + if (t != n_idv) { + cout << "error! number of columns doesn't match in row: " << ng_total + << endl; + return false; + } + + SNPINFO sInfo = {"-9", rs, -9, -9, "-9", "-9", 0, -9, -9, 0, 0, 0}; + snpInfo.push_back(sInfo); + + ng_total++; + } + + infile.close(); + infile.clear(); + + return true; +} + +// Read header to determine which column contains which item. +bool ReadHeader_io(const string &line, HEADER &header) { + debug_msg("entered"); + string rs_ptr[] = {"rs", "RS", "snp", "SNP", "snps", "SNPS", + "snpid", "SNPID", "rsid", "RSID", "MarkerName"}; + set rs_set(rs_ptr, rs_ptr + 11); // create a set of 11 items + string chr_ptr[] = {"chr", "CHR"}; + set chr_set(chr_ptr, chr_ptr + 2); + string pos_ptr[] = { + "ps", "PS", "pos", "POS", "base_position", "BASE_POSITION", "bp", "BP"}; + set pos_set(pos_ptr, pos_ptr + 8); + string cm_ptr[] = {"cm", "CM"}; + set cm_set(cm_ptr, cm_ptr + 2); + string a1_ptr[] = {"a1", "A1", "allele1", "ALLELE1", "Allele1", "INC_ALLELE"}; + set a1_set(a1_ptr, a1_ptr + 5); + string a0_ptr[] = {"a0", "A0", "allele0", "ALLELE0", "Allele0", "a2", + "A2", "allele2", "ALLELE2", "Allele2", "DEC_ALLELE"}; + set a0_set(a0_ptr, a0_ptr + 10); + + string z_ptr[] = {"z", "Z", "z_score", "Z_SCORE", "zscore", "ZSCORE"}; + set z_set(z_ptr, z_ptr + 6); + string beta_ptr[] = {"beta", "BETA", "b", "B"}; + set beta_set(beta_ptr, beta_ptr + 4); + string sebeta_ptr[] = {"se_beta", "SE_BETA", "se", "SE"}; + set sebeta_set(sebeta_ptr, sebeta_ptr + 4); + string chisq_ptr[] = {"chisq", "CHISQ", "chisquare", "CHISQUARE"}; + set chisq_set(chisq_ptr, chisq_ptr + 4); + string p_ptr[] = {"p", "P", "pvalue", "PVALUE", "p-value", "P-VALUE"}; + set p_set(p_ptr, p_ptr + 6); + + string n_ptr[] = {"n", "N", "ntotal", "NTOTAL", "n_total", "N_TOTAL"}; + set n_set(n_ptr, n_ptr + 6); + string nmis_ptr[] = {"nmis", "NMIS", "n_mis", "N_MIS", "n_miss", "N_MISS"}; + set nmis_set(nmis_ptr, nmis_ptr + 6); + string nobs_ptr[] = {"nobs", "NOBS", "n_obs", "N_OBS"}; + set nobs_set(nobs_ptr, nobs_ptr + 4); + string ncase_ptr[] = {"ncase", "NCASE", "n_case", "N_CASE"}; + set ncase_set(ncase_ptr, ncase_ptr + 4); + string ncontrol_ptr[] = {"ncontrol", "NCONTROL", "n_control", "N_CONTROL"}; + set ncontrol_set(ncontrol_ptr, ncontrol_ptr + 4); + + string af_ptr[] = {"af", + "AF", + "maf", + "MAF", + "f", + "F", + "allele_freq", + "ALLELE_FREQ", + "allele_frequency", + "ALLELE_FREQUENCY", + "Freq.Allele1.HapMapCEU", + "FreqAllele1HapMapCEU", + "Freq1.Hapmap"}; + set af_set(af_ptr, af_ptr + 13); + string var_ptr[] = {"var", "VAR"}; + set var_set(var_ptr, var_ptr + 2); + + string ws_ptr[] = {"window_size", "WINDOW_SIZE", "ws", "WS"}; + set ws_set(ws_ptr, ws_ptr + 4); + string cor_ptr[] = {"cor", "COR", "r", "R"}; + set cor_set(cor_ptr, cor_ptr + 4); + + header.rs_col = 0; + header.chr_col = 0; + header.pos_col = 0; + header.cm_col = 0; + header.a1_col = 0; + header.a0_col = 0; + header.z_col = 0; + header.beta_col = 0; + header.sebeta_col = 0; + header.chisq_col = 0; + header.p_col = 0; + header.n_col = 0; + header.nmis_col = 0; + header.nobs_col = 0; + header.ncase_col = 0; + header.ncontrol_col = 0; + header.af_col = 0; + header.var_col = 0; + header.ws_col = 0; + header.cor_col = 0; + header.coln = 0; + + char *ch_ptr; + string type; + size_t n_error = 0; + + ch_ptr = strtok((char *)line.c_str(), " , \t"); + while (ch_ptr != NULL) { + type = ch_ptr; + if (rs_set.count(type) != 0) { + if (header.rs_col == 0) { + header.rs_col = header.coln + 1; + } else { + cout << "error! more than two rs columns in the file." << endl; + n_error++; + } + } else if (chr_set.count(type) != 0) { + if (header.chr_col == 0) { + header.chr_col = header.coln + 1; + } else { + cout << "error! more than two chr columns in the file." << endl; + n_error++; + } + } else if (pos_set.count(type) != 0) { + if (header.pos_col == 0) { + header.pos_col = header.coln + 1; + } else { + cout << "error! more than two pos columns in the file." << endl; + n_error++; + } + } else if (cm_set.count(type) != 0) { + if (header.cm_col == 0) { + header.cm_col = header.coln + 1; + } else { + cout << "error! more than two cm columns in the file." << endl; + n_error++; + } + } else if (a1_set.count(type) != 0) { + if (header.a1_col == 0) { + header.a1_col = header.coln + 1; + } else { + cout << "error! more than two allele1 columns in the file." << endl; + n_error++; + } + } else if (a0_set.count(type) != 0) { + if (header.a0_col == 0) { + header.a0_col = header.coln + 1; + } else { + cout << "error! more than two allele0 columns in the file." << endl; + n_error++; + } + } else if (z_set.count(type) != 0) { + if (header.z_col == 0) { + header.z_col = header.coln + 1; + } else { + cout << "error! more than two z columns in the file." << endl; + n_error++; + } + } else if (beta_set.count(type) != 0) { + if (header.beta_col == 0) { + header.beta_col = header.coln + 1; + } else { + cout << "error! more than two beta columns in the file." << endl; + n_error++; + } + } else if (sebeta_set.count(type) != 0) { + if (header.sebeta_col == 0) { + header.sebeta_col = header.coln + 1; + } else { + cout << "error! more than two se_beta columns in the file." << endl; + n_error++; + } + } else if (chisq_set.count(type) != 0) { + if (header.chisq_col == 0) { + header.chisq_col = header.coln + 1; + } else { + cout << "error! more than two z columns in the file." << endl; + n_error++; + } + } else if (p_set.count(type) != 0) { + if (header.p_col == 0) { + header.p_col = header.coln + 1; + } else { + cout << "error! more than two p columns in the file." << endl; + n_error++; + } + } else if (n_set.count(type) != 0) { + if (header.n_col == 0) { + header.n_col = header.coln + 1; + } else { + cout << "error! more than two n_total columns in the file." << endl; + n_error++; + } + } else if (nmis_set.count(type) != 0) { + if (header.nmis_col == 0) { + header.nmis_col = header.coln + 1; + } else { + cout << "error! more than two n_mis columns in the file." << endl; + n_error++; + } + } else if (nobs_set.count(type) != 0) { + if (header.nobs_col == 0) { + header.nobs_col = header.coln + 1; + } else { + cout << "error! more than two n_obs columns in the file." << endl; + n_error++; + } + } else if (ncase_set.count(type) != 0) { + if (header.ncase_col == 0) { + header.ncase_col = header.coln + 1; + } else { + cout << "error! more than two n_case columns in the file." << endl; + n_error++; + } + } else if (ncontrol_set.count(type) != 0) { + if (header.ncontrol_col == 0) { + header.ncontrol_col = header.coln + 1; + } else { + cout << "error! more than two n_control columns in the file." << endl; + n_error++; + } + } else if (ws_set.count(type) != 0) { + if (header.ws_col == 0) { + header.ws_col = header.coln + 1; + } else { + cout << "error! more than two window_size columns in the file." << endl; + n_error++; + } + } else if (af_set.count(type) != 0) { + if (header.af_col == 0) { + header.af_col = header.coln + 1; + } else { + cout << "error! more than two af columns in the file." << endl; + n_error++; + } + } else if (cor_set.count(type) != 0) { + if (header.cor_col == 0) { + header.cor_col = header.coln + 1; + } else { + cout << "error! more than two cor columns in the file." << endl; + n_error++; + } + } else { + string str = ch_ptr; + string cat = str.substr(str.size() - 2, 2); + + if (cat == "_c" || cat == "_C") { + + // continuous + header.catc_col.insert(header.coln + 1); + } else { + + // discrete + header.catd_col.insert(header.coln + 1); + } + } + + ch_ptr = strtok(NULL, " , \t"); + header.coln++; + } + + if (header.cor_col != 0 && header.cor_col != header.coln) { + cout << "error! the cor column should be the last column." << endl; + n_error++; + } + + if (header.rs_col == 0) { + if (header.chr_col != 0 && header.pos_col != 0) { + cout << "missing an rs column. rs id will be replaced by chr:pos" << endl; + } else { + cout << "error! missing an rs column." << endl; + n_error++; + } + } + + if (n_error == 0) { + return true; + } else { + return false; + } +} + +// Read category file, record mapRS2 in the category file does not +// contain a null category so if a snp has 0 for all categories, then +// it is not included in the analysis. +bool ReadFile_cat(const string &file_cat, map &mapRS2cat, + size_t &n_vc) { + debug_msg("entered"); + mapRS2cat.clear(); + + igzstream infile(file_cat.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open category file: " << file_cat << endl; + return false; + } + + string line; + char *ch_ptr; + + string rs, chr, a1, a0, pos, cm; + size_t i_cat; + + // Read header. + HEADER header; + safeGetline(infile, line).eof(); + ReadHeader_io(line, header); + + // Use the header to count the number of categories. + n_vc = header.coln; + if (header.rs_col != 0) { + n_vc--; + } + if (header.chr_col != 0) { + n_vc--; + } + if (header.pos_col != 0) { + n_vc--; + } + if (header.cm_col != 0) { + n_vc--; + } + if (header.a1_col != 0) { + n_vc--; + } + if (header.a0_col != 0) { + n_vc--; + } + + // Read the following lines to record mapRS2cat. + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + + i_cat = 0; + for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); + if (header.rs_col != 0 && header.rs_col == i + 1) { + rs = ch_ptr; + } else if (header.chr_col != 0 && header.chr_col == i + 1) { + chr = ch_ptr; + } else if (header.pos_col != 0 && header.pos_col == i + 1) { + pos = ch_ptr; + } else if (header.cm_col != 0 && header.cm_col == i + 1) { + cm = ch_ptr; + } else if (header.a1_col != 0 && header.a1_col == i + 1) { + a1 = ch_ptr; + } else if (header.a0_col != 0 && header.a0_col == i + 1) { + a0 = ch_ptr; + } else if (atoi(ch_ptr) == 1 || atoi(ch_ptr) == 0) { + if (i_cat == 0) { + if (header.rs_col == 0) { + rs = chr + ":" + pos; + } + } + + if (atoi(ch_ptr) == 1 && mapRS2cat.count(rs) == 0) { + mapRS2cat[rs] = i_cat; + } + i_cat++; + } else { + } + + ch_ptr = strtok(NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return true; +} + +bool ReadFile_mcat(const string &file_mcat, map &mapRS2cat, + size_t &n_vc) { + debug_msg("entered"); + mapRS2cat.clear(); + + igzstream infile(file_mcat.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open mcategory file: " << file_mcat << endl; + return false; + } + + string file_name; + map mapRS2cat_tmp; + size_t n_vc_tmp, t = 0; + + while (!safeGetline(infile, file_name).eof()) { + mapRS2cat_tmp.clear(); + ReadFile_cat(file_name, mapRS2cat_tmp, n_vc_tmp); + mapRS2cat.insert(mapRS2cat_tmp.begin(), mapRS2cat_tmp.end()); + if (t == 0) { + n_vc = n_vc_tmp; + } else { + n_vc = max(n_vc, n_vc_tmp); + } + t++; + } + + return true; +} + +// Read bimbam mean genotype file and calculate kinship matrix; this +// time, the kinship matrix is not centered, and can contain multiple +// K matrix. +bool BimbamKinUncentered(const string &file_geno, const set ksnps, + const int display_pace, + const vector &indicator_idv, + const vector &indicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns) { + debug_msg("entered"); + debug_msg("BimbamKinUncentered"); + igzstream infile(file_geno.c_str(), igzstream::in); + if (!infile) { + cout << "error reading genotype file:" << file_geno << endl; + return false; + } + + string line; + char *ch_ptr; + + size_t n_miss; + double d, geno_mean, geno_var; + + size_t ni_test = matrix_kin->size1; + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); + gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_test); + + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + size_t n_vc = matrix_kin->size2 / ni_test, i_vc; + string rs; + vector ns_vec; + for (size_t i = 0; i < n_vc; i++) { + ns_vec.push_back(0); + } + + // Create a large matrix. + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); + gsl_matrix_set_zero(Xlarge); + + size_t ns_test = 0; + for (size_t t = 0; t < indicator_snp.size(); ++t) { + safeGetline(infile, line).eof(); + if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) + continue; + + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + ch_ptr = strtok_safe(NULL, " , \t"); + + rs = snpInfo[t].rs_number; // This line is new. + + geno_mean = 0.0; + n_miss = 0; + geno_var = 0.0; + gsl_vector_set_all(geno_miss, 0); + + size_t j = 0; + for (size_t i = 0; i < indicator_idv.size(); ++i) { + if (indicator_idv[i] == 0) { + continue; + } + ch_ptr = strtok_safe(NULL, " , \t"); + if (strcmp(ch_ptr, "NA") == 0) { + gsl_vector_set(geno_miss, i, 0); + n_miss++; + } else { + d = atof(ch_ptr); + gsl_vector_set(geno, j, d); + gsl_vector_set(geno_miss, j, 1); + geno_mean += d; + geno_var += d * d; + } + j++; + } + + geno_mean /= (double)(ni_test - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_test; + geno_var -= geno_mean * geno_mean; + + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(geno_miss, i) == 0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + gsl_blas_dgemv(CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv(CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot(geno, geno, &geno_var); + geno_var /= (double)ni_test; + + if (geno_var != 0 && + (mapRS2weight.size() == 0 || mapRS2weight.count(rs) != 0)) { + if (mapRS2weight.size() == 0) { + d = 1.0 / geno_var; + } else { + d = mapRS2weight.at(rs) / geno_var; + } + + gsl_vector_scale(geno, sqrt(d)); + if (n_vc == 1 || mapRS2cat.size() == 0) { + gsl_vector_view Xlarge_col = + gsl_matrix_column(Xlarge, ns_vec[0] % msize); + gsl_vector_memcpy(&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0] % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } else if (mapRS2cat.count(rs) != 0) { + i_vc = mapRS2cat.at(rs); + + gsl_vector_view Xlarge_col = + gsl_matrix_column(Xlarge, msize * i_vc + ns_vec[i_vc] % msize); + gsl_vector_memcpy(&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc] % msize == 0) { + gsl_matrix_view X_sub = + gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); + gsl_matrix_view kin_sub = gsl_matrix_submatrix( + matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); + } + } + } + ns_test++; + } + + for (size_t i_vc = 0; i_vc < n_vc; i_vc++) { + if (ns_vec[i_vc] % msize != 0) { + gsl_matrix_view X_sub = + gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); + gsl_matrix_view kin_sub = + gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + &kin_sub.matrix); + } + } + + cout << endl; + + for (size_t t = 0; t < n_vc; t++) { + gsl_vector_set(vector_ns, t, ns_vec[t]); + + for (size_t i = 0; i < ni_test; ++i) { + for (size_t j = 0; j <= i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i + ni_test * t); + d /= (double)ns_vec[t]; + gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); + gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); + } + } + } + + gsl_vector_free(geno); + gsl_vector_free(geno_miss); + + gsl_vector_free(Wtx); + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(WtWiWtx); + gsl_permutation_free(pmt); + + gsl_matrix_free(Xlarge); + + infile.close(); + infile.clear(); + + return true; +} + +bool PlinkKin(const string &file_bed, const int display_pace, + const vector &indicator_idv, + const vector &indicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns) { + debug_msg("entered"); + ifstream infile(file_bed.c_str(), ios::binary); + if (!infile) { + cout << "error reading bed file:" << file_bed << endl; + return false; + } + + char ch[1]; + bitset<8> b; + + size_t n_miss, ci_total, ci_test; + double d, geno_mean, geno_var; + + size_t ni_test = matrix_kin->size1; + size_t ni_total = indicator_idv.size(); + gsl_vector *geno = gsl_vector_safe_alloc(ni_test); + + gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); + gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); + gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); + gsl_permutation *pmt = gsl_permutation_alloc(W->size2); + + gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); + int sig; + LUDecomp(WtW, pmt, &sig); + LUInvert(WtW, pmt, WtWi); + + size_t ns_test = 0; + int n_bit; + + size_t n_vc = matrix_kin->size2 / ni_test, i_vc; + string rs; + vector ns_vec; + for (size_t i = 0; i < n_vc; i++) { + ns_vec.push_back(0); + } + + // Create a large matrix. + const size_t msize = K_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); + gsl_matrix_set_zero(Xlarge); + + // Calculate n_bit and c, the number of bit for each SNP. + if (ni_total % 4 == 0) { + n_bit = ni_total / 4; + } else { + n_bit = ni_total / 4 + 1; + } + + // Print the first three magic numbers. + for (int i = 0; i < 3; ++i) { + infile.read(ch, 1); + b = ch[0]; + } + + for (size_t t = 0; t < indicator_snp.size(); ++t) { + if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { + ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); + } + if (indicator_snp[t] == 0) { + continue; + } + + // n_bit, and 3 is the number of magic numbers + infile.seekg(t * n_bit + 3); + + rs = snpInfo[t].rs_number; // This line is new. + + // Read genotypes. + geno_mean = 0.0; + n_miss = 0; + ci_total = 0; + geno_var = 0.0; + ci_test = 0; + for (int i = 0; i < n_bit; ++i) { + infile.read(ch, 1); + b = ch[0]; + + // Minor allele homozygous: 2.0; major: 0.0; + for (size_t j = 0; j < 4; ++j) { + if ((i == (n_bit - 1)) && ci_total == ni_total) { + break; + } + if (indicator_idv[ci_total] == 0) { + ci_total++; + continue; + } + + if (b[2 * j] == 0) { + if (b[2 * j + 1] == 0) { + gsl_vector_set(geno, ci_test, 2.0); + geno_mean += 2.0; + geno_var += 4.0; + } else { + gsl_vector_set(geno, ci_test, 1.0); + geno_mean += 1.0; + geno_var += 1.0; + } + } else { + if (b[2 * j + 1] == 1) { + gsl_vector_set(geno, ci_test, 0.0); + } else { + gsl_vector_set(geno, ci_test, -9.0); + n_miss++; + } + } + + ci_test++; + ci_total++; + } + } + + geno_mean /= (double)(ni_test - n_miss); + geno_var += geno_mean * geno_mean * (double)n_miss; + geno_var /= (double)ni_test; + geno_var -= geno_mean * geno_mean; + + for (size_t i = 0; i < ni_test; ++i) { + d = gsl_vector_get(geno, i); + if (d == -9.0) { + gsl_vector_set(geno, i, geno_mean); + } + } + + gsl_vector_add_constant(geno, -1.0 * geno_mean); + + gsl_blas_dgemv(CblasTrans, 1.0, W, geno, 0.0, Wtx); + gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); + gsl_blas_dgemv(CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); + gsl_blas_ddot(geno, geno, &geno_var); + geno_var /= (double)ni_test; + + if (geno_var != 0 && + (mapRS2weight.size() == 0 || mapRS2weight.count(rs) != 0)) { + if (mapRS2weight.size() == 0) { + d = 1.0 / geno_var; + } else { + d = mapRS2weight.at(rs) / geno_var; + } + + gsl_vector_scale(geno, sqrt(d)); + if (n_vc == 1 || mapRS2cat.size() == 0) { + gsl_vector_view Xlarge_col = + gsl_matrix_column(Xlarge, ns_vec[0] % msize); + gsl_vector_memcpy(&Xlarge_col.vector, geno); + ns_vec[0]++; + + if (ns_vec[0] % msize == 0) { + fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); + gsl_matrix_set_zero(Xlarge); + } + } else if (mapRS2cat.count(rs) != 0) { + i_vc = mapRS2cat.at(rs); + + gsl_vector_view Xlarge_col = + gsl_matrix_column(Xlarge, msize * i_vc + ns_vec[i_vc] % msize); + gsl_vector_memcpy(&Xlarge_col.vector, geno); + + ns_vec[i_vc]++; + + if (ns_vec[i_vc] % msize == 0) { + gsl_matrix_view X_sub = + gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); + gsl_matrix_view kin_sub = gsl_matrix_submatrix( + matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + &kin_sub.matrix); + + gsl_matrix_set_zero(&X_sub.matrix); + } + } + } + ns_test++; + } + + for (size_t i_vc = 0; i_vc < n_vc; i_vc++) { + if (ns_vec[i_vc] % msize != 0) { + gsl_matrix_view X_sub = + gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); + gsl_matrix_view kin_sub = + gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); + fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, + &kin_sub.matrix); + } + } + + cout << endl; + + for (size_t t = 0; t < n_vc; t++) { + gsl_vector_set(vector_ns, t, ns_vec[t]); + + for (size_t i = 0; i < ni_test; ++i) { + for (size_t j = 0; j <= i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i + ni_test * t); + d /= (double)ns_vec[t]; + gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); + gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); + } + } + } + + gsl_vector_free(geno); + + gsl_vector_free(Wtx); + gsl_matrix_free(WtW); + gsl_matrix_free(WtWi); + gsl_vector_free(WtWiWtx); + gsl_permutation_free(pmt); + + gsl_matrix_free(Xlarge); + + infile.close(); + infile.clear(); + + return true; +} + +bool MFILEKin(const size_t mfile_mode, const string &file_mfile, + const set setKSnps, const int display_pace, + const vector &indicator_idv, + const vector> &mindicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector> &msnpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns) { + debug_msg("entered"); + size_t n_vc = vector_ns->size, ni_test = matrix_kin->size1; + gsl_matrix_set_zero(matrix_kin); + gsl_vector_set_zero(vector_ns); + + igzstream infile(file_mfile.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open mfile file: " << file_mfile << endl; + return false; + } + + string file_name; + + gsl_matrix *kin_tmp = gsl_matrix_safe_alloc(matrix_kin->size1, matrix_kin->size2); + gsl_vector *ns_tmp = gsl_vector_safe_alloc(vector_ns->size); + + size_t l = 0; + double d; + while (!safeGetline(infile, file_name).eof()) { + gsl_matrix_set_zero(kin_tmp); + gsl_vector_set_zero(ns_tmp); + + if (mfile_mode == 1) { + file_name += ".bed"; + PlinkKin(file_name, display_pace, indicator_idv, mindicator_snp[l], + mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); + } else { + BimbamKinUncentered(file_name, setKSnps, display_pace, indicator_idv, + mindicator_snp[l], mapRS2weight, mapRS2cat, + msnpInfo[l], W, kin_tmp, ns_tmp); + } + + // Add ns. + gsl_vector_add(vector_ns, ns_tmp); + + // Add kin. + for (size_t t = 0; t < n_vc; t++) { + for (size_t i = 0; i < ni_test; ++i) { + for (size_t j = 0; j <= i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i + ni_test * t) + + gsl_matrix_get(kin_tmp, j, i + ni_test * t) * + gsl_vector_get(ns_tmp, t); + + gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); + gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); + } + } + } + l++; + } + + // Renormalize kin. + for (size_t t = 0; t < n_vc; t++) { + for (size_t i = 0; i < ni_test; ++i) { + for (size_t j = 0; j <= i; ++j) { + d = gsl_matrix_get(matrix_kin, j, i + ni_test * t) / + gsl_vector_get(vector_ns, t); + + gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); + gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); + } + } + } + cout << endl; + + infile.close(); + infile.clear(); + + gsl_matrix_free(kin_tmp); + gsl_vector_free(ns_tmp); + + return true; +} + +// Read var file, store mapRS2wsnp. +bool ReadFile_wsnp(const string &file_wsnp, map &mapRS2weight) { + debug_msg("entered"); + mapRS2weight.clear(); + + igzstream infile(file_wsnp.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open snp weight file: " << file_wsnp << endl; + return false; + } + + char *ch_ptr; + string line, rs; + double weight; + + while (!safeGetline(infile, line).eof()) { + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + rs = ch_ptr; + ch_ptr = strtok_safe(NULL, " , \t"); + weight = atof(ch_ptr); + mapRS2weight[rs] = weight; + } + + return true; +} + +bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, + map> &mapRS2wvector) { + debug_msg("entered"); + mapRS2wvector.clear(); + + igzstream infile(file_wcat.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open snp weight file: " << file_wcat << endl; + return false; + } + + char *ch_ptr; + vector weight; + for (size_t i = 0; i < n_vc; i++) { + weight.push_back(0.0); + } + + string line, rs, chr, a1, a0, pos, cm; + + // Read header. + HEADER header; + safeGetline(infile, line).eof(); + ReadHeader_io(line, header); + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) { + continue; + } + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + + size_t t = 0; + for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); + if (header.rs_col != 0 && header.rs_col == i + 1) { + rs = ch_ptr; + } else if (header.chr_col != 0 && header.chr_col == i + 1) { + chr = ch_ptr; + } else if (header.pos_col != 0 && header.pos_col == i + 1) { + pos = ch_ptr; + } else if (header.cm_col != 0 && header.cm_col == i + 1) { + cm = ch_ptr; + } else if (header.a1_col != 0 && header.a1_col == i + 1) { + a1 = ch_ptr; + } else if (header.a0_col != 0 && header.a0_col == i + 1) { + a0 = ch_ptr; + } else { + weight[t] = atof(ch_ptr); + t++; + if (t > n_vc) { + cout << "error! Number of columns in the wcat file does not " + << "match that of cat file."; + return false; + } + } + + ch_ptr = strtok(NULL, " , \t"); + } + + if (t != n_vc) { + cout << "error! Number of columns in the wcat file does not " + << "match that of cat file."; + return false; + } + + if (header.rs_col == 0) { + rs = chr + ":" + pos; + } + + mapRS2wvector[rs] = weight; + } + + return true; +} + +// Read the beta file, save snp z scores in to z2_score, and save +// category into indicator_snp based on mapRS2var and set, and +// indicator_snp record the category number (from 1 to n_vc), and +// provide var if maf/var is not provided in the beta file notice that +// indicator_snp contains ns_test snps, instead of ns_total snps read +// the beta file for the second time, compute q, and Vq based on block +// jacknife use the mapRS2var to select snps (and to ), calculate q do +// a block-wise jacknife, and compute Vq +void ReadFile_beta(const string &file_beta, + const map &mapRS2cat, + const map &mapRS2wA, vector &vec_cat, + vector &vec_ni, vector &vec_weight, + vector &vec_z2, size_t &ni_total, size_t &ns_total, + size_t &ns_test) { + debug_msg("entered"); + vec_cat.clear(); + vec_ni.clear(); + vec_weight.clear(); + vec_z2.clear(); + ni_total = 0; + ns_total = 0; + ns_test = 0; + + igzstream infile(file_beta.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open beta file: " << file_beta << endl; + return; + } + + string line; + char *ch_ptr; + string type; + + string rs, chr, a1, a0, pos, cm; + double z = 0, beta = 0, se_beta = 0, pvalue = 0, zsquare = 0; // af = 0; + size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; + + // Read header. + HEADER header; + safeGetline(infile, line).eof(); + ReadHeader_io(line, header); + + if (header.n_col == 0) { + if ((header.nobs_col == 0 && header.nmis_col == 0) && + (header.ncase_col == 0 && header.ncontrol_col == 0)) { + cout << "error! missing sample size in the beta file." << endl; + } else { + cout << "total sample size will be replaced by obs/mis sample size." + << endl; + } + } + + if (header.z_col == 0 && (header.beta_col == 0 || header.sebeta_col == 0) && + header.chisq_col == 0 && header.p_col == 0) { + cout << "error! missing z scores in the beta file." << endl; + } + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) { + continue; + } + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + + z = 0; + beta = 0; + se_beta = 0; + auto chisq = 0.0; + pvalue = 0; + n_total = 0; + n_mis = 0; + n_obs = 0; + n_case = 0; + n_control = 0; + // af = 0; + // auto var_x = 0.0; + for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); + if (header.rs_col != 0 && header.rs_col == i + 1) { + rs = ch_ptr; + } + if (header.chr_col != 0 && header.chr_col == i + 1) { + chr = ch_ptr; + } + if (header.pos_col != 0 && header.pos_col == i + 1) { + pos = ch_ptr; + } + if (header.cm_col != 0 && header.cm_col == i + 1) { + cm = ch_ptr; + } + if (header.a1_col != 0 && header.a1_col == i + 1) { + a1 = ch_ptr; + } + if (header.a0_col != 0 && header.a0_col == i + 1) { + a0 = ch_ptr; + } + + if (header.z_col != 0 && header.z_col == i + 1) { + z = atof(ch_ptr); + } + if (header.beta_col != 0 && header.beta_col == i + 1) { + beta = atof(ch_ptr); + } + if (header.sebeta_col != 0 && header.sebeta_col == i + 1) { + se_beta = atof(ch_ptr); + } + if (header.chisq_col != 0 && header.chisq_col == i + 1) { + chisq = atof(ch_ptr); + } + if (header.p_col != 0 && header.p_col == i + 1) { + pvalue = atof(ch_ptr); + } + + if (header.n_col != 0 && header.n_col == i + 1) { + n_total = atoi(ch_ptr); + } + if (header.nmis_col != 0 && header.nmis_col == i + 1) { + n_mis = atoi(ch_ptr); + } + if (header.nobs_col != 0 && header.nobs_col == i + 1) { + n_obs = atoi(ch_ptr); + } + if (header.ncase_col != 0 && header.ncase_col == i + 1) { + n_case = atoi(ch_ptr); + } + if (header.ncontrol_col != 0 && header.ncontrol_col == i + 1) { + n_control = atoi(ch_ptr); + } + // if (header.af_col != 0 && header.af_col == i + 1) { + // af = atof(ch_ptr); + // } + // if (header.var_col != 0 && header.var_col == i + 1) { + // var_x = atof(ch_ptr); + // } + + ch_ptr = strtok(NULL, " , \t"); + } + + if (header.rs_col == 0) { + rs = chr + ":" + pos; + } + + if (header.n_col == 0) { + if (header.nmis_col != 0 && header.nobs_col != 0) { + n_total = n_mis + n_obs; + } else { + n_total = n_case + n_control; + } + } + + // Both z values and beta/se_beta have directions, while + // chisq/pvalue do not. + if (header.z_col != 0) { + zsquare = z * z; + } else if (header.beta_col != 0 && header.sebeta_col != 0) { + z = beta / se_beta; + zsquare = z * z; + } else if (header.chisq_col != 0) { + zsquare = chisq; + } else if (header.p_col != 0) { + zsquare = gsl_cdf_chisq_Qinv(pvalue, 1); + } else { + zsquare = 0; + } + + // Obtain var_x. + // if (header.var_col == 0 && header.af_col != 0) { + // var_x = 2.0 * af * (1.0 - af); + // } + + // If the SNP is also present in cor file, then do calculations. + if ((mapRS2wA.size() == 0 || mapRS2wA.count(rs) != 0) && + (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0) && zsquare != 0) { + if (mapRS2cat.size() != 0) { + vec_cat.push_back(mapRS2cat.at(rs)); + } else { + vec_cat.push_back(0); + } + vec_ni.push_back(n_total); + if (mapRS2wA.size() == 0) { + vec_weight.push_back(1); + } else { + vec_weight.push_back(mapRS2wA.at(rs)); + } + vec_z2.push_back(zsquare); + + ni_total = max(ni_total, n_total); + ns_test++; + } + + ns_total++; + } + + infile.clear(); + infile.close(); + + return; +} + +void ReadFile_beta(const string &file_beta, const map &mapRS2wA, + map &mapRS2A1, + map &mapRS2z) { + debug_msg("entered"); + mapRS2A1.clear(); + mapRS2z.clear(); + + igzstream infile(file_beta.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open beta file: " << file_beta << endl; + return; + } + + string line; + char *ch_ptr; + string type; + + string rs, chr, a1, a0, pos, cm; + double z = 0, beta = 0, se_beta = 0; // pvalue = 0, chisq=0, af = 0 , var_x = 0; + size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; + size_t ni_total = 0, ns_total = 0, ns_test = 0; + + // Read header. + HEADER header; + safeGetline(infile, line).eof(); + ReadHeader_io(line, header); + + if (header.n_col == 0) { + if ((header.nobs_col == 0 && header.nmis_col == 0) && + (header.ncase_col == 0 && header.ncontrol_col == 0)) { + cout << "error! missing sample size in the beta file." << endl; + } else { + cout << "total sample size will be replaced by obs/mis sample size." + << endl; + } + } + + if (header.z_col == 0 && (header.beta_col == 0 || header.sebeta_col == 0)) { + cout << "error! missing z scores in the beta file." << endl; + } + + while (!safeGetline(infile, line).eof()) { + if (isBlankLine(line)) { + continue; + } + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + + z = 0; + beta = 0; + se_beta = 0; + // chisq = 0; + // pvalue = 0; + n_total = 0; + n_mis = 0; + n_obs = 0; + n_case = 0; + n_control = 0; + // af = 0; + // double var_x = 0; + for (size_t i = 0; i < header.coln; i++) { + enforce(ch_ptr); + if (header.rs_col != 0 && header.rs_col == i + 1) { + rs = ch_ptr; + } + if (header.chr_col != 0 && header.chr_col == i + 1) { + chr = ch_ptr; + } + if (header.pos_col != 0 && header.pos_col == i + 1) { + pos = ch_ptr; + } + if (header.cm_col != 0 && header.cm_col == i + 1) { + cm = ch_ptr; + } + if (header.a1_col != 0 && header.a1_col == i + 1) { + a1 = ch_ptr; + } + if (header.a0_col != 0 && header.a0_col == i + 1) { + a0 = ch_ptr; + } + + if (header.z_col != 0 && header.z_col == i + 1) { + z = atof(ch_ptr); + } + if (header.beta_col != 0 && header.beta_col == i + 1) { + beta = atof(ch_ptr); + } + if (header.sebeta_col != 0 && header.sebeta_col == i + 1) { + se_beta = atof(ch_ptr); + } + // if (header.chisq_col != 0 && header.chisq_col == i + 1) { + // chisq = atof(ch_ptr); + // } + // if (header.p_col != 0 && header.p_col == i + 1) { + // pvalue = atof(ch_ptr); + // } + + if (header.n_col != 0 && header.n_col == i + 1) { + n_total = atoi(ch_ptr); + } + if (header.nmis_col != 0 && header.nmis_col == i + 1) { + n_mis = atoi(ch_ptr); + } + if (header.nobs_col != 0 && header.nobs_col == i + 1) { + n_obs = atoi(ch_ptr); + } + if (header.ncase_col != 0 && header.ncase_col == i + 1) { + n_case = atoi(ch_ptr); + } + if (header.ncontrol_col != 0 && header.ncontrol_col == i + 1) { + n_control = atoi(ch_ptr); + } + + // if (header.af_col != 0 && header.af_col == i + 1) { + // af = atof(ch_ptr); + // } + + // if (header.var_col != 0 && header.var_col == i + 1) { + // var_x = atof(ch_ptr); + // } + + ch_ptr = strtok(NULL, " , \t"); + } + + if (header.rs_col == 0) { + rs = chr + ":" + pos; + } + + if (header.n_col == 0) { + if (header.nmis_col != 0 && header.nobs_col != 0) { + n_total = n_mis + n_obs; + } else { + n_total = n_case + n_control; + } + } + + // Both z values and beta/se_beta have directions, while + // chisq/pvalue do not. + if (header.z_col != 0) { + z = z; + } else if (header.beta_col != 0 && header.sebeta_col != 0) { + z = beta / se_beta; + } else { + z = 0; + } + + // If the snp is also present in cor file, then do calculations. + if ((mapRS2wA.size() == 0 || mapRS2wA.count(rs) != 0)) { + mapRS2z[rs] = z; + mapRS2A1[rs] = a1; + + ni_total = max(ni_total, n_total); + ns_test++; + } + + ns_total++; + } + + infile.clear(); + infile.close(); + + return; +} + +void Calcq(const size_t n_block, const vector &vec_cat, + const vector &vec_ni, const vector &vec_weight, + const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, + gsl_vector *s) { + debug_msg("entered"); + gsl_matrix_set_zero(Vq); + gsl_vector_set_zero(q); + gsl_vector_set_zero(s); + + size_t cat, n_total; + double w, zsquare; + + vector vec_q, vec_s, n_snps; + for (size_t i = 0; i < q->size; i++) { + vec_q.push_back(0.0); + vec_s.push_back(0.0); + n_snps.push_back(0.0); + } + + vector> mat_q, mat_s; + for (size_t i = 0; i < n_block; i++) { + mat_q.push_back(vec_q); + mat_s.push_back(vec_s); + } + + // Compute q and s. + for (size_t i = 0; i < vec_cat.size(); i++) { + + // Extract quantities. + cat = vec_cat[i]; + n_total = vec_ni[i]; + w = vec_weight[i]; + zsquare = vec_z2[i]; + + // Compute q and s. + vec_q[cat] += (zsquare - 1.0) * w / (double)n_total; + vec_s[cat] += w; + n_snps[cat]++; + } + + // Update q; vec_q is used again for computing Vq below. + for (size_t i = 0; i < q->size; i++) { + if (vec_s[i] != 0) { + gsl_vector_set(q, i, vec_q[i] / vec_s[i]); + } + gsl_vector_set(s, i, vec_s[i]); + } + + // Compute Vq; divide SNPs in each category into evenly distributed + // blocks. + size_t t = 0, b = 0, n_snp = 0; + double d, m, n; + for (size_t l = 0; l < q->size; l++) { + n_snp = floor(n_snps[l] / n_block); + t = 0; + b = 0; + if (n_snp == 0) { + continue; + } + + // Initiate everything to zero. + for (size_t i = 0; i < n_block; i++) { + for (size_t j = 0; j < q->size; j++) { + mat_q[i][j] = 0; + mat_s[i][j] = 0; + } + } + + // Record values. + for (size_t i = 0; i < vec_cat.size(); i++) { + + // Extract quantities. + cat = vec_cat[i]; + n_total = vec_ni[i]; + w = vec_weight[i]; + zsquare = vec_z2[i]; + + // Save quantities for computing Vq (which is not divided by + // n_total). + mat_q[b][cat] += (zsquare - 1.0) * w; + mat_s[b][cat] += w; + + if (cat == l) { + if (b < n_block - 1) { + if (t < n_snp - 1) { + t++; + } else { + b++; + t = 0; + } + } else { + t++; + } + } + } + + // Center mat_q. + for (size_t i = 0; i < q->size; i++) { + m = 0; + n = 0; + for (size_t k = 0; k < n_block; k++) { + if (mat_s[k][i] != 0 && vec_s[i] != mat_s[k][i]) { + d = (vec_q[i] - mat_q[k][i]) / (vec_s[i] - mat_s[k][i]); + mat_q[k][i] = d; + m += d; + n++; + } + } + if (n != 0) { + m /= n; + } + + for (size_t k = 0; k < n_block; k++) { + if (mat_q[k][i] != 0) { + mat_q[k][i] -= m; + } + } + } + + // Compute Vq for l'th row and l'th column only. + for (size_t i = 0; i < q->size; i++) { + d = 0; + n = 0; + for (size_t k = 0; k < n_block; k++) { + if (mat_q[k][l] != 0 && mat_q[k][i] != 0) { + d += mat_q[k][l] * mat_q[k][i]; + n++; + } + } + if (n != 0) { + d /= n; + d *= n - 1; + } + d += gsl_matrix_get(Vq, i, l); + gsl_matrix_set(Vq, i, l, d); + if (i != l) { + gsl_matrix_set(Vq, l, i, d); + } + } + } + + // divide the off diagonal elements of Vq by 2 + for (size_t i = 0; i < q->size; i++) { + for (size_t j = i; j < q->size; j++) { + if (i == j) { + continue; + } + d = gsl_matrix_get(Vq, i, j); + gsl_matrix_set(Vq, i, j, d / 2); + gsl_matrix_set(Vq, j, i, d / 2); + } + } + + return; +} + +// Read vector file. +void ReadFile_vector(const string &file_vec, gsl_vector *vec) { + debug_msg("entered"); + igzstream infile(file_vec.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open vector file: " << file_vec << endl; + return; + } + + string line; + char *ch_ptr; + + for (size_t i = 0; i < vec->size; i++) { + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + gsl_vector_set(vec, i, atof(ch_ptr)); + } + + infile.clear(); + infile.close(); + + return; +} + +void ReadFile_matrix(const string &file_mat, gsl_matrix *mat) { + debug_msg("entered"); + igzstream infile(file_mat.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open matrix file: " << file_mat << endl; + return; + } + + string line; + char *ch_ptr; + + for (size_t i = 0; i < mat->size1; i++) { + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + for (size_t j = 0; j < mat->size2; j++) { + enforce(ch_ptr); + gsl_matrix_set(mat, i, j, atof(ch_ptr)); + ch_ptr = strtok(NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + +void ReadFile_matrix(const string &file_mat, gsl_matrix *mat1, + gsl_matrix *mat2) { + debug_msg("entered"); + igzstream infile(file_mat.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open matrix file: " << file_mat << endl; + return; + } + + string line; + char *ch_ptr; + + for (size_t i = 0; i < mat1->size1; i++) { + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + for (size_t j = 0; j < mat1->size2; j++) { + enforce(ch_ptr); + gsl_matrix_set(mat1, i, j, atof(ch_ptr)); + ch_ptr = strtok(NULL, " , \t"); + } + } + + for (size_t i = 0; i < mat2->size1; i++) { + safeGetline(infile, line).eof(); + ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); + for (size_t j = 0; j < mat2->size2; j++) { + enforce(ch_ptr); + gsl_matrix_set(mat2, i, j, atof(ch_ptr)); + ch_ptr = strtok(NULL, " , \t"); + } + } + + infile.clear(); + infile.close(); + + return; +} + +// Read study file. +void ReadFile_study(const string &file_study, gsl_matrix *Vq_mat, + gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { + debug_msg("entered"); + string Vqfile = file_study + ".Vq.txt"; + string sfile = file_study + ".size.txt"; + string qfile = file_study + ".q.txt"; + + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); + + ReadFile_matrix(Vqfile, Vq_mat); + ReadFile_vector(sfile, s); + ReadFile_vector(qfile, q_vec); + + double d; + for (size_t i = 0; i < s_vec->size; i++) { + d = gsl_vector_get(s, i); + gsl_vector_set(s_vec, i, d); + } + ni = gsl_vector_get(s, s_vec->size); + + gsl_vector_free(s); + + return; +} + +// Read reference file. +void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { + debug_msg("entered"); + string sfile = file_ref + ".size.txt"; + string Sfile = file_ref + ".S.txt"; + + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); + + ReadFile_vector(sfile, s); + ReadFile_matrix(Sfile, S_mat, Svar_mat); + + double d; + for (size_t i = 0; i < s_vec->size; i++) { + d = gsl_vector_get(s, i); + gsl_vector_set(s_vec, i, d); + } + ni = gsl_vector_get(s, s_vec->size); + + gsl_vector_free(s); + + return; +} + +// Read mstudy file. +void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq_mat, + gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { + debug_msg("entered"); + gsl_matrix_set_zero(Vq_mat); + gsl_vector_set_zero(q_vec); + gsl_vector_set_zero(s_vec); + ni = 0; + + gsl_matrix *Vq_sub = gsl_matrix_safe_alloc(Vq_mat->size1, Vq_mat->size2); + gsl_vector *q_sub = gsl_vector_safe_alloc(q_vec->size); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); + + igzstream infile(file_mstudy.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open mstudy file: " << file_mstudy << endl; + return; + } + + string file_name; + double d1, d2, d; + + while (!safeGetline(infile, file_name).eof()) { + string Vqfile = file_name + ".Vq.txt"; + string sfile = file_name + ".size.txt"; + string qfile = file_name + ".q.txt"; + + ReadFile_matrix(Vqfile, Vq_sub); + ReadFile_vector(sfile, s); + ReadFile_vector(qfile, q_sub); + + ni = max(ni, (size_t)gsl_vector_get(s, s_vec->size)); + + for (size_t i = 0; i < s_vec->size; i++) { + d1 = gsl_vector_get(s, i); + if (d1 == 0) { + continue; + } + + d = gsl_vector_get(q_vec, i) + gsl_vector_get(q_sub, i) * d1; + gsl_vector_set(q_vec, i, d); + + d = gsl_vector_get(s_vec, i) + d1; + gsl_vector_set(s_vec, i, d); + + for (size_t j = i; j < s_vec->size; j++) { + d2 = gsl_vector_get(s, j); + if (d2 == 0) { + continue; + } + + d = gsl_matrix_get(Vq_mat, i, j) + + gsl_matrix_get(Vq_sub, i, j) * d1 * d2; + gsl_matrix_set(Vq_mat, i, j, d); + if (i != j) { + gsl_matrix_set(Vq_mat, j, i, d); + } + } + } + } + + for (size_t i = 0; i < s_vec->size; i++) { + d1 = gsl_vector_get(s_vec, i); + if (d1 == 0) { + continue; + } + + d = gsl_vector_get(q_vec, i); + gsl_vector_set(q_vec, i, d / d1); + + for (size_t j = i; j < s_vec->size; j++) { + d2 = gsl_vector_get(s_vec, j); + if (d2 == 0) { + continue; + } + + d = gsl_matrix_get(Vq_mat, i, j) / (d1 * d2); + gsl_matrix_set(Vq_mat, i, j, d); + if (i != j) { + gsl_matrix_set(Vq_mat, j, i, d); + } + } + } + + gsl_matrix_free(Vq_sub); + gsl_vector_free(q_sub); + gsl_vector_free(s); + + return; +} + +// Read reference file. +void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { + debug_msg("entered"); + gsl_matrix_set_zero(S_mat); + gsl_matrix_set_zero(Svar_mat); + gsl_vector_set_zero(s_vec); + ni = 0; + + gsl_matrix *S_sub = gsl_matrix_safe_alloc(S_mat->size1, S_mat->size2); + gsl_matrix *Svar_sub = gsl_matrix_safe_alloc(Svar_mat->size1, Svar_mat->size2); + gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); + + igzstream infile(file_mref.c_str(), igzstream::in); + if (!infile) { + cout << "error! fail to open mref file: " << file_mref << endl; + return; + } + + string file_name; + double d1, d2, d; + + while (!safeGetline(infile, file_name).eof()) { + string sfile = file_name + ".size.txt"; + string Sfile = file_name + ".S.txt"; + + ReadFile_vector(sfile, s); + ReadFile_matrix(Sfile, S_sub, Svar_sub); + + // Update s_vec and ni. + for (size_t i = 0; i < s_vec->size; i++) { + d = gsl_vector_get(s, i) + gsl_vector_get(s_vec, i); + gsl_vector_set(s_vec, i, d); + } + ni = max(ni, (size_t)gsl_vector_get(s, s_vec->size)); + + // Update S and Svar from each file. + for (size_t i = 0; i < S_mat->size1; i++) { + d1 = gsl_vector_get(s, i); + for (size_t j = 0; j < S_mat->size2; j++) { + d2 = gsl_vector_get(s, j); + + d = gsl_matrix_get(S_sub, i, j) * d1 * d2; + gsl_matrix_set(S_sub, i, j, d); + d = gsl_matrix_get(Svar_sub, i, j) * d1 * d2 * d1 * d2; + gsl_matrix_set(Svar_sub, i, j, d); + } + } + + gsl_matrix_add(S_mat, S_sub); + gsl_matrix_add(Svar_mat, Svar_sub); + } + + // Final: update S and Svar. + for (size_t i = 0; i < S_mat->size1; i++) { + d1 = gsl_vector_get(s_vec, i); + if (d1 == 0) { + continue; + } + for (size_t j = i; j < S_mat->size2; j++) { + d2 = gsl_vector_get(s_vec, j); + if (d2 == 0) { + continue; + } + + d = gsl_matrix_get(S_mat, i, j) / (d1 * d2); + gsl_matrix_set(S_mat, i, j, d); + if (i != j) { + gsl_matrix_set(S_mat, j, i, d); + } + + d = gsl_matrix_get(Svar_mat, i, j) / (d1 * d2 * d1 * d2); + gsl_matrix_set(Svar_mat, i, j, d); + if (i != j) { + gsl_matrix_set(Svar_mat, j, i, d); + } + } + } + + // Free matrices. + gsl_matrix_free(S_sub); + gsl_matrix_free(Svar_sub); + gsl_vector_free(s); + + return; +} diff --git a/src/gemma_io.h b/src/gemma_io.h new file mode 100644 index 0000000..dd1d5c0 --- /dev/null +++ b/src/gemma_io.h @@ -0,0 +1,187 @@ +/* + Genome-wide Efficient Mixed Model Association (GEMMA) + Copyright © 2011-2017, Xiang Zhou + Copyright © 2017, Peter Carbonetto + Copyright © 2017, Pjotr Prins + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __IO_H__ +#define __IO_H__ + +#include "gsl/gsl_matrix.h" +#include "gsl/gsl_vector.h" +#include +#include +#include + +#include "gzstream.h" +#include "param.h" + +#define tab(col) ( col ? "\t" : "") + +using namespace std; + +void ProgressBar(string str, double p, double total, double ratio = -1.0); + +std::istream &safeGetline(std::istream &is, std::string &t); + +bool ReadFile_snps(const string file_snps, set &setSnps); +bool ReadFile_snps_header(const string &file_snps, set &setSnps); +bool ReadFile_log(const string &file_log, double &pheno_mean); + +bool ReadFile_bim(const string &file_bim, vector &snpInfo); +bool ReadFile_fam(const string &file_fam, vector> &indicator_pheno, + vector> &pheno, map &mapID2num, + const vector &p_column); + +bool ReadFile_cvt(const string &file_cvt, vector &indicator_cvt, + vector> &cvt, size_t &n_cvt); +bool ReadFile_anno(const string &file_bim, map &mapRS2chr, + map &mapRS2bp, + map &mapRS2cM); +bool ReadFile_pheno(const string &file_pheno, + vector> &indicator_pheno, + vector> &pheno, + const vector &p_column); +bool ReadFile_column(const string &file_pheno, vector &indicator_idv, + vector &pheno, const int &p_column); + +bool ReadFile_geno(const string &file_geno, const set &setSnps, + const gsl_matrix *W, vector &indicator_idv, + vector &indicator_snp, const double &maf_level, + const double &miss_level, const double &hwe_level, + const double &r2_level, map &mapRS2chr, + map &mapRS2bp, + map &mapRS2cM, vector &snpInfo, + size_t &ns_test); +bool ReadFile_bed(const string &file_bed, const set &setSnps, + const gsl_matrix *W, vector &indicator_idv, + vector &indicator_snp, vector &snpInfo, + const double &maf_level, const double &miss_level, + const double &hwe_level, const double &r2_level, + size_t &ns_test); +bool Bimbam_ReadOneSNP(const size_t inc, const vector &indicator_idv, + igzstream &infile, gsl_vector *geno, double &geno_mean); +void Plink_ReadOneSNP(const int pos, const vector &indicator_idv, + ifstream &infile, gsl_vector *geno, double &geno_mean); + +void ReadFile_kin(const string &file_kin, vector &indicator_idv, + map &mapID2num, const size_t k_mode, bool &error, + gsl_matrix *G); +void ReadFile_mk(const string &file_mk, vector &indicator_idv, + map &mapID2num, const size_t k_mode, bool &error, + gsl_matrix *G); +void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); +void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); + +bool BimbamKin(const string file_geno, const set ksnps, + vector &indicator_snp, const int k_mode, + const int display_pace, gsl_matrix *matrix_kin, + const bool test_nind); +bool PlinkKin(const string &file_bed, vector &indicator_snp, + const int k_mode, const int display_pace, gsl_matrix *matrix_kin); + +bool ReadFile_geno(const string file_geno, vector &indicator_idv, + vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, + const bool calc_K); +bool ReadFile_bed(const string &file_bed, vector &indicator_idv, + vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, + const bool calc_K); +bool ReadFile_geno(const string &file_geno, vector &indicator_idv, + vector &indicator_snp, + vector> &Xt, gsl_matrix *K, + const bool calc_K, const size_t ni_test, + const size_t ns_test); +bool ReadFile_bed(const string &file_bed, vector &indicator_idv, + vector &indicator_snp, vector> &Xt, + gsl_matrix *K, const bool calc_K, const size_t ni_test, + const size_t ns_test); + +bool ReadFile_est(const string &file_est, const vector &est_column, + map &mapRS2est); + +bool CountFileLines(const string &file_input, size_t &n_lines); + +bool ReadFile_gene(const string &file_gene, vector &vec_read, + vector &snpInfo, size_t &ng_total); + +bool ReadHeader_io(const string &line, HEADER &header); +bool ReadFile_cat(const string &file_cat, map &mapRS2cat, + size_t &n_vc); +bool ReadFile_mcat(const string &file_mcat, map &mapRS2cat, + size_t &n_vc); + +bool ReadFile_catc(const string &file_cat, + map> &mapRS2catc, size_t &n_cat); +bool ReadFile_mcatc(const string &file_mcat, + map> &mapRS2catc, size_t &n_cat); + +bool BimbamKinUncentered(const string &file_geno, const set ksnps, + const int display_pace, + const vector &indicator_idv, + const vector &indicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool PlinkKin(const string &file_bed, const int display_pace, + const vector &indicator_idv, + const vector &indicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector &snpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns); +bool MFILEKin(const size_t mfile_mode, const string &file_mfile, + const set setKSnps, const int display_pace, + const vector &indicator_idv, + const vector> &mindicator_snp, + const map &mapRS2weight, + const map &mapRS2cat, + const vector> &msnpInfo, const gsl_matrix *W, + gsl_matrix *matrix_kin, gsl_vector *vector_ns); + +bool ReadFile_wsnp(const string &file_wsnp, map &mapRS2double); +bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, + map> &mapRS2vector); + +void ReadFile_beta(const string &file_beta, + const map &mapRS2cat, + const map &mapRS2wA, vector &vec_cat, + vector &vec_ni, vector &vec_weight, + vector &vec_z2, size_t &ni_total, size_t &ns_total, + size_t &ns_test); +void ReadFile_beta(const string &file_beta, const map &mapRS2wA, + map &mapRS2A1, map &mapRS2z); +void Calcq(const size_t n_block, const vector &vec_cat, + const vector &vec_ni, const vector &vec_weight, + const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, + gsl_vector *s); + +void ReadFile_study(const string &file_study, gsl_matrix *Vq, gsl_vector *q_vec, + gsl_vector *s_vec, size_t &ni); +void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); +void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq, + gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); +void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, + gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); +bool ReadFile_sample(const string &file_sample, + vector> &indicator_pheno, + vector> &pheno, + const vector &p_column, vector &indicator_cvt, + vector> &cvt, size_t &n_cvt); + +#endif diff --git a/src/io.cpp b/src/io.cpp deleted file mode 100644 index d20b473..0000000 --- a/src/io.cpp +++ /dev/null @@ -1,4080 +0,0 @@ -/* - Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright © 2011-2017, Xiang Zhou - Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gsl/gsl_blas.h" -#include "gsl/gsl_cdf.h" -#include "gsl/gsl_linalg.h" -#include "gsl/gsl_matrix.h" -#include "gsl/gsl_vector.h" - -#include "debug.h" -// #include "eigenlib.h" -#include "fastblas.h" -#include "gzstream.h" -#include "io.h" -#include "lapack.h" -#include "mathfunc.h" - -using namespace std; - -// Print progress bar. -void ProgressBar(string str, double p, double total, double ratio) { - assert(p<=total); - assert(p>=0); - if (total <= 0.0) return; - const double progress = (100.0 * p / total); - const uint barsize = (int)(progress / 2.0); // characters - // cout << barsize << endl; - // cout << str << " "; - // cout << p << "/" << total << endl; - assert(barsize < 101); // corrupted data somehow - if (barsize > 0) { - cout << std::string(barsize,'='); - } - cout << std::string(50-barsize,' '); - cout << setprecision(0) << fixed << " " << progress << "%"; - if (ratio != -1.0) - cout << setprecision(2) << " " << ratio; - cout << "\r" << flush; -} - -bool isBlankLine(char const *line) { - for (char const *cp = line; *cp; ++cp) { - if (!isspace(*cp)) - return false; - } - return true; -} - -bool isBlankLine(std::string const &line) { return isBlankLine(line.c_str()); } - -// In case files are ended with "\r" or "\r\n". -std::istream &safeGetline(std::istream &is, std::string &t) { - t.clear(); - - // The characters in the stream are read one-by-one using a - // std::streambuf. That is faster than reading them one-by-one - // using the std::istream. Code that uses streambuf this way must - // be guarded by a sentry object. The sentry object performs - // various tasks, such as thread synchronization and updating the - // stream state. - std::istream::sentry se(is, true); - std::streambuf *sb = is.rdbuf(); - - for (;;) { - int c = sb->sbumpc(); - switch (c) { - case '\n': - return is; - case '\r': - if (sb->sgetc() == '\n') - sb->sbumpc(); - return is; - case EOF: - - // Also handle the case when the last line has no line - // ending. - if (t.empty()) - is.setstate(std::ios::eofbit); - return is; - default: - t += (char)c; - } - } -} - -// Read SNP file. A single column of SNP names. -bool ReadFile_snps(const string file_snps, set &setSnps) { - debug_msg("entered"); - setSnps.clear(); - - igzstream infile(file_snps.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open snps file: " << file_snps << endl; - return false; - } - - string line; - char *ch_ptr; - - while (getline(infile, line)) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); - enforce_msg(ch_ptr,"Problem reading SNP file"); - setSnps.insert(ch_ptr); - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read SNP file using a header. The header determines how the -// values for each row are parsed. A valid header can be, for example, -// RS POS CHR -bool ReadFile_snps_header(const string &file_snps, set &setSnps) { - debug_msg("entered"); - setSnps.clear(); - - igzstream infile(file_snps.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open snps file: " << file_snps << endl; - return false; - } - - string line, rs, chr, pos; - char *ch_ptr; - - // Read header. - HEADER header; - safeGetline(infile, line).eof(); - ReadHeader_io(line, header); - - if (header.rs_col == 0 && (header.chr_col == 0 || header.pos_col == 0)) { - cout << "missing rs id in the header" << endl; - } - - while (!safeGetline(infile, line).eof()) { - if (isBlankLine(line)) { - continue; - } - ch_ptr = strtok((char *)line.c_str(), " , \t"); - enforce_msg(ch_ptr,"Problem reading SNP header"); - - for (size_t i = 0; i < header.coln; i++) { - enforce_msg(ch_ptr,"Problem reading SNP file"); - if (header.rs_col != 0 && header.rs_col == i + 1) { - rs = ch_ptr; - } - if (header.chr_col != 0 && header.chr_col == i + 1) { - chr = ch_ptr; - } - if (header.pos_col != 0 && header.pos_col == i + 1) { - pos = ch_ptr; - } - - ch_ptr = strtok(NULL, " , \t"); - } - - if (header.rs_col == 0) { - rs = chr + ":" + pos; - } - - setSnps.insert(rs); - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read log file. -bool ReadFile_log(const string &file_log, double &pheno_mean) { - debug_msg("ReadFile_log"); - ifstream infile(file_log.c_str(), ifstream::in); - if (!infile) { - cout << "error! fail to open log file: " << file_log << endl; - return false; - } - - string line; - char *ch_ptr; - size_t flag = 0; - - while (getline(infile, line)) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - ch_ptr = strtok(NULL, " , \t"); - - if (ch_ptr != NULL && strcmp(ch_ptr, "estimated") == 0) { - ch_ptr = strtok(NULL, " , \t"); - if (ch_ptr != NULL && strcmp(ch_ptr, "mean") == 0) { - ch_ptr = strtok(NULL, " , \t"); - if (ch_ptr != NULL && strcmp(ch_ptr, "=") == 0) { - ch_ptr = strtok_safe(NULL, " , \t"); - pheno_mean = atof(ch_ptr); - flag = 1; - } - } - } - - if (flag == 1) { - break; - } - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read bimbam annotation file which consists of rows of SNP, POS and CHR -bool ReadFile_anno(const string &file_anno, map &mapRS2chr, - map &mapRS2bp, - map &mapRS2cM) { - debug_msg("ReadFile_anno"); - mapRS2chr.clear(); - mapRS2bp.clear(); - - ifstream infile(file_anno.c_str(), ifstream::in); - if (!infile) { - cout << "error opening annotation file: " << file_anno << endl; - return false; - } - - string line; - - while (!safeGetline(infile, line).eof()) { - const char *ch_ptr = strtok((char *)line.c_str(), " , \t"); - enforce_str(ch_ptr, line + " Bad RS format"); - const string rs = ch_ptr; - enforce_str(rs != "", line + " Bad RS format"); - - ch_ptr = strtok(NULL, " , \t"); - enforce_str(ch_ptr, line + " Bad format"); - long b_pos; - if (strcmp(ch_ptr, "NA") == 0) { - b_pos = -9; - } else { - b_pos = atol(ch_ptr); - } - enforce_str(b_pos,line + " Bad pos format (is zero)"); - - string chr; - ch_ptr = strtok(NULL, " , \t"); - if (ch_ptr == NULL || strcmp(ch_ptr, "NA") == 0) { - chr = "-9"; - } else { - chr = ch_ptr; - enforce_str(chr != "", line + " Bad chr format"); - } - - double cM; - ch_ptr = strtok(NULL, " , \t"); - if (ch_ptr == NULL || strcmp(ch_ptr, "NA") == 0) { - cM = -9; - } else { - cM = atof(ch_ptr); - enforce_str(b_pos, line + "Bad cM format (is zero)"); - } - - mapRS2chr[rs] = chr; - mapRS2bp[rs] = b_pos; - mapRS2cM[rs] = cM; - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read 1 column of phenotype. -bool ReadFile_column(const string &file_pheno, vector &indicator_idv, - vector &pheno, const int &p_column) { - debug_msg("entered"); - indicator_idv.clear(); - pheno.clear(); - - igzstream infile(file_pheno.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open phenotype file: " << file_pheno << endl; - return false; - } - - string line; - char *ch_ptr; - - string id; - double p; - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - for (int i = 0; i < (p_column - 1); ++i) { - ch_ptr = strtok(NULL, " , \t"); - } - enforce_msg(ch_ptr,"Problem reading PHENO column"); - if (strcmp(ch_ptr, "NA") == 0) { - indicator_idv.push_back(0); - pheno.push_back(-9); - } else { - // Pheno is different from pimass2. - p = atof(ch_ptr); - indicator_idv.push_back(1); - pheno.push_back(p); - } - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read bimbam phenotype file, p_column=1, 2,... -bool ReadFile_pheno(const string &file_pheno, - vector> &indicator_pheno, - vector> &pheno, - const vector &p_column) { - debug_msg("entered"); - indicator_pheno.clear(); - pheno.clear(); - - igzstream infile(file_pheno.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open phenotype file: " << file_pheno << endl; - return false; - } - - string line; - char *ch_ptr; - - string id; - double p; - - vector pheno_row; - vector ind_pheno_row; - - size_t p_max = *max_element(p_column.begin(), p_column.end()); - map mapP2c; - for (size_t i = 0; i < p_column.size(); i++) { - mapP2c[p_column[i]] = i; - pheno_row.push_back(-9); - ind_pheno_row.push_back(0); - } - - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok((char *)line.c_str(), " , \t"); - size_t i = 0; - while (i < p_max) { - enforce_msg(ch_ptr,"Number of phenotypes out of range"); - if (mapP2c.count(i + 1) != 0) { - if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[i + 1]] = 0; - pheno_row[mapP2c[i + 1]] = -9; - } else { - p = atof(ch_ptr); - ind_pheno_row[mapP2c[i + 1]] = 1; - pheno_row[mapP2c[i + 1]] = p; - } - } - i++; - ch_ptr = strtok(NULL, " , \t"); - } - - indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - } - - infile.close(); - infile.clear(); - - return true; -} - -bool ReadFile_cvt(const string &file_cvt, vector &indicator_cvt, - vector> &cvt, size_t &n_cvt) { - debug_msg("entered"); - indicator_cvt.clear(); - - ifstream infile(file_cvt.c_str(), ifstream::in); - if (!infile) { - cout << "error! fail to open covariates file: " << file_cvt << endl; - return false; - } - - string line; - char *ch_ptr; - double d; - - int flag_na = 0; - - while (!safeGetline(infile, line).eof()) { - vector v_d; - flag_na = 0; - ch_ptr = strtok((char *)line.c_str(), " , \t"); - while (ch_ptr != NULL) { - if (strcmp(ch_ptr, "NA") == 0) { - flag_na = 1; - d = -9; - } else { - d = atof(ch_ptr); - } - - v_d.push_back(d); - ch_ptr = strtok(NULL, " , \t"); - } - if (flag_na == 0) { - indicator_cvt.push_back(1); - } else { - indicator_cvt.push_back(0); - } - cvt.push_back(v_d); - } - - if (indicator_cvt.empty()) { - n_cvt = 0; - } else { - flag_na = 0; - for (vector::size_type i = 0; i < indicator_cvt.size(); ++i) { - if (indicator_cvt[i] == 0) { - continue; - } - - if (flag_na == 0) { - flag_na = 1; - n_cvt = cvt[i].size(); - } - if (flag_na != 0 && n_cvt != cvt[i].size()) { - cout << "error! number of covariates in row " << i - << " do not match other rows." << endl; - return false; - } - } - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read .bim file. -bool ReadFile_bim(const string &file_bim, vector &snpInfo) { - debug_msg("entered"); - snpInfo.clear(); - - ifstream infile(file_bim.c_str(), ifstream::in); - if (!infile) { - cout << "error opening .bim file: " << file_bim << endl; - return false; - } - - string line; - char *ch_ptr; - - string rs; - long int b_pos; - string chr; - double cM; - string major; - string minor; - - while (getline(infile, line)) { - ch_ptr = strtok_safe((char *)line.c_str(), " \t"); - chr = ch_ptr; - ch_ptr = strtok_safe(NULL, " \t"); - rs = ch_ptr; - ch_ptr = strtok_safe(NULL, " \t"); - cM = atof(ch_ptr); - ch_ptr = strtok_safe(NULL, " \t"); - b_pos = atol(ch_ptr); - ch_ptr = strtok_safe(NULL, " \t"); - minor = ch_ptr; - ch_ptr = strtok_safe(NULL, " \t"); - major = ch_ptr; - - SNPINFO sInfo = {chr, rs, cM, b_pos, minor, major, 0, -9, -9, 0, 0, 0}; - snpInfo.push_back(sInfo); - } - - infile.close(); - infile.clear(); - return true; -} - -// Read .fam file (ignored with -p phenotypes switch) -bool ReadFile_fam(const string &file_fam, vector> &indicator_pheno, - vector> &pheno, map &mapID2num, - const vector &p_column) { - debug_msg("entered"); - indicator_pheno.clear(); - pheno.clear(); - mapID2num.clear(); - - igzstream infile(file_fam.c_str(), igzstream::in); - if (!infile) { - cout << "error opening .fam file: " << file_fam << endl; - return false; - } - - string line; - char *ch_ptr; - - string id; - int c = 0; - double p; - - vector pheno_row; - vector ind_pheno_row; - - size_t p_max = *max_element(p_column.begin(), p_column.end()); - map mapP2c; - for (size_t i = 0; i < p_column.size(); i++) { - mapP2c[p_column[i]] = i; - pheno_row.push_back(-9); - ind_pheno_row.push_back(0); - } - - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " \t"); - ch_ptr = strtok_safe(NULL, " \t"); - id = ch_ptr; - ch_ptr = strtok_safe(NULL, " \t"); - ch_ptr = strtok_safe(NULL, " \t"); - ch_ptr = strtok_safe(NULL, " \t"); - ch_ptr = strtok(NULL, " \t"); - - size_t i = 0; - while (i < p_max) { - if (mapP2c.count(i + 1) != 0) { - enforce_msg(ch_ptr,"Problem reading FAM file (phenotypes out of range)"); - - if (strcmp(ch_ptr, "NA") == 0) { - ind_pheno_row[mapP2c[i + 1]] = 0; - pheno_row[mapP2c[i + 1]] = -9; - } else { - p = atof(ch_ptr); - - if (p == -9) { - ind_pheno_row[mapP2c[i + 1]] = 0; - pheno_row[mapP2c[i + 1]] = -9; - } else { - ind_pheno_row[mapP2c[i + 1]] = 1; - pheno_row[mapP2c[i + 1]] = p; - } - } - } - i++; - ch_ptr = strtok(NULL, " , \t"); - } - - indicator_pheno.push_back(ind_pheno_row); - pheno.push_back(pheno_row); - - mapID2num[id] = c; - c++; - } - - infile.close(); - infile.clear(); - return true; -} - -// Read bimbam mean genotype file, the first time, to obtain #SNPs for -// analysis (ns_test) and total #SNP (ns_total). -bool ReadFile_geno(const string &file_geno, const set &setSnps, - const gsl_matrix *W, vector &indicator_idv, - vector &indicator_snp, const double &maf_level, - const double &miss_level, const double &hwe_level, - const double &r2_level, map &mapRS2chr, - map &mapRS2bp, - map &mapRS2cM, vector &snpInfo, - size_t &ns_test) { - debug_msg("entered"); - indicator_snp.clear(); - snpInfo.clear(); - - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - - LUInvert(WtW, pmt, WtWi); // @@ - - double v_x, v_w; - int c_idv = 0; - - string line; - char *ch_ptr; - - string rs; - long int b_pos; - string chr; - string major; - string minor; - double cM; - size_t file_pos; - - double maf, geno, geno_old; - size_t n_miss; - size_t n_0, n_1, n_2; - int flag_poly; - - int ni_total = indicator_idv.size(); - int ni_test = 0; - for (int i = 0; i < ni_total; ++i) { - ni_test += indicator_idv[i]; - } - ns_test = 0; - - file_pos = 0; - auto count_warnings = 0; - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - rs = ch_ptr; - ch_ptr = strtok_safe(NULL, " , \t"); - minor = ch_ptr; - ch_ptr = strtok_safe(NULL, " , \t"); - major = ch_ptr; - - if (setSnps.size() != 0 && setSnps.count(rs) == 0) { - // if SNP in geno but not in -snps we add an missing value - SNPINFO sInfo = {"-9", rs, -9, -9, minor, major, - 0, -9, -9, 0, 0, file_pos}; - snpInfo.push_back(sInfo); - indicator_snp.push_back(0); - - file_pos++; - continue; - } - - if (mapRS2bp.count(rs) == 0) { - if (is_debug_mode() && count_warnings++ < 10) { - std::string msg = "Can't figure out position for "; - msg += rs; - debug_msg(msg); - if (count_warnings == 10) - debug_msg("Skipping similar warnings"); - } - chr = "-9"; - b_pos = -9; - cM = -9; - } else { - b_pos = mapRS2bp[rs]; - chr = mapRS2chr[rs]; - cM = mapRS2cM[rs]; - } - - maf = 0; - n_miss = 0; - flag_poly = 0; - geno_old = -9; - n_0 = 0; - n_1 = 0; - n_2 = 0; - c_idv = 0; - gsl_vector_set_zero(genotype_miss); - for (int i = 0; i < ni_total; ++i) { - ch_ptr = strtok_safe(NULL, " , \t"); - if (indicator_idv[i] == 0) - continue; - - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - c_idv++; - continue; - } - - geno = atof(ch_ptr); - if (geno >= 0 && geno <= 0.5) { - n_0++; - } - if (geno > 0.5 && geno < 1.5) { - n_1++; - } - if (geno >= 1.5 && geno <= 2.0) { - n_2++; - } - - gsl_vector_set(genotype, c_idv, geno); - - if (flag_poly == 0) { - geno_old = geno; - flag_poly = 2; - } - if (flag_poly == 2 && geno != geno_old) { - flag_poly = 1; - } - - maf += geno; - - c_idv++; - } - maf /= 2.0 * (double)(ni_test - n_miss); - - SNPINFO sInfo = {chr, rs, - cM, b_pos, - minor, major, - n_miss, (double)n_miss / (double)ni_test, - maf, ni_test - n_miss, - 0, file_pos}; - snpInfo.push_back(sInfo); - file_pos++; - - if ((double)n_miss / (double)ni_test > miss_level) { - indicator_snp.push_back(0); - continue; - } - - if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { - indicator_snp.push_back(0); - continue; - } - - if (flag_poly != 1) { - indicator_snp.push_back(0); - continue; - } - - if (hwe_level != 0 && maf_level != -1) { - if (CalcHWE(n_0, n_2, n_1) < hwe_level) { - indicator_snp.push_back(0); - continue; - } - } - - // Filter SNP if it is correlated with W unless W has - // only one column, of 1s. - for (size_t i = 0; i < genotype->size; ++i) { - if (gsl_vector_get(genotype_miss, i) == 1) { - geno = maf * 2.0; - gsl_vector_set(genotype, i, geno); - } - } - - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_ddot(genotype, genotype, &v_x); - gsl_blas_ddot(Wtx, WtWiWtx, &v_w); - - if (W->size2 != 1 && v_w / v_x >= r2_level) { - indicator_snp.push_back(0); - continue; - } - - indicator_snp.push_back(1); - ns_test++; - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wtx); - gsl_vector_free(WtWiWtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return true; -} - -// Read bed file, the first time. -bool ReadFile_bed(const string &file_bed, const set &setSnps, - const gsl_matrix *W, vector &indicator_idv, - vector &indicator_snp, vector &snpInfo, - const double &maf_level, const double &miss_level, - const double &hwe_level, const double &r2_level, - size_t &ns_test) { - debug_msg("entered"); - indicator_snp.clear(); - size_t ns_total = snpInfo.size(); - - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return false; - } - - gsl_vector *genotype = gsl_vector_safe_alloc(W->size1); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1); - gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); - gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - double v_x, v_w, geno; - size_t c_idv = 0; - - char ch[1]; - bitset<8> b; - - size_t ni_total = indicator_idv.size(); - size_t ni_test = 0; - for (size_t i = 0; i < ni_total; ++i) { - ni_test += indicator_idv[i]; - } - ns_test = 0; - - // Calculate n_bit and c, the number of bit for each snp. - size_t n_bit; - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // Ignore the first three magic numbers. - for (int i = 0; i < 3; ++i) { - infile.read(ch, 1); - b = ch[0]; - } - - double maf; - size_t n_miss; - size_t n_0, n_1, n_2, c; - - // Start reading snps and doing association test. - for (size_t t = 0; t < ns_total; ++t) { - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(t * n_bit + 3); - - if (setSnps.size() != 0 && setSnps.count(snpInfo[t].rs_number) == 0) { - snpInfo[t].n_miss = -9; - snpInfo[t].missingness = -9; - snpInfo[t].maf = -9; - snpInfo[t].file_position = t; - indicator_snp.push_back(0); - continue; - } - - // Read genotypes. - c = 0; - maf = 0.0; - n_miss = 0; - n_0 = 0; - n_1 = 0; - n_2 = 0; - c_idv = 0; - gsl_vector_set_zero(genotype_miss); - for (size_t i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0; - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && c == ni_total) { - break; - } - if (indicator_idv[c] == 0) { - c++; - continue; - } - c++; - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(genotype, c_idv, 2.0); - maf += 2.0; - n_2++; - } else { - gsl_vector_set(genotype, c_idv, 1.0); - maf += 1.0; - n_1++; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(genotype, c_idv, 0.0); - maf += 0.0; - n_0++; - } else { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - } - } - c_idv++; - } - } - maf /= 2.0 * (double)(ni_test - n_miss); - - snpInfo[t].n_miss = n_miss; - snpInfo[t].missingness = (double)n_miss / (double)ni_test; - snpInfo[t].maf = maf; - snpInfo[t].n_idv = ni_test - n_miss; - snpInfo[t].n_nb = 0; - snpInfo[t].file_position = t; - - if ((double)n_miss / (double)ni_test > miss_level) { - indicator_snp.push_back(0); - continue; - } - - if ((maf < maf_level || maf > (1.0 - maf_level)) && maf_level != -1) { - indicator_snp.push_back(0); - continue; - } - - if ((n_0 + n_1) == 0 || (n_1 + n_2) == 0 || (n_2 + n_0) == 0) { - indicator_snp.push_back(0); - continue; - } - - if (hwe_level != 0 && maf_level != -1) { - if (CalcHWE(n_0, n_2, n_1) < hwe_level) { - indicator_snp.push_back(0); - continue; - } - } - - // Filter SNP if it is correlated with W unless W has - // only one column, of 1s. - for (size_t i = 0; i < genotype->size; ++i) { - if (gsl_vector_get(genotype_miss, i) == 1) { - geno = maf * 2.0; - gsl_vector_set(genotype, i, geno); - } - } - - gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_ddot(genotype, genotype, &v_x); - gsl_blas_ddot(Wtx, WtWiWtx, &v_w); - - if (W->size2 != 1 && v_w / v_x > r2_level) { - indicator_snp.push_back(0); - continue; - } - - indicator_snp.push_back(1); - ns_test++; - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(Wtx); - gsl_vector_free(WtWiWtx); - gsl_permutation_free(pmt); - - infile.close(); - infile.clear(); - - return true; -} - -// Read the genotype for one SNP; remember to read empty lines. -// Geno stores original genotypes without centering. -// Missing values are replaced by mean. -bool Bimbam_ReadOneSNP(const size_t inc, const vector &indicator_idv, - igzstream &infile, gsl_vector *geno, double &geno_mean) { - debug_msg("entered"); - size_t ni_total = indicator_idv.size(); - - string line; - char *ch_ptr; - bool flag = false; - - for (size_t i = 0; i < inc; i++) { - safeGetline(infile, line).eof(); - } - - if (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - - geno_mean = 0.0; - double d; - size_t c_idv = 0; - vector geno_miss; - - for (size_t i = 0; i < ni_total; ++i) { - ch_ptr = strtok_safe(NULL, " , \t"); - if (indicator_idv[i] == 0) { - continue; - } - - if (strcmp(ch_ptr, "NA") == 0) { - geno_miss.push_back(c_idv); - } else { - d = atof(ch_ptr); - gsl_vector_set(geno, c_idv, d); - geno_mean += d; - } - c_idv++; - } - - geno_mean /= (double)(c_idv - geno_miss.size()); - - for (size_t i = 0; i < geno_miss.size(); ++i) { - gsl_vector_set(geno, geno_miss[i], geno_mean); - } - flag = true; - } - - return flag; -} - -// For PLINK, store SNPs as double too. -void Plink_ReadOneSNP(const int pos, const vector &indicator_idv, - ifstream &infile, gsl_vector *geno, double &geno_mean) { - debug_msg("entered"); - size_t ni_total = indicator_idv.size(), n_bit; - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(pos * n_bit + 3); - - // Read genotypes. - char ch[1]; - bitset<8> b; - - geno_mean = 0.0; - size_t c = 0, c_idv = 0; - vector geno_miss; - - for (size_t i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0. - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && c == ni_total) { - break; - } - if (indicator_idv[c] == 0) { - c++; - continue; - } - c++; - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(geno, c_idv, 2); - geno_mean += 2.0; - } else { - gsl_vector_set(geno, c_idv, 1); - geno_mean += 1.0; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(geno, c_idv, 0); - geno_mean += 0.0; - } else { - geno_miss.push_back(c_idv); - } - } - - c_idv++; - } - } - - geno_mean /= (double)(c_idv - geno_miss.size()); - - for (size_t i = 0; i < geno_miss.size(); ++i) { - gsl_vector_set(geno, geno_miss[i], geno_mean); - } - - return; -} - -void ReadFile_kin(const string &file_kin, vector &indicator_idv, - map &mapID2num, const size_t k_mode, bool &error, - gsl_matrix *G) { - debug_msg("entered"); - igzstream infile(file_kin.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open kinship file: " << file_kin << endl; - error = true; - return; - } - - size_t ni_total = indicator_idv.size(); - - gsl_matrix_set_zero(G); - - string line; - char *ch_ptr; - double d; - - if (k_mode == 1) { - size_t i_test = 0, i_total = 0, j_test = 0, j_total = 0; - while (getline(infile, line)) { - if (i_total == ni_total) { - fail_msg("number of rows in the kinship file is larger than the number of phentypes"); - } - - if (indicator_idv[i_total] == 0) { - i_total++; - continue; - } - - j_total = 0; - j_test = 0; - ch_ptr = strtok((char *)line.c_str(), " , \t"); - while (ch_ptr != NULL) { - if (j_total == ni_total) { - fail_msg(string("number of columns in the kinship file is larger than the number of individuals for row = ")+to_string(i_total)); - } - - d = atof(ch_ptr); - if (indicator_idv[j_total] == 1) { - gsl_matrix_set(G, i_test, j_test, d); - j_test++; - } - j_total++; - - ch_ptr = strtok(NULL, " , \t"); - } - if (j_total != ni_total) { - string msg = "number of columns in the kinship file does not match the number of individuals for row = " + to_string( i_total ); - fail_msg(msg); - } - i_total++; - i_test++; - } - if (i_total != ni_total) { - fail_msg("number of rows in the kinship file does not match the number of individuals."); - } - } else { - map mapID2ID; - size_t c = 0; - for (size_t i = 0; i < indicator_idv.size(); i++) { - if (indicator_idv[i] == 1) { - mapID2ID[i] = c; - c++; - } - } - - string id1, id2; - double Cov_d; - size_t n_id1, n_id2; - - while (getline(infile, line)) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - id1 = ch_ptr; - ch_ptr = strtok_safe(NULL, " , \t"); - id2 = ch_ptr; - ch_ptr = strtok_safe(NULL, " , \t"); - d = atof(ch_ptr); - if (mapID2num.count(id1) == 0 || mapID2num.count(id2) == 0) { - continue; - } - if (indicator_idv[mapID2num[id1]] == 0 || - indicator_idv[mapID2num[id2]] == 0) { - continue; - } - - n_id1 = mapID2ID[mapID2num[id1]]; - n_id2 = mapID2ID[mapID2num[id2]]; - - Cov_d = gsl_matrix_get(G, n_id1, n_id2); - if (Cov_d != 0 && Cov_d != d) { - cerr << "error! redundant and unequal terms in the " - << "kinship file, for id1 = " << id1 << " and id2 = " << id2 - << endl; - fail_msg(""); - } else { - gsl_matrix_set(G, n_id1, n_id2, d); - gsl_matrix_set(G, n_id2, n_id1, d); - } - } - } - - infile.close(); - infile.clear(); - - return; -} - -void ReadFile_mk(const string &file_mk, vector &indicator_idv, - map &mapID2num, const size_t k_mode, bool &error, - gsl_matrix *G) { - debug_msg("entered"); - igzstream infile(file_mk.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open file: " << file_mk << endl; - error = true; - return; - } - - string file_kin, line; - - size_t i = 0; - while (getline(infile, line)) { - file_kin = line.c_str(); - gsl_matrix_view G_sub = - gsl_matrix_submatrix(G, 0, i * G->size1, G->size1, G->size1); - ReadFile_kin(file_kin, indicator_idv, mapID2num, k_mode, error, - &G_sub.matrix); - i++; - } - - infile.close(); - infile.clear(); -} - -void ReadFile_eigenU(const string &file_ku, bool &error, gsl_matrix *U) { - debug_msg("entered"); - igzstream infile(file_ku.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open the U file: " << file_ku << endl; - error = true; - return; - } - - size_t n_row = U->size1, n_col = U->size2, i_row = 0, i_col = 0; - - gsl_matrix_set_zero(U); - - string line; - char *ch_ptr; - double d; - - while (getline(infile, line)) { - if (i_row == n_row) { - cout << "error! number of rows in the U file is larger " - << "than expected." << endl; - error = true; - } - - i_col = 0; - ch_ptr = strtok((char *)line.c_str(), " , \t"); - while (ch_ptr != NULL) { - if (i_col == n_col) { - cout << "error! number of columns in the U file " - << "is larger than expected, for row = " << i_row << endl; - error = true; - } - - d = atof(ch_ptr); - gsl_matrix_set(U, i_row, i_col, d); - i_col++; - - ch_ptr = strtok(NULL, " , \t"); - } - - i_row++; - } - - infile.close(); - infile.clear(); - - return; -} - -void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) { - debug_msg("entered"); - igzstream infile(file_kd.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open the D file: " << file_kd << endl; - error = true; - return; - } - - size_t n_row = eval->size, i_row = 0; - - gsl_vector_set_zero(eval); - - string line; - char *ch_ptr; - double d; - - while (getline(infile, line)) { - if (i_row == n_row) { - cout << "error! number of rows in the D file is larger " - << "than expected." << endl; - error = true; - } - - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - d = atof(ch_ptr); - - ch_ptr = strtok(NULL, " , \t"); - if (ch_ptr != NULL) { - cout << "error! number of columns in the D file is larger " - << "than expected, for row = " << i_row << endl; - error = true; - } - - gsl_vector_set(eval, i_row, d); - - i_row++; - } - - infile.close(); - infile.clear(); - - return; -} - -// Read bimbam mean genotype file and calculate kinship matrix. -bool BimbamKin(const string file_geno, const set ksnps, - vector &indicator_snp, const int k_mode, - const int display_pace, gsl_matrix *matrix_kin, - const bool test_nind) { - debug_msg("entered"); - igzstream infile(file_geno.c_str(), igzstream::in); - enforce_msg(infile, "error reading genotype file"); - - size_t n_miss; - double d, geno_mean, geno_var; - - // setKSnp and/or LOCO support - bool process_ksnps = ksnps.size(); - - size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_safe_alloc(ni_total); - gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total); - - // Xlarge contains inds x markers - const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); - enforce_msg(Xlarge, "allocate Xlarge"); - - gsl_matrix_set_zero(Xlarge); - - // For every SNP read the genotype per individual - size_t ns_test = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - string line; - safeGetline(infile, line).eof(); - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); - } - if (indicator_snp[t] == 0) - continue; - - std::regex_token_iterator rend; - regex split_on("[,[:blank:]]+"); - regex_token_iterator tokens(line.begin(), line.end(), - split_on, -1); - if (test_nind) { - // ascertain the number of genotype fields match - uint token_num = 0; - for (auto x = tokens; x != rend; x++) - token_num++; - if (token_num != ni_total+3) { - cerr << line << endl; - cerr << token_num << " != " << ni_total << endl; - warning_msg("Columns in geno file do not match # individuals"); - } - enforce_msg(token_num <= ni_total + 3,"not enough genotype fields"); - } - - auto snp = *tokens; // first field - // check whether SNP is included in ksnps (used by LOCO) - if (process_ksnps && ksnps.count(snp) == 0) - continue; - - tokens++; // skip nucleotide fields - tokens++; // skip nucleotide fields - - // calc SNP stats - geno_mean = 0.0; - n_miss = 0; - geno_var = 0.0; - gsl_vector_set_all(geno_miss, 0); - for (size_t i = 0; i < ni_total; ++i) { - tokens++; - enforce_str(tokens != rend, line + " number of fields"); - string field = *tokens; - if (field == "NA") { - gsl_vector_set(geno_miss, i, 0); - n_miss++; - } else { - d = stod(field); - // make sure genotype field contains a number - if (field != "0" && field != "0.0") - enforce_str(d != 0.0f, field); - gsl_vector_set(geno, i, d); - gsl_vector_set(geno_miss, i, 1); - geno_mean += d; - geno_var += d * d; - } - } - - geno_mean /= (double)(ni_total - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_total; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_total; ++i) { - if (gsl_vector_get(geno_miss, i) == 0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - if (k_mode == 2 && geno_var != 0) { - gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); - } - // set the SNP column ns_test - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); - enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); - - ns_test++; - - // compute kinship matrix and return in matrix_kin a SNP at a time - if (ns_test % msize == 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - gsl_matrix_set_zero(Xlarge); - } - } - if (ns_test % msize != 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - } - cout << endl; - - // scale the kinship matrix - enforce_gsl(gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test)); - - // and transpose - // FIXME: the following is very slow - - for (size_t i = 0; i < ni_total; ++i) { - for (size_t j = 0; j < i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i); - gsl_matrix_set(matrix_kin, i, j, d); - } - } - // GSL is faster - and there are even faster methods - // enforce_gsl(gsl_matrix_transpose(matrix_kin)); - - gsl_vector_free(geno); - gsl_vector_free(geno_miss); - gsl_matrix_free(Xlarge); - - infile.close(); - infile.clear(); - - return true; -} - -bool PlinkKin(const string &file_bed, vector &indicator_snp, - const int k_mode, const int display_pace, - gsl_matrix *matrix_kin) { - debug_msg("entered"); - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return false; - } - - char ch[1]; - bitset<8> b; - - size_t n_miss, ci_total; - double d, geno_mean, geno_var; - - size_t ni_total = matrix_kin->size1; - gsl_vector *geno = gsl_vector_safe_alloc(ni_total); - - size_t ns_test = 0; - int n_bit; - - // Create a large matrix. - const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, msize); - gsl_matrix_set_zero(Xlarge); - - // Calculate n_bit and c, the number of bit for each snp. - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // print the first three magic numbers - for (int i = 0; i < 3; ++i) { - infile.read(ch, 1); - b = ch[0]; - } - - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); - } - if (indicator_snp[t] == 0) { - continue; - } - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(t * n_bit + 3); - - // Read genotypes. - geno_mean = 0.0; - n_miss = 0; - ci_total = 0; - geno_var = 0.0; - for (int i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0. - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && ci_total == ni_total) { - break; - } - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(geno, ci_total, 2.0); - geno_mean += 2.0; - geno_var += 4.0; - } else { - gsl_vector_set(geno, ci_total, 1.0); - geno_mean += 1.0; - geno_var += 1.0; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(geno, ci_total, 0.0); - } else { - gsl_vector_set(geno, ci_total, -9.0); - n_miss++; - } - } - - ci_total++; - } - } - - geno_mean /= (double)(ni_total - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_total; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_total; ++i) { - d = gsl_vector_get(geno, i); - if (d == -9.0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - if (k_mode == 2 && geno_var != 0) { - gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); - } - gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); - - ns_test++; - - if (ns_test % msize == 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - gsl_matrix_set_zero(Xlarge); - } - } - - if (ns_test % msize != 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - } - - cout << endl; - - gsl_matrix_scale(matrix_kin, 1.0 / (double)ns_test); - - for (size_t i = 0; i < ni_total; ++i) { - for (size_t j = 0; j < i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i); - gsl_matrix_set(matrix_kin, i, j, d); - } - } - - gsl_vector_free(geno); - gsl_matrix_free(Xlarge); - - infile.close(); - infile.clear(); - - return true; -} - -// Read bimbam mean genotype file, the second time, recode "mean" -// genotype and calculate K. -bool ReadFile_geno(const string file_geno, vector &indicator_idv, - vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K) { - debug_msg("entered"); - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - string line; - char *ch_ptr; - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(UtX->size1); - double geno, geno_mean; - size_t n_miss; - - int ni_total = (int)indicator_idv.size(); - int ns_total = (int)indicator_snp.size(); - int ni_test = UtX->size1; - int ns_test = UtX->size2; - - int c_idv = 0, c_snp = 0; - - for (int i = 0; i < ns_total; ++i) { - safeGetline(infile, line).eof(); - if (indicator_snp[i] == 0) { - continue; - } - - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - - c_idv = 0; - geno_mean = 0; - n_miss = 0; - gsl_vector_set_zero(genotype_miss); - for (int j = 0; j < ni_total; ++j) { - ch_ptr = strtok_safe(NULL, " , \t"); - if (indicator_idv[j] == 0) { - continue; - } - - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - } else { - geno = atof(ch_ptr); - gsl_vector_set(genotype, c_idv, geno); - geno_mean += geno; - } - c_idv++; - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t i = 0; i < genotype->size; ++i) { - if (gsl_vector_get(genotype_miss, i) == 1) { - geno = 0; - } else { - geno = gsl_vector_get(genotype, i); - geno -= geno_mean; - } - - gsl_vector_set(genotype, i, geno); - gsl_matrix_set(UtX, i, c_snp, geno); - } - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - - infile.clear(); - infile.close(); - - return true; -} - -// Compact version of the above function, using uchar instead of -// gsl_matrix. -bool ReadFile_geno(const string &file_geno, vector &indicator_idv, - vector &indicator_snp, - vector> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test) { - debug_msg("entered"); - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - Xt.clear(); - vector Xt_row; - for (size_t i = 0; i < ni_test; i++) { - Xt_row.push_back(0); - } - - string line; - char *ch_ptr; - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); - gsl_vector *genotype_miss = gsl_vector_safe_alloc(ni_test); - double geno, geno_mean; - size_t n_miss; - - size_t ni_total = indicator_idv.size(); - size_t ns_total = indicator_snp.size(); - - size_t c_idv = 0, c_snp = 0; - - for (size_t i = 0; i < ns_total; ++i) { - safeGetline(infile, line).eof(); - if (indicator_snp[i] == 0) { - continue; - } - - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - - c_idv = 0; - geno_mean = 0; - n_miss = 0; - gsl_vector_set_zero(genotype_miss); - for (uint j = 0; j < ni_total; ++j) { - ch_ptr = strtok_safe(NULL, " , \t"); - if (indicator_idv[j] == 0) { - continue; - } - - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(genotype_miss, c_idv, 1); - n_miss++; - } else { - geno = atof(ch_ptr); - gsl_vector_set(genotype, c_idv, geno); - geno_mean += geno; - } - c_idv++; - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t j = 0; j < genotype->size; ++j) { - if (gsl_vector_get(genotype_miss, j) == 1) { - geno = geno_mean; - } else { - geno = gsl_vector_get(genotype, j); - } - - Xt_row[j] = Double02ToUchar(geno); - gsl_vector_set(genotype, j, (geno - geno_mean)); - } - Xt.push_back(Xt_row); - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - gsl_vector_free(genotype_miss); - - infile.clear(); - infile.close(); - - return true; -} - -// Read bimbam mean genotype file, the second time, recode "mean" -// genotype and calculate K. -bool ReadFile_bed(const string &file_bed, vector &indicator_idv, - vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K) { - debug_msg("entered"); - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return false; - } - - char ch[1]; - bitset<8> b; - - size_t ni_total = indicator_idv.size(); - size_t ns_total = indicator_snp.size(); - size_t ni_test = UtX->size1; - size_t ns_test = UtX->size2; - int n_bit; - - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // Print the first three magic numbers. - for (int i = 0; i < 3; ++i) { - infile.read(ch, 1); - b = ch[0]; - } - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(UtX->size1); - - double geno, geno_mean; - size_t n_miss; - size_t c_idv = 0, c_snp = 0, c = 0; - - // Start reading snps and doing association test. - for (size_t t = 0; t < ns_total; ++t) { - if (indicator_snp[t] == 0) { - continue; - } - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(t * n_bit + 3); - - // Read genotypes. - c_idv = 0; - geno_mean = 0.0; - n_miss = 0; - c = 0; - for (int i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0. - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && c == ni_total) { - break; - } - if (indicator_idv[c] == 0) { - c++; - continue; - } - c++; - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(genotype, c_idv, 2.0); - geno_mean += 2.0; - } else { - gsl_vector_set(genotype, c_idv, 1.0); - geno_mean += 1.0; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(genotype, c_idv, 0.0); - geno_mean += 0.0; - } else { - gsl_vector_set(genotype, c_idv, -9.0); - n_miss++; - } - } - c_idv++; - } - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t i = 0; i < genotype->size; ++i) { - geno = gsl_vector_get(genotype, i); - if (geno == -9) { - geno = 0; - } else { - geno -= geno_mean; - } - - gsl_vector_set(genotype, i, geno); - gsl_matrix_set(UtX, i, c_snp, geno); - } - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - infile.clear(); - infile.close(); - - return true; -} - -// Compact version of the above function, using uchar instead of gsl_matrix. -bool ReadFile_bed(const string &file_bed, vector &indicator_idv, - vector &indicator_snp, vector> &Xt, - gsl_matrix *K, const bool calc_K, const size_t ni_test, - const size_t ns_test) { - debug_msg("entered"); - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return false; - } - - Xt.clear(); - vector Xt_row; - for (size_t i = 0; i < ni_test; i++) { - Xt_row.push_back(0); - } - - char ch[1]; - bitset<8> b; - - size_t ni_total = indicator_idv.size(); - size_t ns_total = indicator_snp.size(); - int n_bit; - - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // Print the first three magic numbers. - for (int i = 0; i < 3; ++i) { - infile.read(ch, 1); - b = ch[0]; - } - - if (calc_K == true) { - gsl_matrix_set_zero(K); - } - - gsl_vector *genotype = gsl_vector_safe_alloc(ni_test); - - double geno, geno_mean; - size_t n_miss; - size_t c_idv = 0, c_snp = 0, c = 0; - - // Start reading SNPs and doing association test. - for (size_t t = 0; t < ns_total; ++t) { - if (indicator_snp[t] == 0) { - continue; - } - - // n_bit, and 3 is the number of magic numbers. - infile.seekg(t * n_bit + 3); - - // Read genotypes. - c_idv = 0; - geno_mean = 0.0; - n_miss = 0; - c = 0; - for (int i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0. - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && c == ni_total) { - break; - } - if (indicator_idv[c] == 0) { - c++; - continue; - } - c++; - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(genotype, c_idv, 2.0); - geno_mean += 2.0; - } else { - gsl_vector_set(genotype, c_idv, 1.0); - geno_mean += 1.0; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(genotype, c_idv, 0.0); - geno_mean += 0.0; - } else { - gsl_vector_set(genotype, c_idv, -9.0); - n_miss++; - } - } - c_idv++; - } - } - - geno_mean /= (double)(ni_test - n_miss); - - for (size_t i = 0; i < genotype->size; ++i) { - geno = gsl_vector_get(genotype, i); - if (geno == -9) { - geno = geno_mean; - } - - Xt_row[i] = Double02ToUchar(geno); - - geno -= geno_mean; - - gsl_vector_set(genotype, i, geno); - } - Xt.push_back(Xt_row); - - if (calc_K == true) { - gsl_blas_dsyr(CblasUpper, 1.0, genotype, K); - } - - c_snp++; - } - - if (calc_K == true) { - gsl_matrix_scale(K, 1.0 / (double)ns_test); - - for (size_t i = 0; i < genotype->size; ++i) { - for (size_t j = 0; j < i; ++j) { - geno = gsl_matrix_get(K, j, i); - gsl_matrix_set(K, i, j, geno); - } - } - } - - gsl_vector_free(genotype); - infile.clear(); - infile.close(); - - return true; -} - -bool ReadFile_est(const string &file_est, const vector &est_column, - map &mapRS2est) { - debug_msg("entered"); - mapRS2est.clear(); - - ifstream infile(file_est.c_str(), ifstream::in); - if (!infile) { - cout << "error opening estimated parameter file: " << file_est << endl; - return false; - } - - string line; - char *ch_ptr; - - string rs; - double alpha, beta, gamma, d; - - // Header. - getline(infile, line); - - size_t n = *max_element(est_column.begin(), est_column.end()); - - while (getline(infile, line)) { - ch_ptr = strtok_safe((char *)line.c_str(), " \t"); - - alpha = 0.0; - beta = 0.0; - gamma = 1.0; - for (size_t i = 0; i < n + 1; ++i) { - if (i == est_column[0] - 1) { - enforce(ch_ptr); - rs = ch_ptr; - } - if (i == est_column[1] - 1) { - enforce(ch_ptr); - alpha = atof(ch_ptr); - } - if (i == est_column[2] - 1) { - enforce(ch_ptr); - beta = atof(ch_ptr); - } - if (i == est_column[3] - 1) { - enforce(ch_ptr); - gamma = atof(ch_ptr); - } - if (i < n) { - ch_ptr = strtok(NULL, " \t"); - } - } - - d = alpha + beta * gamma; - - if (mapRS2est.count(rs) == 0) { - mapRS2est[rs] = d; - } else { - cout << "the same SNP occurs more than once in estimated " - << "parameter file: " << rs << endl; - return false; - } - } - - infile.clear(); - infile.close(); - return true; -} - -bool CountFileLines(const string &file_input, size_t &n_lines) { - debug_msg("entered"); - igzstream infile(file_input.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open file: " << file_input << endl; - return false; - } - - n_lines = count(istreambuf_iterator(infile), - istreambuf_iterator(), '\n'); - infile.seekg(0, ios::beg); - - return true; -} - -// Read gene expression file. -bool ReadFile_gene(const string &file_gene, vector &vec_read, - vector &snpInfo, size_t &ng_total) { - debug_msg("entered"); - vec_read.clear(); - ng_total = 0; - - igzstream infile(file_gene.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open gene expression file: " << file_gene << endl; - return false; - } - - string line; - char *ch_ptr; - string rs; - - size_t n_idv = 0, t = 0; - - // Header. - getline(infile, line); - - while (getline(infile, line)) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - rs = ch_ptr; - - ch_ptr = strtok(NULL, " , \t"); - - t = 0; - while (ch_ptr != NULL) { - if (ng_total == 0) { - vec_read.push_back(0); - t++; - n_idv++; - } else { - vec_read[t] += atof(ch_ptr); - t++; - } - - ch_ptr = strtok(NULL, " , \t"); - } - - if (t != n_idv) { - cout << "error! number of columns doesn't match in row: " << ng_total - << endl; - return false; - } - - SNPINFO sInfo = {"-9", rs, -9, -9, "-9", "-9", 0, -9, -9, 0, 0, 0}; - snpInfo.push_back(sInfo); - - ng_total++; - } - - infile.close(); - infile.clear(); - - return true; -} - -// Read header to determine which column contains which item. -bool ReadHeader_io(const string &line, HEADER &header) { - debug_msg("entered"); - string rs_ptr[] = {"rs", "RS", "snp", "SNP", "snps", "SNPS", - "snpid", "SNPID", "rsid", "RSID", "MarkerName"}; - set rs_set(rs_ptr, rs_ptr + 11); // create a set of 11 items - string chr_ptr[] = {"chr", "CHR"}; - set chr_set(chr_ptr, chr_ptr + 2); - string pos_ptr[] = { - "ps", "PS", "pos", "POS", "base_position", "BASE_POSITION", "bp", "BP"}; - set pos_set(pos_ptr, pos_ptr + 8); - string cm_ptr[] = {"cm", "CM"}; - set cm_set(cm_ptr, cm_ptr + 2); - string a1_ptr[] = {"a1", "A1", "allele1", "ALLELE1", "Allele1", "INC_ALLELE"}; - set a1_set(a1_ptr, a1_ptr + 5); - string a0_ptr[] = {"a0", "A0", "allele0", "ALLELE0", "Allele0", "a2", - "A2", "allele2", "ALLELE2", "Allele2", "DEC_ALLELE"}; - set a0_set(a0_ptr, a0_ptr + 10); - - string z_ptr[] = {"z", "Z", "z_score", "Z_SCORE", "zscore", "ZSCORE"}; - set z_set(z_ptr, z_ptr + 6); - string beta_ptr[] = {"beta", "BETA", "b", "B"}; - set beta_set(beta_ptr, beta_ptr + 4); - string sebeta_ptr[] = {"se_beta", "SE_BETA", "se", "SE"}; - set sebeta_set(sebeta_ptr, sebeta_ptr + 4); - string chisq_ptr[] = {"chisq", "CHISQ", "chisquare", "CHISQUARE"}; - set chisq_set(chisq_ptr, chisq_ptr + 4); - string p_ptr[] = {"p", "P", "pvalue", "PVALUE", "p-value", "P-VALUE"}; - set p_set(p_ptr, p_ptr + 6); - - string n_ptr[] = {"n", "N", "ntotal", "NTOTAL", "n_total", "N_TOTAL"}; - set n_set(n_ptr, n_ptr + 6); - string nmis_ptr[] = {"nmis", "NMIS", "n_mis", "N_MIS", "n_miss", "N_MISS"}; - set nmis_set(nmis_ptr, nmis_ptr + 6); - string nobs_ptr[] = {"nobs", "NOBS", "n_obs", "N_OBS"}; - set nobs_set(nobs_ptr, nobs_ptr + 4); - string ncase_ptr[] = {"ncase", "NCASE", "n_case", "N_CASE"}; - set ncase_set(ncase_ptr, ncase_ptr + 4); - string ncontrol_ptr[] = {"ncontrol", "NCONTROL", "n_control", "N_CONTROL"}; - set ncontrol_set(ncontrol_ptr, ncontrol_ptr + 4); - - string af_ptr[] = {"af", - "AF", - "maf", - "MAF", - "f", - "F", - "allele_freq", - "ALLELE_FREQ", - "allele_frequency", - "ALLELE_FREQUENCY", - "Freq.Allele1.HapMapCEU", - "FreqAllele1HapMapCEU", - "Freq1.Hapmap"}; - set af_set(af_ptr, af_ptr + 13); - string var_ptr[] = {"var", "VAR"}; - set var_set(var_ptr, var_ptr + 2); - - string ws_ptr[] = {"window_size", "WINDOW_SIZE", "ws", "WS"}; - set ws_set(ws_ptr, ws_ptr + 4); - string cor_ptr[] = {"cor", "COR", "r", "R"}; - set cor_set(cor_ptr, cor_ptr + 4); - - header.rs_col = 0; - header.chr_col = 0; - header.pos_col = 0; - header.cm_col = 0; - header.a1_col = 0; - header.a0_col = 0; - header.z_col = 0; - header.beta_col = 0; - header.sebeta_col = 0; - header.chisq_col = 0; - header.p_col = 0; - header.n_col = 0; - header.nmis_col = 0; - header.nobs_col = 0; - header.ncase_col = 0; - header.ncontrol_col = 0; - header.af_col = 0; - header.var_col = 0; - header.ws_col = 0; - header.cor_col = 0; - header.coln = 0; - - char *ch_ptr; - string type; - size_t n_error = 0; - - ch_ptr = strtok((char *)line.c_str(), " , \t"); - while (ch_ptr != NULL) { - type = ch_ptr; - if (rs_set.count(type) != 0) { - if (header.rs_col == 0) { - header.rs_col = header.coln + 1; - } else { - cout << "error! more than two rs columns in the file." << endl; - n_error++; - } - } else if (chr_set.count(type) != 0) { - if (header.chr_col == 0) { - header.chr_col = header.coln + 1; - } else { - cout << "error! more than two chr columns in the file." << endl; - n_error++; - } - } else if (pos_set.count(type) != 0) { - if (header.pos_col == 0) { - header.pos_col = header.coln + 1; - } else { - cout << "error! more than two pos columns in the file." << endl; - n_error++; - } - } else if (cm_set.count(type) != 0) { - if (header.cm_col == 0) { - header.cm_col = header.coln + 1; - } else { - cout << "error! more than two cm columns in the file." << endl; - n_error++; - } - } else if (a1_set.count(type) != 0) { - if (header.a1_col == 0) { - header.a1_col = header.coln + 1; - } else { - cout << "error! more than two allele1 columns in the file." << endl; - n_error++; - } - } else if (a0_set.count(type) != 0) { - if (header.a0_col == 0) { - header.a0_col = header.coln + 1; - } else { - cout << "error! more than two allele0 columns in the file." << endl; - n_error++; - } - } else if (z_set.count(type) != 0) { - if (header.z_col == 0) { - header.z_col = header.coln + 1; - } else { - cout << "error! more than two z columns in the file." << endl; - n_error++; - } - } else if (beta_set.count(type) != 0) { - if (header.beta_col == 0) { - header.beta_col = header.coln + 1; - } else { - cout << "error! more than two beta columns in the file." << endl; - n_error++; - } - } else if (sebeta_set.count(type) != 0) { - if (header.sebeta_col == 0) { - header.sebeta_col = header.coln + 1; - } else { - cout << "error! more than two se_beta columns in the file." << endl; - n_error++; - } - } else if (chisq_set.count(type) != 0) { - if (header.chisq_col == 0) { - header.chisq_col = header.coln + 1; - } else { - cout << "error! more than two z columns in the file." << endl; - n_error++; - } - } else if (p_set.count(type) != 0) { - if (header.p_col == 0) { - header.p_col = header.coln + 1; - } else { - cout << "error! more than two p columns in the file." << endl; - n_error++; - } - } else if (n_set.count(type) != 0) { - if (header.n_col == 0) { - header.n_col = header.coln + 1; - } else { - cout << "error! more than two n_total columns in the file." << endl; - n_error++; - } - } else if (nmis_set.count(type) != 0) { - if (header.nmis_col == 0) { - header.nmis_col = header.coln + 1; - } else { - cout << "error! more than two n_mis columns in the file." << endl; - n_error++; - } - } else if (nobs_set.count(type) != 0) { - if (header.nobs_col == 0) { - header.nobs_col = header.coln + 1; - } else { - cout << "error! more than two n_obs columns in the file." << endl; - n_error++; - } - } else if (ncase_set.count(type) != 0) { - if (header.ncase_col == 0) { - header.ncase_col = header.coln + 1; - } else { - cout << "error! more than two n_case columns in the file." << endl; - n_error++; - } - } else if (ncontrol_set.count(type) != 0) { - if (header.ncontrol_col == 0) { - header.ncontrol_col = header.coln + 1; - } else { - cout << "error! more than two n_control columns in the file." << endl; - n_error++; - } - } else if (ws_set.count(type) != 0) { - if (header.ws_col == 0) { - header.ws_col = header.coln + 1; - } else { - cout << "error! more than two window_size columns in the file." << endl; - n_error++; - } - } else if (af_set.count(type) != 0) { - if (header.af_col == 0) { - header.af_col = header.coln + 1; - } else { - cout << "error! more than two af columns in the file." << endl; - n_error++; - } - } else if (cor_set.count(type) != 0) { - if (header.cor_col == 0) { - header.cor_col = header.coln + 1; - } else { - cout << "error! more than two cor columns in the file." << endl; - n_error++; - } - } else { - string str = ch_ptr; - string cat = str.substr(str.size() - 2, 2); - - if (cat == "_c" || cat == "_C") { - - // continuous - header.catc_col.insert(header.coln + 1); - } else { - - // discrete - header.catd_col.insert(header.coln + 1); - } - } - - ch_ptr = strtok(NULL, " , \t"); - header.coln++; - } - - if (header.cor_col != 0 && header.cor_col != header.coln) { - cout << "error! the cor column should be the last column." << endl; - n_error++; - } - - if (header.rs_col == 0) { - if (header.chr_col != 0 && header.pos_col != 0) { - cout << "missing an rs column. rs id will be replaced by chr:pos" << endl; - } else { - cout << "error! missing an rs column." << endl; - n_error++; - } - } - - if (n_error == 0) { - return true; - } else { - return false; - } -} - -// Read category file, record mapRS2 in the category file does not -// contain a null category so if a snp has 0 for all categories, then -// it is not included in the analysis. -bool ReadFile_cat(const string &file_cat, map &mapRS2cat, - size_t &n_vc) { - debug_msg("entered"); - mapRS2cat.clear(); - - igzstream infile(file_cat.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open category file: " << file_cat << endl; - return false; - } - - string line; - char *ch_ptr; - - string rs, chr, a1, a0, pos, cm; - size_t i_cat; - - // Read header. - HEADER header; - safeGetline(infile, line).eof(); - ReadHeader_io(line, header); - - // Use the header to count the number of categories. - n_vc = header.coln; - if (header.rs_col != 0) { - n_vc--; - } - if (header.chr_col != 0) { - n_vc--; - } - if (header.pos_col != 0) { - n_vc--; - } - if (header.cm_col != 0) { - n_vc--; - } - if (header.a1_col != 0) { - n_vc--; - } - if (header.a0_col != 0) { - n_vc--; - } - - // Read the following lines to record mapRS2cat. - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - - i_cat = 0; - for (size_t i = 0; i < header.coln; i++) { - enforce(ch_ptr); - if (header.rs_col != 0 && header.rs_col == i + 1) { - rs = ch_ptr; - } else if (header.chr_col != 0 && header.chr_col == i + 1) { - chr = ch_ptr; - } else if (header.pos_col != 0 && header.pos_col == i + 1) { - pos = ch_ptr; - } else if (header.cm_col != 0 && header.cm_col == i + 1) { - cm = ch_ptr; - } else if (header.a1_col != 0 && header.a1_col == i + 1) { - a1 = ch_ptr; - } else if (header.a0_col != 0 && header.a0_col == i + 1) { - a0 = ch_ptr; - } else if (atoi(ch_ptr) == 1 || atoi(ch_ptr) == 0) { - if (i_cat == 0) { - if (header.rs_col == 0) { - rs = chr + ":" + pos; - } - } - - if (atoi(ch_ptr) == 1 && mapRS2cat.count(rs) == 0) { - mapRS2cat[rs] = i_cat; - } - i_cat++; - } else { - } - - ch_ptr = strtok(NULL, " , \t"); - } - } - - infile.clear(); - infile.close(); - - return true; -} - -bool ReadFile_mcat(const string &file_mcat, map &mapRS2cat, - size_t &n_vc) { - debug_msg("entered"); - mapRS2cat.clear(); - - igzstream infile(file_mcat.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open mcategory file: " << file_mcat << endl; - return false; - } - - string file_name; - map mapRS2cat_tmp; - size_t n_vc_tmp, t = 0; - - while (!safeGetline(infile, file_name).eof()) { - mapRS2cat_tmp.clear(); - ReadFile_cat(file_name, mapRS2cat_tmp, n_vc_tmp); - mapRS2cat.insert(mapRS2cat_tmp.begin(), mapRS2cat_tmp.end()); - if (t == 0) { - n_vc = n_vc_tmp; - } else { - n_vc = max(n_vc, n_vc_tmp); - } - t++; - } - - return true; -} - -// Read bimbam mean genotype file and calculate kinship matrix; this -// time, the kinship matrix is not centered, and can contain multiple -// K matrix. -bool BimbamKinUncentered(const string &file_geno, const set ksnps, - const int display_pace, - const vector &indicator_idv, - const vector &indicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns) { - debug_msg("entered"); - debug_msg("BimbamKinUncentered"); - igzstream infile(file_geno.c_str(), igzstream::in); - if (!infile) { - cout << "error reading genotype file:" << file_geno << endl; - return false; - } - - string line; - char *ch_ptr; - - size_t n_miss; - double d, geno_mean, geno_var; - - size_t ni_test = matrix_kin->size1; - gsl_vector *geno = gsl_vector_safe_alloc(ni_test); - gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_test); - - gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - size_t n_vc = matrix_kin->size2 / ni_test, i_vc; - string rs; - vector ns_vec; - for (size_t i = 0; i < n_vc; i++) { - ns_vec.push_back(0); - } - - // Create a large matrix. - const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); - gsl_matrix_set_zero(Xlarge); - - size_t ns_test = 0; - for (size_t t = 0; t < indicator_snp.size(); ++t) { - safeGetline(infile, line).eof(); - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); - } - if (indicator_snp[t] == 0) - continue; - - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - ch_ptr = strtok_safe(NULL, " , \t"); - - rs = snpInfo[t].rs_number; // This line is new. - - geno_mean = 0.0; - n_miss = 0; - geno_var = 0.0; - gsl_vector_set_all(geno_miss, 0); - - size_t j = 0; - for (size_t i = 0; i < indicator_idv.size(); ++i) { - if (indicator_idv[i] == 0) { - continue; - } - ch_ptr = strtok_safe(NULL, " , \t"); - if (strcmp(ch_ptr, "NA") == 0) { - gsl_vector_set(geno_miss, i, 0); - n_miss++; - } else { - d = atof(ch_ptr); - gsl_vector_set(geno, j, d); - gsl_vector_set(geno_miss, j, 1); - geno_mean += d; - geno_var += d * d; - } - j++; - } - - geno_mean /= (double)(ni_test - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_test; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_test; ++i) { - if (gsl_vector_get(geno_miss, i) == 0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - gsl_blas_dgemv(CblasTrans, 1.0, W, geno, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_dgemv(CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); - gsl_blas_ddot(geno, geno, &geno_var); - geno_var /= (double)ni_test; - - if (geno_var != 0 && - (mapRS2weight.size() == 0 || mapRS2weight.count(rs) != 0)) { - if (mapRS2weight.size() == 0) { - d = 1.0 / geno_var; - } else { - d = mapRS2weight.at(rs) / geno_var; - } - - gsl_vector_scale(geno, sqrt(d)); - if (n_vc == 1 || mapRS2cat.size() == 0) { - gsl_vector_view Xlarge_col = - gsl_matrix_column(Xlarge, ns_vec[0] % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); - ns_vec[0]++; - - if (ns_vec[0] % msize == 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - gsl_matrix_set_zero(Xlarge); - } - } else if (mapRS2cat.count(rs) != 0) { - i_vc = mapRS2cat.at(rs); - - gsl_vector_view Xlarge_col = - gsl_matrix_column(Xlarge, msize * i_vc + ns_vec[i_vc] % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); - - ns_vec[i_vc]++; - - if (ns_vec[i_vc] % msize == 0) { - gsl_matrix_view X_sub = - gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); - gsl_matrix_view kin_sub = gsl_matrix_submatrix( - matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, - &kin_sub.matrix); - - gsl_matrix_set_zero(&X_sub.matrix); - } - } - } - ns_test++; - } - - for (size_t i_vc = 0; i_vc < n_vc; i_vc++) { - if (ns_vec[i_vc] % msize != 0) { - gsl_matrix_view X_sub = - gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); - gsl_matrix_view kin_sub = - gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, - &kin_sub.matrix); - } - } - - cout << endl; - - for (size_t t = 0; t < n_vc; t++) { - gsl_vector_set(vector_ns, t, ns_vec[t]); - - for (size_t i = 0; i < ni_test; ++i) { - for (size_t j = 0; j <= i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i + ni_test * t); - d /= (double)ns_vec[t]; - gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); - gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); - } - } - } - - gsl_vector_free(geno); - gsl_vector_free(geno_miss); - - gsl_vector_free(Wtx); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(WtWiWtx); - gsl_permutation_free(pmt); - - gsl_matrix_free(Xlarge); - - infile.close(); - infile.clear(); - - return true; -} - -bool PlinkKin(const string &file_bed, const int display_pace, - const vector &indicator_idv, - const vector &indicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns) { - debug_msg("entered"); - ifstream infile(file_bed.c_str(), ios::binary); - if (!infile) { - cout << "error reading bed file:" << file_bed << endl; - return false; - } - - char ch[1]; - bitset<8> b; - - size_t n_miss, ci_total, ci_test; - double d, geno_mean, geno_var; - - size_t ni_test = matrix_kin->size1; - size_t ni_total = indicator_idv.size(); - gsl_vector *geno = gsl_vector_safe_alloc(ni_test); - - gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2); - gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2); - gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2); - gsl_permutation *pmt = gsl_permutation_alloc(W->size2); - - gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, W, W, 0.0, WtW); - int sig; - LUDecomp(WtW, pmt, &sig); - LUInvert(WtW, pmt, WtWi); - - size_t ns_test = 0; - int n_bit; - - size_t n_vc = matrix_kin->size2 / ni_test, i_vc; - string rs; - vector ns_vec; - for (size_t i = 0; i < n_vc; i++) { - ns_vec.push_back(0); - } - - // Create a large matrix. - const size_t msize = K_BATCH_SIZE; - gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_test, msize * n_vc); - gsl_matrix_set_zero(Xlarge); - - // Calculate n_bit and c, the number of bit for each SNP. - if (ni_total % 4 == 0) { - n_bit = ni_total / 4; - } else { - n_bit = ni_total / 4 + 1; - } - - // Print the first three magic numbers. - for (int i = 0; i < 3; ++i) { - infile.read(ch, 1); - b = ch[0]; - } - - for (size_t t = 0; t < indicator_snp.size(); ++t) { - if (t % display_pace == 0 || t == (indicator_snp.size() - 1)) { - ProgressBar("Reading SNPs", t, indicator_snp.size() - 1); - } - if (indicator_snp[t] == 0) { - continue; - } - - // n_bit, and 3 is the number of magic numbers - infile.seekg(t * n_bit + 3); - - rs = snpInfo[t].rs_number; // This line is new. - - // Read genotypes. - geno_mean = 0.0; - n_miss = 0; - ci_total = 0; - geno_var = 0.0; - ci_test = 0; - for (int i = 0; i < n_bit; ++i) { - infile.read(ch, 1); - b = ch[0]; - - // Minor allele homozygous: 2.0; major: 0.0; - for (size_t j = 0; j < 4; ++j) { - if ((i == (n_bit - 1)) && ci_total == ni_total) { - break; - } - if (indicator_idv[ci_total] == 0) { - ci_total++; - continue; - } - - if (b[2 * j] == 0) { - if (b[2 * j + 1] == 0) { - gsl_vector_set(geno, ci_test, 2.0); - geno_mean += 2.0; - geno_var += 4.0; - } else { - gsl_vector_set(geno, ci_test, 1.0); - geno_mean += 1.0; - geno_var += 1.0; - } - } else { - if (b[2 * j + 1] == 1) { - gsl_vector_set(geno, ci_test, 0.0); - } else { - gsl_vector_set(geno, ci_test, -9.0); - n_miss++; - } - } - - ci_test++; - ci_total++; - } - } - - geno_mean /= (double)(ni_test - n_miss); - geno_var += geno_mean * geno_mean * (double)n_miss; - geno_var /= (double)ni_test; - geno_var -= geno_mean * geno_mean; - - for (size_t i = 0; i < ni_test; ++i) { - d = gsl_vector_get(geno, i); - if (d == -9.0) { - gsl_vector_set(geno, i, geno_mean); - } - } - - gsl_vector_add_constant(geno, -1.0 * geno_mean); - - gsl_blas_dgemv(CblasTrans, 1.0, W, geno, 0.0, Wtx); - gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx); - gsl_blas_dgemv(CblasNoTrans, -1.0, W, WtWiWtx, 1.0, geno); - gsl_blas_ddot(geno, geno, &geno_var); - geno_var /= (double)ni_test; - - if (geno_var != 0 && - (mapRS2weight.size() == 0 || mapRS2weight.count(rs) != 0)) { - if (mapRS2weight.size() == 0) { - d = 1.0 / geno_var; - } else { - d = mapRS2weight.at(rs) / geno_var; - } - - gsl_vector_scale(geno, sqrt(d)); - if (n_vc == 1 || mapRS2cat.size() == 0) { - gsl_vector_view Xlarge_col = - gsl_matrix_column(Xlarge, ns_vec[0] % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); - ns_vec[0]++; - - if (ns_vec[0] % msize == 0) { - fast_eigen_dgemm("N", "T", 1.0, Xlarge, Xlarge, 1.0, matrix_kin); - gsl_matrix_set_zero(Xlarge); - } - } else if (mapRS2cat.count(rs) != 0) { - i_vc = mapRS2cat.at(rs); - - gsl_vector_view Xlarge_col = - gsl_matrix_column(Xlarge, msize * i_vc + ns_vec[i_vc] % msize); - gsl_vector_memcpy(&Xlarge_col.vector, geno); - - ns_vec[i_vc]++; - - if (ns_vec[i_vc] % msize == 0) { - gsl_matrix_view X_sub = - gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); - gsl_matrix_view kin_sub = gsl_matrix_submatrix( - matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, - &kin_sub.matrix); - - gsl_matrix_set_zero(&X_sub.matrix); - } - } - } - ns_test++; - } - - for (size_t i_vc = 0; i_vc < n_vc; i_vc++) { - if (ns_vec[i_vc] % msize != 0) { - gsl_matrix_view X_sub = - gsl_matrix_submatrix(Xlarge, 0, msize * i_vc, ni_test, msize); - gsl_matrix_view kin_sub = - gsl_matrix_submatrix(matrix_kin, 0, ni_test * i_vc, ni_test, ni_test); - fast_eigen_dgemm("N", "T", 1.0, &X_sub.matrix, &X_sub.matrix, 1.0, - &kin_sub.matrix); - } - } - - cout << endl; - - for (size_t t = 0; t < n_vc; t++) { - gsl_vector_set(vector_ns, t, ns_vec[t]); - - for (size_t i = 0; i < ni_test; ++i) { - for (size_t j = 0; j <= i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i + ni_test * t); - d /= (double)ns_vec[t]; - gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); - gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); - } - } - } - - gsl_vector_free(geno); - - gsl_vector_free(Wtx); - gsl_matrix_free(WtW); - gsl_matrix_free(WtWi); - gsl_vector_free(WtWiWtx); - gsl_permutation_free(pmt); - - gsl_matrix_free(Xlarge); - - infile.close(); - infile.clear(); - - return true; -} - -bool MFILEKin(const size_t mfile_mode, const string &file_mfile, - const set setKSnps, const int display_pace, - const vector &indicator_idv, - const vector> &mindicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector> &msnpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns) { - debug_msg("entered"); - size_t n_vc = vector_ns->size, ni_test = matrix_kin->size1; - gsl_matrix_set_zero(matrix_kin); - gsl_vector_set_zero(vector_ns); - - igzstream infile(file_mfile.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open mfile file: " << file_mfile << endl; - return false; - } - - string file_name; - - gsl_matrix *kin_tmp = gsl_matrix_safe_alloc(matrix_kin->size1, matrix_kin->size2); - gsl_vector *ns_tmp = gsl_vector_safe_alloc(vector_ns->size); - - size_t l = 0; - double d; - while (!safeGetline(infile, file_name).eof()) { - gsl_matrix_set_zero(kin_tmp); - gsl_vector_set_zero(ns_tmp); - - if (mfile_mode == 1) { - file_name += ".bed"; - PlinkKin(file_name, display_pace, indicator_idv, mindicator_snp[l], - mapRS2weight, mapRS2cat, msnpInfo[l], W, kin_tmp, ns_tmp); - } else { - BimbamKinUncentered(file_name, setKSnps, display_pace, indicator_idv, - mindicator_snp[l], mapRS2weight, mapRS2cat, - msnpInfo[l], W, kin_tmp, ns_tmp); - } - - // Add ns. - gsl_vector_add(vector_ns, ns_tmp); - - // Add kin. - for (size_t t = 0; t < n_vc; t++) { - for (size_t i = 0; i < ni_test; ++i) { - for (size_t j = 0; j <= i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i + ni_test * t) + - gsl_matrix_get(kin_tmp, j, i + ni_test * t) * - gsl_vector_get(ns_tmp, t); - - gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); - gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); - } - } - } - l++; - } - - // Renormalize kin. - for (size_t t = 0; t < n_vc; t++) { - for (size_t i = 0; i < ni_test; ++i) { - for (size_t j = 0; j <= i; ++j) { - d = gsl_matrix_get(matrix_kin, j, i + ni_test * t) / - gsl_vector_get(vector_ns, t); - - gsl_matrix_set(matrix_kin, i, j + ni_test * t, d); - gsl_matrix_set(matrix_kin, j, i + ni_test * t, d); - } - } - } - cout << endl; - - infile.close(); - infile.clear(); - - gsl_matrix_free(kin_tmp); - gsl_vector_free(ns_tmp); - - return true; -} - -// Read var file, store mapRS2wsnp. -bool ReadFile_wsnp(const string &file_wsnp, map &mapRS2weight) { - debug_msg("entered"); - mapRS2weight.clear(); - - igzstream infile(file_wsnp.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open snp weight file: " << file_wsnp << endl; - return false; - } - - char *ch_ptr; - string line, rs; - double weight; - - while (!safeGetline(infile, line).eof()) { - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - rs = ch_ptr; - ch_ptr = strtok_safe(NULL, " , \t"); - weight = atof(ch_ptr); - mapRS2weight[rs] = weight; - } - - return true; -} - -bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, - map> &mapRS2wvector) { - debug_msg("entered"); - mapRS2wvector.clear(); - - igzstream infile(file_wcat.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open snp weight file: " << file_wcat << endl; - return false; - } - - char *ch_ptr; - vector weight; - for (size_t i = 0; i < n_vc; i++) { - weight.push_back(0.0); - } - - string line, rs, chr, a1, a0, pos, cm; - - // Read header. - HEADER header; - safeGetline(infile, line).eof(); - ReadHeader_io(line, header); - - while (!safeGetline(infile, line).eof()) { - if (isBlankLine(line)) { - continue; - } - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - - size_t t = 0; - for (size_t i = 0; i < header.coln; i++) { - enforce(ch_ptr); - if (header.rs_col != 0 && header.rs_col == i + 1) { - rs = ch_ptr; - } else if (header.chr_col != 0 && header.chr_col == i + 1) { - chr = ch_ptr; - } else if (header.pos_col != 0 && header.pos_col == i + 1) { - pos = ch_ptr; - } else if (header.cm_col != 0 && header.cm_col == i + 1) { - cm = ch_ptr; - } else if (header.a1_col != 0 && header.a1_col == i + 1) { - a1 = ch_ptr; - } else if (header.a0_col != 0 && header.a0_col == i + 1) { - a0 = ch_ptr; - } else { - weight[t] = atof(ch_ptr); - t++; - if (t > n_vc) { - cout << "error! Number of columns in the wcat file does not " - << "match that of cat file."; - return false; - } - } - - ch_ptr = strtok(NULL, " , \t"); - } - - if (t != n_vc) { - cout << "error! Number of columns in the wcat file does not " - << "match that of cat file."; - return false; - } - - if (header.rs_col == 0) { - rs = chr + ":" + pos; - } - - mapRS2wvector[rs] = weight; - } - - return true; -} - -// Read the beta file, save snp z scores in to z2_score, and save -// category into indicator_snp based on mapRS2var and set, and -// indicator_snp record the category number (from 1 to n_vc), and -// provide var if maf/var is not provided in the beta file notice that -// indicator_snp contains ns_test snps, instead of ns_total snps read -// the beta file for the second time, compute q, and Vq based on block -// jacknife use the mapRS2var to select snps (and to ), calculate q do -// a block-wise jacknife, and compute Vq -void ReadFile_beta(const string &file_beta, - const map &mapRS2cat, - const map &mapRS2wA, vector &vec_cat, - vector &vec_ni, vector &vec_weight, - vector &vec_z2, size_t &ni_total, size_t &ns_total, - size_t &ns_test) { - debug_msg("entered"); - vec_cat.clear(); - vec_ni.clear(); - vec_weight.clear(); - vec_z2.clear(); - ni_total = 0; - ns_total = 0; - ns_test = 0; - - igzstream infile(file_beta.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open beta file: " << file_beta << endl; - return; - } - - string line; - char *ch_ptr; - string type; - - string rs, chr, a1, a0, pos, cm; - double z = 0, beta = 0, se_beta = 0, pvalue = 0, zsquare = 0; // af = 0; - size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; - - // Read header. - HEADER header; - safeGetline(infile, line).eof(); - ReadHeader_io(line, header); - - if (header.n_col == 0) { - if ((header.nobs_col == 0 && header.nmis_col == 0) && - (header.ncase_col == 0 && header.ncontrol_col == 0)) { - cout << "error! missing sample size in the beta file." << endl; - } else { - cout << "total sample size will be replaced by obs/mis sample size." - << endl; - } - } - - if (header.z_col == 0 && (header.beta_col == 0 || header.sebeta_col == 0) && - header.chisq_col == 0 && header.p_col == 0) { - cout << "error! missing z scores in the beta file." << endl; - } - - while (!safeGetline(infile, line).eof()) { - if (isBlankLine(line)) { - continue; - } - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - - z = 0; - beta = 0; - se_beta = 0; - auto chisq = 0.0; - pvalue = 0; - n_total = 0; - n_mis = 0; - n_obs = 0; - n_case = 0; - n_control = 0; - // af = 0; - // auto var_x = 0.0; - for (size_t i = 0; i < header.coln; i++) { - enforce(ch_ptr); - if (header.rs_col != 0 && header.rs_col == i + 1) { - rs = ch_ptr; - } - if (header.chr_col != 0 && header.chr_col == i + 1) { - chr = ch_ptr; - } - if (header.pos_col != 0 && header.pos_col == i + 1) { - pos = ch_ptr; - } - if (header.cm_col != 0 && header.cm_col == i + 1) { - cm = ch_ptr; - } - if (header.a1_col != 0 && header.a1_col == i + 1) { - a1 = ch_ptr; - } - if (header.a0_col != 0 && header.a0_col == i + 1) { - a0 = ch_ptr; - } - - if (header.z_col != 0 && header.z_col == i + 1) { - z = atof(ch_ptr); - } - if (header.beta_col != 0 && header.beta_col == i + 1) { - beta = atof(ch_ptr); - } - if (header.sebeta_col != 0 && header.sebeta_col == i + 1) { - se_beta = atof(ch_ptr); - } - if (header.chisq_col != 0 && header.chisq_col == i + 1) { - chisq = atof(ch_ptr); - } - if (header.p_col != 0 && header.p_col == i + 1) { - pvalue = atof(ch_ptr); - } - - if (header.n_col != 0 && header.n_col == i + 1) { - n_total = atoi(ch_ptr); - } - if (header.nmis_col != 0 && header.nmis_col == i + 1) { - n_mis = atoi(ch_ptr); - } - if (header.nobs_col != 0 && header.nobs_col == i + 1) { - n_obs = atoi(ch_ptr); - } - if (header.ncase_col != 0 && header.ncase_col == i + 1) { - n_case = atoi(ch_ptr); - } - if (header.ncontrol_col != 0 && header.ncontrol_col == i + 1) { - n_control = atoi(ch_ptr); - } - // if (header.af_col != 0 && header.af_col == i + 1) { - // af = atof(ch_ptr); - // } - // if (header.var_col != 0 && header.var_col == i + 1) { - // var_x = atof(ch_ptr); - // } - - ch_ptr = strtok(NULL, " , \t"); - } - - if (header.rs_col == 0) { - rs = chr + ":" + pos; - } - - if (header.n_col == 0) { - if (header.nmis_col != 0 && header.nobs_col != 0) { - n_total = n_mis + n_obs; - } else { - n_total = n_case + n_control; - } - } - - // Both z values and beta/se_beta have directions, while - // chisq/pvalue do not. - if (header.z_col != 0) { - zsquare = z * z; - } else if (header.beta_col != 0 && header.sebeta_col != 0) { - z = beta / se_beta; - zsquare = z * z; - } else if (header.chisq_col != 0) { - zsquare = chisq; - } else if (header.p_col != 0) { - zsquare = gsl_cdf_chisq_Qinv(pvalue, 1); - } else { - zsquare = 0; - } - - // Obtain var_x. - // if (header.var_col == 0 && header.af_col != 0) { - // var_x = 2.0 * af * (1.0 - af); - // } - - // If the SNP is also present in cor file, then do calculations. - if ((mapRS2wA.size() == 0 || mapRS2wA.count(rs) != 0) && - (mapRS2cat.size() == 0 || mapRS2cat.count(rs) != 0) && zsquare != 0) { - if (mapRS2cat.size() != 0) { - vec_cat.push_back(mapRS2cat.at(rs)); - } else { - vec_cat.push_back(0); - } - vec_ni.push_back(n_total); - if (mapRS2wA.size() == 0) { - vec_weight.push_back(1); - } else { - vec_weight.push_back(mapRS2wA.at(rs)); - } - vec_z2.push_back(zsquare); - - ni_total = max(ni_total, n_total); - ns_test++; - } - - ns_total++; - } - - infile.clear(); - infile.close(); - - return; -} - -void ReadFile_beta(const string &file_beta, const map &mapRS2wA, - map &mapRS2A1, - map &mapRS2z) { - debug_msg("entered"); - mapRS2A1.clear(); - mapRS2z.clear(); - - igzstream infile(file_beta.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open beta file: " << file_beta << endl; - return; - } - - string line; - char *ch_ptr; - string type; - - string rs, chr, a1, a0, pos, cm; - double z = 0, beta = 0, se_beta = 0; // pvalue = 0, chisq=0, af = 0 , var_x = 0; - size_t n_total = 0, n_mis = 0, n_obs = 0, n_case = 0, n_control = 0; - size_t ni_total = 0, ns_total = 0, ns_test = 0; - - // Read header. - HEADER header; - safeGetline(infile, line).eof(); - ReadHeader_io(line, header); - - if (header.n_col == 0) { - if ((header.nobs_col == 0 && header.nmis_col == 0) && - (header.ncase_col == 0 && header.ncontrol_col == 0)) { - cout << "error! missing sample size in the beta file." << endl; - } else { - cout << "total sample size will be replaced by obs/mis sample size." - << endl; - } - } - - if (header.z_col == 0 && (header.beta_col == 0 || header.sebeta_col == 0)) { - cout << "error! missing z scores in the beta file." << endl; - } - - while (!safeGetline(infile, line).eof()) { - if (isBlankLine(line)) { - continue; - } - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - - z = 0; - beta = 0; - se_beta = 0; - // chisq = 0; - // pvalue = 0; - n_total = 0; - n_mis = 0; - n_obs = 0; - n_case = 0; - n_control = 0; - // af = 0; - // double var_x = 0; - for (size_t i = 0; i < header.coln; i++) { - enforce(ch_ptr); - if (header.rs_col != 0 && header.rs_col == i + 1) { - rs = ch_ptr; - } - if (header.chr_col != 0 && header.chr_col == i + 1) { - chr = ch_ptr; - } - if (header.pos_col != 0 && header.pos_col == i + 1) { - pos = ch_ptr; - } - if (header.cm_col != 0 && header.cm_col == i + 1) { - cm = ch_ptr; - } - if (header.a1_col != 0 && header.a1_col == i + 1) { - a1 = ch_ptr; - } - if (header.a0_col != 0 && header.a0_col == i + 1) { - a0 = ch_ptr; - } - - if (header.z_col != 0 && header.z_col == i + 1) { - z = atof(ch_ptr); - } - if (header.beta_col != 0 && header.beta_col == i + 1) { - beta = atof(ch_ptr); - } - if (header.sebeta_col != 0 && header.sebeta_col == i + 1) { - se_beta = atof(ch_ptr); - } - // if (header.chisq_col != 0 && header.chisq_col == i + 1) { - // chisq = atof(ch_ptr); - // } - // if (header.p_col != 0 && header.p_col == i + 1) { - // pvalue = atof(ch_ptr); - // } - - if (header.n_col != 0 && header.n_col == i + 1) { - n_total = atoi(ch_ptr); - } - if (header.nmis_col != 0 && header.nmis_col == i + 1) { - n_mis = atoi(ch_ptr); - } - if (header.nobs_col != 0 && header.nobs_col == i + 1) { - n_obs = atoi(ch_ptr); - } - if (header.ncase_col != 0 && header.ncase_col == i + 1) { - n_case = atoi(ch_ptr); - } - if (header.ncontrol_col != 0 && header.ncontrol_col == i + 1) { - n_control = atoi(ch_ptr); - } - - // if (header.af_col != 0 && header.af_col == i + 1) { - // af = atof(ch_ptr); - // } - - // if (header.var_col != 0 && header.var_col == i + 1) { - // var_x = atof(ch_ptr); - // } - - ch_ptr = strtok(NULL, " , \t"); - } - - if (header.rs_col == 0) { - rs = chr + ":" + pos; - } - - if (header.n_col == 0) { - if (header.nmis_col != 0 && header.nobs_col != 0) { - n_total = n_mis + n_obs; - } else { - n_total = n_case + n_control; - } - } - - // Both z values and beta/se_beta have directions, while - // chisq/pvalue do not. - if (header.z_col != 0) { - z = z; - } else if (header.beta_col != 0 && header.sebeta_col != 0) { - z = beta / se_beta; - } else { - z = 0; - } - - // If the snp is also present in cor file, then do calculations. - if ((mapRS2wA.size() == 0 || mapRS2wA.count(rs) != 0)) { - mapRS2z[rs] = z; - mapRS2A1[rs] = a1; - - ni_total = max(ni_total, n_total); - ns_test++; - } - - ns_total++; - } - - infile.clear(); - infile.close(); - - return; -} - -void Calcq(const size_t n_block, const vector &vec_cat, - const vector &vec_ni, const vector &vec_weight, - const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, - gsl_vector *s) { - debug_msg("entered"); - gsl_matrix_set_zero(Vq); - gsl_vector_set_zero(q); - gsl_vector_set_zero(s); - - size_t cat, n_total; - double w, zsquare; - - vector vec_q, vec_s, n_snps; - for (size_t i = 0; i < q->size; i++) { - vec_q.push_back(0.0); - vec_s.push_back(0.0); - n_snps.push_back(0.0); - } - - vector> mat_q, mat_s; - for (size_t i = 0; i < n_block; i++) { - mat_q.push_back(vec_q); - mat_s.push_back(vec_s); - } - - // Compute q and s. - for (size_t i = 0; i < vec_cat.size(); i++) { - - // Extract quantities. - cat = vec_cat[i]; - n_total = vec_ni[i]; - w = vec_weight[i]; - zsquare = vec_z2[i]; - - // Compute q and s. - vec_q[cat] += (zsquare - 1.0) * w / (double)n_total; - vec_s[cat] += w; - n_snps[cat]++; - } - - // Update q; vec_q is used again for computing Vq below. - for (size_t i = 0; i < q->size; i++) { - if (vec_s[i] != 0) { - gsl_vector_set(q, i, vec_q[i] / vec_s[i]); - } - gsl_vector_set(s, i, vec_s[i]); - } - - // Compute Vq; divide SNPs in each category into evenly distributed - // blocks. - size_t t = 0, b = 0, n_snp = 0; - double d, m, n; - for (size_t l = 0; l < q->size; l++) { - n_snp = floor(n_snps[l] / n_block); - t = 0; - b = 0; - if (n_snp == 0) { - continue; - } - - // Initiate everything to zero. - for (size_t i = 0; i < n_block; i++) { - for (size_t j = 0; j < q->size; j++) { - mat_q[i][j] = 0; - mat_s[i][j] = 0; - } - } - - // Record values. - for (size_t i = 0; i < vec_cat.size(); i++) { - - // Extract quantities. - cat = vec_cat[i]; - n_total = vec_ni[i]; - w = vec_weight[i]; - zsquare = vec_z2[i]; - - // Save quantities for computing Vq (which is not divided by - // n_total). - mat_q[b][cat] += (zsquare - 1.0) * w; - mat_s[b][cat] += w; - - if (cat == l) { - if (b < n_block - 1) { - if (t < n_snp - 1) { - t++; - } else { - b++; - t = 0; - } - } else { - t++; - } - } - } - - // Center mat_q. - for (size_t i = 0; i < q->size; i++) { - m = 0; - n = 0; - for (size_t k = 0; k < n_block; k++) { - if (mat_s[k][i] != 0 && vec_s[i] != mat_s[k][i]) { - d = (vec_q[i] - mat_q[k][i]) / (vec_s[i] - mat_s[k][i]); - mat_q[k][i] = d; - m += d; - n++; - } - } - if (n != 0) { - m /= n; - } - - for (size_t k = 0; k < n_block; k++) { - if (mat_q[k][i] != 0) { - mat_q[k][i] -= m; - } - } - } - - // Compute Vq for l'th row and l'th column only. - for (size_t i = 0; i < q->size; i++) { - d = 0; - n = 0; - for (size_t k = 0; k < n_block; k++) { - if (mat_q[k][l] != 0 && mat_q[k][i] != 0) { - d += mat_q[k][l] * mat_q[k][i]; - n++; - } - } - if (n != 0) { - d /= n; - d *= n - 1; - } - d += gsl_matrix_get(Vq, i, l); - gsl_matrix_set(Vq, i, l, d); - if (i != l) { - gsl_matrix_set(Vq, l, i, d); - } - } - } - - // divide the off diagonal elements of Vq by 2 - for (size_t i = 0; i < q->size; i++) { - for (size_t j = i; j < q->size; j++) { - if (i == j) { - continue; - } - d = gsl_matrix_get(Vq, i, j); - gsl_matrix_set(Vq, i, j, d / 2); - gsl_matrix_set(Vq, j, i, d / 2); - } - } - - return; -} - -// Read vector file. -void ReadFile_vector(const string &file_vec, gsl_vector *vec) { - debug_msg("entered"); - igzstream infile(file_vec.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open vector file: " << file_vec << endl; - return; - } - - string line; - char *ch_ptr; - - for (size_t i = 0; i < vec->size; i++) { - safeGetline(infile, line).eof(); - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - gsl_vector_set(vec, i, atof(ch_ptr)); - } - - infile.clear(); - infile.close(); - - return; -} - -void ReadFile_matrix(const string &file_mat, gsl_matrix *mat) { - debug_msg("entered"); - igzstream infile(file_mat.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open matrix file: " << file_mat << endl; - return; - } - - string line; - char *ch_ptr; - - for (size_t i = 0; i < mat->size1; i++) { - safeGetline(infile, line).eof(); - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - for (size_t j = 0; j < mat->size2; j++) { - enforce(ch_ptr); - gsl_matrix_set(mat, i, j, atof(ch_ptr)); - ch_ptr = strtok(NULL, " , \t"); - } - } - - infile.clear(); - infile.close(); - - return; -} - -void ReadFile_matrix(const string &file_mat, gsl_matrix *mat1, - gsl_matrix *mat2) { - debug_msg("entered"); - igzstream infile(file_mat.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open matrix file: " << file_mat << endl; - return; - } - - string line; - char *ch_ptr; - - for (size_t i = 0; i < mat1->size1; i++) { - safeGetline(infile, line).eof(); - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - for (size_t j = 0; j < mat1->size2; j++) { - enforce(ch_ptr); - gsl_matrix_set(mat1, i, j, atof(ch_ptr)); - ch_ptr = strtok(NULL, " , \t"); - } - } - - for (size_t i = 0; i < mat2->size1; i++) { - safeGetline(infile, line).eof(); - ch_ptr = strtok_safe((char *)line.c_str(), " , \t"); - for (size_t j = 0; j < mat2->size2; j++) { - enforce(ch_ptr); - gsl_matrix_set(mat2, i, j, atof(ch_ptr)); - ch_ptr = strtok(NULL, " , \t"); - } - } - - infile.clear(); - infile.close(); - - return; -} - -// Read study file. -void ReadFile_study(const string &file_study, gsl_matrix *Vq_mat, - gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { - debug_msg("entered"); - string Vqfile = file_study + ".Vq.txt"; - string sfile = file_study + ".size.txt"; - string qfile = file_study + ".q.txt"; - - gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); - - ReadFile_matrix(Vqfile, Vq_mat); - ReadFile_vector(sfile, s); - ReadFile_vector(qfile, q_vec); - - double d; - for (size_t i = 0; i < s_vec->size; i++) { - d = gsl_vector_get(s, i); - gsl_vector_set(s_vec, i, d); - } - ni = gsl_vector_get(s, s_vec->size); - - gsl_vector_free(s); - - return; -} - -// Read reference file. -void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, - gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { - debug_msg("entered"); - string sfile = file_ref + ".size.txt"; - string Sfile = file_ref + ".S.txt"; - - gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); - - ReadFile_vector(sfile, s); - ReadFile_matrix(Sfile, S_mat, Svar_mat); - - double d; - for (size_t i = 0; i < s_vec->size; i++) { - d = gsl_vector_get(s, i); - gsl_vector_set(s_vec, i, d); - } - ni = gsl_vector_get(s, s_vec->size); - - gsl_vector_free(s); - - return; -} - -// Read mstudy file. -void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq_mat, - gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni) { - debug_msg("entered"); - gsl_matrix_set_zero(Vq_mat); - gsl_vector_set_zero(q_vec); - gsl_vector_set_zero(s_vec); - ni = 0; - - gsl_matrix *Vq_sub = gsl_matrix_safe_alloc(Vq_mat->size1, Vq_mat->size2); - gsl_vector *q_sub = gsl_vector_safe_alloc(q_vec->size); - gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); - - igzstream infile(file_mstudy.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open mstudy file: " << file_mstudy << endl; - return; - } - - string file_name; - double d1, d2, d; - - while (!safeGetline(infile, file_name).eof()) { - string Vqfile = file_name + ".Vq.txt"; - string sfile = file_name + ".size.txt"; - string qfile = file_name + ".q.txt"; - - ReadFile_matrix(Vqfile, Vq_sub); - ReadFile_vector(sfile, s); - ReadFile_vector(qfile, q_sub); - - ni = max(ni, (size_t)gsl_vector_get(s, s_vec->size)); - - for (size_t i = 0; i < s_vec->size; i++) { - d1 = gsl_vector_get(s, i); - if (d1 == 0) { - continue; - } - - d = gsl_vector_get(q_vec, i) + gsl_vector_get(q_sub, i) * d1; - gsl_vector_set(q_vec, i, d); - - d = gsl_vector_get(s_vec, i) + d1; - gsl_vector_set(s_vec, i, d); - - for (size_t j = i; j < s_vec->size; j++) { - d2 = gsl_vector_get(s, j); - if (d2 == 0) { - continue; - } - - d = gsl_matrix_get(Vq_mat, i, j) + - gsl_matrix_get(Vq_sub, i, j) * d1 * d2; - gsl_matrix_set(Vq_mat, i, j, d); - if (i != j) { - gsl_matrix_set(Vq_mat, j, i, d); - } - } - } - } - - for (size_t i = 0; i < s_vec->size; i++) { - d1 = gsl_vector_get(s_vec, i); - if (d1 == 0) { - continue; - } - - d = gsl_vector_get(q_vec, i); - gsl_vector_set(q_vec, i, d / d1); - - for (size_t j = i; j < s_vec->size; j++) { - d2 = gsl_vector_get(s_vec, j); - if (d2 == 0) { - continue; - } - - d = gsl_matrix_get(Vq_mat, i, j) / (d1 * d2); - gsl_matrix_set(Vq_mat, i, j, d); - if (i != j) { - gsl_matrix_set(Vq_mat, j, i, d); - } - } - } - - gsl_matrix_free(Vq_sub); - gsl_vector_free(q_sub); - gsl_vector_free(s); - - return; -} - -// Read reference file. -void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, - gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni) { - debug_msg("entered"); - gsl_matrix_set_zero(S_mat); - gsl_matrix_set_zero(Svar_mat); - gsl_vector_set_zero(s_vec); - ni = 0; - - gsl_matrix *S_sub = gsl_matrix_safe_alloc(S_mat->size1, S_mat->size2); - gsl_matrix *Svar_sub = gsl_matrix_safe_alloc(Svar_mat->size1, Svar_mat->size2); - gsl_vector *s = gsl_vector_safe_alloc(s_vec->size + 1); - - igzstream infile(file_mref.c_str(), igzstream::in); - if (!infile) { - cout << "error! fail to open mref file: " << file_mref << endl; - return; - } - - string file_name; - double d1, d2, d; - - while (!safeGetline(infile, file_name).eof()) { - string sfile = file_name + ".size.txt"; - string Sfile = file_name + ".S.txt"; - - ReadFile_vector(sfile, s); - ReadFile_matrix(Sfile, S_sub, Svar_sub); - - // Update s_vec and ni. - for (size_t i = 0; i < s_vec->size; i++) { - d = gsl_vector_get(s, i) + gsl_vector_get(s_vec, i); - gsl_vector_set(s_vec, i, d); - } - ni = max(ni, (size_t)gsl_vector_get(s, s_vec->size)); - - // Update S and Svar from each file. - for (size_t i = 0; i < S_mat->size1; i++) { - d1 = gsl_vector_get(s, i); - for (size_t j = 0; j < S_mat->size2; j++) { - d2 = gsl_vector_get(s, j); - - d = gsl_matrix_get(S_sub, i, j) * d1 * d2; - gsl_matrix_set(S_sub, i, j, d); - d = gsl_matrix_get(Svar_sub, i, j) * d1 * d2 * d1 * d2; - gsl_matrix_set(Svar_sub, i, j, d); - } - } - - gsl_matrix_add(S_mat, S_sub); - gsl_matrix_add(Svar_mat, Svar_sub); - } - - // Final: update S and Svar. - for (size_t i = 0; i < S_mat->size1; i++) { - d1 = gsl_vector_get(s_vec, i); - if (d1 == 0) { - continue; - } - for (size_t j = i; j < S_mat->size2; j++) { - d2 = gsl_vector_get(s_vec, j); - if (d2 == 0) { - continue; - } - - d = gsl_matrix_get(S_mat, i, j) / (d1 * d2); - gsl_matrix_set(S_mat, i, j, d); - if (i != j) { - gsl_matrix_set(S_mat, j, i, d); - } - - d = gsl_matrix_get(Svar_mat, i, j) / (d1 * d2 * d1 * d2); - gsl_matrix_set(Svar_mat, i, j, d); - if (i != j) { - gsl_matrix_set(Svar_mat, j, i, d); - } - } - } - - // Free matrices. - gsl_matrix_free(S_sub); - gsl_matrix_free(Svar_sub); - gsl_vector_free(s); - - return; -} diff --git a/src/io.h b/src/io.h deleted file mode 100644 index dd1d5c0..0000000 --- a/src/io.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - Genome-wide Efficient Mixed Model Association (GEMMA) - Copyright © 2011-2017, Xiang Zhou - Copyright © 2017, Peter Carbonetto - Copyright © 2017, Pjotr Prins - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef __IO_H__ -#define __IO_H__ - -#include "gsl/gsl_matrix.h" -#include "gsl/gsl_vector.h" -#include -#include -#include - -#include "gzstream.h" -#include "param.h" - -#define tab(col) ( col ? "\t" : "") - -using namespace std; - -void ProgressBar(string str, double p, double total, double ratio = -1.0); - -std::istream &safeGetline(std::istream &is, std::string &t); - -bool ReadFile_snps(const string file_snps, set &setSnps); -bool ReadFile_snps_header(const string &file_snps, set &setSnps); -bool ReadFile_log(const string &file_log, double &pheno_mean); - -bool ReadFile_bim(const string &file_bim, vector &snpInfo); -bool ReadFile_fam(const string &file_fam, vector> &indicator_pheno, - vector> &pheno, map &mapID2num, - const vector &p_column); - -bool ReadFile_cvt(const string &file_cvt, vector &indicator_cvt, - vector> &cvt, size_t &n_cvt); -bool ReadFile_anno(const string &file_bim, map &mapRS2chr, - map &mapRS2bp, - map &mapRS2cM); -bool ReadFile_pheno(const string &file_pheno, - vector> &indicator_pheno, - vector> &pheno, - const vector &p_column); -bool ReadFile_column(const string &file_pheno, vector &indicator_idv, - vector &pheno, const int &p_column); - -bool ReadFile_geno(const string &file_geno, const set &setSnps, - const gsl_matrix *W, vector &indicator_idv, - vector &indicator_snp, const double &maf_level, - const double &miss_level, const double &hwe_level, - const double &r2_level, map &mapRS2chr, - map &mapRS2bp, - map &mapRS2cM, vector &snpInfo, - size_t &ns_test); -bool ReadFile_bed(const string &file_bed, const set &setSnps, - const gsl_matrix *W, vector &indicator_idv, - vector &indicator_snp, vector &snpInfo, - const double &maf_level, const double &miss_level, - const double &hwe_level, const double &r2_level, - size_t &ns_test); -bool Bimbam_ReadOneSNP(const size_t inc, const vector &indicator_idv, - igzstream &infile, gsl_vector *geno, double &geno_mean); -void Plink_ReadOneSNP(const int pos, const vector &indicator_idv, - ifstream &infile, gsl_vector *geno, double &geno_mean); - -void ReadFile_kin(const string &file_kin, vector &indicator_idv, - map &mapID2num, const size_t k_mode, bool &error, - gsl_matrix *G); -void ReadFile_mk(const string &file_mk, vector &indicator_idv, - map &mapID2num, const size_t k_mode, bool &error, - gsl_matrix *G); -void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U); -void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval); - -bool BimbamKin(const string file_geno, const set ksnps, - vector &indicator_snp, const int k_mode, - const int display_pace, gsl_matrix *matrix_kin, - const bool test_nind); -bool PlinkKin(const string &file_bed, vector &indicator_snp, - const int k_mode, const int display_pace, gsl_matrix *matrix_kin); - -bool ReadFile_geno(const string file_geno, vector &indicator_idv, - vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K); -bool ReadFile_bed(const string &file_bed, vector &indicator_idv, - vector &indicator_snp, gsl_matrix *UtX, gsl_matrix *K, - const bool calc_K); -bool ReadFile_geno(const string &file_geno, vector &indicator_idv, - vector &indicator_snp, - vector> &Xt, gsl_matrix *K, - const bool calc_K, const size_t ni_test, - const size_t ns_test); -bool ReadFile_bed(const string &file_bed, vector &indicator_idv, - vector &indicator_snp, vector> &Xt, - gsl_matrix *K, const bool calc_K, const size_t ni_test, - const size_t ns_test); - -bool ReadFile_est(const string &file_est, const vector &est_column, - map &mapRS2est); - -bool CountFileLines(const string &file_input, size_t &n_lines); - -bool ReadFile_gene(const string &file_gene, vector &vec_read, - vector &snpInfo, size_t &ng_total); - -bool ReadHeader_io(const string &line, HEADER &header); -bool ReadFile_cat(const string &file_cat, map &mapRS2cat, - size_t &n_vc); -bool ReadFile_mcat(const string &file_mcat, map &mapRS2cat, - size_t &n_vc); - -bool ReadFile_catc(const string &file_cat, - map> &mapRS2catc, size_t &n_cat); -bool ReadFile_mcatc(const string &file_mcat, - map> &mapRS2catc, size_t &n_cat); - -bool BimbamKinUncentered(const string &file_geno, const set ksnps, - const int display_pace, - const vector &indicator_idv, - const vector &indicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns); -bool PlinkKin(const string &file_bed, const int display_pace, - const vector &indicator_idv, - const vector &indicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector &snpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns); -bool MFILEKin(const size_t mfile_mode, const string &file_mfile, - const set setKSnps, const int display_pace, - const vector &indicator_idv, - const vector> &mindicator_snp, - const map &mapRS2weight, - const map &mapRS2cat, - const vector> &msnpInfo, const gsl_matrix *W, - gsl_matrix *matrix_kin, gsl_vector *vector_ns); - -bool ReadFile_wsnp(const string &file_wsnp, map &mapRS2double); -bool ReadFile_wsnp(const string &file_wcat, const size_t n_vc, - map> &mapRS2vector); - -void ReadFile_beta(const string &file_beta, - const map &mapRS2cat, - const map &mapRS2wA, vector &vec_cat, - vector &vec_ni, vector &vec_weight, - vector &vec_z2, size_t &ni_total, size_t &ns_total, - size_t &ns_test); -void ReadFile_beta(const string &file_beta, const map &mapRS2wA, - map &mapRS2A1, map &mapRS2z); -void Calcq(const size_t n_block, const vector &vec_cat, - const vector &vec_ni, const vector &vec_weight, - const vector &vec_z2, gsl_matrix *Vq, gsl_vector *q, - gsl_vector *s); - -void ReadFile_study(const string &file_study, gsl_matrix *Vq, gsl_vector *q_vec, - gsl_vector *s_vec, size_t &ni); -void ReadFile_ref(const string &file_ref, gsl_matrix *S_mat, - gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); -void ReadFile_mstudy(const string &file_mstudy, gsl_matrix *Vq, - gsl_vector *q_vec, gsl_vector *s_vec, size_t &ni); -void ReadFile_mref(const string &file_mref, gsl_matrix *S_mat, - gsl_matrix *Svar_mat, gsl_vector *s_vec, size_t &ni); -bool ReadFile_sample(const string &file_sample, - vector> &indicator_pheno, - vector> &pheno, - const vector &p_column, vector &indicator_cvt, - vector> &cvt, size_t &n_cvt); - -#endif -- cgit v1.2.3 From 356f528e52b5cdd13812f64308cbedd87cc0c74f Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 16:34:05 +0100 Subject: renamed import io.h to gemma_io.h --- src/bslmmdap.cpp | 2 +- src/gemma.cpp | 2 +- src/gemma_io.cpp | 2 +- src/lm.h | 2 +- src/lmm.cpp | 2 +- src/lmm.h | 2 +- src/mvlmm.cpp | 2 +- src/mvlmm.h | 2 +- src/param.cpp | 2 +- src/prdt.cpp | 3 +-- src/varcov.cpp | 2 +- src/varcov.h | 2 +- src/vc.cpp | 2 +- src/vc.h | 2 +- 14 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/bslmmdap.cpp b/src/bslmmdap.cpp index e9900e3..6220bb5 100644 --- a/src/bslmmdap.cpp +++ b/src/bslmmdap.cpp @@ -39,7 +39,7 @@ #include "gsl/gsl_vector.h" #include "bslmmdap.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "lm.h" #include "lmm.h" diff --git a/src/gemma.cpp b/src/gemma.cpp index edd79d7..5f742bd 100644 --- a/src/gemma.cpp +++ b/src/gemma.cpp @@ -48,7 +48,7 @@ extern "C" { #include "bslmm.h" #include "bslmmdap.h" #include "gemma.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "ldr.h" #include "lm.h" diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index d20b473..1876a7b 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -45,7 +45,7 @@ // #include "eigenlib.h" #include "fastblas.h" #include "gzstream.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "mathfunc.h" diff --git a/src/lm.h b/src/lm.h index 030e6f9..bcec0ab 100644 --- a/src/lm.h +++ b/src/lm.h @@ -21,7 +21,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" -#include "io.h" +#include "gemma_io.h" #include "param.h" using namespace std; diff --git a/src/lmm.cpp b/src/lmm.cpp index 4198fab..c0a9785 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -43,7 +43,7 @@ // #include "eigenlib.h" #include "gzstream.h" -#include "io.h" +#include "gemma_io.h" #include "fastblas.h" #include "lapack.h" #include "lmm.h" diff --git a/src/lmm.h b/src/lmm.h index e9740d5..f603599 100644 --- a/src/lmm.h +++ b/src/lmm.h @@ -23,7 +23,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" -#include "io.h" +#include "gemma_io.h" #include "param.h" #include #include diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp index eee562d..14f8b4a 100644 --- a/src/mvlmm.cpp +++ b/src/mvlmm.cpp @@ -39,7 +39,7 @@ #include "fastblas.h" #include "gzstream.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "lmm.h" #include "mvlmm.h" diff --git a/src/mvlmm.h b/src/mvlmm.h index 4329ad1..b92bd5e 100644 --- a/src/mvlmm.h +++ b/src/mvlmm.h @@ -21,7 +21,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" -#include "io.h" +#include "gemma_io.h" #include "param.h" using namespace std; diff --git a/src/param.cpp b/src/param.cpp index bf6c195..cd4274a 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -35,7 +35,7 @@ #include "gsl/gsl_vector.h" #include "eigenlib.h" -#include "io.h" +#include "gemma_io.h" #include "mathfunc.h" #include "param.h" diff --git a/src/prdt.cpp b/src/prdt.cpp index fc0abe8..42912a8 100644 --- a/src/prdt.cpp +++ b/src/prdt.cpp @@ -32,8 +32,7 @@ #include #include "gzstream.h" -#include "io.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "mathfunc.h" #include "prdt.h" diff --git a/src/varcov.cpp b/src/varcov.cpp index e9c7295..055d3b1 100644 --- a/src/varcov.cpp +++ b/src/varcov.cpp @@ -37,7 +37,7 @@ #include "gsl/gsl_vector.h" #include "gzstream.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "mathfunc.h" #include "param.h" diff --git a/src/varcov.h b/src/varcov.h index 47b4f9d..fa707d5 100644 --- a/src/varcov.h +++ b/src/varcov.h @@ -21,7 +21,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" -#include "io.h" +#include "gemma_io.h" #include "param.h" using namespace std; diff --git a/src/vc.cpp b/src/vc.cpp index 1a16c07..416a3e4 100644 --- a/src/vc.cpp +++ b/src/vc.cpp @@ -45,7 +45,7 @@ #include "eigenlib.h" #include "gzstream.h" -#include "io.h" +#include "gemma_io.h" #include "lapack.h" #include "lmm.h" #include "mathfunc.h" diff --git a/src/vc.h b/src/vc.h index 49397bb..3586433 100644 --- a/src/vc.h +++ b/src/vc.h @@ -21,7 +21,7 @@ #include "gsl/gsl_matrix.h" #include "gsl/gsl_vector.h" -#include "io.h" +#include "gemma_io.h" #include "param.h" using namespace std; -- cgit v1.2.3 From 3b6aafaa2f320f5bad2b76bf44b08257b4972ecd Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 22:09:24 +0100 Subject: MinGW mkdir function doesn't allow for S_IRWXU, params --- src/main.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main.cpp b/src/main.cpp index 706ac35..deadc63 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -58,7 +58,11 @@ int main(int argc, char *argv[]) { ifstream check_dir((cPar.path_out).c_str()); if (!check_dir) { - mkdir((cPar.path_out).c_str(), S_IRWXU | S_IRGRP | S_IROTH); + #ifdef WINDOWS + mkdir((cPar.path_out).c_str()); + #else + mkdir((cPar.path_out).c_str(), S_IRWXU | S_IRGRP | S_IROTH); + #endif } if (!is_quiet_mode()) -- cgit v1.2.3 From 9b6cc6e0955d9b6acc88ee8801729ae6fcc60d8b Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 23:37:52 +0100 Subject: Makefile changes, to compile under windows --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index a1b9b76..235d1b6 100644 --- a/Makefile +++ b/Makefile @@ -68,9 +68,9 @@ TRAVIS_CI = # used by TRAVIS for testing GSL_INCLUDE_PATH = ifeq ($(SYS), WIN) - GSL_INCLUDE_PATH = -Ic:/GnuWin32/include - EIGEN_INCLUDE_PATH = -I../eigen-git-mirror - OPENBLAS_INCLUDE_PATH = -I../OpenBLAS-v0.2.19-Win64-int32/include + GSL_INCLUDE_PATH = -isystemc:/MinGW/include -LC:/MinGW/lib + EIGEN_INCLUDE_PATH = -isystem../eigen-git-mirror + OPENBLAS_INCLUDE_PATH = -isystem../OpenBLAS-v0.2.19-Win64-int32/include -L../OpenBLAS-v0.2.19-Win64-int32/lib else OPENBLAS_INCLUDE_PATH = -isystem//usr/local/opt/openblas/include ifeq ($(SYS), MAC) @@ -118,7 +118,7 @@ else endif ifeq ($(SYS), WIN) - CPPFLAGS += -Duint="unsigned int" -D__CRT__NO_INLINE -D__STRING="__STRINGIFY" + CPPFLAGS += -Duint="unsigned int" -D__CRT__NO_INLINE -D__STRING="__STRINGIFY" -DWINDOWS -DWITH_GSLCBLAS=1 endif ifdef SHOW_COMPILER_WARNINGS -- cgit v1.2.3 From f957ecc3a372298ffdd76af92c148c0a7b6aa16f Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 28 Feb 2018 23:40:34 +0100 Subject: Compilation instruction for GEMMA using the R-toolchain under windows --- doc/compile_GEMMA_win64.txt | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 doc/compile_GEMMA_win64.txt diff --git a/doc/compile_GEMMA_win64.txt b/doc/compile_GEMMA_win64.txt new file mode 100644 index 0000000..84f16ec --- /dev/null +++ b/doc/compile_GEMMA_win64.txt @@ -0,0 +1,48 @@ +// install R 3.4.3 +https://cran.r-project.org/bin/windows/base/ + +// install Rtools 3.4 +https://cran.r-project.org/bin/windows/Rtools/ + +// Download openblas (v0.2.19-Win64-int32) +https://sourceforge.net/projects/openblas/files/v0.2.19/ + +// Make a place to store the files +mkdir Github +cd Github/ + +// Clone the required dependancies +git clone https://github.com/eigenteam/eigen-git-mirror.git +git clone https://github.com/genetics-statistics/GEMMA.git + +// Download and install gsl2.4, unzip/tar into Github +http://gnu.askapache.com/gsl/ + +// Download and install msys from http://downloads.sourceforge.net/mingw/MSYS-1.0.11.exe +cd c:/msys/1.0 +// Run mysys +msys.bat + +// Under msys, Compile GSL-2.4 inside the msys +cd /c/ +cd Github/gsl-2.4 +./configure --prefix=C:/MinGW +make -j 4 +make install + +// Building Gemma on the R tool chain under windows using CMD compile gemma +cd gemma +make -j 2 + +// Get all the DLLs from: +- MinGW DLLs: https://sourceforge.net/projects/openblas/files/v0.2.12/mingw64_dll.zip/download +- DLLs from the compiled gsl-2.4 +- DLLs from openBLAS + +// Required DLLs: ++ libgcc_s_seh-1.dll ++ libgfortran-3.dll ++ libgsl-23.dll ++ libgslcblas-0.dll ++ libopenblas.dll ++ libquadmath-0.dll -- cgit v1.2.3 From 64799368da26440c014ecfcf435f7ca3a44d8d15 Mon Sep 17 00:00:00 2001 From: DannyArends Date: Thu, 1 Mar 2018 15:55:10 +0100 Subject: Trying to fix the -isystem/ failure of travis --- Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 235d1b6..e7fe655 100644 --- a/Makefile +++ b/Makefile @@ -68,15 +68,15 @@ TRAVIS_CI = # used by TRAVIS for testing GSL_INCLUDE_PATH = ifeq ($(SYS), WIN) - GSL_INCLUDE_PATH = -isystemc:/MinGW/include -LC:/MinGW/lib - EIGEN_INCLUDE_PATH = -isystem../eigen-git-mirror - OPENBLAS_INCLUDE_PATH = -isystem../OpenBLAS-v0.2.19-Win64-int32/include -L../OpenBLAS-v0.2.19-Win64-int32/lib + GSL_INCLUDE_PATH = -isystem/c:/MinGW/include -LC:/MinGW/lib + EIGEN_INCLUDE_PATH = ../eigen-git-mirror + OPENBLAS_INCLUDE_PATH = ../OpenBLAS-v0.2.19-Win64-int32/include -L../OpenBLAS-v0.2.19-Win64-int32/lib else - OPENBLAS_INCLUDE_PATH = -isystem//usr/local/opt/openblas/include + OPENBLAS_INCLUDE_PATH = /usr/local/opt/openblas/include ifeq ($(SYS), MAC) - EIGEN_INCLUDE_PATH = -isystem//usr/local/include/eigen3 + EIGEN_INCLUDE_PATH = /usr/local/include/eigen3 else - EIGEN_INCLUDE_PATH = -isystem//usr/include/eigen3 + EIGEN_INCLUDE_PATH = /usr/include/eigen3 endif endif @@ -97,13 +97,13 @@ endif ifeq ($(CPP), clang++) # macOS Homebrew settings (as used on Travis-CI) - GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ $(OPENBLAS_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Wl,-L/usr/local/opt/openblas/lib + GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ -isystem/$(OPENBLAS_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Wl,-L/usr/local/opt/openblas/lib endif ifdef WITH_OPENBLAS OPENBLAS=1 # WITH_LAPACK = # OPENBLAS usually includes LAPACK - CPPFLAGS += -DOPENBLAS $(OPENBLAS_INCLUDE_PATH) + CPPFLAGS += -DOPENBLAS -isystem/$(OPENBLAS_INCLUDE_PATH) ifdef OPENBLAS_LEGACY # Legacy version (mostly for Travis-CI) CPPFLAGS += -DOPENBLAS_LEGACY @@ -111,10 +111,10 @@ ifdef WITH_OPENBLAS endif ifdef DEBUG - CPPFLAGS += -g $(GCC_FLAGS) $(GSL_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -g $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc else # release mode - CPPFLAGS += -DNDEBUG $(GCC_FLAGS) $(GSL_INCLUDE_PATH) $(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -DNDEBUG $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc endif ifeq ($(SYS), WIN) -- cgit v1.2.3 From dd3ca7014de2d8f1a2cec36cc22ff93b1e929cc4 Mon Sep 17 00:00:00 2001 From: DannyArends Date: Thu, 1 Mar 2018 15:58:04 +0100 Subject: Early morning extra fwd slash failure, fixed in this commit --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index e7fe655..bdb58b2 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ TRAVIS_CI = # used by TRAVIS for testing GSL_INCLUDE_PATH = ifeq ($(SYS), WIN) - GSL_INCLUDE_PATH = -isystem/c:/MinGW/include -LC:/MinGW/lib + GSL_INCLUDE_PATH = -isystemc:/MinGW/include -LC:/MinGW/lib EIGEN_INCLUDE_PATH = ../eigen-git-mirror OPENBLAS_INCLUDE_PATH = ../OpenBLAS-v0.2.19-Win64-int32/include -L../OpenBLAS-v0.2.19-Win64-int32/lib else @@ -97,13 +97,13 @@ endif ifeq ($(CPP), clang++) # macOS Homebrew settings (as used on Travis-CI) - GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ -isystem/$(OPENBLAS_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Wl,-L/usr/local/opt/openblas/lib + GCC_FLAGS=-O3 -std=c++11 -stdlib=libc++ -isystem$(OPENBLAS_INCLUDE_PATH) -isystem$(EIGEN_INCLUDE_PATH) -Wl,-L/usr/local/opt/openblas/lib endif ifdef WITH_OPENBLAS OPENBLAS=1 # WITH_LAPACK = # OPENBLAS usually includes LAPACK - CPPFLAGS += -DOPENBLAS -isystem/$(OPENBLAS_INCLUDE_PATH) + CPPFLAGS += -DOPENBLAS -isystem$(OPENBLAS_INCLUDE_PATH) ifdef OPENBLAS_LEGACY # Legacy version (mostly for Travis-CI) CPPFLAGS += -DOPENBLAS_LEGACY @@ -111,10 +111,10 @@ ifdef WITH_OPENBLAS endif ifdef DEBUG - CPPFLAGS += -g $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -g $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc else # release mode - CPPFLAGS += -DNDEBUG $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem/$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc + CPPFLAGS += -DNDEBUG $(GCC_FLAGS) $(GSL_INCLUDE_PATH) -isystem$(EIGEN_INCLUDE_PATH) -Icontrib/catch-1.9.7 -Isrc endif ifeq ($(SYS), WIN) -- cgit v1.2.3 From 678552c172e16b7964affe70404d40deb6e749bf Mon Sep 17 00:00:00 2001 From: Peter Carbonetto Date: Tue, 6 Mar 2018 20:50:25 -0600 Subject: Added link to RCC workshop. --- README.md | 9 ++++++--- example/demo.txt | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 355a3a9..8c97aee 100644 --- a/README.md +++ b/README.md @@ -88,11 +88,14 @@ numerical libraries. gemma.linux.gz` to unpack the file. -## Quick start +## Help materials -1. Work through the demo. *Give more details here.* ++ [The GEMMA manual](doc/manual.pdf). -2. Read the manual and run `gemma -h`. *Give more details here.* ++ [Detailed example with HS mouse data](example/demo.txt). + ++ [Tutorial on GEMMA for genome-wide association +analysis](https://github.com/rcc-uchicago/genetic-data-analysis-2). ## Citing GEMMA diff --git a/example/demo.txt b/example/demo.txt index 9452e04..9b22175 100644 --- a/example/demo.txt +++ b/example/demo.txt @@ -98,7 +98,9 @@ chr rs ps n_miss allele1 allele0 af beta_1 beta_2 Vbeta_1_1 Vbeta_1_2 Vbeta_2_2 ## To fit BSLMM in the training set: ## To fit a quantitative trait -../bin/gemma -g mouse_hs1940.geno.txt.gz -p mouse_hs1940.pheno.txt -n 2 -a mouse_hs1940.anno.txt -bslmm -o mouse_hs1940_CD8_bslmm -w 1000 -s 10000 -seed 1 +../bin/gemma -g mouse_hs1940.geno.txt.gz -p mouse_hs1940.pheno.txt -n 2 \ + -a mouse_hs1940.anno.txt -bslmm -o mouse_hs1940_CD8_bslmm \ + -w 1000 -s 10000 -seed 1 # the following three files may be of most importance: # the *.hyp.txt contains a column for pve and pge -- cgit v1.2.3 From 1493da41a6161d6238cce5473eaffb73d7b33e2c Mon Sep 17 00:00:00 2001 From: Dylan Aïssi Date: Tue, 5 Jun 2018 23:03:43 +0200 Subject: Typo: sepcified -> specified --- src/param.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/param.cpp b/src/param.cpp index cd4274a..edee79d 100644 --- a/src/param.cpp +++ b/src/param.cpp @@ -521,7 +521,7 @@ void PARAM::CheckParam(void) { a_mode != 71) { cout << "error! unknown analysis mode: " << a_mode << ". make sure -gk or -eigen or -lmm or -bslmm -predict or " - << "-calccov is sepcified correctly." << endl; + << "-calccov is specified correctly." << endl; error = true; } if (miss_level > 1) { -- cgit v1.2.3 From 159f95233afd36c98335059e35cd6c51e4760d24 Mon Sep 17 00:00:00 2001 From: xiangzhou Date: Fri, 15 Jun 2018 07:38:39 -0400 Subject: explain how to deal with population stratification in the reference panel --- doc/manual.pdf | Bin 269308 -> 319480 bytes doc/manual.tex | 4 ++++ 2 files changed, 4 insertions(+) diff --git a/doc/manual.pdf b/doc/manual.pdf index b760cc1..1b7dc5d 100644 Binary files a/doc/manual.pdf and b/doc/manual.pdf differ diff --git a/doc/manual.tex b/doc/manual.tex index 1e042e7..8e5efe2 100644 --- a/doc/manual.tex +++ b/doc/manual.tex @@ -1373,6 +1373,10 @@ format. In addition, to fit MQS-LDW, you will need to add "-wcat specifies the LD score file, which can be provided in a gzip compressed format. +A feature of MQS based variance component estimation is that one only need to use a subset of samples to estimate certain quantities. Using a subset of samples dramatically improves computation speed while maintaining variance component estimation accuracy. To take this strategy, one can use ``-sample [num]" to use a fixed number of random samples to perform estimation. + +Instead of using the genotype data from the study, one can also use genotype data from a reference panel. For example, one can use the genotype data from the 1000 genomes project as the reference. However, any population stratification in the reference panel should be dealt with first. For example, the individuals with European ancestry in the 1000 genomes project come from five subpopulations: CEU, FIN, GBR, IBS, and TSI. MQS computes SNP correlations across all SNP pairs as it should be under the LMM assumption. Therefore, any population stratification in the reference panel would increase the overall SNP correlation estimate, leading to down-ward bias in the final heritability estimate. To address the population stratification in the reference panel, one can include a few dummy variables in the model fitting step as covariates. These covariates represent, for example, the five subpopulations, and are used to effectively center the genotype mean in each subpopulation separately. To do this, one can create a covariate file containing five columns (no header): the first column is all 1 representing the intercept; the second column is 1 for CEU and 0 for others; the third column is 1 for FIN and 0 for others; ...; while the fifth column is 1 for IBS and 0 for others. Afterwards, one can add "-c [filename]" to include this covariate file in the command line. + \subsubsection{Detailed Information} MQS-LDW uses an iterative procedure to update the variance -- cgit v1.2.3