diff options
Diffstat (limited to 'src/lmm.cpp')
| -rw-r--r-- | src/lmm.cpp | 484 |
1 files changed, 459 insertions, 25 deletions
diff --git a/src/lmm.cpp b/src/lmm.cpp index 87b9e1e..2b730f3 100644 --- a/src/lmm.cpp +++ b/src/lmm.cpp @@ -2,7 +2,7 @@ Genome-wide Efficient Mixed Model Association (GEMMA) Copyright © 2011-2017, Xiang Zhou Copyright © 2017, Peter Carbonetto - Copyright © 2017-2022 Pjotr Prins + Copyright © 2017-2025 Pjotr Prins This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -40,6 +40,9 @@ #include "gsl/gsl_min.h" #include "gsl/gsl_roots.h" #include "gsl/gsl_vector.h" +#include <lmdb++.h> +// #include <lmdb.h> +#include <sys/mman.h> #include "gzstream.h" #include "gemma.h" @@ -55,6 +58,7 @@ using namespace std; void LMM::CopyFromParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-from-param"); a_mode = cPar.a_mode; d_pace = cPar.d_pace; @@ -91,10 +95,12 @@ void LMM::CopyFromParam(PARAM &cPar) { } void LMM::CopyToParam(PARAM &cPar) { + checkpoint_nofile("lmm-copy-to-param"); cPar.time_UtX = time_UtX; cPar.time_opt = time_opt; - - cPar.ng_test = ng_test; + cPar.ns_total = ns_total; + cPar.ns_test = ns_test; + cPar.ng_test = ng_test; // number of markers tested return; } @@ -105,10 +111,11 @@ void LMM::WriteFiles() { file_str = path_out + "/" + file_out; file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); ofstream outfile(file_str.c_str(), ofstream::out); if (!outfile) { - cout << "error writing file: " << file_str.c_str() << endl; + cout << "error writing file: " << file_str << endl; return; } @@ -148,7 +155,7 @@ void LMM::WriteFiles() { outfile << scientific << setprecision(6); if (a_mode != M_LMM2) { - outfile << st.beta << "\t"; + outfile << scientific << st.beta << "\t"; outfile << st.se << "\t"; } @@ -202,21 +209,29 @@ void LMM::WriteFiles() { common_header(); - size_t t = 0; - for (size_t i = 0; i < snpInfo.size(); ++i) { - if (indicator_snp[i] == 0) - continue; - auto snp = snpInfo[i].rs_number; - if (process_gwasnps && setGWASnps.count(snp) == 0) - continue; - // cout << t << endl; - outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" - << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" - << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" - << fixed << setprecision(3) << snpInfo[i].maf << "\t"; - - sumstats(sumStat[t]); - t++; + if (snpInfo.size()) { + size_t t = 0; + for (size_t i = 0; i < snpInfo.size(); ++i) { + if (indicator_snp[i] == 0) + continue; + auto snp = snpInfo[i].rs_number; + if (process_gwasnps && setGWASnps.count(snp) == 0) + continue; + // cout << t << endl; + outfile << snpInfo[i].chr << "\t" << snpInfo[i].rs_number << "\t" + << snpInfo[i].base_position << "\t" << snpInfo[i].n_miss << "\t" + << snpInfo[i].a_minor << "\t" << snpInfo[i].a_major << "\t" + << fixed << setprecision(3) << snpInfo[i].maf << "\t"; + + sumstats(sumStat[t]); + t++; + } + } + else + { + for (auto &s : sumStat) { + sumstats(s); + } } } @@ -1662,12 +1677,9 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, const set<string> gwasnps) { clock_t time_start = clock(); - write(W, "W"); - write(y, "y"); + checkpoint_nofile("start-lmm-analyze"); // Subset/LOCO support bool process_gwasnps = gwasnps.size(); - if (process_gwasnps) - debug_msg("Analyze subset of SNPs (LOCO)"); // Calculate basic quantities. size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; @@ -1704,6 +1716,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, auto batch_compute = [&](size_t l) { // using a C++ closure // Compute SNPs in batch, note the computations are independent per SNP // debug_msg("enter batch_compute"); + checkpoint_nofile("\nstart-batch_compute"); gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); gsl_matrix_view UtXlarge_sub = gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); @@ -1763,6 +1776,7 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, sumStat.push_back(SNPs); } // debug_msg("exit batch_compute"); + checkpoint_nofile("end-batch_compute"); }; const auto num_snps = indicator_snp.size(); @@ -1844,9 +1858,10 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } batch_compute(c % msize); - ProgressBar("Reading SNPs", num_snps - 1, num_snps - 1); + ProgressBar("Reading SNPS", num_snps - 1, num_snps - 1); // cout << "Counted SNPs " << c << " sumStat " << sumStat.size() << endl; cout << endl; + checkpoint_nofile("end-lmm-analyze"); gsl_vector_safe_free(x); gsl_vector_safe_free(x_miss); @@ -1860,6 +1875,425 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp, } /* + This is the mirror function of LMM::Analyze and AnalyzeBimbam, but uses mdb input instead. + */ + + + +void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp, + const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, + size_t num_markers) { + vector<SUMSTAT2> sumstat2; + clock_t time_start = clock(); + + checkpoint_nofile("start-lmm-mdb-analyze"); + + // Calculate basic quantities. + size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2; + + const size_t inds = U->size1; + enforce(inds == ni_test); + assert(inds > 0); + + gsl_vector *x = gsl_vector_safe_alloc(inds); // #inds + gsl_vector *x_miss = gsl_vector_safe_alloc(inds); + assert(ni_test == U->size2); + assert(ni_test > 0); + assert(ni_total > 0); + assert(n_index > 0); + gsl_vector *Utx = gsl_vector_safe_alloc(ni_test); + gsl_matrix *Uab = gsl_matrix_safe_alloc(ni_test, n_index); + gsl_vector *ab = gsl_vector_safe_alloc(n_index); + + const size_t msize = LMM_BATCH_SIZE; + gsl_matrix *Xlarge = gsl_matrix_safe_alloc(inds, msize); + gsl_matrix *UtXlarge = gsl_matrix_safe_alloc(inds, msize); + enforce_msg(Xlarge && UtXlarge, "Xlarge memory check"); // just to be sure + enforce(Xlarge->size1 == inds); + gsl_matrix_set_zero(Xlarge); + gsl_matrix_set_zero(Uab); + CalcUab(UtW, Uty, Uab); + + // start reading genotypes and analyze + size_t c = 0; + + /* + batch_compute(l) takes l x SNPs that have been loaded into Xlarge, + transforms them all at once using the eigenvector matrix U, then + loops through each transformed SNP to compute association + statistics (beta, standard errors, p-values) and stores results in + sumStat. + */ + auto batch_compute = [&](size_t l, const Markers &markers) { // using a C++ closure + // Compute SNPs in batch, note the computations are independent per SNP + debug_msg("enter batch_compute"); + assert(l>0); + assert(inds>0); + gsl_matrix_view Xlarge_sub = gsl_matrix_submatrix(Xlarge, 0, 0, inds, l); + gsl_matrix_view UtXlarge_sub = + gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l); + + time_start = clock(); + // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge + // This is much faster than doing l separate matrix-vector products + // U is the eigenvector matrix from the spectral decomposition of the kinship matrix + fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0, + &UtXlarge_sub.matrix); + time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + gsl_matrix_set_zero(Xlarge); + for (size_t i = 0; i < l; i++) { + // for each snp batch item extract transformed genotype: + gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i); + gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector); + + // Calculate design matrix components and compute sufficient statistics for the regression model + CalcUab(UtW, Uty, Utx, Uab); + + time_start = clock(); + FUNC_PARAM param1 = {false, ni_test, n_cvt, eval, Uab, ab, 0}; + + double lambda_mle = 0.0, lambda_remle = 0.0, beta = 0.0, se = 0.0, p_wald = 0.0; + double p_lrt = 0.0, p_score = 0.0; + double logl_H1 = 0.0; + + // Run statistical tests based on analysis mode + // 3 is before 1. + if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) { + CalcRLScore(l_mle_null, param1, beta, se, p_score); + } + + // Computes Wald statistic for testing association + if (a_mode == M_LMM1 || a_mode == M_LMM4) { + // for univariate a_mode is 1 + // Estimates variance component (lambda) via REML + CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1); + CalcRLWald(lambda_remle, param1, beta, se, p_wald); + } + + // Estimates variance component (lambda) via REML + // Likelihood Ratio Test (modes 2, 4, 9): + // Estimates variance component via MLE + // Compares log-likelihood under alternative vs null hypothesis + if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) { + CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1); + p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1); + } + + time_opt += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0); + + auto markerinfo = markers[i]; + // Store summary data. + SUMSTAT2 st = {markerinfo, beta, se, lambda_remle, lambda_mle, p_wald, p_lrt, p_score, logl_H1}; + sumstat2.push_back(st); + } + }; + + /* + const auto num_markers = indicator_snp.size(); + enforce_msg(num_markers > 0,"Zero SNPs to process - data corrupt?"); + if (num_markers < 50) { + cerr << num_markers << " SNPs" << endl; + warning_msg("very few SNPs processed"); + } + */ + const size_t progress_step = (num_markers/50>d_pace ? num_markers/50 : d_pace); + Markers markers; + + assert(num_markers > 0); + for (size_t t = 0; t < num_markers; ++t) { + // for (size_t t = 0; t < 2; ++t) { + if (t % progress_step == 0 || t == (num_markers - 1)) { + ProgressBar("Reading markers", t, num_markers - 1); + } + // if (indicator_snp[t] == 0) + // continue; + + auto tup = fetch_snp(t); // use the callback + auto state = get<0>(tup); + if (state == SKIP) + continue; + if (state == LAST) + break; // marker loop because of LOCO + + auto markerinfo = get<1>(tup); + auto gs = get<2>(tup); + + markers.push_back(markerinfo); + + // drop missing idv and plug mean values for missing geno + double x_total = 0.0; // sum genotype values to compute x_mean + uint vpos = 0; // position in target vector + uint n_miss = 0; // count NA genotypes + gsl_vector_set_zero(x_miss); + for (size_t i = 0; i < ni_total; ++i) { + // get the genotypes per individual and compute stats per SNP + if (indicator_idv[i] == 0) // skip individual + continue; + + double geno = gs[i]; + if (isnan(geno)) { + gsl_vector_set(x_miss, vpos, 1.0); + n_miss++; + } else { + gsl_vector_set(x, vpos, geno); + x_total += geno; + } + vpos++; + } + enforce(vpos == ni_test); + + const double x_mean = x_total/(double)(ni_test - n_miss); + + // plug x_mean back into missing values + for (size_t i = 0; i < ni_test; ++i) { + if (gsl_vector_get(x_miss, i) == 1.0) { + gsl_vector_set(x, i, x_mean); + } + } + + enforce(x->size == ni_test); + + // copy genotype values for SNP into Xlarge cache + gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, c % msize); + gsl_vector_safe_memcpy(&Xlarge_col.vector, x); + c++; // count markers going in + + if (c % msize == 0) { + batch_compute(msize,markers); + markers.clear(); + markers.reserve(msize); + } + } + + if (c % msize) + batch_compute(c % msize,markers); + ProgressBar("Reading markers", num_markers - 1, num_markers - 1); + cout << endl; + cout << "Counted markers " << c << " sumStat " << sumstat2.size() << endl; + checkpoint_nofile("end-lmm-mdb-analyze"); + + string file_str; + debug_msg("LMM::WriteFiles"); + file_str = path_out + "/" + file_out; + file_str += ".assoc.txt"; + checkpoint("lmm-write-files",file_str); + + ofstream outfile(file_str.c_str(), ofstream::out); + if (!outfile) { + cout << "error writing file: " << file_str << endl; + return; + } + + auto sumstats = [&] (SUMSTAT2 st) { + outfile << scientific << setprecision(6); + auto m = st.markerinfo; + auto name = m.name; + auto chr = m.chr; + string chr_s; + if (chr == CHR_X) + chr_s = "X"; + else if (chr == CHR_Y) + chr_s = "Y"; + else if (chr == CHR_M) + chr_s = "M"; + else + chr_s = to_string(chr); + + outfile << chr_s << "\t"; + outfile << name << "\t"; + outfile << m.pos << "\t"; + outfile << m.n_miss << "\t-\t-\t"; // n_miss column + allele1 + allele0 + outfile << fixed << setprecision(3) << m.maf << "\t"; + outfile << scientific << setprecision(6); + if (a_mode != M_LMM2) { + outfile << st.beta << "\t"; + outfile << st.se << "\t"; + } + + if (a_mode != M_LMM3 && a_mode != M_LMM9) + outfile << st.logl_H1 << "\t"; + + switch(a_mode) { + case M_LMM1: + outfile << st.lambda_remle << "\t" + << st.p_wald << endl; + break; + case M_LMM2: + case M_LMM9: + outfile << st.lambda_mle << "\t" + << st.p_lrt << endl; + break; + case M_LMM3: + outfile << st.p_score << endl; + break; + case M_LMM4: + outfile << st.lambda_remle << "\t" + << st.lambda_mle << "\t" + << st.p_wald << "\t" + << st.p_lrt << "\t" + << st.p_score << endl; + break; + } + }; + + for (auto &s : sumstat2) { + sumstats(s); + } + + gsl_vector_safe_free(x); + gsl_vector_safe_free(x_miss); + gsl_vector_safe_free(Utx); + gsl_matrix_safe_free(Uab); + gsl_vector_free(ab); // unused + + gsl_matrix_safe_free(Xlarge); + gsl_matrix_safe_free(UtXlarge); + +} + + +void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval, + const gsl_matrix *UtW, const gsl_vector *Uty, + const gsl_matrix *W, const gsl_vector *y, const string loco) { + checkpoint("mdb-calc-gwa",file_geno); + bool is_loco = !loco.empty(); + + // Convert loco string to what we use in the chrpos index + uint8_t loco_chr; + if (is_loco) { + if (loco == "X") { + loco_chr = CHR_X; + } else if (loco == "Y") { + loco_chr = CHR_Y; + } else if (loco == "M") { + loco_chr = CHR_M; + } else { + try { + loco_chr = static_cast<uint8_t>(stoi(loco)); + } catch (...) { + loco_chr = 0; + } + } + } + + auto env = lmdb::env::create(); + env.set_mapsize(1UL * 1024UL * 1024UL * 1024UL * 1024UL); + env.set_max_dbs(10); + env.open(file_geno.c_str(), MDB_RDONLY | MDB_NOSUBDIR, 0664); + // Get mmap info using lmdb++ wrapper + MDB_envinfo info; + mdb_env_info(env.handle(), &info); + // Linux kernel aggressive readahead hints + madvise(info.me_mapaddr, info.me_mapsize, MADV_SEQUENTIAL); + madvise(info.me_mapaddr, info.me_mapsize, MADV_WILLNEED); + + std::cout << "## LMDB opened with optimized readahead; map size = " << (info.me_mapsize / 1024 / 1024) << " MB" << std::endl; + + auto rtxn = lmdb::txn::begin(env, nullptr, MDB_RDONLY); + auto geno_mdb = lmdb::dbi::open(rtxn, "geno"); + + auto marker_mdb = lmdb::dbi::open(rtxn, "marker"); + auto info_mdb = lmdb::dbi::open(rtxn, "info"); + string_view info_key,info_value; + info_mdb.get(rtxn,"format",info_value); + auto format = string(info_value); + + MDB_stat stat; + mdb_stat(rtxn, geno_mdb, &stat); + auto num_markers = stat.ms_entries; + + auto mdb_fetch = MDB_FIRST; + + auto cursor = lmdb::cursor::open(rtxn, geno_mdb); + cout << "## number of total individuals = " << ni_total << endl; + cout << "## number of analyzed individuals = " << ni_test << endl; + cout << "## number of analyzed SNPs/var = " << num_markers << endl; + + std::function<SnpNameValues2(size_t)> fetch_snp = [&](size_t num) { + + string_view key,value; + + auto mdb_success = cursor.get(key, value, mdb_fetch); + mdb_fetch = MDB_NEXT; + + // uint8_t chr; + vector<double> gs; + MarkerInfo markerinfo; + + if (mdb_success) { + size_t size = 0; + // ---- Depending on the format we get different buffers - currently float and byte are supported: + if (format == "Gb") { + size_t num_bytes = value.size() / sizeof(uint8_t); + assert(num_bytes == ni_total); + size = num_bytes; + const uint8_t* gs_bbuf = reinterpret_cast<const uint8_t*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + double g = static_cast<double>(gs_bbuf[i])/127.0; + gs.push_back(g); + } + } else { + size_t num_floats = value.size() / sizeof(float); + assert(num_floats == ni_total); + size = num_floats; + const float* gs_fbuf = reinterpret_cast<const float*>(value.data()); + gs.reserve(size); + for (size_t i = 0; i < size; ++i) { + gs.push_back(static_cast<double>(gs_fbuf[i])); + } + } + // Start unpacking key chr+pos + if (key.size() != 10) { + cerr << "key.size=" << key.size() << endl; + throw std::runtime_error("Invalid key size"); + } + // "S>L>L>" + const uint8_t* data = reinterpret_cast<const uint8_t*>(key.data()); + auto chr = static_cast<uint8_t>(data[1]); + // Extract big-endian uint32 + // uint32_t rest = static_cast<uint32_t>(data[2]); + uint32_t pos = (data[2] << 24) | (data[3] << 16) | + (data[4] << 8) | data[5]; + + uint32_t num = (data[6] << 24) | (data[7] << 16) | + (data[8] << 8) | data[9]; + + // printf("%#02x %#02x\n", chr, loco_chr); + + if (is_loco && loco_chr != chr) { + if (chr > loco_chr) + return make_tuple(LAST, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + else + return make_tuple(SKIP, MarkerInfo { .name="", .chr=chr, .pos=pos } , gs); + } + + string_view value2; + marker_mdb.get(rtxn,key,value2); + auto marker = string(value2); + // 1 rs13476251 174792257 + // cout << static_cast<int>(chr) << ":" << pos2 << " line " << rest2 << ":" << marker << endl ; + + // compute maf and n_miss (NAs) + size_t n_miss = 0; // count NAs: FIXME + double maf = compute_maf(ni_total, ni_test, n_miss, gs.data(), indicator_idv); + + markerinfo = MarkerInfo { .name=marker,.chr=chr,.pos=pos,.line_no=num,.n_miss=n_miss,.maf=maf }; + + // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl; + } + return make_tuple(COMPUTE, markerinfo, gs); + }; + LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,num_markers); + + ns_total = ns_test = num_markers; // track global number of snps in original gemma (goes to cPar) +} + + +/* Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified: |
