diff options
| author | Pjotr Prins | 2025-12-01 08:04:07 +0100 |
|---|---|---|
| committer | Pjotr Prins | 2025-12-01 08:04:07 +0100 |
| commit | dc63e81e9df26f607b0f83b86a7118f73a3d2cfa (patch) | |
| tree | 8bdb5b51233c61b26ce329a5559927c31c165a0d /src | |
| parent | ca03f0043f26b6019a613ae5fb0ea2cf45c52108 (diff) | |
| download | pangemma-dc63e81e9df26f607b0f83b86a7118f73a3d2cfa.tar.gz | |
Toyed a bit with SNP filtering. I think my unfiltered approach for computing GRM is probably more accurate and faster without any SNP filtering.
Diffstat (limited to 'src')
| -rw-r--r-- | src/gemma_io.cpp | 50 |
1 files changed, 35 insertions, 15 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index f176dac..ef02429 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -650,15 +650,14 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno, return true; } -double compute_maf(size_t ni_total, const gsl_vector *gs) { - double maf = 0; +#include <tuple> + +std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) { size_t n_0 = 0; size_t n_1 = 0; size_t n_2 = 0; - for (int i = 0; i < ni_total; ++i) { // read genotypes - double geno; - gsl_vector_get(gs, i); + double geno = gsl_vector_get(gs, i); if (geno >= 0 && geno <= 0.5) { n_0++; } @@ -668,10 +667,17 @@ double compute_maf(size_t ni_total, const gsl_vector *gs) { if (geno >= 1.5 && geno <= 2.0) { n_2++; } + } + return {n_0,n_1,n_2}; +} + +double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const gsl_vector *gs) { + double maf = 0.0; + + for (int i = 0; i < ni_total; ++i) { // read genotypes + double geno = gsl_vector_get(gs, i); maf += geno; } - auto ni_test = ni_total; - auto n_miss = 0; maf /= 2.0 * (double)(ni_test - n_miss); return maf; } @@ -774,7 +780,7 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, for (int i = 0; i < ni_total; ++i) { ni_test += indicator_idv[i]; } - assert(ni_test == ni_total); // we use indicator_idv? + // assert(ni_test == ni_total); // we use indicator_idv? ns_test = 0; file_pos = 0; @@ -826,7 +832,8 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, geno_old = -9; c_idv = 0; gsl_vector_set_zero(genotype_miss); - + double maf = 0.0; + size_t n_0=0,n_1=0,n_2=0; auto infilen = file_geno.c_str(); for (int i = 0; i < ni_total; ++i) { // read genotypes @@ -846,8 +853,10 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, geno = atof(ch_ptr); gsl_vector_set(genotype, c_idv, geno); - if (geno < min_g) min_g = geno; - if (geno > max_g) max_g = geno; + if (geno < min_g) + min_g = geno; + if (geno > max_g) + max_g = geno; // going through genotypes with 0.0 < geno < 2.0 if (flag_poly == 0) { // first init in marker @@ -857,10 +866,22 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, if (flag_poly == 2 && geno != geno_old) { flag_poly = 1; // genotypes differ } + + // compute ratios + if (hwe_level != 0 && maf_level != -1) { + if (geno >= 0 && geno <= 0.5) + n_0++; + if (geno > 0.5 && geno < 1.5) + n_1++; + if (geno >= 1.5 && geno <= 2.0) + n_2++; + } + maf += geno; + c_idv++; } - double maf = compute_maf(ni_total,genotype); + maf /= 2.0 * (double)(ni_test - n_miss); SNPINFO sInfo = {chr, rs, cM, b_pos, @@ -890,14 +911,12 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps, } // -hwe flag - /* FIXME FIXME if (hwe_level != 0 && maf_level != -1) { if (CalcHWE(n_0, n_2, n_1) < hwe_level) { indicator_snp.push_back(0); continue; } } - */ // -r2 flag for (size_t i = 0; i < genotype->size; ++i) { if (gsl_vector_get(genotype_miss, i) == 1) { @@ -1534,7 +1553,8 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) // Xlarge contains inds x markers gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE); enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE"); - gsl_matrix_set_zero(Xlarge); + if (is_check_mode()) + gsl_matrix_set_zero(Xlarge); // [ ] check XLarge is zeroed properly // [ ] handle missing data |
