Toyed a bit with SNP filtering. I think my unfiltered approach for computing GRM is probably more accurate and faster without any SNP filtering.

author: Pjotr Prins 2025-12-01 08:04:07 +0100
committer: Pjotr Prins 2025-12-01 08:04:07 +0100
commit: dc63e81e9df26f607b0f83b86a7118f73a3d2cfa (patch)
tree: 8bdb5b51233c61b26ce329a5559927c31c165a0d /src
parent: ca03f0043f26b6019a613ae5fb0ea2cf45c52108 (diff)
download: pangemma-dc63e81e9df26f607b0f83b86a7118f73a3d2cfa.tar.gz
1 files changed, 35 insertions, 15 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index f176dac..ef02429 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -650,15 +650,14 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
   return true;
 }
 
-double compute_maf(size_t ni_total, const gsl_vector *gs) {
-  double maf = 0;
+#include <tuple>
+
+std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) {
   size_t n_0 = 0;
   size_t n_1 = 0;
   size_t n_2 = 0;
-
   for (int i = 0; i < ni_total; ++i) { // read genotypes
-    double geno;
-    gsl_vector_get(gs, i);
+    double geno = gsl_vector_get(gs, i);
     if (geno >= 0 && geno <= 0.5) {
       n_0++;
     }
@@ -668,10 +667,17 @@ double compute_maf(size_t ni_total, const gsl_vector *gs) {
     if (geno >= 1.5 && geno <= 2.0) {
       n_2++;
     }
+  }
+  return {n_0,n_1,n_2};
+}
+
+double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const gsl_vector *gs) {
+  double maf = 0.0;
+
+  for (int i = 0; i < ni_total; ++i) { // read genotypes
+    double geno = gsl_vector_get(gs, i);
     maf += geno;
   }
-  auto ni_test = ni_total;
-  auto n_miss = 0;
   maf /= 2.0 * (double)(ni_test - n_miss);
   return maf;
 }
@@ -774,7 +780,7 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
   for (int i = 0; i < ni_total; ++i) {
     ni_test += indicator_idv[i];
   }
-  assert(ni_test == ni_total); // we use indicator_idv?
+  // assert(ni_test == ni_total); // we use indicator_idv?
   ns_test = 0;
 
   file_pos = 0;
@@ -826,7 +832,8 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
     geno_old = -9;
     c_idv = 0;
     gsl_vector_set_zero(genotype_miss);
-
+    double maf = 0.0;
+    size_t n_0=0,n_1=0,n_2=0;
 
     auto infilen = file_geno.c_str();
     for (int i = 0; i < ni_total; ++i) { // read genotypes
@@ -846,8 +853,10 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
       geno = atof(ch_ptr);
 
       gsl_vector_set(genotype, c_idv, geno);
-      if (geno < min_g) min_g = geno;
-      if (geno > max_g) max_g = geno;
+      if (geno < min_g)
+        min_g = geno;
+      if (geno > max_g)
+        max_g = geno;
 
       // going through genotypes with 0.0 < geno < 2.0
       if (flag_poly == 0) { // first init in marker
@@ -857,10 +866,22 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
       if (flag_poly == 2 && geno != geno_old) {
         flag_poly = 1;      // genotypes differ
       }
+
+      // compute ratios
+      if (hwe_level != 0 && maf_level != -1) {
+        if (geno >= 0 && geno <= 0.5)
+          n_0++;
+        if (geno > 0.5 && geno < 1.5)
+          n_1++;
+        if (geno >= 1.5 && geno <= 2.0)
+          n_2++;
+      }
+      maf += geno;
+
       c_idv++;
     }
 
-    double maf = compute_maf(ni_total,genotype);
+    maf /= 2.0 * (double)(ni_test - n_miss);
 
     SNPINFO sInfo = {chr,    rs,
                      cM,     b_pos,
@@ -890,14 +911,12 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
     }
 
     // -hwe flag
-    /* FIXME FIXME
     if (hwe_level != 0 && maf_level != -1) {
       if (CalcHWE(n_0, n_2, n_1) < hwe_level) {
         indicator_snp.push_back(0);
         continue;
       }
     }
-    */
     // -r2 flag
     for (size_t i = 0; i < genotype->size; ++i) {
       if (gsl_vector_get(genotype_miss, i) == 1) {
@@ -1534,7 +1553,8 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode)
   // Xlarge contains inds x markers
   gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE);
   enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE");
-  gsl_matrix_set_zero(Xlarge);
+  if (is_check_mode())
+    gsl_matrix_set_zero(Xlarge);
 
   // [ ] check XLarge is zeroed properly
   // [ ] handle missing data
author	Pjotr Prins	2025-12-01 08:04:07 +0100
committer	Pjotr Prins	2025-12-01 08:04:07 +0100
commit	dc63e81e9df26f607b0f83b86a7118f73a3d2cfa (patch)
tree	8bdb5b51233c61b26ce329a5559927c31c165a0d /src
parent	ca03f0043f26b6019a613ae5fb0ea2cf45c52108 (diff)
download	pangemma-dc63e81e9df26f607b0f83b86a7118f73a3d2cfa.tar.gz