about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorPjotr Prins2025-11-30 10:12:46 +0100
committerPjotr Prins2025-11-30 10:12:46 +0100
commitca03f0043f26b6019a613ae5fb0ea2cf45c52108 (patch)
treed5738a962efca65c17deacfa3fc665db48554695 /src
parentab434bbd38a61941bf32afcf91b7cc397462a53b (diff)
downloadpangemma-ca03f0043f26b6019a613ae5fb0ea2cf45c52108.tar.gz
Extracting MAF computation
Diffstat (limited to 'src')
-rw-r--r--src/gemma_io.cpp64
-rw-r--r--src/gemma_io.h2
-rw-r--r--src/param.cpp2
3 files changed, 43 insertions, 25 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index baa44b9..f176dac 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -650,6 +650,32 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
   return true;
 }
 
+double compute_maf(size_t ni_total, const gsl_vector *gs) {
+  double maf = 0;
+  size_t n_0 = 0;
+  size_t n_1 = 0;
+  size_t n_2 = 0;
+
+  for (int i = 0; i < ni_total; ++i) { // read genotypes
+    double geno;
+    gsl_vector_get(gs, i);
+    if (geno >= 0 && geno <= 0.5) {
+      n_0++;
+    }
+    if (geno > 0.5 && geno < 1.5) {
+      n_1++;
+    }
+    if (geno >= 1.5 && geno <= 2.0) {
+      n_2++;
+    }
+    maf += geno;
+  }
+  auto ni_test = ni_total;
+  auto n_miss = 0;
+  maf /= 2.0 * (double)(ni_test - n_miss);
+  return maf;
+}
+
 // Read bimbam mean genotype file, the first time, to obtain #SNPs for
 // analysis (ns_test) and total #SNP (ns_total).
 /*
@@ -736,9 +762,8 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
   double cM;
   size_t file_pos;
 
-  double maf, geno, geno_old;
+  double geno, geno_old;
   size_t n_miss;
-  size_t n_0, n_1, n_2;
   double min_g = std::numeric_limits<float>::max();
   double max_g = std::numeric_limits<float>::min();
 
@@ -749,6 +774,7 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
   for (int i = 0; i < ni_total; ++i) {
     ni_test += indicator_idv[i];
   }
+  assert(ni_test == ni_total); // we use indicator_idv?
   ns_test = 0;
 
   file_pos = 0;
@@ -795,15 +821,13 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
     }
 
     // Start on a new marker/SNP
-    maf = 0;
     n_miss = 0;
     flag_poly = 0;
     geno_old = -9;
-    n_0 = 0;
-    n_1 = 0;
-    n_2 = 0;
     c_idv = 0;
     gsl_vector_set_zero(genotype_miss);
+
+
     auto infilen = file_geno.c_str();
     for (int i = 0; i < ni_total; ++i) { // read genotypes
       ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str());
@@ -820,15 +844,6 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
       }
 
       geno = atof(ch_ptr);
-      if (geno >= 0 && geno <= 0.5) {
-        n_0++;
-      }
-      if (geno > 0.5 && geno < 1.5) {
-        n_1++;
-      }
-      if (geno >= 1.5 && geno <= 2.0) {
-        n_2++;
-      }
 
       gsl_vector_set(genotype, c_idv, geno);
       if (geno < min_g) min_g = geno;
@@ -842,12 +857,10 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
       if (flag_poly == 2 && geno != geno_old) {
         flag_poly = 1;      // genotypes differ
       }
-
-      maf += geno;
-
       c_idv++;
     }
-    maf /= 2.0 * (double)(ni_test - n_miss);
+
+    double maf = compute_maf(ni_total,genotype);
 
     SNPINFO sInfo = {chr,    rs,
                      cM,     b_pos,
@@ -877,14 +890,14 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
     }
 
     // -hwe flag
+    /* FIXME FIXME
     if (hwe_level != 0 && maf_level != -1) {
       if (CalcHWE(n_0, n_2, n_1) < hwe_level) {
         indicator_snp.push_back(0);
         continue;
       }
     }
-
-
+    */
     // -r2 flag
     for (size_t i = 0; i < genotype->size; ++i) {
       if (gsl_vector_get(genotype_miss, i) == 1) {
@@ -892,7 +905,6 @@ bool ReadFile_bimbam_geno(const string &file_geno, const set<string> &setSnps,
         gsl_vector_set(genotype, i, geno);
       }
     }
-
     gsl_blas_dgemv(CblasTrans, 1.0, W, genotype, 0.0, Wtx);
     gsl_blas_dgemv(CblasNoTrans, 1.0, WtWi, Wtx, 0.0, WtWiWtx);
     gsl_blas_ddot(genotype, genotype, &v_x);
@@ -1477,7 +1489,7 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
 
 
 // Read bimbam mean genotype file and calculate kinship matrix.
-gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode, const vector<int> &indicator_snp) {
+gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode) {
   checkpoint("mdb-kin",file_geno);
 
   /* Create and open the LMDB environment: */
@@ -1515,12 +1527,14 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode,
   num_markers = stat.ms_entries;
 
   gsl_matrix *matrix_kin = gsl_matrix_safe_alloc(ni_total,ni_total);
+  gsl_matrix_set_zero(matrix_kin);
   gsl_vector *geno = gsl_vector_safe_alloc(ni_total);
   gsl_vector *geno_miss = gsl_vector_safe_alloc(ni_total);
 
   // Xlarge contains inds x markers
   gsl_matrix *Xlarge = gsl_matrix_safe_alloc(ni_total, K_BATCH_SIZE);
   enforce_msg(Xlarge, "allocate Xlarge ni_total x K_BATCH_SIZE");
+  gsl_matrix_set_zero(Xlarge);
 
   // [ ] check XLarge is zeroed properly
   // [ ] handle missing data
@@ -1540,6 +1554,10 @@ gsl_matrix *mdb_calc_kin(const string file_geno, bool is_loco, const int k_mode,
     string_view key, value;
     cursor.get(key, value, mdb_fetch);
     mdb_fetch = MDB_NEXT;
+    /* FIXME!
+    if (indicator_snp[t] == 0)
+      continue;
+    */
     size_t num_floats = value.size() / sizeof(float);
     assert(num_floats == ni_total);
     const float* gs = reinterpret_cast<const float*>(value.data());
diff --git a/src/gemma_io.h b/src/gemma_io.h
index de74ada..4e9f4c1 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -87,7 +87,7 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
 void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
 void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
 
-gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode, const vector<int> &indicator_snp);
+gsl_matrix *mdb_calc_kin(const string file_geno, const bool is_loco, const int mode);
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
                const int display_pace, gsl_matrix *matrix_kin,
diff --git a/src/param.cpp b/src/param.cpp
index 1adee9e..f799ef7 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -1294,7 +1294,7 @@ void PARAM::ReadBIMBAMGenotypes(gsl_matrix *UtX, gsl_matrix *K, const bool calc_
 }
 
 gsl_matrix *PARAM::MdbCalcKin() {
-  return mdb_calc_kin(file_geno, is_loco(), a_mode - 20, indicator_snp);
+  return mdb_calc_kin(file_geno, is_loco(), a_mode - 20);
 }
 
 void PARAM::CalcKin(gsl_matrix *matrix_kin) {