about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorPjotr Prins2025-12-05 10:46:23 +0100
committerPjotr Prins2025-12-05 10:46:23 +0100
commit6255a45e28b32ee8116418d849db68395ec4e096 (patch)
tree353c9c32373ab00b539b2d261ff1b1f92f54b284 /src
parentd95db7c5f117bc945b224bd8815feb8c0076e7ba (diff)
downloadpangemma-6255a45e28b32ee8116418d849db68395ec4e096.tar.gz
Compute maf again
Diffstat (limited to 'src')
-rw-r--r--src/gemma.cpp1
-rw-r--r--src/gemma_io.cpp31
-rw-r--r--src/lmm.cpp24
-rw-r--r--src/lmm.h4
-rw-r--r--src/mathfunc.cpp31
-rw-r--r--src/mathfunc.h5
6 files changed, 57 insertions, 39 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp
index f7ebdce..3100b94 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -2845,6 +2845,7 @@ void GEMMA::BatchRun(PARAM &cPar) {
           if (cPar.is_mdb) {
             cLmm.mdb_calc_gwa(U, eval, UtW, &UtY_col.vector, W,
                               &Y_col.vector, cPar.setGWASnps);
+            cLmm.CopyToParam(cPar);
           }
           else {
             if (!cPar.file_bfile.empty()) {
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 08f25ca..e183e3f 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -651,37 +651,6 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
   return true;
 }
 
-#include <tuple>
-
-std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) {
-  size_t n_0 = 0;
-  size_t n_1 = 0;
-  size_t n_2 = 0;
-  for (size_t i = 0; i < ni_total; ++i) { // read genotypes
-    double geno = gsl_vector_get(gs, i);
-    if (geno >= 0 && geno <= 0.5) {
-      n_0++;
-    }
-    if (geno > 0.5 && geno < 1.5) {
-      n_1++;
-    }
-    if (geno >= 1.5 && geno <= 2.0) {
-      n_2++;
-    }
-  }
-  return {n_0,n_1,n_2};
-}
-
-double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const gsl_vector *gs) {
-  double maf = 0.0;
-
-  for (size_t i = 0; i < ni_total; ++i) { // read genotypes
-    double geno = gsl_vector_get(gs, i);
-    maf += geno;
-  }
-  maf /= 2.0 * (double)(ni_test - n_miss);
-  return maf;
-}
 
 // Read bimbam mean genotype file, the first time, to obtain #SNPs for
 // analysis (ns_test) and total #SNP (ns_total).
diff --git a/src/lmm.cpp b/src/lmm.cpp
index d13ce3c..66b8e37 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -96,8 +96,9 @@ void LMM::CopyToParam(PARAM &cPar) {
   checkpoint_nofile("lmm-copy-to-param");
   cPar.time_UtX = time_UtX;
   cPar.time_opt = time_opt;
-
-  cPar.ng_test = ng_test; // number of markers tested
+  cPar.ns_total = ns_total;
+  cPar.ns_test  = ns_test;
+  cPar.ng_test  = ng_test; // number of markers tested
 
   return;
 }
@@ -2034,8 +2035,8 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     */
     // drop missing idv and plug mean values for missing geno
     double x_total = 0.0; // sum genotype values to compute x_mean
-    uint vpos = 0;         // position in target vector
-    uint n_miss = 0;
+    uint vpos = 0;        // position in target vector
+    uint n_miss = 0;      // count NA genotypes
     gsl_vector_set_zero(x_miss);
     for (size_t i = 0; i < ni_total; ++i) {
       // get the genotypes per individual and compute stats per SNP
@@ -2107,6 +2108,9 @@ void LMM::mdb_analyze(std::function< SnpNameValues2(size_t) >& fetch_snp,
     outfile << chr << "\t";
     outfile << name << "\t";
     outfile << pos << "\t";
+    auto n_miss = 0;
+    outfile << n_miss << "\t-\t-\t"; // n_miss column
+    // outfile << st.maf << "\t" ;
     if (a_mode != M_LMM2) {
       outfile << st.beta << "\t";
       outfile << st.se << "\t";
@@ -2240,18 +2244,26 @@ void LMM::mdb_calc_gwa(const gsl_matrix *U, const gsl_vector *eval,
       auto marker = string(value2);
       // 1       rs13476251      174792257
       // cout << static_cast<int>(chr) << ":" << pos2 << " line " << rest2 << ":" << marker << endl ;
-      markerinfo = make_tuple(marker,chr,pos,num);
 
       auto size = num_floats;
       gs.reserve(size);
       for (size_t i = 0; i < size; ++i) {
         gs.push_back(static_cast<double>(gsbuf[i]));
       }
-      // cout << "!!!!" << size << snp << ": " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
+
+      // compute maf and n_miss (NAs)
+      size_t n_miss = 0; // count NAs: FIXME
+      double maf = compute_maf(ni_total, ni_test, n_miss, gs.data());
+
+      markerinfo = make_tuple(marker,chr,pos,num);
+
+      // cout << "!!!!" << size << marker << ": af" << maf << " " << gs[0] << "," << gs[1] << "," << gs[2] << "," << gs[3] << endl;
     }
     return make_tuple(success, markerinfo, gs);
   };
   LMM::mdb_analyze(fetch_snp,U,eval,UtW,Uty,W,y,gwasnps,num_markers);
+
+  ns_total = ns_test = num_markers; // track global number of snps in original gemma (goes to cPar)
 }
 
 
diff --git a/src/lmm.h b/src/lmm.h
index 35064c7..715b9c0 100644
--- a/src/lmm.h
+++ b/src/lmm.h
@@ -45,10 +45,10 @@ public:
   size_t e_mode;
 };
 
-typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerChrPos;
+typedef tuple< string, uint16_t, uint32_t, uint32_t > MarkerChrPos; // name, chr, pos, line
 typedef vector<MarkerChrPos> Markers;
 typedef tuple< string,vector<double> > SnpNameValues;
-typedef tuple< bool,MarkerChrPos,vector<double> > SnpNameValues2;
+typedef tuple< bool,MarkerChrPos,vector<double> > SnpNameValues2; // success, markerinfo (maf and n_miss are computed)
 // Results for LMM.
 class SUMSTAT2 {
 public:
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index a351509..f132e6e 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -645,3 +645,34 @@ double UcharToDouble02(const unsigned char c) { return (double)c * 0.01; }
 unsigned char Double02ToUchar(const double dosage) {
   return (int)(dosage * 100);
 }
+
+
+std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs) {
+  size_t n_0 = 0;
+  size_t n_1 = 0;
+  size_t n_2 = 0;
+  for (size_t i = 0; i < ni_total; ++i) { // read genotypes
+    double geno = gsl_vector_get(gs, i);
+    if (geno >= 0 && geno <= 0.5) {
+      n_0++;
+    }
+    if (geno > 0.5 && geno < 1.5) {
+      n_1++;
+    }
+    if (geno >= 1.5 && geno <= 2.0) {
+      n_2++;
+    }
+  }
+  return {n_0,n_1,n_2};
+}
+
+double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs) {
+  double maf = 0.0;
+
+  for (size_t i = 0; i < ni_total; ++i) { // read genotypes
+    double geno = gs[i];
+    maf += geno; // 0..2 range
+  }
+  maf /= 2.0 * (double)(ni_test - n_miss); // Assumption is that geno value is between 0 and 2. FIXME
+  return maf;
+}
diff --git a/src/mathfunc.h b/src/mathfunc.h
index 641d0a3..b408ea7 100644
--- a/src/mathfunc.h
+++ b/src/mathfunc.h
@@ -24,6 +24,7 @@
 // #include "Eigen/Dense"
 #include "gsl/gsl_matrix.h"
 #include "gsl/gsl_vector.h"
+#include <tuple>
 
 #define CONDITIONED_MAXRATIO 2e+6 // based on http://mathworld.wolfram.com/ConditionNumber.html
 #define EIGEN_MINVALUE 1e-10
@@ -77,4 +78,8 @@ unsigned char Double02ToUchar(const double dosage);
 // void uchar_matrix_get_row(const vector<vector<unsigned char>> &X,
 //                          const size_t i_row, Eigen::VectorXd &x_row);
 
+std::tuple<size_t, size_t, size_t> compute_ratio(size_t ni_total, const gsl_vector *gs);
+double compute_maf(size_t ni_total, size_t ni_test, size_t n_miss, const double *gs);
+
+
 #endif