Adding comments

author: Pjotr Prins 2025-11-23 12:44:20 +0100
committer: Pjotr Prins 2025-11-23 12:44:20 +0100
commit: ff196955e3ca5bdd671a44f852f42323d3828be2 (patch)
tree: 1a1a12dea794dff9df5f4ca991740cbaf48c992a /src
parent: 663778957ad37d5c7806d4f17c1f2e77b2b268fa (diff)
download: pangemma-ff196955e3ca5bdd671a44f852f42323d3828be2.tar.gz
4 files changed, 431 insertions, 3 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index 76f8a35..fddfd79 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -641,8 +641,45 @@ bool ReadFile_fam(const string &file_fam, vector<vector<int>> &indicator_pheno,
 
 // Read bimbam mean genotype file, the first time, to obtain #SNPs for
 // analysis (ns_test) and total #SNP (ns_total).
+/*
+## Modified (Mutated) Parameters:
+
+1. **`indicator_idv`** (vector<int>&)
+   - Actually **not modified** - only read from to determine which individuals to include
+
+2. **`indicator_snp`** (vector<int>&)
+   - **Modified**: Cleared at the start (`indicator_snp.clear()`)
+   - Populated with 0s and 1s to indicate whether each SNP passes filters
+   - One entry per SNP in the genotype file
+
+3. **`mapRS2chr`** (map<string, string>&)
+   - **Not modified** - only read from to get chromosome info
+
+4. **`mapRS2bp`** (map<string, long int>&)
+   - **Not modified** - only read from to get base pair positions
+
+5. **`mapRS2cM`** (map<string, double>&)
+   - **Not modified** - only read from to get centimorgan positions
+
+6. **`snpInfo`** (vector<SNPINFO>&)
+   - **Modified**: Cleared at the start (`snpInfo.clear()`)
+   - Populated with SNPINFO structures for each SNP containing metadata (chr, rs, position, alleles, missingness, MAF, etc.)
+
+7. **`ns_test`** (size_t&)
+   - **Modified**: Set to 0 initially
+   - Incremented for each SNP that passes all filters
+   - Returns the count of SNPs that will be used for testing
+
+## Summary of Mutated Outputs:
+- **`indicator_snp`**: Binary indicators for which SNPs passed filtering
+- **`snpInfo`**: Complete metadata for all SNPs in the file
+- **`ns_test`**: Count of SNPs passing filters
+
+The function returns a `bool` indicating success/failure, but the real outputs are these three modified reference parameters.
+*/
+
 bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
-                   const gsl_matrix *W, vector<int> &indicator_idv,
+                   const gsl_matrix *W, const vector<int> &indicator_idv,
                    vector<int> &indicator_snp, const double &maf_level,
                    const double &miss_level, const double &hwe_level,
                    const double &r2_level, map<string, string> &mapRS2chr,
diff --git a/src/gemma_io.h b/src/gemma_io.h
index dd1d5c0..5c14227 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -60,7 +60,7 @@ bool ReadFile_column(const string &file_pheno, vector<int> &indicator_idv,
                      vector<double> &pheno, const int &p_column);
 
 bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
-                   const gsl_matrix *W, vector<int> &indicator_idv,
+                   const gsl_matrix *W, const vector<int> &indicator_idv,
                    vector<int> &indicator_snp, const double &maf_level,
                    const double &miss_level, const double &hwe_level,
                    const double &r2_level, map<string, string> &mapRS2chr,
diff --git a/src/lmm.cpp b/src/lmm.cpp
index ccf880a..87b9e1e 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -1211,6 +1211,142 @@ void LMM::CalcRLScore(const double l, const FUNC_PARAM &params, double &beta,
   gsl_vector_safe_free(v_temp);
 }
 
+/*
+  ## What `CalcUab` does:
+
+`CalcUab` **computes element-wise products of transformed covariates and phenotypes** to create sufficient statistics needed for linear mixed model calculations.
+
+### Purpose:
+
+Creates a matrix `Uab` containing all pairwise element-wise products of:
+- Transformed covariates: columns of `UtW` (U^T × W)
+- Transformed phenotype: vector `Uty` (U^T × y)
+
+These products are used later to efficiently compute variance components and test statistics without repeatedly accessing the original data.
+
+### Function Signature:
+```cpp
+void CalcUab(const gsl_matrix *UtW,   // U^T × W (transformed covariates)
+             const gsl_vector *Uty,   // U^T × y (transformed phenotype)
+             gsl_matrix *Uab)         // OUTPUT: matrix of products
+```
+
+### Matrix Structure:
+
+Given:
+- **n_cvt** covariates (columns in W)
+- 1 phenotype (y)
+
+The function creates products for indices `a` and `b` where:
+- `a, b ∈ {1, 2, ..., n_cvt, n_cvt+2}` (note: n_cvt+1 is **skipped**)
+- Only computes for `b ≤ a` (lower triangular, symmetric)
+
+**Index mapping:**
+- `1` to `n_cvt`: Covariate columns from UtW
+- `n_cvt+1`: **SKIPPED** (reserved for genotype, added later)
+- `n_cvt+2`: Phenotype (Uty)
+
+### Algorithm Walkthrough:
+
+```cpp
+for (size_t a = 1; a <= n_cvt + 2; ++a) {
+  // Get column 'a'
+  if (a == n_cvt + 2) {
+    u_a = Uty;  // Phenotype
+  } else {
+    u_a = UtW[:, a-1];  // Covariate a
+  }
+
+  for (size_t b = a; b >= 1; --b) {
+    // Get column 'b'
+    if (b == n_cvt + 2) {
+      u_b = Uty;  // Phenotype
+    } else {
+      u_b = UtW[:, b-1];  // Covariate b
+    }
+
+    // Store element-wise product: u_a .* u_b
+    index_ab = GetabIndex(a, b, n_cvt);
+    Uab[:, index_ab] = u_a .* u_b;
+  }
+}
+```
+
+### Example:
+
+With **n_cvt = 2** (2 covariates):
+
+**Columns stored in Uab:**
+```
+GetabIndex(1, 1, 2) → UtW₁ .* UtW₁    (covariate 1 × covariate 1)
+GetabIndex(2, 1, 2) → UtW₂ .* UtW₁    (covariate 2 × covariate 1)
+GetabIndex(2, 2, 2) → UtW₂ .* UtW₂    (covariate 2 × covariate 2)
+GetabIndex(4, 1, 2) → Uty .* UtW₁     (phenotype × covariate 1)
+GetabIndex(4, 2, 2) → Uty .* UtW₂     (phenotype × covariate 2)
+GetabIndex(4, 4, 2) → Uty .* Uty      (phenotype × phenotype)
+
+Note: Index 3 (n_cvt+1) is skipped - reserved for genotype
+```
+
+### Why Skip `n_cvt+1`?
+
+```cpp
+if (a == n_cvt + 1) continue;
+if (b == n_cvt + 1) continue;
+```
+
+The slot `n_cvt+1` is **reserved for the genotype (SNP)** being tested, which is added by the overloaded version:
+
+```cpp
+void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty,
+             const gsl_vector *Utx,  // ← GENOTYPE
+             gsl_matrix *Uab);
+```
+
+This design allows the base covariate products to be computed once, then genotype-specific products added for each SNP.
+
+### Index Function:
+
+```cpp
+size_t GetabIndex(const size_t a, const size_t b, const size_t n_cvt)
+```
+
+Maps `(a, b)` pairs to column indices in `Uab`, ensuring:
+- Only lower triangular entries (b ≤ a)
+- Skips the genotype position (n_cvt+1)
+
+### Matrix Dimensions:
+
+```cpp
+size_t n_index = (n_cvt + 2 + 1) * (n_cvt + 2) / 2;
+```
+
+Number of unique products in the symmetric matrix (including diagonal), accounting for the skipped genotype position.
+
+### In Context (from `batch_compute`):
+
+```cpp
+// Pre-compute covariate products once
+CalcUab(UtW, Uty, Uab);
+
+// For each SNP:
+for (size_t i = 0; i < batch_size; i++) {
+  gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
+  gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector);
+
+  // Add genotype-specific products
+  CalcUab(UtW, Uty, Utx, Uab);  // Overloaded version
+
+  // Use Uab for likelihood calculations
+  CalcLambda(..., Uab, ...);
+}
+```
+
+### Summary:
+
+**`CalcUab` pre-computes all pairwise element-wise products** of transformed covariates and phenotype (UtW columns and Uty). These products are sufficient statistics used repeatedly in likelihood calculations for different SNPs, avoiding redundant computation. The design reserves space for genotype products to be added later per-SNP while reusing the fixed covariate products.
+*/
+
 void CalcUab(const gsl_matrix *UtW, const gsl_vector *Uty, gsl_matrix *Uab) {
   size_t index_ab;
   size_t n_cvt = UtW->size2;
@@ -1471,6 +1607,53 @@ void LMM::AnalyzeGene(const gsl_matrix *U, const gsl_vector *eval,
   return;
 }
 
+/*
+Looking at the LMM::Analyze function, here are the parameters that get modified:
+Modified (Mutated) Parameters:
+
+None of the function parameters are directly modified.
+
+All parameters are either:
+
+    const gsl_matrix * - pointer to const (read-only)
+    const gsl_vector * - pointer to const (read-only)
+    const set<string> - const set passed by value (copy)
+    std::function<...>& - reference to a function, which is called but not modified itself
+
+What Actually Gets Modified:
+
+The function modifies member variables of the LMM class object:
+
+    sumStat (member variable)
+        Modified: sumStat.push_back(SNPs) is called in the batch_compute lambda
+        Accumulates summary statistics (beta, se, lambda values, p-values) for each SNP that passes filters
+        This is the primary output of the analysis
+    Timing member variables (implied by member access):
+        time_UtX: time_UtX += (clock() - time_start) / ...
+        time_opt: time_opt += (clock() - time_start) / ...
+        These accumulate timing information for performance profiling
+    Member variables read but not modified:
+        indicator_snp - read to determine which SNPs to process
+        indicator_idv - read to determine which individuals to include
+        ni_total - number of total individuals
+        ni_test - number of individuals in test
+        n_cvt - number of covariates
+        l_mle_null, l_min, l_max, n_region, logl_mle_H0 - parameters for likelihood calculations
+        a_mode - analysis mode (M_LMM1, M_LMM2, etc.)
+        d_pace - for progress display
+
+Summary:
+
+From the outside caller's perspective: None of the parameters passed to Analyze() are mutated. All inputs are const.
+
+The actual output is stored in the member variable sumStat, which accumulates one SUMSTAT structure per analyzed SNP containing:
+
+    beta - effect size
+    se - standard error
+    lambda_remle, lambda_mle - variance component estimates
+    p_wald, p_lrt, p_score - p-values from different tests
+    logl_H1 - log-likelihood under alternative hypothesis
+*/
 
 void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
                   const gsl_matrix *U, const gsl_vector *eval,
@@ -1511,6 +1694,13 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
   // start reading genotypes and analyze
   size_t c = 0;
 
+  /*
+    batch_compute(l) takes l SNPs that have been loaded into Xlarge,
+    transforms them all at once using the eigenvector matrix U, then
+    loops through each transformed SNP to compute association
+    statistics (beta, standard errors, p-values) and stores results in
+    sumStat.
+  */
   auto batch_compute = [&](size_t l) { // using a C++ closure
     // Compute SNPs in batch, note the computations are independent per SNP
     // debug_msg("enter batch_compute");
@@ -1519,16 +1709,20 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
         gsl_matrix_submatrix(UtXlarge, 0, 0, inds, l);
 
     time_start = clock();
+    // Transforms all l SNPs in the batch at once: UtXlarge = U^T × Xlarge
+    // This is much faster than doing l separate matrix-vector products
+    // U is the eigenvector matrix from the spectral decomposition of the kinship matrix
     fast_dgemm("T", "N", 1.0, U, &Xlarge_sub.matrix, 0.0,
                    &UtXlarge_sub.matrix);
     time_UtX += (clock() - time_start) / (double(CLOCKS_PER_SEC) * 60.0);
 
     gsl_matrix_set_zero(Xlarge);
     for (size_t i = 0; i < l; i++) {
-      // for every batch...
+      // for each snp batch item extract transformed genotype:
       gsl_vector_view UtXlarge_col = gsl_matrix_column(UtXlarge, i);
       gsl_vector_safe_memcpy(Utx, &UtXlarge_col.vector);
 
+      // Calculate design matrix components and compute sufficient statistics for the regression model
       CalcUab(UtW, Uty, Utx, Uab);
 
       time_start = clock();
@@ -1538,17 +1732,24 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
       double p_lrt = 0.0, p_score = 0.0;
       double logl_H1 = 0.0;
 
+      // Run statistical tests based on analysis mode
       // 3 is before 1.
       if (a_mode == M_LMM3 || a_mode == M_LMM4 || a_mode == M_LMM9 ) {
         CalcRLScore(l_mle_null, param1, beta, se, p_score);
       }
 
+      // Computes Wald statistic for testing association
       if (a_mode == M_LMM1 || a_mode == M_LMM4) {
         // for univariate a_mode is 1
+        // Estimates variance component (lambda) via REML
         CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
         CalcRLWald(lambda_remle, param1, beta, se, p_wald);
       }
 
+      // Estimates variance component (lambda) via REML
+      // Likelihood Ratio Test (modes 2, 4, 9):
+      // Estimates variance component via MLE
+      // Compares log-likelihood under alternative vs null hypothesis
       if (a_mode == M_LMM2 || a_mode == M_LMM4 || a_mode == M_LMM9) {
         CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
         p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
@@ -1658,6 +1859,48 @@ void LMM::Analyze(std::function< SnpNameValues(size_t) >& fetch_snp,
 
 }
 
+/*
+
+Looking at the `LMM::AnalyzeBimbam` function, here are the parameters that get modified:
+
+## Modified (Mutated) Parameters:
+
+**None of the parameters are modified.**
+
+All parameters are passed as:
+- `const gsl_matrix *` - pointer to const matrix (read-only)
+- `const gsl_vector *` - pointer to const vector (read-only)
+- `const set<string>` - const set passed by value (copy)
+
+## What Actually Gets Modified:
+
+The function modifies **internal state** and **local variables** only:
+
+1. **Local variables modified**:
+   - `prev_line` - tracks current line position in file
+   - `gs` - vector that gets resized and reassigned for each SNP
+   - `infile` - file stream that gets opened, read from, and closed
+
+2. **Member variables** (implied by `this->`):
+   - `file_geno` - used to open the file (read-only here)
+   - `ni_total` - number of individuals (read-only here)
+
+3. **The lambda function `fetch_snp`** captures local variables by reference (`[&]`) and modifies:
+   - `prev_line` - incremented as lines are read
+   - `gs` - reassigned with new genotype values for each SNP
+   - `infile` - advanced through the file
+
+## Summary:
+
+**From the outside caller's perspective**: Nothing passed into `AnalyzeBimbam` gets mutated. All inputs are treated as read-only (`const`).
+
+The function's work happens through:
+- Reading from the genotype file
+- Calling `LMM::Analyze(fetch_snp, ...)` with a callback that reads SNP data
+- Any mutations likely happen inside the `LMM::Analyze` method (which would need to be examined separately to see what it modifies)
+
+*/
+
 void LMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
                         const gsl_matrix *UtW, const gsl_vector *Uty,
                         const gsl_matrix *W, const gsl_vector *y,
@@ -1944,6 +2187,153 @@ void MatrixCalcLR(const gsl_matrix *U, const gsl_matrix *UtX,
   return;
 }
 
+/*
+
+## What `CalcLambda` does:
+
+`CalcLambda` **estimates the variance component ratio (lambda)** in a linear mixed model by finding the value that maximizes either the log-likelihood or log-restricted likelihood function.
+
+### Purpose:
+
+In LMM, the variance structure is:
+- **Var(y) = λτσ² K + τσ² I**
+
+Where:
+- **λ (lambda)**: The ratio of genetic variance to environmental variance (Vg/Ve)
+- **K**: Kinship/relatedness matrix
+- **I**: Identity matrix
+
+Lambda determines how much of the phenotypic variance is explained by genetic relatedness vs random environmental effects.
+
+### Function Signature:
+```cpp
+void CalcLambda(const char func_name,      // 'R' for REML, 'L' for MLE
+                FUNC_PARAM &params,        // Model parameters
+                const double l_min,        // Min lambda to search
+                const double l_max,        // Max lambda to search
+                const size_t n_region,     // # intervals to divide search
+                double &lambda,            // OUTPUT: estimated lambda
+                double &logf)              // OUTPUT: log-likelihood at lambda
+```
+
+### Algorithm Overview:
+
+#### **Phase 1: Identify Candidate Intervals**
+```cpp
+for (size_t i = 0; i < n_region; ++i) {
+  lambda_l = l_min * exp(lambda_interval * i);
+  lambda_h = l_min * exp(lambda_interval * (i + 1.0));
+
+  dev1_l = LogRL_dev1(lambda_l, &params);  // First derivative at lower bound
+  dev1_h = LogRL_dev1(lambda_h, &params);  // First derivative at upper bound
+
+  if (dev1_l * dev1_h <= 0) {
+    lambda_lh.push_back(make_pair(lambda_l, lambda_h));  // Sign change = root
+  }
+}
+```
+- Divides the search range [l_min, l_max] into `n_region` intervals (log-spaced)
+- Evaluates the **first derivative** at interval boundaries
+- If the derivative changes sign, there's a local maximum in that interval
+
+#### **Phase 2: Find Maximum**
+
+**Case A: No sign changes found**
+```cpp
+if (lambda_lh.empty()) {
+  // Maximum is at a boundary
+  if (logf_l >= logf_h) {
+    lambda = l_min;
+  } else {
+    lambda = l_max;
+  }
+}
+```
+
+**Case B: Sign changes found (interior maximum)**
+
+For each candidate interval:
+
+1. **Brent's Method** (robust root-finding):
+   ```cpp
+   gsl_root_fsolver_brent
+   ```
+   - Finds where the first derivative = 0 (critical point)
+   - Doesn't require computing second derivatives
+   - Robust but slower
+
+2. **Newton-Raphson Method** (refinement):
+   ```cpp
+   gsl_root_fdfsolver_newton
+   ```
+   - Uses both first and second derivatives
+   - Refines the estimate from Brent's method
+   - Faster convergence near the root
+
+3. **Compare across intervals**:
+   ```cpp
+   if (logf < logf_l) {
+     logf = logf_l;
+     lambda = l;
+   }
+   ```
+   - Keeps track of the lambda with the highest log-likelihood
+   - Handles multiple local maxima
+
+#### **Phase 3: Check Boundaries**
+```cpp
+if (logf_l > logf) {
+  lambda = l_min;
+}
+if (logf_h > logf) {
+  lambda = l_max;
+}
+```
+- Ensures the global maximum isn't at a boundary
+
+### Function Types:
+
+- **`func_name = 'R'` or `'r'`**: REML (Restricted Maximum Likelihood)
+  - Uses `LogRL_f`, `LogRL_dev1`, `LogRL_dev2`
+  - Accounts for loss of degrees of freedom from fixed effects
+  - Preferred for variance component estimation
+
+- **`func_name = 'L'` or `'l'`**: MLE (Maximum Likelihood)
+  - Uses `LogL_f`, `LogL_dev1`, `LogL_dev2`
+  - Direct likelihood maximization
+  - Used for likelihood ratio tests
+
+### Error Handling:
+
+```cpp
+if (status == GSL_CONTINUE || status != GSL_SUCCESS) {
+  logf = nan("NAN");
+  lambda = nan("NAN");
+  return;
+}
+```
+- If optimization fails, returns NaN values
+- Warnings issued for non-convergence
+
+### In Context (from `batch_compute`):
+
+```cpp
+// REML for Wald test
+CalcLambda('R', param1, l_min, l_max, n_region, lambda_remle, logl_H1);
+CalcRLWald(lambda_remle, param1, beta, se, p_wald);
+
+// MLE for LRT test
+CalcLambda('L', param1, l_min, l_max, n_region, lambda_mle, logl_H1);
+p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_mle_H0), 1);
+```
+
+### Summary:
+
+**`CalcLambda` finds the optimal variance component ratio (λ)** that maximizes the (restricted) log-likelihood for a given SNP's genotype data in a linear mixed model. This λ value quantifies the relative contribution of genetic relatedness vs environmental noise to phenotypic variation, and is essential for computing association test statistics (Wald, LRT) while properly accounting for population structure and relatedness.
+
+*/
+
+
 void CalcLambda(const char func_name, FUNC_PARAM &params, const double l_min,
                 const double l_max, const size_t n_region, double &lambda,
                 double &logf) {
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index 68d17a7..a1c6c29 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -494,6 +494,7 @@ void StandardizeVector(gsl_vector *y) {
 }
 
 // Calculate UtX (U gets transposed)
+// CalcUtX transforms a genotype matrix into the eigenspace by computing U^T × X, where U contains the eigenvectors from the kinship matrix decomposition. This transformation diagonalizes the correlation structure induced by relatedness, making subsequent likelihood calculations tractable and efficient. The function overwrites its input with the transformed result, using a temporary copy to avoid data corruption.
 void CalcUtX(const gsl_matrix *U, gsl_matrix *UtX) {
   gsl_matrix *X = gsl_matrix_safe_alloc(UtX->size1, UtX->size2);
   gsl_matrix_safe_memcpy(X, UtX);
author	Pjotr Prins	2025-11-23 12:44:20 +0100
committer	Pjotr Prins	2025-11-23 12:44:20 +0100
commit	ff196955e3ca5bdd671a44f852f42323d3828be2 (patch)
tree	1a1a12dea794dff9df5f4ca991740cbaf48c992a /src
parent	663778957ad37d5c7806d4f17c1f2e77b2b268fa (diff)
download	pangemma-ff196955e3ca5bdd671a44f852f42323d3828be2.tar.gz