Comments

author: Pjotr Prins 2025-11-23 14:09:24 +0100
committer: Pjotr Prins 2025-11-23 14:09:24 +0100
commit: 8229e4e18dddae7bf570ca37767ff39726eede45 (patch)
tree: 17986d394dbc3a03da9bebe6eba96119dff571dd /src
parent: ff196955e3ca5bdd671a44f852f42323d3828be2 (diff)
download: pangemma-8229e4e18dddae7bf570ca37767ff39726eede45.tar.gz
3 files changed, 35 insertions, 20 deletions
diff --git a/src/gemma.cpp b/src/gemma.cpp
index 4ef2059..b30c3c3 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -2572,19 +2572,19 @@ void GEMMA::BatchRun(PARAM &cPar) {
       cPar.a_mode == M_LMM4 || cPar.a_mode == M_LMM5 || cPar.a_mode == M_LMM9 ||
       cPar.a_mode == M_EIGEN) { // Fit LMM or mvLMM or eigen
     write(cPar.a_mode, "Mode");
-    gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph);
+    gsl_matrix *Y = gsl_matrix_safe_alloc(cPar.ni_test, cPar.n_ph); // Y is ind x phenotypes
     enforce_msg(Y, "allocate Y"); // just to be sure
-    gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt);
-    gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2); // B is a d by c
+    gsl_matrix *W = gsl_matrix_safe_alloc(Y->size1, cPar.n_cvt);    // W is ind x covariates
+    gsl_matrix *B = gsl_matrix_safe_alloc(Y->size2, W->size2);      // B is a d by c (effect size)
                                                           // matrix
-    gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2);
-    gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1);
+    gsl_matrix *se_B = gsl_matrix_safe_alloc(Y->size2, W->size2);   // se
+    gsl_matrix *K = gsl_matrix_safe_alloc(Y->size1, Y->size1);      // Kinship aka as G/GRM in the equation ind x ind
     gsl_matrix *U = gsl_matrix_safe_alloc(Y->size1, Y->size1);
-    gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2);
-    gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2);
-    gsl_vector *eval = gsl_vector_calloc(Y->size1);
-    gsl_vector *env = gsl_vector_safe_alloc(Y->size1);
-    gsl_vector *weight = gsl_vector_safe_alloc(Y->size1);
+    gsl_matrix *UtW = gsl_matrix_calloc(Y->size1, W->size2);        // Transformed ind x covariates
+    gsl_matrix *UtY = gsl_matrix_calloc(Y->size1, Y->size2);        // Transforemd ind x phenotypes
+    gsl_vector *eval = gsl_vector_calloc(Y->size1);                 // eigen values
+    gsl_vector *env = gsl_vector_safe_alloc(Y->size1);              // E environment
+    gsl_vector *weight = gsl_vector_safe_alloc(Y->size1);           // weights
     debug_msg("Started on LMM");
     assert_issue(is_issue(26), UtY->data[0] == 0.0);
 
@@ -2696,8 +2696,8 @@ void GEMMA::BatchRun(PARAM &cPar) {
       LMM cLmm;
       cLmm.CopyFromParam(cPar);
 
-      gsl_vector_view Y_col = gsl_matrix_column(Y, 0);
-      gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0);
+      gsl_vector_view Y_col = gsl_matrix_column(Y, 0); // get a single phenotype column
+      gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and transformed column
 
       cLmm.AnalyzeGene(U, eval, UtW, &UtY_col.vector, W,
                        &Y_col.vector); // y is the predictor, not the phenotype
@@ -2814,8 +2814,8 @@ void GEMMA::BatchRun(PARAM &cPar) {
 
           // if (is_check_mode()) disable_segfpe(); // disable fast NaN checking for now
 
-          gsl_vector_view Y_col = gsl_matrix_column(Y, 0);
-          gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0);
+          gsl_vector_view Y_col = gsl_matrix_column(Y, 0); // get pheno column
+          gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0); // and its transformed
           write(&Y_col.vector, "Y_col");
 
           if (!cPar.file_bfile.empty()) {
@@ -2840,7 +2840,7 @@ void GEMMA::BatchRun(PARAM &cPar) {
                                     &Y_col.vector, env);
             }
           }
-          cLmm.WriteFiles();
+          cLmm.WriteFiles(); // write output files
           cLmm.CopyToParam(cPar);
         } else {
           debug_msg("fit mvLMM (multiple phenotypes)");
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index fddfd79..7d1f0ca 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -698,7 +698,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
 
   gsl_vector *genotype = gsl_vector_safe_alloc(W->size1);
   gsl_vector *genotype_miss = gsl_vector_safe_alloc(W->size1);
-  gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2);
+  gsl_matrix *WtW = gsl_matrix_safe_alloc(W->size2, W->size2); // Covariates
   gsl_matrix *WtWi = gsl_matrix_safe_alloc(W->size2, W->size2);
   gsl_vector *Wtx = gsl_vector_safe_alloc(W->size2);
   gsl_vector *WtWiWtx = gsl_vector_safe_alloc(W->size2);
@@ -746,13 +746,14 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   //      cerr << key << endl;
   // }
   while (!safe_get_line(infile, line).eof()) {
-    // cout << line;
+    // fetch SNP annotation
     ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen);
     rs = ch_ptr;
     ch_ptr = strtok_safe2(NULL, " ,\t",infilen);
     minor = ch_ptr;
     ch_ptr = strtok_safe2(NULL, " ,\t",infilen);
     major = ch_ptr;
+    // genotypes get read below
 
     if (setSnps.size() != 0 && setSnps.count(rs) == 0) {
       // if SNP in geno but not in -snps we add an missing value
@@ -792,11 +793,12 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
     c_idv = 0;
     gsl_vector_set_zero(genotype_miss);
     auto infilen = file_geno.c_str();
-    for (int i = 0; i < ni_total; ++i) {
+    for (int i = 0; i < ni_total; ++i) { // read genotypes
       ch_ptr = strtok_safe2(NULL, " ,\t",(string("To many individuals/strains/genometypes in pheno file for ")+infilen).c_str());
       if (indicator_idv[i] == 0)
         continue;
 
+      // annotate missing genotypes
       enforce_msg(ch_ptr,"Problem reading geno file (not enough genotypes in line)");
       if (strcmp(ch_ptr, "NA") == 0) {
         gsl_vector_set(genotype_miss, c_idv, 1);
@@ -900,7 +902,7 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   if (max_g != 2.0)
     warning_msg("The maximum genotype value is not 2.0 - this is not the BIMBAM standard and will skew l_lme and effect sizes");
 
-  gsl_vector_free(genotype);
+  gsl_vector_free(genotype); // we don't hang on to the genotypes
   gsl_vector_free(genotype_miss);
   gsl_matrix_free(WtW);
   gsl_matrix_free(WtWi);
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index a1c6c29..a351509 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -494,7 +494,20 @@ void StandardizeVector(gsl_vector *y) {
 }
 
 // Calculate UtX (U gets transposed)
-// CalcUtX transforms a genotype matrix into the eigenspace by computing U^T × X, where U contains the eigenvectors from the kinship matrix decomposition. This transformation diagonalizes the correlation structure induced by relatedness, making subsequent likelihood calculations tractable and efficient. The function overwrites its input with the transformed result, using a temporary copy to avoid data corruption.
+/*
+CalcUtX transforms a genotype matrix by the eigenvector matrix from the kinship matrix decomposition.
+Purpose:
+
+Computes UtX = U^T × X, where:
+
+    U: Eigenvector matrix from the spectral decomposition of the kinship matrix K
+    X: Genotype matrix (individuals × SNPs)
+    UtX: Transformed genotype matrix
+
+This transformation rotates the genotype data into the eigenspace where the kinship-induced correlations become diagonal, simplifying likelihood calculations.
+
+CalcUtX transforms a genotype matrix into the eigenspace by computing U^T × X, where U contains the eigenvectors from the kinship matrix decomposition. This transformation diagonalizes the correlation structure induced by relatedness, making subsequent likelihood calculations tractable and efficient. The function overwrites its input with the transformed result, using a temporary copy to avoid data corruption.
+*/
 void CalcUtX(const gsl_matrix *U, gsl_matrix *UtX) {
   gsl_matrix *X = gsl_matrix_safe_alloc(UtX->size1, UtX->size2);
   gsl_matrix_safe_memcpy(X, UtX);
author	Pjotr Prins	2025-11-23 14:09:24 +0100
committer	Pjotr Prins	2025-11-23 14:09:24 +0100
commit	8229e4e18dddae7bf570ca37767ff39726eede45 (patch)
tree	17986d394dbc3a03da9bebe6eba96119dff571dd /src
parent	ff196955e3ca5bdd671a44f852f42323d3828be2 (diff)
download	pangemma-8229e4e18dddae7bf570ca37767ff39726eede45.tar.gz