about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorPjotr Prins2020-09-28 08:47:27 +0100
committerPjotr Prins2020-09-28 08:47:27 +0100
commit6233a881ceaf701c8ae4b075e4a3ac0f90b6ca84 (patch)
tree133359d06bb78e234e568c2c480ccdbb6dddd2fe /src
parent519c4d005098dec382344514a837d49c164a5372 (diff)
downloadpangemma-6233a881ceaf701c8ae4b075e4a3ac0f90b6ca84.tar.gz
Comments and improved error message
Diffstat (limited to 'src')
-rw-r--r--src/gemma_io.cpp13
-rw-r--r--src/mathfunc.cpp8
2 files changed, 17 insertions, 4 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index cd2a5d1..0eea17f 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1428,6 +1428,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
     if (indicator_snp[t] == 0)
       continue;
 
+    // Using regular expressions is slow:
     // std::regex_token_iterator<std::string::iterator> rend;
     // regex split_on("[,[:blank:]]+");
     // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
@@ -1438,10 +1439,10 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
       uint token_num = tokens.size();
       if (token_num != ni_total+3) {
         cerr << line << endl;
-        cerr << token_num << " != " << ni_total << endl;
-        warning_msg("Columns in geno file do not match # individuals");
+        cerr << token_num-3 << " != " << ni_total << endl;
+        warning_msg("Columns in geno file do not match # individuals in phenotypes");
       }
-      enforce_msg(token_num <= ni_total + 3,"not enough genotype fields");
+      enforce_msg(token_num <= ni_total + 3,"not enough genotype fields for marker");
     }
 
     auto token_i = tokens.begin();
@@ -1466,7 +1467,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
       auto field = *token_i;
       auto sfield = std::string(field);
       // cout << i << ":" << sfield << "," << endl;
-      if (strncmp(field,"NA",2)==0) {
+      if (strncmp(field,"NA",2)==0) { // missing value
         gsl_vector_set(geno_miss, i, 0);
         n_miss++;
       } else {
@@ -1486,17 +1487,21 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
     geno_var /= (double)ni_total;
     geno_var -= geno_mean * geno_mean;
 
+    // impute missing values by plugging in the mean
     for (size_t i = 0; i < ni_total; ++i) {
       if (gsl_vector_get(geno_miss, i) == 0) {
         gsl_vector_set(geno, i, geno_mean);
       }
     }
 
+    // subtract the mean
     gsl_vector_add_constant(geno, -1.0 * geno_mean);
 
+    // center the genotypes
     if (k_mode == 2 && geno_var != 0) { // centering
       gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
     }
+    write(geno,"geno");
     // set the SNP column ns_test
     gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
     enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index 9043e00..cd74e09 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -150,11 +150,19 @@ void CenterMatrix(gsl_matrix *G) {
   gsl_vector *Gw = gsl_vector_safe_alloc(G->size1);
   gsl_vector_set_all(w, 1.0);
 
+  //  y := alpha*A*x+ beta*y or Gw = G*w
   gsl_blas_dgemv(CblasNoTrans, 1.0, G, w, 0.0, Gw);
+
+  // int gsl_blas_dsyr2(CBLAS_UPLO_t Uplo, double alpha, const gsl_vector * x, const gsl_vector * y, gsl_matrix * A)
+  // compute the symmetric rank-2 update A = \alpha x y^T + \alpha y x^T + A of the symmetric matrix A. Since the matrix A is symmetric only its upper half or lower half need to be stored
+  // or G = (UpperTriangle) alpha*Gw*w' + alpha*w*Gw' + G
   gsl_blas_dsyr2(CblasUpper, -1.0 / (double)G->size1, Gw, w, G);
+  // compute dot product of vectors w.Gw and store in d
   gsl_blas_ddot(w, Gw, &d);
+  // G = (upper) alpha w*w' + G
   gsl_blas_dsyr(CblasUpper, d / ((double)G->size1 * (double)G->size1), w, G);
 
+  // Transpose the matrix
   for (size_t i = 0; i < G->size1; ++i) {
     for (size_t j = 0; j < i; ++j) {
       d = gsl_matrix_get(G, j, i);