aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/gemma_io.cpp13
-rw-r--r--src/mathfunc.cpp8
2 files changed, 17 insertions, 4 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index cd2a5d1..0eea17f 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1428,6 +1428,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
if (indicator_snp[t] == 0)
continue;
+ // Using regular expressions is slow:
// std::regex_token_iterator<std::string::iterator> rend;
// regex split_on("[,[:blank:]]+");
// regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
@@ -1438,10 +1439,10 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
uint token_num = tokens.size();
if (token_num != ni_total+3) {
cerr << line << endl;
- cerr << token_num << " != " << ni_total << endl;
- warning_msg("Columns in geno file do not match # individuals");
+ cerr << token_num-3 << " != " << ni_total << endl;
+ warning_msg("Columns in geno file do not match # individuals in phenotypes");
}
- enforce_msg(token_num <= ni_total + 3,"not enough genotype fields");
+ enforce_msg(token_num <= ni_total + 3,"not enough genotype fields for marker");
}
auto token_i = tokens.begin();
@@ -1466,7 +1467,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
auto field = *token_i;
auto sfield = std::string(field);
// cout << i << ":" << sfield << "," << endl;
- if (strncmp(field,"NA",2)==0) {
+ if (strncmp(field,"NA",2)==0) { // missing value
gsl_vector_set(geno_miss, i, 0);
n_miss++;
} else {
@@ -1486,17 +1487,21 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
geno_var /= (double)ni_total;
geno_var -= geno_mean * geno_mean;
+ // impute missing values by plugging in the mean
for (size_t i = 0; i < ni_total; ++i) {
if (gsl_vector_get(geno_miss, i) == 0) {
gsl_vector_set(geno, i, geno_mean);
}
}
+ // subtract the mean
gsl_vector_add_constant(geno, -1.0 * geno_mean);
+ // center the genotypes
if (k_mode == 2 && geno_var != 0) { // centering
gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
}
+ write(geno,"geno");
// set the SNP column ns_test
gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));
diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp
index 9043e00..cd74e09 100644
--- a/src/mathfunc.cpp
+++ b/src/mathfunc.cpp
@@ -150,11 +150,19 @@ void CenterMatrix(gsl_matrix *G) {
gsl_vector *Gw = gsl_vector_safe_alloc(G->size1);
gsl_vector_set_all(w, 1.0);
+ // y := alpha*A*x+ beta*y or Gw = G*w
gsl_blas_dgemv(CblasNoTrans, 1.0, G, w, 0.0, Gw);
+
+ // int gsl_blas_dsyr2(CBLAS_UPLO_t Uplo, double alpha, const gsl_vector * x, const gsl_vector * y, gsl_matrix * A)
+ // compute the symmetric rank-2 update A = \alpha x y^T + \alpha y x^T + A of the symmetric matrix A. Since the matrix A is symmetric only its upper half or lower half need to be stored
+ // or G = (UpperTriangle) alpha*Gw*w' + alpha*w*Gw' + G
gsl_blas_dsyr2(CblasUpper, -1.0 / (double)G->size1, Gw, w, G);
+ // compute dot product of vectors w.Gw and store in d
gsl_blas_ddot(w, Gw, &d);
+ // G = (upper) alpha w*w' + G
gsl_blas_dsyr(CblasUpper, d / ((double)G->size1 * (double)G->size1), w, G);
+ // Transpose the matrix
for (size_t i = 0; i < G->size1; ++i) {
for (size_t j = 0; j < i; ++j) {
d = gsl_matrix_get(G, j, i);