diff options
-rw-r--r-- | src/gemma_io.cpp | 13 | ||||
-rw-r--r-- | src/mathfunc.cpp | 8 |
2 files changed, 17 insertions, 4 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp index cd2a5d1..0eea17f 100644 --- a/src/gemma_io.cpp +++ b/src/gemma_io.cpp @@ -1428,6 +1428,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, if (indicator_snp[t] == 0) continue; + // Using regular expressions is slow: // std::regex_token_iterator<std::string::iterator> rend; // regex split_on("[,[:blank:]]+"); // regex_token_iterator<string::iterator> tokens(line.begin(), line.end(), @@ -1438,10 +1439,10 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, uint token_num = tokens.size(); if (token_num != ni_total+3) { cerr << line << endl; - cerr << token_num << " != " << ni_total << endl; - warning_msg("Columns in geno file do not match # individuals"); + cerr << token_num-3 << " != " << ni_total << endl; + warning_msg("Columns in geno file do not match # individuals in phenotypes"); } - enforce_msg(token_num <= ni_total + 3,"not enough genotype fields"); + enforce_msg(token_num <= ni_total + 3,"not enough genotype fields for marker"); } auto token_i = tokens.begin(); @@ -1466,7 +1467,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, auto field = *token_i; auto sfield = std::string(field); // cout << i << ":" << sfield << "," << endl; - if (strncmp(field,"NA",2)==0) { + if (strncmp(field,"NA",2)==0) { // missing value gsl_vector_set(geno_miss, i, 0); n_miss++; } else { @@ -1486,17 +1487,21 @@ bool BimbamKin(const string file_geno, const set<string> ksnps, geno_var /= (double)ni_total; geno_var -= geno_mean * geno_mean; + // impute missing values by plugging in the mean for (size_t i = 0; i < ni_total; ++i) { if (gsl_vector_get(geno_miss, i) == 0) { gsl_vector_set(geno, i, geno_mean); } } + // subtract the mean gsl_vector_add_constant(geno, -1.0 * geno_mean); + // center the genotypes if (k_mode == 2 && geno_var != 0) { // centering gsl_vector_scale(geno, 1.0 / sqrt(geno_var)); } + write(geno,"geno"); // set the SNP column ns_test gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize); enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno)); diff --git a/src/mathfunc.cpp b/src/mathfunc.cpp index 9043e00..cd74e09 100644 --- a/src/mathfunc.cpp +++ b/src/mathfunc.cpp @@ -150,11 +150,19 @@ void CenterMatrix(gsl_matrix *G) { gsl_vector *Gw = gsl_vector_safe_alloc(G->size1); gsl_vector_set_all(w, 1.0); + // y := alpha*A*x+ beta*y or Gw = G*w gsl_blas_dgemv(CblasNoTrans, 1.0, G, w, 0.0, Gw); + + // int gsl_blas_dsyr2(CBLAS_UPLO_t Uplo, double alpha, const gsl_vector * x, const gsl_vector * y, gsl_matrix * A) + // compute the symmetric rank-2 update A = \alpha x y^T + \alpha y x^T + A of the symmetric matrix A. Since the matrix A is symmetric only its upper half or lower half need to be stored + // or G = (UpperTriangle) alpha*Gw*w' + alpha*w*Gw' + G gsl_blas_dsyr2(CblasUpper, -1.0 / (double)G->size1, Gw, w, G); + // compute dot product of vectors w.Gw and store in d gsl_blas_ddot(w, Gw, &d); + // G = (upper) alpha w*w' + G gsl_blas_dsyr(CblasUpper, d / ((double)G->size1 * (double)G->size1), w, G); + // Transpose the matrix for (size_t i = 0; i < G->size1; ++i) { for (size_t j = 0; j < i; ++j) { d = gsl_matrix_get(G, j, i); |