aboutsummaryrefslogtreecommitdiff
path: root/src/gemma_io.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gemma_io.cpp')
-rw-r--r--src/gemma_io.cpp13
1 files changed, 9 insertions, 4 deletions
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index cd2a5d1..0eea17f 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -1428,6 +1428,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
if (indicator_snp[t] == 0)
continue;
+ // Using regular expressions is slow:
// std::regex_token_iterator<std::string::iterator> rend;
// regex split_on("[,[:blank:]]+");
// regex_token_iterator<string::iterator> tokens(line.begin(), line.end(),
@@ -1438,10 +1439,10 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
uint token_num = tokens.size();
if (token_num != ni_total+3) {
cerr << line << endl;
- cerr << token_num << " != " << ni_total << endl;
- warning_msg("Columns in geno file do not match # individuals");
+ cerr << token_num-3 << " != " << ni_total << endl;
+ warning_msg("Columns in geno file do not match # individuals in phenotypes");
}
- enforce_msg(token_num <= ni_total + 3,"not enough genotype fields");
+ enforce_msg(token_num <= ni_total + 3,"not enough genotype fields for marker");
}
auto token_i = tokens.begin();
@@ -1466,7 +1467,7 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
auto field = *token_i;
auto sfield = std::string(field);
// cout << i << ":" << sfield << "," << endl;
- if (strncmp(field,"NA",2)==0) {
+ if (strncmp(field,"NA",2)==0) { // missing value
gsl_vector_set(geno_miss, i, 0);
n_miss++;
} else {
@@ -1486,17 +1487,21 @@ bool BimbamKin(const string file_geno, const set<string> ksnps,
geno_var /= (double)ni_total;
geno_var -= geno_mean * geno_mean;
+ // impute missing values by plugging in the mean
for (size_t i = 0; i < ni_total; ++i) {
if (gsl_vector_get(geno_miss, i) == 0) {
gsl_vector_set(geno, i, geno_mean);
}
}
+ // subtract the mean
gsl_vector_add_constant(geno, -1.0 * geno_mean);
+ // center the genotypes
if (k_mode == 2 && geno_var != 0) { // centering
gsl_vector_scale(geno, 1.0 / sqrt(geno_var));
}
+ write(geno,"geno");
// set the SNP column ns_test
gsl_vector_view Xlarge_col = gsl_matrix_column(Xlarge, ns_test % msize);
enforce_gsl(gsl_vector_memcpy(&Xlarge_col.vector, geno));