diff options
author | Pjotr Prins | 2015-03-30 13:01:22 +0200 |
---|---|---|
committer | Pjotr Prins | 2015-03-30 13:01:22 +0200 |
commit | 8b88be4f48baa6cd0cc3c37a851144d5b1dc24af (patch) | |
tree | cb35abd3dd57a93aa02b10eaf3cac01c46d41ff7 /wqflask | |
parent | 6fc112431c0edb0ecae6cd5fa45716c349094a7f (diff) | |
download | genenetwork2-8b88be4f48baa6cd0cc3c37a851144d5b1dc24af.tar.gz |
Refactoring genotype normalization
Diffstat (limited to 'wqflask')
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 19 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 9 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 |
3 files changed, 17 insertions, 13 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index 315fd824..49f32e3a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -37,14 +37,15 @@ def normalize(ind_g): Run for every SNP list (for one individual) and return normalized SNP genotype values with missing data filled in """ - g1 = np.copy(ind_g) # avoid side effects - x = True - np.isnan(ind_g) # Matrix of True/False - m = ind_g[x].mean() # Global mean value - s = np.sqrt(ind_g[x].var()) # Global stddev - g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data - if s == 0: - g1 = g1 - m # Subtract the mean + g = np.copy(ind_g) # copy to avoid side effects + missing = np.isnan(g) + values = g[True - missing] + mean = values.mean() # Global mean value + stddev = np.sqrt(values.var()) # Global stddev + g[missing] = mean # Plug-in mean values for missing data + if stddev == 0: + g = g - mean # Subtract the mean else: - g1 = (g1 - m) / s # Normalize the deviation - return g1 + g = (g - mean) / stddev # Normalize the deviation + return g diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index f0473f99..035f31e8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -414,6 +414,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): info("genotype 2D matrix m (snps) is: %d" % (m)) assert m>n, "n should be larger than m (snps>inds)" keep = [] + mprint("G (before old normalize)",genotype_matrix) for counter in range(m): #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers @@ -435,10 +436,10 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): progress('kinship_old normalize genotype',counter,m) genotype_matrix = genotype_matrix[:,keep] - mprint("After kinship (old) genotype_matrix: ", genotype_matrix) - # kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) - # return kinship_matrix,genotype_matrix - return kinship_full(genotype_matrix.T,uses),genotype_matrix + mprint("G (after old normalize)",genotype_matrix.T) + kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) + return kinship_matrix,genotype_matrix + # return kinship_full(genotype_matrix.T,uses),genotype_matrix def GWAS(pheno_vector, genotype_matrix, diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 88e2a033..fc7a4b9d 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -106,6 +106,8 @@ if options.geno: if cmd == 'redis_new': # The main difference between redis_new and redis is that missing # phenotypes are handled by the first + if options.remove_missing_phenotypes: + raise Exception('Can not use --remove-missing-phenotypes with LMM2') Y = y G = g print "Original G",G.shape, "\n", G |