From 8b88be4f48baa6cd0cc3c37a851144d5b1dc24af Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 30 Mar 2015 13:01:22 +0200 Subject: Refactoring genotype normalization --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 19 ++++++++++--------- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 9 +++++---- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 ++ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index 315fd824..49f32e3a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -37,14 +37,15 @@ def normalize(ind_g): Run for every SNP list (for one individual) and return normalized SNP genotype values with missing data filled in """ - g1 = np.copy(ind_g) # avoid side effects - x = True - np.isnan(ind_g) # Matrix of True/False - m = ind_g[x].mean() # Global mean value - s = np.sqrt(ind_g[x].var()) # Global stddev - g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data - if s == 0: - g1 = g1 - m # Subtract the mean + g = np.copy(ind_g) # copy to avoid side effects + missing = np.isnan(g) + values = g[True - missing] + mean = values.mean() # Global mean value + stddev = np.sqrt(values.var()) # Global stddev + g[missing] = mean # Plug-in mean values for missing data + if stddev == 0: + g = g - mean # Subtract the mean else: - g1 = (g1 - m) / s # Normalize the deviation - return g1 + g = (g - mean) / stddev # Normalize the deviation + return g diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index f0473f99..035f31e8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -414,6 +414,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): info("genotype 2D matrix m (snps) is: %d" % (m)) assert m>n, "n should be larger than m (snps>inds)" keep = [] + mprint("G (before old normalize)",genotype_matrix) for counter in range(m): #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers @@ -435,10 +436,10 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): progress('kinship_old normalize genotype',counter,m) genotype_matrix = genotype_matrix[:,keep] - mprint("After kinship (old) genotype_matrix: ", genotype_matrix) - # kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) - # return kinship_matrix,genotype_matrix - return kinship_full(genotype_matrix.T,uses),genotype_matrix + mprint("G (after old normalize)",genotype_matrix.T) + kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) + return kinship_matrix,genotype_matrix + # return kinship_full(genotype_matrix.T,uses),genotype_matrix def GWAS(pheno_vector, genotype_matrix, diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 88e2a033..fc7a4b9d 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -106,6 +106,8 @@ if options.geno: if cmd == 'redis_new': # The main difference between redis_new and redis is that missing # phenotypes are handled by the first + if options.remove_missing_phenotypes: + raise Exception('Can not use --remove-missing-phenotypes with LMM2') Y = y G = g print "Original G",G.shape, "\n", G -- cgit v1.2.3