Refactoring genotype normalization

author: Pjotr Prins 2015-03-30 13:01:22 +0200
committer: Pjotr Prins 2015-03-30 13:01:22 +0200
commit: 8b88be4f48baa6cd0cc3c37a851144d5b1dc24af (patch)
tree: cb35abd3dd57a93aa02b10eaf3cac01c46d41ff7 /wqflask
parent: 6fc112431c0edb0ecae6cd5fa45716c349094a7f (diff)
download: genenetwork2-8b88be4f48baa6cd0cc3c37a851144d5b1dc24af.tar.gz
3 files changed, 17 insertions, 13 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index 315fd824..49f32e3a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -37,14 +37,15 @@ def normalize(ind_g):
     Run for every SNP list (for one individual) and return
     normalized SNP genotype values with missing data filled in
     """
-    g1 = np.copy(ind_g)          # avoid side effects
-    x = True - np.isnan(ind_g)   # Matrix of True/False
-    m = ind_g[x].mean()          # Global mean value
-    s = np.sqrt(ind_g[x].var())  # Global stddev
-    g1[np.isnan(ind_g)] = m      # Plug-in mean values for missing data
-    if s == 0:
-        g1 = g1 - m              # Subtract the mean
+    g = np.copy(ind_g)              # copy to avoid side effects
+    missing = np.isnan(g)
+    values = g[True - missing]
+    mean = values.mean()            # Global mean value
+    stddev = np.sqrt(values.var())  # Global stddev
+    g[missing] = mean               # Plug-in mean values for missing data
+    if stddev == 0:
+        g = g - mean                # Subtract the mean
     else:
-        g1 = (g1 - m) / s        # Normalize the deviation
-    return g1
+        g = (g - mean) / stddev     # Normalize the deviation
+    return g
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index f0473f99..035f31e8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -414,6 +414,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
     info("genotype 2D matrix m (snps) is: %d" % (m))
     assert m>n, "n should be larger than m (snps>inds)"
     keep = []
+    mprint("G (before old normalize)",genotype_matrix)
     for counter in range(m):
         #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
         #Checks if any values in column are not numbers
@@ -435,10 +436,10 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
         progress('kinship_old normalize genotype',counter,m)
         
     genotype_matrix = genotype_matrix[:,keep]
-    mprint("After kinship (old) genotype_matrix: ", genotype_matrix)
-    # kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
-    # return kinship_matrix,genotype_matrix
-    return kinship_full(genotype_matrix.T,uses),genotype_matrix
+    mprint("G (after old normalize)",genotype_matrix.T)
+    kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
+    return kinship_matrix,genotype_matrix
+    # return kinship_full(genotype_matrix.T,uses),genotype_matrix
 
 def GWAS(pheno_vector,
          genotype_matrix,
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 88e2a033..fc7a4b9d 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -106,6 +106,8 @@ if options.geno:
 if cmd == 'redis_new':
     # The main difference between redis_new and redis is that missing
     # phenotypes are handled by the first
+    if options.remove_missing_phenotypes:
+        raise Exception('Can not use --remove-missing-phenotypes with LMM2')
     Y = y
     G = g
     print "Original G",G.shape, "\n", G
author	Pjotr Prins	2015-03-30 13:01:22 +0200
committer	Pjotr Prins	2015-03-30 13:01:22 +0200
commit	8b88be4f48baa6cd0cc3c37a851144d5b1dc24af (patch)
tree	cb35abd3dd57a93aa02b10eaf3cac01c46d41ff7 /wqflask
parent	6fc112431c0edb0ecae6cd5fa45716c349094a7f (diff)
download	genenetwork2-8b88be4f48baa6cd0cc3c37a851144d5b1dc24af.tar.gz