Replace missing values with MAF

author: Pjotr Prins 2015-03-14 10:56:50 +0300
committer: Pjotr Prins 2015-03-14 10:56:50 +0300
commit: 30cc25c26263f10aa19548c70a18178f2b3ca59e (patch)
tree: 3d0d1bc649bdb66de604bbfe17d36446f204da16 /wqflask
parent: 5dfb2fdc0a739d86fc1b6c0230d43dcb05428092 (diff)
download: genenetwork2-30cc25c26263f10aa19548c70a18178f2b3ca59e.tar.gz
3 files changed, 42 insertions, 18 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index 19b0957b..e2457f6b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -17,20 +17,35 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import numpy as np
+from collections import Counter
 
-def normalizeGenotype(g):
+def replace_missing_with_MAF(snp_g):
+    """
+    Replace the missing genotype with the minor allele frequency (MAF)
+    in the snp row
+    """
+    g1 = np.copy(snp_g)
+    cnt = Counter(g1)
+    print cnt
+    min_val = min(cnt.itervalues())
+    print "min_val=",min_val
+    l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)]
+    print "l=",l[0]
+    return [l[0] if np.isnan(snp) else snp for snp in g1] 
+    
+def normalize(ind_g):
     """
     Run for every SNP list (for one individual) and return
     normalized SNP genotype values with missing data filled in
     """
-    g1 = np.copy(g)          # avoid side effects
-    x = True - np.isnan(g)   # Matrix of True/False
-    m = g[x].mean()          # Global mean value
-    s = np.sqrt(g[x].var())  # Global stddev
-    g1[np.isnan(g)] = m      # Plug-in mean values for missing data
+    g1 = np.copy(ind_g)          # avoid side effects
+    x = True - np.isnan(ind_g)   # Matrix of True/False
+    m = ind_g[x].mean()          # Global mean value
+    s = np.sqrt(ind_g[x].var())  # Global stddev
+    g1[np.isnan(ind_g)] = m      # Plug-in mean values for missing data
     if s == 0:
-        g1 = g1 - m          # Subtract the mean
+        g1 = g1 - m              # Subtract the mean
     else:
-        g1 = (g1 - m) / s    # Normalize the deviation
+        g1 = (g1 - m) / s        # Normalize the deviation
     return g1
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index c22da761..bb620052 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -19,7 +19,7 @@
 import sys
 import numpy as np
 
-def removeMissingPhenotypes(y,g,verbose=False):
+def remove_missing(y,g,verbose=False):
     """
     Remove missing data from matrices, make sure the genotype data has
     individuals as rows
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 35f6e9a9..ffe25fcf 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -23,8 +23,8 @@ import tsvreader
 import numpy as np
 from lmm import gn2_load_redis, calculate_kinship
 from kinship import kinship, kinship_full
-from genotype import normalizeGenotype
-from phenotype import removeMissingPhenotypes
+import genotype
+import phenotype
 
 usage = """
 python runlmm.py [options] command
@@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno",
                   help="Phenotype file format 1.0")
 parser.add_option("--geno",dest="geno",
                   help="Genotype file format 1.0")
+parser.add_option("--maf-normalization",
+                  action="store_true", dest="maf_normalization", default=False,
+                  help="Apply MAF genotype normalization")
 parser.add_option("--skip-genotype-normalization",
                   action="store_true", dest="skip_genotype_normalization", default=False,
                   help="Skip genotype normalization")
@@ -97,10 +100,10 @@ if cmd == 'redis':
     # Emulating the redis setup of GN2
     gn = []
     for ind_g in g:
-        gn.append( normalizeGenotype(ind_g) )
+        gn.append( genotype.normalize(ind_g) )
     gnt = np.array(gn).T
     if y:
-        Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
+        Y,G = phenotype.remove_missing(y,gnt,options.verbose)
         print "G",G.shape,G
     else:
         G = gnt
@@ -111,14 +114,20 @@ if cmd == 'redis':
     print round(ps[-1],4)
     assert(options.testing and round(ps[-1],4)==0.3461)
 elif cmd == 'kinship':
-    gn = []
+    G = g
+    print G.shape, "\n", G
+    if options.maf_normalization:
+        g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+        print "MAF: ",g1
+    sys.exit()
     for ind_g in g:
         if len(gn)>=8000: break
         if options.skip_genotype_normalization:
-          gn.append(ind_g)
+            gn.append(ind_g)
         else:
-            gn.append( normalizeGenotype(ind_g) )
-    G = np.array(gn)
+            gn.append( genotype.normalize(ind_g) )
+        G = np.array(gn)
+            
     print G.shape, "\n", G
     K = kinship_full(G,options)
     print "first Kinship method",K.shape,"\n",K
@@ -128,7 +137,7 @@ elif cmd == 'kinship':
     print "third Kinship method",K3.shape,"\n",K3
     sys.exit(1)
     gnt = np.array(gn).T
-    Y,g = removeMissingPhenotypes(y,gnt,options.verbose)
+    Y,g = remove_missing_phenotypes(y,gnt,options.verbose)
     G = g
     print G.shape,G
     K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing)
author	Pjotr Prins	2015-03-14 10:56:50 +0300
committer	Pjotr Prins	2015-03-14 10:56:50 +0300
commit	30cc25c26263f10aa19548c70a18178f2b3ca59e (patch)
tree	3d0d1bc649bdb66de604bbfe17d36446f204da16 /wqflask
parent	5dfb2fdc0a739d86fc1b6c0230d43dcb05428092 (diff)
download	genenetwork2-30cc25c26263f10aa19548c70a18178f2b3ca59e.tar.gz