From 30cc25c26263f10aa19548c70a18178f2b3ca59e Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 10:56:50 +0300 Subject: Replace missing values with MAF --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 31 +++++++++++++++++++++-------- wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 27 ++++++++++++++++--------- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index 19b0957b..e2457f6b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -17,20 +17,35 @@ # along with this program. If not, see . import numpy as np +from collections import Counter -def normalizeGenotype(g): +def replace_missing_with_MAF(snp_g): + """ + Replace the missing genotype with the minor allele frequency (MAF) + in the snp row + """ + g1 = np.copy(snp_g) + cnt = Counter(g1) + print cnt + min_val = min(cnt.itervalues()) + print "min_val=",min_val + l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)] + print "l=",l[0] + return [l[0] if np.isnan(snp) else snp for snp in g1] + +def normalize(ind_g): """ Run for every SNP list (for one individual) and return normalized SNP genotype values with missing data filled in """ - g1 = np.copy(g) # avoid side effects - x = True - np.isnan(g) # Matrix of True/False - m = g[x].mean() # Global mean value - s = np.sqrt(g[x].var()) # Global stddev - g1[np.isnan(g)] = m # Plug-in mean values for missing data + g1 = np.copy(ind_g) # avoid side effects + x = True - np.isnan(ind_g) # Matrix of True/False + m = ind_g[x].mean() # Global mean value + s = np.sqrt(ind_g[x].var()) # Global stddev + g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data if s == 0: - g1 = g1 - m # Subtract the mean + g1 = g1 - m # Subtract the mean else: - g1 = (g1 - m) / s # Normalize the deviation + g1 = (g1 - m) / s # Normalize the deviation return g1 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py index c22da761..bb620052 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -19,7 +19,7 @@ import sys import numpy as np -def removeMissingPhenotypes(y,g,verbose=False): +def remove_missing(y,g,verbose=False): """ Remove missing data from matrices, make sure the genotype data has individuals as rows diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 35f6e9a9..ffe25fcf 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -23,8 +23,8 @@ import tsvreader import numpy as np from lmm import gn2_load_redis, calculate_kinship from kinship import kinship, kinship_full -from genotype import normalizeGenotype -from phenotype import removeMissingPhenotypes +import genotype +import phenotype usage = """ python runlmm.py [options] command @@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno", help="Phenotype file format 1.0") parser.add_option("--geno",dest="geno", help="Genotype file format 1.0") +parser.add_option("--maf-normalization", + action="store_true", dest="maf_normalization", default=False, + help="Apply MAF genotype normalization") parser.add_option("--skip-genotype-normalization", action="store_true", dest="skip_genotype_normalization", default=False, help="Skip genotype normalization") @@ -97,10 +100,10 @@ if cmd == 'redis': # Emulating the redis setup of GN2 gn = [] for ind_g in g: - gn.append( normalizeGenotype(ind_g) ) + gn.append( genotype.normalize(ind_g) ) gnt = np.array(gn).T if y: - Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + Y,G = phenotype.remove_missing(y,gnt,options.verbose) print "G",G.shape,G else: G = gnt @@ -111,14 +114,20 @@ if cmd == 'redis': print round(ps[-1],4) assert(options.testing and round(ps[-1],4)==0.3461) elif cmd == 'kinship': - gn = [] + G = g + print G.shape, "\n", G + if options.maf_normalization: + g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF: ",g1 + sys.exit() for ind_g in g: if len(gn)>=8000: break if options.skip_genotype_normalization: - gn.append(ind_g) + gn.append(ind_g) else: - gn.append( normalizeGenotype(ind_g) ) - G = np.array(gn) + gn.append( genotype.normalize(ind_g) ) + G = np.array(gn) + print G.shape, "\n", G K = kinship_full(G,options) print "first Kinship method",K.shape,"\n",K @@ -128,7 +137,7 @@ elif cmd == 'kinship': print "third Kinship method",K3.shape,"\n",K3 sys.exit(1) gnt = np.array(gn).T - Y,g = removeMissingPhenotypes(y,gnt,options.verbose) + Y,g = remove_missing_phenotypes(y,gnt,options.verbose) G = g print G.shape,G K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing) -- cgit v1.2.3