diff options
author | Pjotr Prins | 2015-03-13 11:48:58 +0300 |
---|---|---|
committer | Pjotr Prins | 2015-03-13 11:48:58 +0300 |
commit | 7c7ace427c8dfa81d3448250d9697b4e00418d34 (patch) | |
tree | b3646dd70619ba4be9099406fe0f4c57fec32c43 | |
parent | 7ec5303a6eca758236103e191f574bb5480a0ba6 (diff) | |
download | genenetwork2-7c7ace427c8dfa81d3448250d9697b4e00418d34.tar.gz |
& Refactoring
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 36 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 40 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 47 |
3 files changed, 85 insertions, 38 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py new file mode 100644 index 00000000..19b0957b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -0,0 +1,36 @@ +# Genotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import numpy as np + +def normalizeGenotype(g): + """ + Run for every SNP list (for one individual) and return + normalized SNP genotype values with missing data filled in + """ + g1 = np.copy(g) # avoid side effects + x = True - np.isnan(g) # Matrix of True/False + m = g[x].mean() # Global mean value + s = np.sqrt(g[x].var()) # Global stddev + g1[np.isnan(g)] = m # Plug-in mean values for missing data + if s == 0: + g1 = g1 - m # Subtract the mean + else: + g1 = (g1 - m) / s # Normalize the deviation + return g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py new file mode 100644 index 00000000..c22da761 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -0,0 +1,40 @@ +# Phenotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import sys +import numpy as np + +def removeMissingPhenotypes(y,g,verbose=False): + """ + Remove missing data from matrices, make sure the genotype data has + individuals as rows + """ + assert(y!=None) + assert(y.shape[0] == g.shape[0]) + + y1 = y + g1 = g + v = np.isnan(y) + keep = True - v + if v.sum(): + if verbose: + sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + y1 = y[keep] + g1 = g[keep,:] + return y1,g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 94533ecb..16e88419 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -23,6 +23,8 @@ import tsvreader import numpy as np from lmm import gn2_load_redis from kinship import kinship +from genotype import normalizeGenotype +from phenotype import removeMissingPhenotypes usage = """ python runlmm.py [options] command @@ -87,47 +89,16 @@ if options.pheno: if options.geno: g = tsvreader.geno(options.geno) print g.shape - -def normalizeGenotype(g1): - g = np.copy(g1) # avoid side effects - # Run for every SNP list (num individuals) - x = True - np.isnan(g) # Matrix of True/False - m = g[x].mean() # Global mean value - # print m - s = np.sqrt(g[x].var()) # Global stddev - # print s - g[np.isnan(g)] = m # Plug-in mean values for missing - if s == 0: - g = g - m # Subtract the mean - else: - g = (g - m) / s # Normalize the deviation - return g - -gn = [] -for snp in g: - gn.append( normalizeGenotype(snp) ) - -print("Before",g) - -gT = np.array(gn).T -print("After",gT) -G = np.array(gT) -print "G shape",G.shape - -# Remove individuals with missing phenotypes -if y != None: - v = np.isnan(y) - keep = True - v - if v.sum(): - if options.verbose: sys.stderr.write("runlmm.py: Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - y = y[keep] - G = G[keep,:] - if k != None: k = k[keep,:][:,keep] if cmd == 'redis': # Emulating the redis setup of GN2 - print G - ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) + gn = [] + for ind_g in g: + gn.append( normalizeGenotype(ind_g) ) + gnt = np.array(gn).T + Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + print "G",G.shape,G + ps, ts = gn2_load_redis('testrun','other',np.array(k),Y,G,options.testing) print np.array(ps) print round(ps[0],4) assert(options.testing and round(ps[0],4)==0.7262) |