From e492c9d3812ee2f6d6c7b640e216af09710b91d7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 11 Mar 2015 14:36:36 +0300 Subject: Adding multi-core kinship.py --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 136 ++++++++++++++++++++++++++++ wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 4 + wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 59 +++++++----- 4 files changed, 177 insertions(+), 24 deletions(-) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/kinship.py diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py new file mode 100644 index 00000000..dc2d717d --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -0,0 +1,136 @@ +# pylmm kinship calculation +# +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test + +import sys +import os +import numpy as np +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +from optmatrix import matrix_initialize, matrixMultT + + +def compute_W(job,G,n,compute_size): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,compute_size): + row = job*compute_size + j + if row >= compute_size: + W = W[:,range(0,j)] + break + snp = G[job*compute_size+j] + # print snp.shape,snp + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_matrixMult(job,W,q = None): + """ + Compute Kinship(W)*j + + For every set of SNPs matrixMult is used to multiply matrices T(W)*W + """ + res = matrixMultT(W) + if not q: q=compute_matrixMult.q + q.put([job,res]) + return job + +def f_init(q): + compute_matrixMult.q = q + +# Calculate the kinship matrix from G (SNPs as rows!), returns K +# +def kinship(G,options): + numThreads = None + if options.numThreads: + numThreads = int(options.numThreads) + options.computeSize = 1000 + matrix_initialize(options.useBLAS) + + sys.stderr.write(str(G.shape)+"\n") + n = G.shape[1] # inds + inds = n + m = G.shape[0] # snps + snps = m + sys.stderr.write(str(m)+" SNPs\n") + + q = mp.Queue() + p = mp.Pool(numThreads, f_init, [q]) + iterations = snps/options.computeSize+1 + if options.testing: + iterations = 8 + # jobs = range(0,8) # range(0,iterations) + + results = [] + + K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + + completed = 0 + for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job,G,n,options.computeSize) + if numThreads == 1: + compute_matrixMult(job,W,q) + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + else: + results.append(p.apply_async(compute_matrixMult, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + + if numThreads == None or numThreads > 1: + for job in range(len(results)-completed): + j,x = q.get(True,15) + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + + print K.shape,K + K = K / float(snps) + outFile = 'runtest.kin' + if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) + np.savetxt(outFile,K) + + if options.saveKvaKve: + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + + + + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 58d7593d..65b989a8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -406,6 +406,8 @@ def GWAS(pheno_vector, v = np.isnan(pheno_vector) if v.sum(): keep = True - v + print(pheno_vector.shape,pheno_vector) + print(keep.shape,keep) pheno_vector = pheno_vector[keep] #genotype_matrix = genotype_matrix[keep,:] #covariate_matrix = covariate_matrix[keep,:] @@ -437,6 +439,8 @@ def GWAS(pheno_vector, p_values.append(0) t_statistics.append(np.nan) continue + + print(genotype_matrix.shape,pheno_vector.shape,keep.shape) pheno_vector = pheno_vector[keep] covariate_matrix = covariate_matrix[keep,:] diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py index abb72081..5c71db6a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py @@ -9,7 +9,7 @@ from scipy import stats useNumpy = None hasBLAS = None -def matrix_initialize(useBLAS): +def matrix_initialize(useBLAS=True): global useNumpy # module based variable if useBLAS and useNumpy == None: print get_info('blas_opt') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 738268be..bd6c55fc 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -22,17 +22,19 @@ import sys import tsvreader import numpy as np from lmm import gn2_load_redis +from kinship import kinship usage = """ python runlmm.py [options] command runlmm.py processing multiplexer reads standardised input formats - and calls the different routines + and calls the different routines (writes to stdout) Current commands are: parse : only parse input files redis : use Redis to call into GN2 + kinship : calculate (new) kinship matrix try --help for more information """ @@ -50,6 +52,13 @@ parser.add_option("--geno",dest="geno", parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") +parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication") +parser.add_option("-t", "--threads", + type="int", dest="numThreads", + help="Threads to use") +parser.add_option("--saveKvaKve", + action="store_true", dest="saveKvaKve", default=False, + help="Testing mode") parser.add_option("--test", action="store_true", dest="testing", default=False, help="Testing mode") @@ -63,6 +72,10 @@ if len(args) != 1: cmd = args[0] print "Command: ",cmd +k = None +y = None +g = None + if options.kinship: k = tsvreader.kinship(options.kinship) print k.shape @@ -74,7 +87,7 @@ if options.pheno: if options.geno: g = tsvreader.geno(options.geno) print g.shape - + def normalizeGenotype(G): # Run for every SNP list (num individuals) x = True - np.isnan(G) # Matrix of True/False @@ -89,32 +102,32 @@ def normalizeGenotype(G): G = (G - m) / s # Normalize the deviation return G -# g = g.reshape((1, -1))[0] print("Before",g) -gn = [] -for snp in g: - gn.append( normalizeGenotype(snp) ) - -gn = g -gn = np.array(gn) -print("After1",gn) -gnT = gn.T -print("After",gnT) -# G = gnT -G = gnT + +gT = g.T +print("After",gT) +G = np.array(gT) print "G shape",G.shape -# sys.exit(1) -# assert(G[0,0]==-2.25726341) # Remove individuals with missing phenotypes -v = np.isnan(y) -keep = True - v -if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - y = y[keep] - G = G[keep,:] - k = k[keep,:][:,keep] +if y != None: + v = np.isnan(y) + keep = True - v + if v.sum(): + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + y = y[keep] + G = G[keep,:] + if k != None: k = k[keep,:][:,keep] if cmd == 'redis': ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) print np.array(ps) +elif cmd == 'kinship': + gn = [] + for snp in G.T: + gn.append( normalizeGenotype(snp) ) + G2 = np.array(gn) + print G2.shape,G2 + K = kinship(G2,options) +else: + print "Doing nothing" -- cgit v1.2.3 From 4bb1113a9c23882ad91634c7cb77537e47a09713 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 11 Mar 2015 14:44:21 +0300 Subject: runlmm.py: message --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index bd6c55fc..5b777580 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -114,7 +114,7 @@ if y != None: v = np.isnan(y) keep = True - v if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + if options.verbose: sys.stderr.write("runlmm.py: Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) y = y[keep] G = G[keep,:] if k != None: k = k[keep,:][:,keep] -- cgit v1.2.3 From d7f25679f0369f9138300e23f49a958239e2a379 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 12 Mar 2015 11:09:08 +0300 Subject: process_plink.py: remove unused file --- wqflask/wqflask/my_pylmm/pyLMM/process_plink.py | 127 ------------------------ 1 file changed, 127 deletions(-) delete mode 100644 wqflask/wqflask/my_pylmm/pyLMM/process_plink.py diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py deleted file mode 100644 index e47c18e1..00000000 --- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import absolute_import, print_function, division - -import sys -sys.path.append("../../..") - -print("sys.path: ", sys.path) - -import numpy as np - -import zlib -import cPickle as pickle -import redis -Redis = redis.Redis() - -import lmm - -class ProcessLmmChunk(object): - - def __init__(self): - self.get_snp_data() - self.get_lmm_vars() - - keep = self.trim_matrices() - - self.do_association(keep) - - print("p_values is: ", self.p_values) - - def get_snp_data(self): - plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) - plink_data = pickle.loads(plink_pickled) - - self.snps = np.array(plink_data['result']) - self.identifier = plink_data['identifier'] - - def get_lmm_vars(self): - lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars") - lmm_vars = pickle.loads(lmm_vars_pickled) - - self.pheno_vector = np.array(lmm_vars['pheno_vector']) - self.covariate_matrix = np.array(lmm_vars['covariate_matrix']) - self.kinship_matrix = np.array(lmm_vars['kinship_matrix']) - - def trim_matrices(self): - v = np.isnan(self.pheno_vector) - keep = True - v - keep = keep.reshape((len(keep),)) - - if v.sum(): - self.pheno_vector = self.pheno_vector[keep] - self.covariate_matrix = self.covariate_matrix[keep,:] - self.kinship_matrix = self.kinship_matrix[keep,:][:,keep] - - return keep - - def do_association(self, keep): - n = self.kinship_matrix.shape[0] - lmm_ob = lmm.LMM(self.pheno_vector, - self.kinship_matrix, - self.covariate_matrix) - lmm_ob.fit() - - self.p_values = [] - - for snp in self.snps: - snp = snp[0] - p_value, t_stat = lmm.human_association(snp, - n, - keep, - lmm_ob, - self.pheno_vector, - self.covariate_matrix, - self.kinship_matrix, - False) - - self.p_values.append(p_value) - - -#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) -# -#plink_data = pickle.loads(plink_pickled) -#result = np.array(plink_data['result']) -#print("snp size is: ", result.shape) -#identifier = plink_data['identifier'] -# -#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars") -#lmm_vars = pickle.loads(lmm_vars_pickled) -# -#pheno_vector = np.array(lmm_vars['pheno_vector']) -#covariate_matrix = np.array(lmm_vars['covariate_matrix']) -#kinship_matrix = np.array(lmm_vars['kinship_matrix']) -# -#v = np.isnan(pheno_vector) -#keep = True - v -#keep = keep.reshape((len(keep),)) -#print("keep is: ", keep) -# -#if v.sum(): -# pheno_vector = pheno_vector[keep] -# covariate_matrix = covariate_matrix[keep,:] -# kinship_matrix = kinship_matrix[keep,:][:,keep] -# -#n = kinship_matrix.shape[0] -#print("n is: ", n) -#lmm_ob = lmm.LMM(pheno_vector, -# kinship_matrix, -# covariate_matrix) -#lmm_ob.fit() -# -#p_values = [] -# -#for snp in result: -# snp = snp[0] -# p_value, t_stat = lmm.human_association(snp, -# n, -# keep, -# lmm_ob, -# pheno_vector, -# covariate_matrix, -# kinship_matrix, -# False) -# -# p_values.append(p_value) - - - - -- cgit v1.2.3 From 3fb9db1d8b1d975b2af1d56c8ea0dbd6b39c9001 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 12 Mar 2015 12:02:17 +0300 Subject: Debug info --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 65b989a8..48a931c5 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -143,6 +143,7 @@ def run_human(pheno_vector, timestamp = datetime.datetime.utcnow().isoformat() + # Pickle chunks of input SNPs (from Plink interator) and compress them #print("Starting adding loop") for part, result in enumerate(results): #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL) @@ -517,7 +518,7 @@ class LMM: Kve = [] self.nonmissing = x - print("this K is:", pf(K)) + print("this K is:", K.shape, pf(K)) if len(Kva) == 0 or len(Kve) == 0: if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) @@ -529,8 +530,8 @@ class LMM: self.K = K self.Kva = Kva self.Kve = Kve - print("self.Kva is: ", pf(self.Kva)) - print("self.Kve is: ", pf(self.Kve)) + print("self.Kva is: ", self.Kva.shape, pf(self.Kva)) + print("self.Kve is: ", self.Kve.shape, pf(self.Kve)) self.Y = Y self.X0 = X0 self.N = self.K.shape[0] -- cgit v1.2.3 From bd8e4f30cd1eb0a41399d8d94b516845da3c32a0 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 12 Mar 2015 17:10:54 +0300 Subject: Check G --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 5b777580..0b727383 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -120,6 +120,7 @@ if y != None: if k != None: k = k[keep,:][:,keep] if cmd == 'redis': + print G ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) print np.array(ps) elif cmd == 'kinship': -- cgit v1.2.3 From 494edec995a38279eb1da21d520b2cb249eb1079 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 12 Mar 2015 17:18:38 +0300 Subject: weird fix for lmm result - Python weirdness --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 0b727383..e64ca0e6 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -102,6 +102,12 @@ def normalizeGenotype(G): G = (G - m) / s # Normalize the deviation return G +gn = [] +for snp in g: + gn.append( normalizeGenotype(snp) ) + +gn = g + print("Before",g) gT = g.T -- cgit v1.2.3 From fa4227614042312a54a5d404af4235efc077ab3a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 12 Mar 2015 21:05:51 +0300 Subject: lmm: use deep-copy to avoid side-effects --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index e64ca0e6..d22e2912 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -88,29 +88,28 @@ if options.geno: g = tsvreader.geno(options.geno) print g.shape -def normalizeGenotype(G): +def normalizeGenotype(g1): + g = np.copy(g1) # avoid side effects # Run for every SNP list (num individuals) - x = True - np.isnan(G) # Matrix of True/False - m = G[x].mean() # Global mean value + x = True - np.isnan(g) # Matrix of True/False + m = g[x].mean() # Global mean value # print m - s = np.sqrt(G[x].var()) # Global stddev + s = np.sqrt(g[x].var()) # Global stddev # print s - G[np.isnan(G)] = m # Plug-in mean values for missing + g[np.isnan(g)] = m # Plug-in mean values for missing if s == 0: - G = G - m # Subtract the mean + g = g - m # Subtract the mean else: - G = (G - m) / s # Normalize the deviation - return G + g = (g - m) / s # Normalize the deviation + return g gn = [] for snp in g: gn.append( normalizeGenotype(snp) ) -gn = g - print("Before",g) -gT = g.T +gT = np.array(gn).T print("After",gT) G = np.array(gT) print "G shape",G.shape @@ -129,6 +128,7 @@ if cmd == 'redis': print G ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) print np.array(ps) + assert(options.testing and ps[0]==0.726205761108) elif cmd == 'kinship': gn = [] for snp in G.T: -- cgit v1.2.3 From 7ec5303a6eca758236103e191f574bb5480a0ba6 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 13 Mar 2015 09:57:49 +0300 Subject: & runlmm.py: added assertions for default run --- wqflask/wqflask/my_pylmm/pyLMM/chunks.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py index abeffee7..9565fb96 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py @@ -93,4 +93,4 @@ if __name__ == '__main__': _main() print("\nConfirming doctests...") import doctest - doctest.testmod() \ No newline at end of file + doctest.testmod() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index d22e2912..94533ecb 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -125,10 +125,14 @@ if y != None: if k != None: k = k[keep,:][:,keep] if cmd == 'redis': + # Emulating the redis setup of GN2 print G ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) print np.array(ps) - assert(options.testing and ps[0]==0.726205761108) + print round(ps[0],4) + assert(options.testing and round(ps[0],4)==0.7262) + print round(ps[-1],4) + assert(options.testing and round(ps[-1],4)==0.3461) elif cmd == 'kinship': gn = [] for snp in G.T: -- cgit v1.2.3 From 7c7ace427c8dfa81d3448250d9697b4e00418d34 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 13 Mar 2015 11:48:58 +0300 Subject: & Refactoring --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 36 ++++++++++++++++++++++ wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 40 ++++++++++++++++++++++++ wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 47 ++++++----------------------- 3 files changed, 85 insertions(+), 38 deletions(-) create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/genotype.py create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py new file mode 100644 index 00000000..19b0957b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -0,0 +1,36 @@ +# Genotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import numpy as np + +def normalizeGenotype(g): + """ + Run for every SNP list (for one individual) and return + normalized SNP genotype values with missing data filled in + """ + g1 = np.copy(g) # avoid side effects + x = True - np.isnan(g) # Matrix of True/False + m = g[x].mean() # Global mean value + s = np.sqrt(g[x].var()) # Global stddev + g1[np.isnan(g)] = m # Plug-in mean values for missing data + if s == 0: + g1 = g1 - m # Subtract the mean + else: + g1 = (g1 - m) / s # Normalize the deviation + return g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py new file mode 100644 index 00000000..c22da761 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -0,0 +1,40 @@ +# Phenotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +import sys +import numpy as np + +def removeMissingPhenotypes(y,g,verbose=False): + """ + Remove missing data from matrices, make sure the genotype data has + individuals as rows + """ + assert(y!=None) + assert(y.shape[0] == g.shape[0]) + + y1 = y + g1 = g + v = np.isnan(y) + keep = True - v + if v.sum(): + if verbose: + sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + y1 = y[keep] + g1 = g[keep,:] + return y1,g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 94533ecb..16e88419 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -23,6 +23,8 @@ import tsvreader import numpy as np from lmm import gn2_load_redis from kinship import kinship +from genotype import normalizeGenotype +from phenotype import removeMissingPhenotypes usage = """ python runlmm.py [options] command @@ -87,47 +89,16 @@ if options.pheno: if options.geno: g = tsvreader.geno(options.geno) print g.shape - -def normalizeGenotype(g1): - g = np.copy(g1) # avoid side effects - # Run for every SNP list (num individuals) - x = True - np.isnan(g) # Matrix of True/False - m = g[x].mean() # Global mean value - # print m - s = np.sqrt(g[x].var()) # Global stddev - # print s - g[np.isnan(g)] = m # Plug-in mean values for missing - if s == 0: - g = g - m # Subtract the mean - else: - g = (g - m) / s # Normalize the deviation - return g - -gn = [] -for snp in g: - gn.append( normalizeGenotype(snp) ) - -print("Before",g) - -gT = np.array(gn).T -print("After",gT) -G = np.array(gT) -print "G shape",G.shape - -# Remove individuals with missing phenotypes -if y != None: - v = np.isnan(y) - keep = True - v - if v.sum(): - if options.verbose: sys.stderr.write("runlmm.py: Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - y = y[keep] - G = G[keep,:] - if k != None: k = k[keep,:][:,keep] if cmd == 'redis': # Emulating the redis setup of GN2 - print G - ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) + gn = [] + for ind_g in g: + gn.append( normalizeGenotype(ind_g) ) + gnt = np.array(gn).T + Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + print "G",G.shape,G + ps, ts = gn2_load_redis('testrun','other',np.array(k),Y,G,options.testing) print np.array(ps) print round(ps[0],4) assert(options.testing and round(ps[0],4)==0.7262) -- cgit v1.2.3 From 08054314542485da15fc2d2c1903d844348a4fe3 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 13 Mar 2015 11:50:43 +0300 Subject: runlmm.py: remove extraneous conversion --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 16e88419..69a80411 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -98,7 +98,7 @@ if cmd == 'redis': gnt = np.array(gn).T Y,G = removeMissingPhenotypes(y,gnt,options.verbose) print "G",G.shape,G - ps, ts = gn2_load_redis('testrun','other',np.array(k),Y,G,options.testing) + ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing) print np.array(ps) print round(ps[0],4) assert(options.testing and round(ps[0],4)==0.7262) -- cgit v1.2.3 From 7b47dd4a9ca1227a0df4f3e7bf2358633944de0f Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 08:39:44 +0300 Subject: kinship: working on pylmm --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 10 ++++++++-- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 5 +++-- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 26 +++++++++++++++++++------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index dc2d717d..353784aa 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -58,6 +58,12 @@ def compute_matrixMult(job,W,q = None): def f_init(q): compute_matrixMult.q = q +def kinship_full(G,options): + print G.shape + m = np.dot(G.T,G) + m = m/G.shape[0] + return m + # Calculate the kinship matrix from G (SNPs as rows!), returns K # def kinship(G,options): @@ -118,7 +124,7 @@ def kinship(G,options): # print j,K_j[:,0] K = K + K_j - print K.shape,K + print "kiship.kinship: ",K.shape,K K = K / float(snps) outFile = 'runtest.kin' if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) @@ -130,7 +136,7 @@ def kinship(G,options): if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) np.savetxt(outFile+".kva",Kva) np.savetxt(outFile+".kve",Kve) - + return K diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 48a931c5..36c3602f 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -326,7 +326,7 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) -def calculate_kinship(genotype_matrix, temp_data, is_testing=False): +def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -361,7 +361,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False): genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr) percent_complete = int(round((counter/m)*45)) - temp_data.store("percent_complete", percent_complete) + if temp_data != None: + temp_data.store("percent_complete", percent_complete) genotype_matrix = genotype_matrix[:,keep] print("genotype_matrix: ", pf(genotype_matrix)) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 69a80411..627cc7a4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -21,8 +21,8 @@ from optparse import OptionParser import sys import tsvreader import numpy as np -from lmm import gn2_load_redis -from kinship import kinship +from lmm import gn2_load_redis, calculate_kinship +from kinship import kinship, kinship_full from genotype import normalizeGenotype from phenotype import removeMissingPhenotypes @@ -106,10 +106,22 @@ if cmd == 'redis': assert(options.testing and round(ps[-1],4)==0.3461) elif cmd == 'kinship': gn = [] - for snp in G.T: - gn.append( normalizeGenotype(snp) ) - G2 = np.array(gn) - print G2.shape,G2 - K = kinship(G2,options) + for ind_g in g: + if len(gn)>=8000: break + gn.append( normalizeGenotype(ind_g) ) + K = kinship_full(np.array(gn),options) + print "first Kinship method",K.shape,K + K = kinship(np.array(gn),options) + print "second Kinship method",K.shape,K + sys.exit(1) + gnt = np.array(gn).T + Y,g = removeMissingPhenotypes(y,gnt,options.verbose) + G = g + print G.shape,G + K = calculate_kinship(np.copy(G),None,options) + print G.shape,G + print "first Kinship method",K.shape,K + K = kinship(G.T,options) + assert(K[0][0]==1.28) else: print "Doing nothing" -- cgit v1.2.3 From 5dfb2fdc0a739d86fc1b6c0230d43dcb05428092 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 10:21:25 +0300 Subject: Testing kinship works on small G --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 9 +++++---- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 29 +++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 353784aa..61da68fc 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -27,16 +27,17 @@ import Queue from optmatrix import matrix_initialize, matrixMultT -def compute_W(job,G,n,compute_size): +def compute_W(job,G,n,snps,compute_size): """ Read 1000 SNPs at a time into matrix and return the result """ W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) for j in range(0,compute_size): row = job*compute_size + j - if row >= compute_size: + if row >= compute_size or row>=snps: W = W[:,range(0,j)] break + # print job,compute_size,j snp = G[job*compute_size+j] # print snp.shape,snp if snp.var() == 0: @@ -79,6 +80,7 @@ def kinship(G,options): m = G.shape[0] # snps snps = m sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" q = mp.Queue() p = mp.Pool(numThreads, f_init, [q]) @@ -95,7 +97,7 @@ def kinship(G,options): for job in range(iterations): if options.verbose: sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) - W = compute_W(job,G,n,options.computeSize) + W = compute_W(job,G,n,snps,options.computeSize) if numThreads == 1: compute_matrixMult(job,W,q) j,x = q.get() @@ -124,7 +126,6 @@ def kinship(G,options): # print j,K_j[:,0] K = K + K_j - print "kiship.kinship: ",K.shape,K K = K / float(snps) outFile = 'runtest.kin' if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 627cc7a4..35f6e9a9 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno", help="Phenotype file format 1.0") parser.add_option("--geno",dest="geno", help="Genotype file format 1.0") +parser.add_option("--skip-genotype-normalization", + action="store_true", dest="skip_genotype_normalization", default=False, + help="Skip genotype normalization") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -96,8 +99,11 @@ if cmd == 'redis': for ind_g in g: gn.append( normalizeGenotype(ind_g) ) gnt = np.array(gn).T - Y,G = removeMissingPhenotypes(y,gnt,options.verbose) - print "G",G.shape,G + if y: + Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + print "G",G.shape,G + else: + G = gnt ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing) print np.array(ps) print round(ps[0],4) @@ -108,17 +114,24 @@ elif cmd == 'kinship': gn = [] for ind_g in g: if len(gn)>=8000: break - gn.append( normalizeGenotype(ind_g) ) - K = kinship_full(np.array(gn),options) - print "first Kinship method",K.shape,K - K = kinship(np.array(gn),options) - print "second Kinship method",K.shape,K + if options.skip_genotype_normalization: + gn.append(ind_g) + else: + gn.append( normalizeGenotype(ind_g) ) + G = np.array(gn) + print G.shape, "\n", G + K = kinship_full(G,options) + print "first Kinship method",K.shape,"\n",K + K2 = calculate_kinship(np.copy(G.T),None,options) + print "GN2 Kinship method",K2.shape,"\n",K2 + K3 = kinship(G,options) + print "third Kinship method",K3.shape,"\n",K3 sys.exit(1) gnt = np.array(gn).T Y,g = removeMissingPhenotypes(y,gnt,options.verbose) G = g print G.shape,G - K = calculate_kinship(np.copy(G),None,options) + K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing) print G.shape,G print "first Kinship method",K.shape,K K = kinship(G.T,options) -- cgit v1.2.3 From 30cc25c26263f10aa19548c70a18178f2b3ca59e Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 10:56:50 +0300 Subject: Replace missing values with MAF --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 31 +++++++++++++++++++++-------- wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 27 ++++++++++++++++--------- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index 19b0957b..e2457f6b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -17,20 +17,35 @@ # along with this program. If not, see . import numpy as np +from collections import Counter -def normalizeGenotype(g): +def replace_missing_with_MAF(snp_g): + """ + Replace the missing genotype with the minor allele frequency (MAF) + in the snp row + """ + g1 = np.copy(snp_g) + cnt = Counter(g1) + print cnt + min_val = min(cnt.itervalues()) + print "min_val=",min_val + l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)] + print "l=",l[0] + return [l[0] if np.isnan(snp) else snp for snp in g1] + +def normalize(ind_g): """ Run for every SNP list (for one individual) and return normalized SNP genotype values with missing data filled in """ - g1 = np.copy(g) # avoid side effects - x = True - np.isnan(g) # Matrix of True/False - m = g[x].mean() # Global mean value - s = np.sqrt(g[x].var()) # Global stddev - g1[np.isnan(g)] = m # Plug-in mean values for missing data + g1 = np.copy(ind_g) # avoid side effects + x = True - np.isnan(ind_g) # Matrix of True/False + m = ind_g[x].mean() # Global mean value + s = np.sqrt(ind_g[x].var()) # Global stddev + g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data if s == 0: - g1 = g1 - m # Subtract the mean + g1 = g1 - m # Subtract the mean else: - g1 = (g1 - m) / s # Normalize the deviation + g1 = (g1 - m) / s # Normalize the deviation return g1 diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py index c22da761..bb620052 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -19,7 +19,7 @@ import sys import numpy as np -def removeMissingPhenotypes(y,g,verbose=False): +def remove_missing(y,g,verbose=False): """ Remove missing data from matrices, make sure the genotype data has individuals as rows diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 35f6e9a9..ffe25fcf 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -23,8 +23,8 @@ import tsvreader import numpy as np from lmm import gn2_load_redis, calculate_kinship from kinship import kinship, kinship_full -from genotype import normalizeGenotype -from phenotype import removeMissingPhenotypes +import genotype +import phenotype usage = """ python runlmm.py [options] command @@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno", help="Phenotype file format 1.0") parser.add_option("--geno",dest="geno", help="Genotype file format 1.0") +parser.add_option("--maf-normalization", + action="store_true", dest="maf_normalization", default=False, + help="Apply MAF genotype normalization") parser.add_option("--skip-genotype-normalization", action="store_true", dest="skip_genotype_normalization", default=False, help="Skip genotype normalization") @@ -97,10 +100,10 @@ if cmd == 'redis': # Emulating the redis setup of GN2 gn = [] for ind_g in g: - gn.append( normalizeGenotype(ind_g) ) + gn.append( genotype.normalize(ind_g) ) gnt = np.array(gn).T if y: - Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + Y,G = phenotype.remove_missing(y,gnt,options.verbose) print "G",G.shape,G else: G = gnt @@ -111,14 +114,20 @@ if cmd == 'redis': print round(ps[-1],4) assert(options.testing and round(ps[-1],4)==0.3461) elif cmd == 'kinship': - gn = [] + G = g + print G.shape, "\n", G + if options.maf_normalization: + g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF: ",g1 + sys.exit() for ind_g in g: if len(gn)>=8000: break if options.skip_genotype_normalization: - gn.append(ind_g) + gn.append(ind_g) else: - gn.append( normalizeGenotype(ind_g) ) - G = np.array(gn) + gn.append( genotype.normalize(ind_g) ) + G = np.array(gn) + print G.shape, "\n", G K = kinship_full(G,options) print "first Kinship method",K.shape,"\n",K @@ -128,7 +137,7 @@ elif cmd == 'kinship': print "third Kinship method",K3.shape,"\n",K3 sys.exit(1) gnt = np.array(gn).T - Y,g = removeMissingPhenotypes(y,gnt,options.verbose) + Y,g = remove_missing_phenotypes(y,gnt,options.verbose) G = g print G.shape,G K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing) -- cgit v1.2.3 From 6e6cae20d29de14ab1d0a5dc8e38ebb9162aa639 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 11:28:36 +0300 Subject: More MAF --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 15 +++++++-------- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 16 +++++----------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index e2457f6b..f5d9ee8c 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -18,20 +18,19 @@ import numpy as np from collections import Counter +import operator def replace_missing_with_MAF(snp_g): """ Replace the missing genotype with the minor allele frequency (MAF) in the snp row """ - g1 = np.copy(snp_g) - cnt = Counter(g1) - print cnt - min_val = min(cnt.itervalues()) - print "min_val=",min_val - l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)] - print "l=",l[0] - return [l[0] if np.isnan(snp) else snp for snp in g1] + cnt = Counter(snp_g) + tuples = sorted(cnt.items(), key=operator.itemgetter(1)) + l2 = [t for t in tuples if not np.isnan(t[0])] + maf = l2[0][0] + res = np.array([maf if np.isnan(snp) else snp for snp in snp_g]) + return res def normalize(ind_g): """ diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index ffe25fcf..0b8830d4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -115,18 +115,12 @@ if cmd == 'redis': assert(options.testing and round(ps[-1],4)==0.3461) elif cmd == 'kinship': G = g - print G.shape, "\n", G + print "Original G",G.shape, "\n", G if options.maf_normalization: - g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) - print "MAF: ",g1 - sys.exit() - for ind_g in g: - if len(gn)>=8000: break - if options.skip_genotype_normalization: - gn.append(ind_g) - else: - gn.append( genotype.normalize(ind_g) ) - G = np.array(gn) + G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF replacements: \n",G + if not options.skip_genotype_normalization: + G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) print G.shape, "\n", G K = kinship_full(G,options) -- cgit v1.2.3 From 2c6d1fcba1138415ecb3ca447e09d06d660af0db Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 12:14:53 +0300 Subject: Working on kinship: GN2 and simple matrix multiplicatino agree --- wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 4 ++++ wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 4 ++-- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 23 +++++++++++------------ 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py index f5d9ee8c..315fd824 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -23,7 +23,7 @@ import operator def replace_missing_with_MAF(snp_g): """ Replace the missing genotype with the minor allele frequency (MAF) - in the snp row + in the snp row. It is rather slow! """ cnt = Counter(snp_g) tuples = sorted(cnt.items(), key=operator.itemgetter(1)) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 61da68fc..c1750e1d 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -61,6 +61,10 @@ def f_init(q): def kinship_full(G,options): print G.shape + m = G.shape[0] # snps + n = G.shape[1] # inds + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" m = np.dot(G.T,G) m = m/G.shape[0] return m diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 36c3602f..7bf77be5 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -340,8 +340,8 @@ def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False): print("genotype 2D matrix m (snps) is:", m) keep = [] for counter in range(m): - if is_testing and counter>8: - break + # if is_testing and counter>8: + # break #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers not_number = np.isnan(genotype_matrix[:,counter]) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 0b8830d4..80478368 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -116,28 +116,27 @@ if cmd == 'redis': elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G + if y: + gnt = np.array(gn).T + Y,g = phenotype.remove_missing(y,g.T,options.verbose) + G = g.T + print "Removed missing phenotypes",G.shape, "\n", G if options.maf_normalization: G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) print "MAF replacements: \n",G if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) - print G.shape, "\n", G K = kinship_full(G,options) + print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K - K2 = calculate_kinship(np.copy(G.T),None,options) + K2 = calculate_kinship(np.copy(G.T),temp_data=None,is_testing=options.testing) + print "Genotype",G.shape, "\n", G print "GN2 Kinship method",K2.shape,"\n",K2 - K3 = kinship(G,options) + + print "Genotype",G.shape, "\n", G + K3 = kinship(np.copy(G),options) print "third Kinship method",K3.shape,"\n",K3 - sys.exit(1) - gnt = np.array(gn).T - Y,g = remove_missing_phenotypes(y,gnt,options.verbose) - G = g - print G.shape,G - K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing) - print G.shape,G - print "first Kinship method",K.shape,K - K = kinship(G.T,options) assert(K[0][0]==1.28) else: print "Doing nothing" -- cgit v1.2.3 From 0533dc4c053165853faa0512229eef30a174c425 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 12:29:35 +0300 Subject: Removing all references to is_testing --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 7bf77be5..87b999e8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -255,8 +255,8 @@ def run_other(pheno_vector, genotype_matrix, restricted_max_likelihood=True, refit=False, - tempdata=None, # <---- can not be None - is_testing=False): + tempdata=None # <---- can not be None + ): """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics @@ -268,9 +268,9 @@ def run_other(pheno_vector, """ print("In run_other") - print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing) + print("REML=",restricted_max_likelihood," REFIT=",refit) with Bench("Calculate Kinship"): - kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing) + kinship_matrix = calculate_kinship(genotype_matrix, tempdata) print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) @@ -326,7 +326,7 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) -def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False): +def calculate_kinship(genotype_matrix, temp_data=None): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -340,8 +340,6 @@ def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False): print("genotype 2D matrix m (snps) is:", m) keep = [] for counter in range(m): - # if is_testing and counter>8: - # break #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) #Checks if any values in column are not numbers not_number = np.isnan(genotype_matrix[:,counter]) @@ -490,7 +488,7 @@ class LMM: is not done consistently. """ - def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False): + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True): """ The constructor takes a phenotype vector or array of size n. @@ -500,7 +498,6 @@ class LMM: When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. """ - self.is_testing = True if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) self.verbose = verbose @@ -728,7 +725,7 @@ class LMM: pl.title(title) -def gn2_redis(key,species,is_testing=False): +def gn2_redis(key,species): json_params = Redis.get(key) params = json.loads(json_params) @@ -753,8 +750,7 @@ def gn2_redis(key,species,is_testing=False): genotype_matrix = geno, restricted_max_likelihood = params['restricted_max_likelihood'], refit = params['refit'], - tempdata = tempdata, - is_testing=is_testing) + tempdata = tempdata) results_key = "pylmm:results:" + params['temp_uuid'] @@ -779,7 +775,7 @@ def gn2_main(): gn2_redis(key,species) -def gn2_load_redis(key,species,kinship,pheno,geno,is_testing): +def gn2_load_redis(key,species,kinship,pheno,geno): print("Loading Redis from parsed data") params = dict(pheno_vector = pheno.tolist(), genotype_matrix = geno.tolist(), @@ -796,7 +792,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno,is_testing): Redis.set(key, json_params) Redis.expire(key, 60*60) - return gn2_redis(key,species,is_testing) + return gn2_redis(key,species) if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") -- cgit v1.2.3 From fc42cd5910bbc021e1fd51b579b2f83a421a33b8 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 12:47:25 +0300 Subject: Allow empty K to be passed in --- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 7 ++++++- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 28 ++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 87b999e8..d6830787 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -338,6 +338,7 @@ def calculate_kinship(genotype_matrix, temp_data=None): m = genotype_matrix.shape[1] print("genotype 2D matrix n (inds) is:", n) print("genotype 2D matrix m (snps) is:", m) + assert m>n, "n should be larger than m (snps>inds)" keep = [] for counter in range(m): #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter])) @@ -777,9 +778,13 @@ def gn2_main(): def gn2_load_redis(key,species,kinship,pheno,geno): print("Loading Redis from parsed data") + if kinship == None: + k = None + else: + k = kinship.tolist() params = dict(pheno_vector = pheno.tolist(), genotype_matrix = geno.tolist(), - kinship_matrix= kinship.tolist(), + kinship_matrix= k, restricted_max_likelihood = True, refit = False, temp_uuid = "testrun_temp_uuid", diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 80478368..7a77ad0a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -98,16 +98,20 @@ if options.geno: if cmd == 'redis': # Emulating the redis setup of GN2 - gn = [] - for ind_g in g: - gn.append( genotype.normalize(ind_g) ) - gnt = np.array(gn).T - if y: - Y,G = phenotype.remove_missing(y,gnt,options.verbose) - print "G",G.shape,G - else: - G = gnt - ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing) + G = g + print "Original G",G.shape, "\n", G + if y != None: + gnt = np.array(g).T + Y,g = phenotype.remove_missing(y,g.T,options.verbose) + G = g.T + print "Removed missing phenotypes",G.shape, "\n", G + if options.maf_normalization: + G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + print "MAF replacements: \n",G + if not options.skip_genotype_normalization: + G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + + ps, ts = gn2_load_redis('testrun','other',k,Y,G.T) print np.array(ps) print round(ps[0],4) assert(options.testing and round(ps[0],4)==0.7262) @@ -116,8 +120,8 @@ if cmd == 'redis': elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G - if y: - gnt = np.array(gn).T + if y != None: + gnt = np.array(g).T Y,g = phenotype.remove_missing(y,g.T,options.verbose) G = g.T print "Removed missing phenotypes",G.shape, "\n", G -- cgit v1.2.3 From 60476f068766371c41eb17c2ad20f4f6ce7da2d5 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 13:58:02 +0300 Subject: Adding assertions --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 7a77ad0a..8c0b73eb 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -113,10 +113,19 @@ if cmd == 'redis': ps, ts = gn2_load_redis('testrun','other',k,Y,G.T) print np.array(ps) - print round(ps[0],4) - assert(options.testing and round(ps[0],4)==0.7262) - print round(ps[-1],4) - assert(options.testing and round(ps[-1],4)==0.3461) + # Test results + p1 = round(ps[0],4) + p2 = round(ps[-1],4) + sys.stderr.write(options.geno+"\n") + if options.geno == 'data/small.geno': + assert p1==0.0708, "p1=%f" % p1 + assert p2==0.1417, "p2=%f" % p2 + if options.geno == 'data/small_na.geno': + assert p1==0.0958, "p1=%f" % p1 + assert p2==0.0435, "p2=%f" % p2 + if options.geno == 'data/test8000.geno': + assert p1==0.8984, "p1=%f" % p1 + assert p2==0.9623, "p2=%f" % p2 elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G -- cgit v1.2.3 From 3c8c9103606736abf3e891cb000dc01e8d0c8a43 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 14:17:42 +0300 Subject: Adding tests --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 8c0b73eb..21d9240b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -143,13 +143,24 @@ elif cmd == 'kinship': K = kinship_full(G,options) print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K - K2 = calculate_kinship(np.copy(G.T),temp_data=None,is_testing=options.testing) + K2 = calculate_kinship(np.copy(G.T),temp_data=None) print "Genotype",G.shape, "\n", G print "GN2 Kinship method",K2.shape,"\n",K2 print "Genotype",G.shape, "\n", G K3 = kinship(np.copy(G),options) print "third Kinship method",K3.shape,"\n",K3 - assert(K[0][0]==1.28) + sys.stderr.write(options.geno+"\n") + k1 = round(K[0][0],4) + k2 = round(K2[0][0],4) + k3 = round(K3[0][0],4) + if options.geno == 'data/small.geno': + assert k1==0.7939, "k1=%f" % k1 + assert k2==0.7939, "k1=%f" % k1 + assert k3==0.7939, "k1=%f" % k1 + if options.geno == 'data/small_na.geno': + assert k1==0.7172, "k1=%f" % k1 + assert k2==0.7172, "k1=%f" % k1 + assert k3==0.7172, "k1=%f" % k1 else: print "Doing nothing" -- cgit v1.2.3 From 24b44a2de4fb47c2ed6a6a3def3acce1c06bebf4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 15:26:07 +0300 Subject: Conditional run of K calculation --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 2 +- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index c1750e1d..1dca1c57 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -59,7 +59,7 @@ def compute_matrixMult(job,W,q = None): def f_init(q): compute_matrixMult.q = q -def kinship_full(G,options): +def kinship_full(G): print G.shape m = G.shape[0] # snps n = G.shape[1] # inds diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 21d9240b..26485136 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -139,20 +139,21 @@ elif cmd == 'kinship': print "MAF replacements: \n",G if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) - - K = kinship_full(G,options) - print "Genotype",G.shape, "\n", G - print "first Kinship method",K.shape,"\n",K - K2 = calculate_kinship(np.copy(G.T),temp_data=None) - print "Genotype",G.shape, "\n", G - print "GN2 Kinship method",K2.shape,"\n",K2 + if True: + K = kinship_full(G) + print "Genotype",G.shape, "\n", G + print "first Kinship method",K.shape,"\n",K + k1 = round(K[0][0],4) + K2 = calculate_kinship(np.copy(G.T),temp_data=None) + print "Genotype",G.shape, "\n", G + print "GN2 Kinship method",K2.shape,"\n",K2 + k2 = round(K2[0][0],4) + print "Genotype",G.shape, "\n", G K3 = kinship(np.copy(G),options) print "third Kinship method",K3.shape,"\n",K3 sys.stderr.write(options.geno+"\n") - k1 = round(K[0][0],4) - k2 = round(K2[0][0],4) k3 = round(K3[0][0],4) if options.geno == 'data/small.geno': assert k1==0.7939, "k1=%f" % k1 -- cgit v1.2.3 From 4931530a6c32eaf56d1586154bc7ea872c0de2ba Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 15:51:48 +0300 Subject: Tests and indentation --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 159 +++++++++++++++--------------- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 13 ++- 2 files changed, 89 insertions(+), 83 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 1dca1c57..8caa753b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -57,91 +57,94 @@ def compute_matrixMult(job,W,q = None): return job def f_init(q): - compute_matrixMult.q = q + compute_matrixMult.q = q def kinship_full(G): - print G.shape - m = G.shape[0] # snps - n = G.shape[1] # inds - sys.stderr.write(str(m)+" SNPs\n") - assert m>n, "n should be larger than m (snps>inds)" - m = np.dot(G.T,G) - m = m/G.shape[0] - return m + """ + Calculate the Kinship matrix using a full dot multiplication + """ + print G.shape + m = G.shape[0] # snps + n = G.shape[1] # inds + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" + m = np.dot(G.T,G) + m = m/G.shape[0] + return m # Calculate the kinship matrix from G (SNPs as rows!), returns K # def kinship(G,options): - numThreads = None - if options.numThreads: - numThreads = int(options.numThreads) - options.computeSize = 1000 - matrix_initialize(options.useBLAS) - - sys.stderr.write(str(G.shape)+"\n") - n = G.shape[1] # inds - inds = n - m = G.shape[0] # snps - snps = m - sys.stderr.write(str(m)+" SNPs\n") - assert m>n, "n should be larger than m (snps>inds)" - - q = mp.Queue() - p = mp.Pool(numThreads, f_init, [q]) - iterations = snps/options.computeSize+1 - if options.testing: + numThreads = None + if options.numThreads: + numThreads = int(options.numThreads) + options.computeSize = 1000 + matrix_initialize(options.useBLAS) + + sys.stderr.write(str(G.shape)+"\n") + n = G.shape[1] # inds + inds = n + m = G.shape[0] # snps + snps = m + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" + + q = mp.Queue() + p = mp.Pool(numThreads, f_init, [q]) + iterations = snps/options.computeSize+1 + if options.testing: iterations = 8 - # jobs = range(0,8) # range(0,iterations) - - results = [] - - K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals - - completed = 0 - for job in range(iterations): - if options.verbose: - sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) - W = compute_W(job,G,n,snps,options.computeSize) - if numThreads == 1: - compute_matrixMult(job,W,q) - j,x = q.get() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - else: - results.append(p.apply_async(compute_matrixMult, (job,W))) - # Do we have a result? - try: - j,x = q.get_nowait() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - completed += 1 - except Queue.Empty: - pass - - if numThreads == None or numThreads > 1: - for job in range(len(results)-completed): - j,x = q.get(True,15) - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - - K = K / float(snps) - outFile = 'runtest.kin' - if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) - np.savetxt(outFile,K) - - if options.saveKvaKve: - if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") - Kva,Kve = linalg.eigh(K) - if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - np.savetxt(outFile+".kva",Kva) - np.savetxt(outFile+".kve",Kve) - return K + # jobs = range(0,8) # range(0,iterations) + + results = [] + + K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + + completed = 0 + for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job,G,n,snps,options.computeSize) + if numThreads == 1: + compute_matrixMult(job,W,q) + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + else: + results.append(p.apply_async(compute_matrixMult, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + + if numThreads == None or numThreads > 1: + for job in range(len(results)-completed): + j,x = q.get(True,15) + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + + K = K / float(snps) + outFile = 'runtest.kin' + if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) + np.savetxt(outFile,K) + + if options.saveKvaKve: + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + return K diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 26485136..547ac7f1 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -140,7 +140,7 @@ elif cmd == 'kinship': if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) - if True: + if False: K = kinship_full(G) print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K @@ -157,11 +157,14 @@ elif cmd == 'kinship': k3 = round(K3[0][0],4) if options.geno == 'data/small.geno': assert k1==0.7939, "k1=%f" % k1 - assert k2==0.7939, "k1=%f" % k1 - assert k3==0.7939, "k1=%f" % k1 + assert k2==0.7939, "k2=%f" % k2 + assert k3==0.7939, "k3=%f" % k3 if options.geno == 'data/small_na.geno': assert k1==0.7172, "k1=%f" % k1 - assert k2==0.7172, "k1=%f" % k1 - assert k3==0.7172, "k1=%f" % k1 + assert k2==0.7172, "k2=%f" % k2 + assert k3==0.7172, "k3=%f" % k3 + if options.geno == 'data/test8000.geno': + assert k3==1.4352, "k3=%f" % k3 + else: print "Doing nothing" -- cgit v1.2.3 From 7759c755cb8d525ba3739864021921888bdf47e9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 16:17:03 +0300 Subject: Parallel K calculation is fixed --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 33 +++++++++++++++---------------- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 5 ++++- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 8caa753b..a5497a77 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -26,20 +26,31 @@ import Queue from optmatrix import matrix_initialize, matrixMultT +def kinship_full(G): + """ + Calculate the Kinship matrix using a full dot multiplication + """ + print G.shape + m = G.shape[0] # snps + n = G.shape[1] # inds + sys.stderr.write(str(m)+" SNPs\n") + assert m>n, "n should be larger than m (snps>inds)" + m = np.dot(G.T,G) + m = m/G.shape[0] + return m def compute_W(job,G,n,snps,compute_size): """ Read 1000 SNPs at a time into matrix and return the result """ - W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + m = compute_size + W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) for j in range(0,compute_size): - row = job*compute_size + j - if row >= compute_size or row>=snps: + pos = job*m + j # real position + if pos >= snps: W = W[:,range(0,j)] break - # print job,compute_size,j snp = G[job*compute_size+j] - # print snp.shape,snp if snp.var() == 0: continue W[:,j] = snp # set row to list of SNPs @@ -59,18 +70,6 @@ def compute_matrixMult(job,W,q = None): def f_init(q): compute_matrixMult.q = q -def kinship_full(G): - """ - Calculate the Kinship matrix using a full dot multiplication - """ - print G.shape - m = G.shape[0] # snps - n = G.shape[1] # inds - sys.stderr.write(str(m)+" SNPs\n") - assert m>n, "n should be larger than m (snps>inds)" - m = np.dot(G.T,G) - m = m/G.shape[0] - return m # Calculate the kinship matrix from G (SNPs as rows!), returns K # diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 547ac7f1..a322f988 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -70,6 +70,9 @@ parser.add_option("--saveKvaKve", parser.add_option("--test", action="store_true", dest="testing", default=False, help="Testing mode") +parser.add_option("--test-kinship", + action="store_true", dest="test_kinship", default=False, + help="Testing mode for Kinship calculation") (options, args) = parser.parse_args() @@ -140,7 +143,7 @@ elif cmd == 'kinship': if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) - if False: + if options.test_kinship: K = kinship_full(G) print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K -- cgit v1.2.3 From 58f43e9e767890031c77e44d10939c48bc1c81fe Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 16:45:37 +0300 Subject: Kinship multi-core: don't overload the queue with jobs --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index a5497a77..967eca81 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -22,7 +22,8 @@ import sys import os import numpy as np import multiprocessing as mp # Multiprocessing is part of the Python stdlib -import Queue +import Queue +import time from optmatrix import matrix_initialize, matrixMultT @@ -90,6 +91,8 @@ def kinship(G,options): q = mp.Queue() p = mp.Pool(numThreads, f_init, [q]) + cpu_num = mp.cpu_count() + print "CPU cores:",cpu_num iterations = snps/options.computeSize+1 if options.testing: iterations = 8 @@ -105,6 +108,7 @@ def kinship(G,options): sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) W = compute_W(job,G,n,snps,options.computeSize) if numThreads == 1: + # Single-core compute_matrixMult(job,W,q) j,x = q.get() if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") @@ -112,25 +116,29 @@ def kinship(G,options): # print j,K_j[:,0] K = K + K_j else: + # Multi-core results.append(p.apply_async(compute_matrixMult, (job,W))) # Do we have a result? - try: - j,x = q.get_nowait() - if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") - K_j = x - # print j,K_j[:,0] - K = K + K_j - completed += 1 - except Queue.Empty: - pass - + while (len(results)-completed>cpu_num): + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + if numThreads == None or numThreads > 1: + # results contains the growing result set for job in range(len(results)-completed): j,x = q.get(True,15) if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") K_j = x # print j,K_j[:,0] K = K + K_j + completed += 1 K = K / float(snps) outFile = 'runtest.kin' -- cgit v1.2.3 From c784f2fb2e3e4ad6cb6d6425aa7ec1596c2bd132 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 16:52:47 +0300 Subject: Allow queue to be 2xCPU. Disable output files --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 967eca81..1383c782 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -71,7 +71,6 @@ def compute_matrixMult(job,W,q = None): def f_init(q): compute_matrixMult.q = q - # Calculate the kinship matrix from G (SNPs as rows!), returns K # def kinship(G,options): @@ -119,7 +118,7 @@ def kinship(G,options): # Multi-core results.append(p.apply_async(compute_matrixMult, (job,W))) # Do we have a result? - while (len(results)-completed>cpu_num): + while (len(results)-completed>cpu_num*2): try: j,x = q.get_nowait() if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") @@ -141,16 +140,17 @@ def kinship(G,options): completed += 1 K = K / float(snps) - outFile = 'runtest.kin' - if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) - np.savetxt(outFile,K) - - if options.saveKvaKve: - if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") - Kva,Kve = linalg.eigh(K) - if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - np.savetxt(outFile+".kva",Kva) - np.savetxt(outFile+".kve",Kve) + + # outFile = 'runtest.kin' + # if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) + # np.savetxt(outFile,K) + + # if options.saveKvaKve: + # if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + # Kva,Kve = linalg.eigh(K) + # if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + # np.savetxt(outFile+".kva",Kva) + # np.savetxt(outFile+".kve",Kve) return K -- cgit v1.2.3 From 086ce413e270d2ed07a1a385f84e569e3a478a39 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 17:08:10 +0300 Subject: Allow the GC to do its job --- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index a322f988..469ba6c9 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -113,6 +113,8 @@ if cmd == 'redis': print "MAF replacements: \n",G if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + g = None + gnt = None ps, ts = gn2_load_redis('testrun','other',k,Y,G.T) print np.array(ps) @@ -142,6 +144,8 @@ elif cmd == 'kinship': print "MAF replacements: \n",G if not options.skip_genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + g = None + gnt = None if options.test_kinship: K = kinship_full(G) -- cgit v1.2.3 From 38cb5d5c0606481cbc34bd1a1187264d1fc1b085 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 14 Mar 2015 18:28:02 +0300 Subject: Threading: Sleep while waiting --- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 1383c782..9ab48510 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -119,6 +119,7 @@ def kinship(G,options): results.append(p.apply_async(compute_matrixMult, (job,W))) # Do we have a result? while (len(results)-completed>cpu_num*2): + time.sleep(0.1) try: j,x = q.get_nowait() if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") -- cgit v1.2.3