diff options
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/chunks.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 36 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 142 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 16 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 40 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/process_plink.py | 127 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 93 |
8 files changed, 281 insertions, 177 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py index abeffee7..9565fb96 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py @@ -93,4 +93,4 @@ if __name__ == '__main__': _main() print("\nConfirming doctests...") import doctest - doctest.testmod()
\ No newline at end of file + doctest.testmod() diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py new file mode 100644 index 00000000..19b0957b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py @@ -0,0 +1,36 @@ +# Genotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import numpy as np + +def normalizeGenotype(g): + """ + Run for every SNP list (for one individual) and return + normalized SNP genotype values with missing data filled in + """ + g1 = np.copy(g) # avoid side effects + x = True - np.isnan(g) # Matrix of True/False + m = g[x].mean() # Global mean value + s = np.sqrt(g[x].var()) # Global stddev + g1[np.isnan(g)] = m # Plug-in mean values for missing data + if s == 0: + g1 = g1 - m # Subtract the mean + else: + g1 = (g1 - m) / s # Normalize the deviation + return g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py new file mode 100644 index 00000000..353784aa --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -0,0 +1,142 @@ +# pylmm kinship calculation +# +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test + +import sys +import os +import numpy as np +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +from optmatrix import matrix_initialize, matrixMultT + + +def compute_W(job,G,n,compute_size): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,compute_size): + row = job*compute_size + j + if row >= compute_size: + W = W[:,range(0,j)] + break + snp = G[job*compute_size+j] + # print snp.shape,snp + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_matrixMult(job,W,q = None): + """ + Compute Kinship(W)*j + + For every set of SNPs matrixMult is used to multiply matrices T(W)*W + """ + res = matrixMultT(W) + if not q: q=compute_matrixMult.q + q.put([job,res]) + return job + +def f_init(q): + compute_matrixMult.q = q + +def kinship_full(G,options): + print G.shape + m = np.dot(G.T,G) + m = m/G.shape[0] + return m + +# Calculate the kinship matrix from G (SNPs as rows!), returns K +# +def kinship(G,options): + numThreads = None + if options.numThreads: + numThreads = int(options.numThreads) + options.computeSize = 1000 + matrix_initialize(options.useBLAS) + + sys.stderr.write(str(G.shape)+"\n") + n = G.shape[1] # inds + inds = n + m = G.shape[0] # snps + snps = m + sys.stderr.write(str(m)+" SNPs\n") + + q = mp.Queue() + p = mp.Pool(numThreads, f_init, [q]) + iterations = snps/options.computeSize+1 + if options.testing: + iterations = 8 + # jobs = range(0,8) # range(0,iterations) + + results = [] + + K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + + completed = 0 + for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job,G,n,options.computeSize) + if numThreads == 1: + compute_matrixMult(job,W,q) + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + else: + results.append(p.apply_async(compute_matrixMult, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + + if numThreads == None or numThreads > 1: + for job in range(len(results)-completed): + j,x = q.get(True,15) + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + + print "kiship.kinship: ",K.shape,K + K = K / float(snps) + outFile = 'runtest.kin' + if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) + np.savetxt(outFile,K) + + if options.saveKvaKve: + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + return K + + + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 58d7593d..36c3602f 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -143,6 +143,7 @@ def run_human(pheno_vector, timestamp = datetime.datetime.utcnow().isoformat() + # Pickle chunks of input SNPs (from Plink interator) and compress them #print("Starting adding loop") for part, result in enumerate(results): #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL) @@ -325,7 +326,7 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) -def calculate_kinship(genotype_matrix, temp_data, is_testing=False): +def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -360,7 +361,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False): genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr) percent_complete = int(round((counter/m)*45)) - temp_data.store("percent_complete", percent_complete) + if temp_data != None: + temp_data.store("percent_complete", percent_complete) genotype_matrix = genotype_matrix[:,keep] print("genotype_matrix: ", pf(genotype_matrix)) @@ -406,6 +408,8 @@ def GWAS(pheno_vector, v = np.isnan(pheno_vector) if v.sum(): keep = True - v + print(pheno_vector.shape,pheno_vector) + print(keep.shape,keep) pheno_vector = pheno_vector[keep] #genotype_matrix = genotype_matrix[keep,:] #covariate_matrix = covariate_matrix[keep,:] @@ -437,6 +441,8 @@ def GWAS(pheno_vector, p_values.append(0) t_statistics.append(np.nan) continue + + print(genotype_matrix.shape,pheno_vector.shape,keep.shape) pheno_vector = pheno_vector[keep] covariate_matrix = covariate_matrix[keep,:] @@ -513,7 +519,7 @@ class LMM: Kve = [] self.nonmissing = x - print("this K is:", pf(K)) + print("this K is:", K.shape, pf(K)) if len(Kva) == 0 or len(Kve) == 0: if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) @@ -525,8 +531,8 @@ class LMM: self.K = K self.Kva = Kva self.Kve = Kve - print("self.Kva is: ", pf(self.Kva)) - print("self.Kve is: ", pf(self.Kve)) + print("self.Kva is: ", self.Kva.shape, pf(self.Kva)) + print("self.Kve is: ", self.Kve.shape, pf(self.Kve)) self.Y = Y self.X0 = X0 self.N = self.K.shape[0] diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py index abb72081..5c71db6a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py @@ -9,7 +9,7 @@ from scipy import stats useNumpy = None hasBLAS = None -def matrix_initialize(useBLAS): +def matrix_initialize(useBLAS=True): global useNumpy # module based variable if useBLAS and useNumpy == None: print get_info('blas_opt') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py new file mode 100644 index 00000000..c22da761 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -0,0 +1,40 @@ +# Phenotype routines + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import sys +import numpy as np + +def removeMissingPhenotypes(y,g,verbose=False): + """ + Remove missing data from matrices, make sure the genotype data has + individuals as rows + """ + assert(y!=None) + assert(y.shape[0] == g.shape[0]) + + y1 = y + g1 = g + v = np.isnan(y) + keep = True - v + if v.sum(): + if verbose: + sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + y1 = y[keep] + g1 = g[keep,:] + return y1,g1 + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py deleted file mode 100644 index e47c18e1..00000000 --- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import absolute_import, print_function, division - -import sys -sys.path.append("../../..") - -print("sys.path: ", sys.path) - -import numpy as np - -import zlib -import cPickle as pickle -import redis -Redis = redis.Redis() - -import lmm - -class ProcessLmmChunk(object): - - def __init__(self): - self.get_snp_data() - self.get_lmm_vars() - - keep = self.trim_matrices() - - self.do_association(keep) - - print("p_values is: ", self.p_values) - - def get_snp_data(self): - plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) - plink_data = pickle.loads(plink_pickled) - - self.snps = np.array(plink_data['result']) - self.identifier = plink_data['identifier'] - - def get_lmm_vars(self): - lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars") - lmm_vars = pickle.loads(lmm_vars_pickled) - - self.pheno_vector = np.array(lmm_vars['pheno_vector']) - self.covariate_matrix = np.array(lmm_vars['covariate_matrix']) - self.kinship_matrix = np.array(lmm_vars['kinship_matrix']) - - def trim_matrices(self): - v = np.isnan(self.pheno_vector) - keep = True - v - keep = keep.reshape((len(keep),)) - - if v.sum(): - self.pheno_vector = self.pheno_vector[keep] - self.covariate_matrix = self.covariate_matrix[keep,:] - self.kinship_matrix = self.kinship_matrix[keep,:][:,keep] - - return keep - - def do_association(self, keep): - n = self.kinship_matrix.shape[0] - lmm_ob = lmm.LMM(self.pheno_vector, - self.kinship_matrix, - self.covariate_matrix) - lmm_ob.fit() - - self.p_values = [] - - for snp in self.snps: - snp = snp[0] - p_value, t_stat = lmm.human_association(snp, - n, - keep, - lmm_ob, - self.pheno_vector, - self.covariate_matrix, - self.kinship_matrix, - False) - - self.p_values.append(p_value) - - -#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs")) -# -#plink_data = pickle.loads(plink_pickled) -#result = np.array(plink_data['result']) -#print("snp size is: ", result.shape) -#identifier = plink_data['identifier'] -# -#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars") -#lmm_vars = pickle.loads(lmm_vars_pickled) -# -#pheno_vector = np.array(lmm_vars['pheno_vector']) -#covariate_matrix = np.array(lmm_vars['covariate_matrix']) -#kinship_matrix = np.array(lmm_vars['kinship_matrix']) -# -#v = np.isnan(pheno_vector) -#keep = True - v -#keep = keep.reshape((len(keep),)) -#print("keep is: ", keep) -# -#if v.sum(): -# pheno_vector = pheno_vector[keep] -# covariate_matrix = covariate_matrix[keep,:] -# kinship_matrix = kinship_matrix[keep,:][:,keep] -# -#n = kinship_matrix.shape[0] -#print("n is: ", n) -#lmm_ob = lmm.LMM(pheno_vector, -# kinship_matrix, -# covariate_matrix) -#lmm_ob.fit() -# -#p_values = [] -# -#for snp in result: -# snp = snp[0] -# p_value, t_stat = lmm.human_association(snp, -# n, -# keep, -# lmm_ob, -# pheno_vector, -# covariate_matrix, -# kinship_matrix, -# False) -# -# p_values.append(p_value) - - - - diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 738268be..627cc7a4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -21,18 +21,22 @@ from optparse import OptionParser import sys import tsvreader import numpy as np -from lmm import gn2_load_redis +from lmm import gn2_load_redis, calculate_kinship +from kinship import kinship, kinship_full +from genotype import normalizeGenotype +from phenotype import removeMissingPhenotypes usage = """ python runlmm.py [options] command runlmm.py processing multiplexer reads standardised input formats - and calls the different routines + and calls the different routines (writes to stdout) Current commands are: parse : only parse input files redis : use Redis to call into GN2 + kinship : calculate (new) kinship matrix try --help for more information """ @@ -50,6 +54,13 @@ parser.add_option("--geno",dest="geno", parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") +parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication") +parser.add_option("-t", "--threads", + type="int", dest="numThreads", + help="Threads to use") +parser.add_option("--saveKvaKve", + action="store_true", dest="saveKvaKve", default=False, + help="Testing mode") parser.add_option("--test", action="store_true", dest="testing", default=False, help="Testing mode") @@ -63,6 +74,10 @@ if len(args) != 1: cmd = args[0] print "Command: ",cmd +k = None +y = None +g = None + if options.kinship: k = tsvreader.kinship(options.kinship) print k.shape @@ -75,46 +90,38 @@ if options.geno: g = tsvreader.geno(options.geno) print g.shape -def normalizeGenotype(G): - # Run for every SNP list (num individuals) - x = True - np.isnan(G) # Matrix of True/False - m = G[x].mean() # Global mean value - # print m - s = np.sqrt(G[x].var()) # Global stddev - # print s - G[np.isnan(G)] = m # Plug-in mean values for missing - if s == 0: - G = G - m # Subtract the mean - else: - G = (G - m) / s # Normalize the deviation - return G - -# g = g.reshape((1, -1))[0] -print("Before",g) -gn = [] -for snp in g: - gn.append( normalizeGenotype(snp) ) - -gn = g -gn = np.array(gn) -print("After1",gn) -gnT = gn.T -print("After",gnT) -# G = gnT -G = gnT -print "G shape",G.shape -# sys.exit(1) -# assert(G[0,0]==-2.25726341) - -# Remove individuals with missing phenotypes -v = np.isnan(y) -keep = True - v -if v.sum(): - if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - y = y[keep] - G = G[keep,:] - k = k[keep,:][:,keep] - if cmd == 'redis': - ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing) + # Emulating the redis setup of GN2 + gn = [] + for ind_g in g: + gn.append( normalizeGenotype(ind_g) ) + gnt = np.array(gn).T + Y,G = removeMissingPhenotypes(y,gnt,options.verbose) + print "G",G.shape,G + ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing) print np.array(ps) + print round(ps[0],4) + assert(options.testing and round(ps[0],4)==0.7262) + print round(ps[-1],4) + assert(options.testing and round(ps[-1],4)==0.3461) +elif cmd == 'kinship': + gn = [] + for ind_g in g: + if len(gn)>=8000: break + gn.append( normalizeGenotype(ind_g) ) + K = kinship_full(np.array(gn),options) + print "first Kinship method",K.shape,K + K = kinship(np.array(gn),options) + print "second Kinship method",K.shape,K + sys.exit(1) + gnt = np.array(gn).T + Y,g = removeMissingPhenotypes(y,gnt,options.verbose) + G = g + print G.shape,G + K = calculate_kinship(np.copy(G),None,options) + print G.shape,G + print "first Kinship method",K.shape,K + K = kinship(G.T,options) + assert(K[0][0]==1.28) +else: + print "Doing nothing" |