From 73ed2f67bd0478473b887902ae96caaa0fca58d4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 17 Mar 2015 13:10:49 +0300 Subject: GWAS: one result is missing --- wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 33 +++++++++++++----------------- wqflask/wqflask/my_pylmm/pyLMM/input.py | 2 ++ wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 10 ++++++--- wqflask/wqflask/my_pylmm/pyLMM/lmm2.py | 17 ++++++++++------ wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 35 +++++++++++++++++--------------- 5 files changed, 53 insertions(+), 44 deletions(-) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py index 2a472717..20083bde 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py @@ -66,7 +66,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): """ matrix_initialize() cpu_num = mp.cpu_count() - numThreads = None + numThreads = None # for now use all available threads kfile2 = False reml = restricted_max_likelihood @@ -110,10 +110,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): # PS = [] # TS = [] count = 0 - - completed = 0 - last_j = 0 - # for snp_id in G: + jobs_running = 0 for snp in G: snp_id = (snp,'SNPID') count += 1 @@ -129,13 +126,12 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): j,lst = q.get() if verbose: sys.stderr.write("Job "+str(j)+" finished\n") - # for line in lines: - # out.write(line) res.append(lst) else: p.apply_async(compute_snp,(job,n,collect,lmm2,reml)) + jobs_running += 1 collect = [] - while job > completed: + while jobs_running: try: j,lst = q.get_nowait() if verbose: @@ -143,34 +139,33 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): # for line in lines: # out.write(line) res.append(lst) - completed += 1 + jobs_running -= 1 except Queue.Empty: pass - if job > completed + cpu_num*2: - time.sleep(0.1) + if jobs_running + cpu_num*2 > 0: + time.sleep(1.0) else: - if job >= completed: + if jobs_running > 0: break collect.append(snp_id) - if numThreads==1: + if numThreads==1 or count<1000: print "Running on 1 THREAD" compute_snp(count/1000,n,collect,lmm2,reml,q) j,lst = q.get() - # for line in lines: - # out.write(line) res.append(lst) else: - for job in range(int(count/1000)-completed): + print "count=",count," running=",jobs_running," collect=",len(collect) + for job in range(jobs_running): j,lst = q.get(True,15) # time out if verbose: sys.stderr.write("Job "+str(j)+" finished\n") res.append(lst) - # print res + if verbose: + print "res=",res[0][0:10] + print [len(res1) for res1 in res] ts = [item[0] for res1 in res for item in res1] ps = [item[1] for res1 in res for item in res1] - # ts = [item[1] for item in res] - # print ps return ts,ps diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py index f7b556a5..7063fedf 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/input.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py @@ -135,6 +135,8 @@ class plink: def normalizeGenotype(self,G): # print "Before",G # print G.shape + print "call input.normalizeGenotype" + raise "This should not be used" x = True - np.isnan(G) m = G[x].mean() s = np.sqrt(G[x].var()) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 2014ffb8..8a24d98b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -285,7 +285,7 @@ def run_other_old(pheno_vector, # with Bench("LMM_ob fitting"): # lmm_ob.fit() - print("genotype_matrix: ", genotype_matrix.shape) + print("run_other_old genotype_matrix: ", genotype_matrix.shape) print(genotype_matrix) with Bench("Doing GWAS"): @@ -320,6 +320,7 @@ def run_other_new(pheno_vector, # Adjust phenotypes Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True) print("Removed missing phenotypes",Y.shape) + # if options.maf_normalization: # G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) # print "MAF replacements: \n",G @@ -337,7 +338,7 @@ def run_other_new(pheno_vector, # with Bench("LMM_ob fitting"): # lmm_ob.fit() - print("genotype_matrix: ", G.shape) + print("run_other_new genotype_matrix: ", G.shape) print(G) with Bench("Doing GWAS"): @@ -388,7 +389,9 @@ def calculate_kinship_new(genotype_matrix, temp_data=None): Call the new kinship calculation where genotype_matrix contains inds (columns) by snps (rows). """ + print("call genotype.normalize") G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix) + print("call calculate_kinship_new") return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME) def calculate_kinship_old(genotype_matrix, temp_data=None): @@ -399,6 +402,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): normalizes the resulting vectors and returns the RRM matrix. """ + print("call calculate_kinship_old") n = genotype_matrix.shape[0] m = genotype_matrix.shape[1] print("genotype 2D matrix n (inds) is:", n) @@ -429,7 +433,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): temp_data.store("percent_complete", percent_complete) genotype_matrix = genotype_matrix[:,keep] - print("genotype_matrix: ", pf(genotype_matrix)) + print("After kinship (old) genotype_matrix: ", pf(genotype_matrix)) kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) return kinship_matrix,genotype_matrix diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py index d50b0111..d4b3ac82 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py @@ -22,6 +22,7 @@ from scipy.linalg import eigh, inv, det import scipy.stats as stats # t-tests from scipy import optimize from optmatrix import matrixMult +import kinship def calculateKinship(W,center=False): """ @@ -177,13 +178,17 @@ class LMM2: Kve = [] self.nonmissing = x - if len(Kva) == 0 or len(Kve) == 0: - if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) - begin = time.time() - Kva,Kve = eigh(K) - end = time.time() - if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + print("this K is:", K.shape, K) + if len(Kva) == 0 or len(Kve) == 0: + # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + begin = time.time() + # Kva,Kve = linalg.eigh(K) + Kva,Kve = kinship.kvakve(K) + end = time.time() + if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve)) + self.K = K self.Kva = Kva self.Kve = Kve diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 1af5a443..708c9185 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -54,9 +54,9 @@ parser.add_option("--geno",dest="geno", parser.add_option("--maf-normalization", action="store_true", dest="maf_normalization", default=False, help="Apply MAF genotype normalization") -parser.add_option("--skip-genotype-normalization", - action="store_true", dest="skip_genotype_normalization", default=False, - help="Skip genotype normalization") +parser.add_option("--genotype-normalization", + action="store_true", dest="genotype_normalization", default=False, + help="Force genotype normalization") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -100,7 +100,8 @@ if options.geno: print g.shape if cmd == 'redis_new': - # Emulating the redis setup of GN2 + # The main difference between redis_new and redis is that missing + # phenotypes are handled by the first Y = y G = g print "Original G",G.shape, "\n", G @@ -109,7 +110,7 @@ if cmd == 'redis_new': G = None ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True) print np.array(ps) - print sum(ps) + print len(ps),sum(ps) # Test results p1 = round(ps[0],4) p2 = round(ps[-1],4) @@ -118,12 +119,14 @@ if cmd == 'redis_new': assert p1==0.0708, "p1=%f" % p1 assert p2==0.1417, "p2=%f" % p2 if options.geno == 'data/small_na.geno': - assert p1==0.0958, "p1=%f" % p1 - assert p2==0.0435, "p2=%f" % p2 + assert p1==0.0897, "p1=%f" % p1 + assert p2==0.0405, "p2=%f" % p2 if options.geno == 'data/test8000.geno': assert p1==0.8984, "p1=%f" % p1 - assert p2==0.9623, "p2=%f" % p2 -if cmd == 'redis': + assert p2==0.9620, "p2=%f" % p2 + assert sum(ps) == 4070.02346579 + assert len(ps) == 8000 +elif cmd == 'redis': # Emulating the redis setup of GN2 G = g print "Original G",G.shape, "\n", G @@ -135,7 +138,7 @@ if cmd == 'redis': if options.maf_normalization: G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) print "MAF replacements: \n",G - if not options.skip_genotype_normalization: + if options.genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) g = None gnt = None @@ -144,8 +147,8 @@ if cmd == 'redis': G = None ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False) print np.array(ps) - print sum(ps) - # Test results + print len(ps),sum(ps) + # Test results 4070.02346579 p1 = round(ps[0],4) p2 = round(ps[-1],4) sys.stderr.write(options.geno+"\n") @@ -153,11 +156,11 @@ if cmd == 'redis': assert p1==0.0708, "p1=%f" % p1 assert p2==0.1417, "p2=%f" % p2 if options.geno == 'data/small_na.geno': - assert p1==0.0958, "p1=%f" % p1 - assert p2==0.0435, "p2=%f" % p2 + assert p1==0.0897, "p1=%f" % p1 + assert p2==0.0405, "p2=%f" % p2 if options.geno == 'data/test8000.geno': assert p1==0.8984, "p1=%f" % p1 - assert p2==0.9623, "p2=%f" % p2 + assert p2==0.8827, "p2=%f" % p2 elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G @@ -169,7 +172,7 @@ elif cmd == 'kinship': if options.maf_normalization: G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) print "MAF replacements: \n",G - if not options.skip_genotype_normalization: + if options.genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) g = None gnt = None -- cgit v1.2.3