From 99fef2888f02551191cf6031c2c7222fce27e360 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sat, 4 Apr 2015 12:33:07 +0200 Subject: Run works without transposes --- wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 21 +++++++--- wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 24 +++++++---- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 65 +++++++++++++++++++---------- wqflask/wqflask/my_pylmm/pyLMM/lmm2.py | 43 ++++++++++--------- wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 35 +++++++++++++--- wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 21 +++++----- 6 files changed, 136 insertions(+), 73 deletions(-) (limited to 'wqflask') diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py index 8b344a90..ae3769d4 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py @@ -19,7 +19,7 @@ import pdb import time -# from utility import temp_data +import sys import lmm2 import os @@ -31,6 +31,18 @@ from lmm2 import LMM2 import multiprocessing as mp # Multiprocessing is part of the Python stdlib import Queue +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses +except ImportError: + sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n") + has_gn2=False + from standalone import uses + +progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') + + def formatResult(id,beta,betaSD,ts,ps): return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" @@ -52,12 +64,11 @@ def compute_snp(j,n,snp_ids,lmm2,REML,q = None): def f_init(q): compute_snp.q = q -def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True): +def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): """ GWAS. The G matrix should be n inds (cols) x m snps (rows) """ - progress,debug,info,mprint = uses('progress','debug','info','mprint') - + info("In gwas.gwas") matrix_initialize() cpu_num = mp.cpu_count() numThreads = None # for now use all available threads @@ -70,7 +81,7 @@ def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True): m = G.shape[0] # snps snps = m info("%s SNPs",snps) - assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds) + assert snps>=inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds) # CREATE LMM object for association # if not kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=verbose) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index be12417e..1c157fd8 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -28,12 +28,21 @@ import time from optmatrix import matrix_initialize, matrixMultT -def kinship_full(G,uses): +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses, progress_set_func +except ImportError: + has_gn2=False + import standalone as handlers + from standalone import uses, progress_set_func + +progress,debug,info,mprint = uses('progress','debug','info','mprint') + +def kinship_full(G): """ Calculate the Kinship matrix using a full dot multiplication """ - info,mprint = uses('info','mprint') - # mprint("kinship_full G",G) m = G.shape[0] # snps n = G.shape[1] # inds @@ -78,8 +87,7 @@ def f_init(q): # Calculate the kinship matrix from G (SNPs as rows!), returns K # -def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False): - progress,debug,info,mprint = uses('progress','debug','info','mprint') +def kinship(G,computeSize=1000,numThreads=None,useBLAS=False): matrix_initialize(useBLAS) @@ -89,7 +97,7 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False): m = G.shape[0] # snps snps = m info("%i SNPs" % (m)) - assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds) + assert snps>=inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds) q = mp.Queue() p = mp.Pool(numThreads, f_init, [q]) @@ -140,13 +148,11 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False): K = K / float(snps) return K -def kvakve(K,uses): +def kvakve(K): """ Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned of small values < 1e-6 (notably smaller than zero) """ - info,mprint = uses('info','mprint') - info("Obtaining eigendecomposition for %dx%d matrix" % (K.shape[0],K.shape[1]) ) Kva,Kve = linalg.eigh(K) mprint("Kva",Kva) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index e51742c4..82bd7f0b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -64,7 +64,6 @@ except ImportError: import standalone as handlers from standalone import uses, progress_set_func sys.stderr.write("WARNING: LMM standalone version missing the Genenetwork2 environment\n") - pass progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal') @@ -296,8 +295,8 @@ def run_other_old(pheno_vector, Bench().report() return p_values, t_stats -def run_other_new(pheno_vector, - genotype_matrix, +def run_other_new(n,m,pheno_vector, + geno, restricted_max_likelihood=True, refit=False): @@ -312,8 +311,7 @@ def run_other_new(pheno_vector, print("REML=",restricted_max_likelihood," REFIT=",refit) # Adjust phenotypes - Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True) - print("Removed missing phenotypes",Y.shape) + n,Y,keep = phenotype.remove_missing_new(n,pheno_vector) # if options.maf_normalization: # G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) @@ -321,8 +319,9 @@ def run_other_new(pheno_vector, # if not options.skip_genotype_normalization: # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + geno = geno[:,keep] with Bench("Calculate Kinship"): - K,G = calculate_kinship_new(G) + K,G = calculate_kinship_new(geno) print("kinship_matrix: ", pf(K)) print("kinship_matrix.shape: ", pf(K.shape)) @@ -337,9 +336,8 @@ def run_other_new(pheno_vector, with Bench("Doing GWAS"): t_stats, p_values = gwas.gwas(Y, - G.T, + G, K, - uses, restricted_max_likelihood=True, refit=False,verbose=True) Bench().report() @@ -378,18 +376,30 @@ def matrixMult(A,B): return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) +def calculate_kinship_new(genotype_matrix): + """ + Call the new kinship calculation where genotype_matrix contains + inds (columns) by snps (rows). + """ + assert type(genotype_matrix) is np.ndarray + info("call genotype.normalize") + G = np.apply_along_axis( genotype.normalize, axis=1, arr=genotype_matrix) + mprint("G",genotype_matrix) + info("call calculate_kinship_new") + return kinship(G),G # G gets transposed, we'll turn this into an iterator (FIXME) -def calculate_kinship_new(genotype_matrix, temp_data=None): +def calculate_kinship_iter(geno): """ Call the new kinship calculation where genotype_matrix contains inds (columns) by snps (rows). """ + assert type(genotype_matrix) is iter info("call genotype.normalize") G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix) info("call calculate_kinship_new") - return kinship(G.T,uses),G # G gets transposed, we'll turn this into an iterator (FIXME) + return kinship(G) -def calculate_kinship_old(genotype_matrix, temp_data=None): +def calculate_kinship_old(genotype_matrix): """ genotype_matrix is an n x m matrix encoding SNP minor alleles. @@ -430,7 +440,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): mprint("G (after old normalize)",genotype_matrix.T) kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) return kinship_matrix,genotype_matrix - # return kinship_full(genotype_matrix.T,uses),genotype_matrix + # return kinship_full(genotype_matrix.T),genotype_matrix def GWAS(pheno_vector, genotype_matrix, @@ -586,7 +596,7 @@ class LMM: # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) begin = time.time() # Kva,Kve = linalg.eigh(K) - Kva,Kve = kvakve(K,uses) + Kva,Kve = kvakve(K) end = time.time() if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve)) @@ -794,12 +804,11 @@ class LMM: pl.ylabel("Probability of data") pl.title(title) - def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,new_code=True): """ Invoke pylmm using genotype as a matrix or as a (SNP) iterator. """ - info("gwas_without_redis") + info("run_gwas") print('pheno', y) if species == "human" : @@ -813,8 +822,8 @@ def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,ne print('geno', geno.shape, geno) if new_code: - ps, ts = run_other_new(pheno_vector = y, - genotype_matrix = geno, + ps, ts = run_other_new(n,m,pheno_vector = y, + geno = geno, restricted_max_likelihood = reml, refit = refit) else: @@ -849,10 +858,20 @@ def gwas_with_redis(key,species,new_code=True): v = np.array(v) return v + def narrayT(key): + m = narray(key) + if m is not None: + return m.T + return m + + # We are transposing before we enter run_gwas - this should happen on the webserver + # side (or when reading data from file) + k = narray('kinship_matrix') + g = narrayT('genotype_matrix') y = narray('pheno_vector') n = len(y) m = params['num_genotypes'] - ps,ts = run_gwas(species,n,m,narray('kinship_matrix'),y,narray('genotype_matrix'),narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code) + ps,ts = run_gwas(species,n,m,k,y,g,narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code) results_key = "pylmm:results:" + params['temp_uuid'] @@ -864,19 +883,19 @@ def gwas_with_redis(key,species,new_code=True): Redis.expire(results_key, 60*60) return ps, ts - def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True): """ - This function emulates current GN2 behaviour by pre-loading Redis + This function emulates current GN2 behaviour by pre-loading Redis (note the input + genotype is transposed to emulate GN2 (FIXME!) """ - print("Loading Redis from parsed data") + info("Loading Redis from parsed data") if kinship == None: k = None else: k = kinship.tolist() params = dict(pheno_vector = pheno.tolist(), - genotype_matrix = geno.tolist(), - num_genotypes = geno.shape[1], + genotype_matrix = geno.T.tolist(), + num_genotypes = geno.shape[0], kinship_matrix = k, covariate_matrix = None, input_file_name = None, diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py index d67e1205..358bf27e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py @@ -32,7 +32,6 @@ except ImportError: sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n") has_gn2=False from standalone import uses - pass def calculateKinship(W,center=False): """ @@ -149,28 +148,32 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): class LMM2: - """ - This is a simple version of EMMA/fastLMM. - The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) - and to optimize this model by finding the maximum-likelihood estimates for the model parameters. - There are three model parameters: heritability (h), covariate coefficients (beta) and the total - phenotypic variance (sigma). - Heritability as defined here is the proportion of the total variance (sigma) that is attributed to - the kinship matrix. - - For simplicity, we assume that everything being input is a numpy array. - If this is not the case, the module may throw an error as conversion from list to numpy array - is not done consistently. + """This is a simple version of EMMA/fastLMM. + + The main purpose of this module is to take a phenotype vector (Y), + a set of covariates (X) and a kinship matrix (K) and to optimize + this model by finding the maximum-likelihood estimates for the + model parameters. There are three model parameters: heritability + (h), covariate coefficients (beta) and the total phenotypic + variance (sigma). Heritability as defined here is the proportion + of the total variance (sigma) that is attributed to the kinship + matrix. + + For simplicity, we assume that everything being input is a numpy + array. If this is not the case, the module may throw an error as + conversion from list to numpy array is not done consistently. """ def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False): - """ - The constructor takes a phenotype vector or array Y of size n. - It takes a kinship matrix K of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. - If they are not provided, the constructor will calculate them. - X0 is an optional covariate matrix of size n x q, where there are q covariates. - When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """The constructor takes a phenotype vector or array Y of size n. It + takes a kinship matrix K of size n x n. Kva and Kve can be + computed as Kva,Kve = linalg.eigh(K) and cached. If they are + not provided, the constructor will calculate them. X0 is an + optional covariate matrix of size n x q, where there are q + covariates. When this parameter is not provided, the + constructor will set X0 to an n x 1 matrix of all ones to + represent a mean effect. """ if X0 is None: @@ -194,7 +197,7 @@ class LMM2: # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) begin = time.time() # Kva,Kve = linalg.eigh(K) - Kva,Kve = kinship.kvakve(K,uses) + Kva,Kve = kinship.kvakve(K) end = time.time() if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve)) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py index 4c8175f7..7b652515 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -19,22 +19,47 @@ import sys import numpy as np -def remove_missing(y,g,verbose=False): +# ---- A trick to decide on the environment: +try: + from wqflask.my_pylmm.pyLMM import chunks + from gn2 import uses, progress_set_func +except ImportError: + has_gn2=False + import standalone as handlers + from standalone import uses, progress_set_func + +progress,debug,info,mprint = uses('progress','debug','info','mprint') + +def remove_missing(n,y,g): """ Remove missing data from matrices, make sure the genotype data has individuals as rows """ assert(y is not None) - assert(y.shape[0] == g.shape[0]) + assert y.shape[0] == g.shape[0],"y (n) %d, g (n,m) %s" % (y.shape[0],g.shape) y1 = y g1 = g v = np.isnan(y) keep = True - v if v.sum(): - if verbose: - sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) + info("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) y1 = y[keep] g1 = g[keep,:] - return y1,g1,keep + n = y1.shape[0] + return n,y1,g1,keep + +def remove_missing_new(n,y): + """ + Remove missing data. Returns new n,y,keep + """ + assert(y is not None) + y1 = y + v = np.isnan(y) + keep = True - v + if v.sum(): + info("runlmm.py: Cleaning the phenotype vector by removing %d individuals" % (v.sum())) + y1 = y[keep] + n = y1.shape[0] + return n,y1,keep diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 2d02e195..d248dee2 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -131,16 +131,15 @@ def check_results(ps,ts): if cmd == 'run': if options.remove_missing_phenotypes: raise Exception('Can not use --remove-missing-phenotypes with LMM2') - snp_iterator = tsvreader.geno_iter(options.geno) n = len(y) m = g.shape[1] - ps, ts = run_gwas('other',n,m,k,y,g.T) + ps, ts = run_gwas('other',n,m,k,y,g) # <--- pass in geno by SNP check_results(ps,ts) elif cmd == 'iterator': if options.remove_missing_phenotypes: raise Exception('Can not use --remove-missing-phenotypes with LMM2') - snp_iterator = tsvreader.geno_iter(options.geno) - ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,snp_iterator) + geno_iterator = tsvreader.geno_iter(options.geno) + ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,geno_iterator) check_results(ps,ts) elif cmd == 'redis_new': # The main difference between redis_new and redis is that missing @@ -150,10 +149,9 @@ elif cmd == 'redis_new': Y = y G = g print "Original G",G.shape, "\n", G - - gt = G.T - G = None - ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True) + # gt = G.T + # G = None + ps, ts = gn2_load_redis('testrun','other',k,Y,G,new_code=True) check_results(ps,ts) elif cmd == 'redis': # Emulating the redis setup of GN2 @@ -174,9 +172,10 @@ elif cmd == 'redis': g = None gnt = None - gt = G.T - G = None - ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False) + # gt = G.T + # G = None + mprint("G",G) + ps, ts = gn2_load_redis('testrun','other',k,Y,G, new_code=False) check_results(ps,ts) elif cmd == 'kinship': G = g -- cgit v1.2.3