aboutsummaryrefslogtreecommitdiff
path: root/wqflask
diff options
context:
space:
mode:
authorPjotr Prins2015-04-04 12:33:07 +0200
committerPjotr Prins2015-04-04 12:33:07 +0200
commit99fef2888f02551191cf6031c2c7222fce27e360 (patch)
treed77cf765d797521d7cc225ddabeb50c9bcbb19de /wqflask
parent163fe965bc1dcb807124c1c70c965d48bf2c2688 (diff)
downloadgenenetwork2-99fef2888f02551191cf6031c2c7222fce27e360.tar.gz
Run works without transposes
Diffstat (limited to 'wqflask')
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/gwas.py21
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/kinship.py24
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py65
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm2.py43
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/phenotype.py35
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/runlmm.py21
6 files changed, 136 insertions, 73 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 8b344a90..ae3769d4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -19,7 +19,7 @@
import pdb
import time
-# from utility import temp_data
+import sys
import lmm2
import os
@@ -31,6 +31,18 @@ from lmm2 import LMM2
import multiprocessing as mp # Multiprocessing is part of the Python stdlib
import Queue
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses
+except ImportError:
+ sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n")
+ has_gn2=False
+ from standalone import uses
+
+progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
+
+
def formatResult(id,beta,betaSD,ts,ps):
return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
@@ -52,12 +64,11 @@ def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
def f_init(q):
compute_snp.q = q
-def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True):
+def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
"""
GWAS. The G matrix should be n inds (cols) x m snps (rows)
"""
- progress,debug,info,mprint = uses('progress','debug','info','mprint')
-
+ info("In gwas.gwas")
matrix_initialize()
cpu_num = mp.cpu_count()
numThreads = None # for now use all available threads
@@ -70,7 +81,7 @@ def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True):
m = G.shape[0] # snps
snps = m
info("%s SNPs",snps)
- assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
+ assert snps>=inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
# CREATE LMM object for association
# if not kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index be12417e..1c157fd8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -28,12 +28,21 @@ import time
from optmatrix import matrix_initialize, matrixMultT
-def kinship_full(G,uses):
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses, progress_set_func
+except ImportError:
+ has_gn2=False
+ import standalone as handlers
+ from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
+def kinship_full(G):
"""
Calculate the Kinship matrix using a full dot multiplication
"""
- info,mprint = uses('info','mprint')
-
# mprint("kinship_full G",G)
m = G.shape[0] # snps
n = G.shape[1] # inds
@@ -78,8 +87,7 @@ def f_init(q):
# Calculate the kinship matrix from G (SNPs as rows!), returns K
#
-def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
- progress,debug,info,mprint = uses('progress','debug','info','mprint')
+def kinship(G,computeSize=1000,numThreads=None,useBLAS=False):
matrix_initialize(useBLAS)
@@ -89,7 +97,7 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
m = G.shape[0] # snps
snps = m
info("%i SNPs" % (m))
- assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
+ assert snps>=inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
q = mp.Queue()
p = mp.Pool(numThreads, f_init, [q])
@@ -140,13 +148,11 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
K = K / float(snps)
return K
-def kvakve(K,uses):
+def kvakve(K):
"""
Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned
of small values < 1e-6 (notably smaller than zero)
"""
- info,mprint = uses('info','mprint')
-
info("Obtaining eigendecomposition for %dx%d matrix" % (K.shape[0],K.shape[1]) )
Kva,Kve = linalg.eigh(K)
mprint("Kva",Kva)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index e51742c4..82bd7f0b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -64,7 +64,6 @@ except ImportError:
import standalone as handlers
from standalone import uses, progress_set_func
sys.stderr.write("WARNING: LMM standalone version missing the Genenetwork2 environment\n")
- pass
progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
@@ -296,8 +295,8 @@ def run_other_old(pheno_vector,
Bench().report()
return p_values, t_stats
-def run_other_new(pheno_vector,
- genotype_matrix,
+def run_other_new(n,m,pheno_vector,
+ geno,
restricted_max_likelihood=True,
refit=False):
@@ -312,8 +311,7 @@ def run_other_new(pheno_vector,
print("REML=",restricted_max_likelihood," REFIT=",refit)
# Adjust phenotypes
- Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
- print("Removed missing phenotypes",Y.shape)
+ n,Y,keep = phenotype.remove_missing_new(n,pheno_vector)
# if options.maf_normalization:
# G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
@@ -321,8 +319,9 @@ def run_other_new(pheno_vector,
# if not options.skip_genotype_normalization:
# G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+ geno = geno[:,keep]
with Bench("Calculate Kinship"):
- K,G = calculate_kinship_new(G)
+ K,G = calculate_kinship_new(geno)
print("kinship_matrix: ", pf(K))
print("kinship_matrix.shape: ", pf(K.shape))
@@ -337,9 +336,8 @@ def run_other_new(pheno_vector,
with Bench("Doing GWAS"):
t_stats, p_values = gwas.gwas(Y,
- G.T,
+ G,
K,
- uses,
restricted_max_likelihood=True,
refit=False,verbose=True)
Bench().report()
@@ -378,18 +376,30 @@ def matrixMult(A,B):
return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
+def calculate_kinship_new(genotype_matrix):
+ """
+ Call the new kinship calculation where genotype_matrix contains
+ inds (columns) by snps (rows).
+ """
+ assert type(genotype_matrix) is np.ndarray
+ info("call genotype.normalize")
+ G = np.apply_along_axis( genotype.normalize, axis=1, arr=genotype_matrix)
+ mprint("G",genotype_matrix)
+ info("call calculate_kinship_new")
+ return kinship(G),G # G gets transposed, we'll turn this into an iterator (FIXME)
-def calculate_kinship_new(genotype_matrix, temp_data=None):
+def calculate_kinship_iter(geno):
"""
Call the new kinship calculation where genotype_matrix contains
inds (columns) by snps (rows).
"""
+ assert type(genotype_matrix) is iter
info("call genotype.normalize")
G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
info("call calculate_kinship_new")
- return kinship(G.T,uses),G # G gets transposed, we'll turn this into an iterator (FIXME)
+ return kinship(G)
-def calculate_kinship_old(genotype_matrix, temp_data=None):
+def calculate_kinship_old(genotype_matrix):
"""
genotype_matrix is an n x m matrix encoding SNP minor alleles.
@@ -430,7 +440,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
mprint("G (after old normalize)",genotype_matrix.T)
kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
return kinship_matrix,genotype_matrix
- # return kinship_full(genotype_matrix.T,uses),genotype_matrix
+ # return kinship_full(genotype_matrix.T),genotype_matrix
def GWAS(pheno_vector,
genotype_matrix,
@@ -586,7 +596,7 @@ class LMM:
# if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
begin = time.time()
# Kva,Kve = linalg.eigh(K)
- Kva,Kve = kvakve(K,uses)
+ Kva,Kve = kvakve(K)
end = time.time()
if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
@@ -794,12 +804,11 @@ class LMM:
pl.ylabel("Probability of data")
pl.title(title)
-
def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,new_code=True):
"""
Invoke pylmm using genotype as a matrix or as a (SNP) iterator.
"""
- info("gwas_without_redis")
+ info("run_gwas")
print('pheno', y)
if species == "human" :
@@ -813,8 +822,8 @@ def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,ne
print('geno', geno.shape, geno)
if new_code:
- ps, ts = run_other_new(pheno_vector = y,
- genotype_matrix = geno,
+ ps, ts = run_other_new(n,m,pheno_vector = y,
+ geno = geno,
restricted_max_likelihood = reml,
refit = refit)
else:
@@ -849,10 +858,20 @@ def gwas_with_redis(key,species,new_code=True):
v = np.array(v)
return v
+ def narrayT(key):
+ m = narray(key)
+ if m is not None:
+ return m.T
+ return m
+
+ # We are transposing before we enter run_gwas - this should happen on the webserver
+ # side (or when reading data from file)
+ k = narray('kinship_matrix')
+ g = narrayT('genotype_matrix')
y = narray('pheno_vector')
n = len(y)
m = params['num_genotypes']
- ps,ts = run_gwas(species,n,m,narray('kinship_matrix'),y,narray('genotype_matrix'),narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code)
+ ps,ts = run_gwas(species,n,m,k,y,g,narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code)
results_key = "pylmm:results:" + params['temp_uuid']
@@ -864,19 +883,19 @@ def gwas_with_redis(key,species,new_code=True):
Redis.expire(results_key, 60*60)
return ps, ts
-
def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
"""
- This function emulates current GN2 behaviour by pre-loading Redis
+ This function emulates current GN2 behaviour by pre-loading Redis (note the input
+ genotype is transposed to emulate GN2 (FIXME!)
"""
- print("Loading Redis from parsed data")
+ info("Loading Redis from parsed data")
if kinship == None:
k = None
else:
k = kinship.tolist()
params = dict(pheno_vector = pheno.tolist(),
- genotype_matrix = geno.tolist(),
- num_genotypes = geno.shape[1],
+ genotype_matrix = geno.T.tolist(),
+ num_genotypes = geno.shape[0],
kinship_matrix = k,
covariate_matrix = None,
input_file_name = None,
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index d67e1205..358bf27e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -32,7 +32,6 @@ except ImportError:
sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n")
has_gn2=False
from standalone import uses
- pass
def calculateKinship(W,center=False):
"""
@@ -149,28 +148,32 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
class LMM2:
- """
- This is a simple version of EMMA/fastLMM.
- The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
- and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
- There are three model parameters: heritability (h), covariate coefficients (beta) and the total
- phenotypic variance (sigma).
- Heritability as defined here is the proportion of the total variance (sigma) that is attributed to
- the kinship matrix.
-
- For simplicity, we assume that everything being input is a numpy array.
- If this is not the case, the module may throw an error as conversion from list to numpy array
- is not done consistently.
+ """This is a simple version of EMMA/fastLMM.
+
+ The main purpose of this module is to take a phenotype vector (Y),
+ a set of covariates (X) and a kinship matrix (K) and to optimize
+ this model by finding the maximum-likelihood estimates for the
+ model parameters. There are three model parameters: heritability
+ (h), covariate coefficients (beta) and the total phenotypic
+ variance (sigma). Heritability as defined here is the proportion
+ of the total variance (sigma) that is attributed to the kinship
+ matrix.
+
+ For simplicity, we assume that everything being input is a numpy
+ array. If this is not the case, the module may throw an error as
+ conversion from list to numpy array is not done consistently.
"""
def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
- """
- The constructor takes a phenotype vector or array Y of size n.
- It takes a kinship matrix K of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
- If they are not provided, the constructor will calculate them.
- X0 is an optional covariate matrix of size n x q, where there are q covariates.
- When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+ """The constructor takes a phenotype vector or array Y of size n. It
+ takes a kinship matrix K of size n x n. Kva and Kve can be
+ computed as Kva,Kve = linalg.eigh(K) and cached. If they are
+ not provided, the constructor will calculate them. X0 is an
+ optional covariate matrix of size n x q, where there are q
+ covariates. When this parameter is not provided, the
+ constructor will set X0 to an n x 1 matrix of all ones to
+ represent a mean effect.
"""
if X0 is None:
@@ -194,7 +197,7 @@ class LMM2:
# if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
begin = time.time()
# Kva,Kve = linalg.eigh(K)
- Kva,Kve = kinship.kvakve(K,uses)
+ Kva,Kve = kinship.kvakve(K)
end = time.time()
if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index 4c8175f7..7b652515 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -19,22 +19,47 @@
import sys
import numpy as np
-def remove_missing(y,g,verbose=False):
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses, progress_set_func
+except ImportError:
+ has_gn2=False
+ import standalone as handlers
+ from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
+def remove_missing(n,y,g):
"""
Remove missing data from matrices, make sure the genotype data has
individuals as rows
"""
assert(y is not None)
- assert(y.shape[0] == g.shape[0])
+ assert y.shape[0] == g.shape[0],"y (n) %d, g (n,m) %s" % (y.shape[0],g.shape)
y1 = y
g1 = g
v = np.isnan(y)
keep = True - v
if v.sum():
- if verbose:
- sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+ info("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
y1 = y[keep]
g1 = g[keep,:]
- return y1,g1,keep
+ n = y1.shape[0]
+ return n,y1,g1,keep
+
+def remove_missing_new(n,y):
+ """
+ Remove missing data. Returns new n,y,keep
+ """
+ assert(y is not None)
+ y1 = y
+ v = np.isnan(y)
+ keep = True - v
+ if v.sum():
+ info("runlmm.py: Cleaning the phenotype vector by removing %d individuals" % (v.sum()))
+ y1 = y[keep]
+ n = y1.shape[0]
+ return n,y1,keep
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 2d02e195..d248dee2 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -131,16 +131,15 @@ def check_results(ps,ts):
if cmd == 'run':
if options.remove_missing_phenotypes:
raise Exception('Can not use --remove-missing-phenotypes with LMM2')
- snp_iterator = tsvreader.geno_iter(options.geno)
n = len(y)
m = g.shape[1]
- ps, ts = run_gwas('other',n,m,k,y,g.T)
+ ps, ts = run_gwas('other',n,m,k,y,g) # <--- pass in geno by SNP
check_results(ps,ts)
elif cmd == 'iterator':
if options.remove_missing_phenotypes:
raise Exception('Can not use --remove-missing-phenotypes with LMM2')
- snp_iterator = tsvreader.geno_iter(options.geno)
- ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,snp_iterator)
+ geno_iterator = tsvreader.geno_iter(options.geno)
+ ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,geno_iterator)
check_results(ps,ts)
elif cmd == 'redis_new':
# The main difference between redis_new and redis is that missing
@@ -150,10 +149,9 @@ elif cmd == 'redis_new':
Y = y
G = g
print "Original G",G.shape, "\n", G
-
- gt = G.T
- G = None
- ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
+ # gt = G.T
+ # G = None
+ ps, ts = gn2_load_redis('testrun','other',k,Y,G,new_code=True)
check_results(ps,ts)
elif cmd == 'redis':
# Emulating the redis setup of GN2
@@ -174,9 +172,10 @@ elif cmd == 'redis':
g = None
gnt = None
- gt = G.T
- G = None
- ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
+ # gt = G.T
+ # G = None
+ mprint("G",G)
+ ps, ts = gn2_load_redis('testrun','other',k,Y,G, new_code=False)
check_results(ps,ts)
elif cmd == 'kinship':
G = g