aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2015-04-13 08:17:24 +0000
committerPjotr Prins2015-04-13 08:17:24 +0000
commitabbd30c0f73184ef22a0501c8e94436242b6b669 (patch)
tree651b5b6eaf5570db3402bca9cbcca09fe658bccd
parent102523493e2f8a7660c63f117f1d8dfd009eff02 (diff)
parent17f453e50ebac657d9f3096811d92bedc9bfc064 (diff)
downloadgenenetwork2-abbd30c0f73184ef22a0501c8e94436242b6b669.tar.gz
Merge branch 'lmm' of github.com:pjotrp/genenetwork2 into lmm
-rw-r--r--wqflask/wqflask/my_pylmm/README.md45
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/__init__.py2
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/genotype.py19
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/gn2.py98
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/gwas.py80
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/kinship.py83
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py266
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm2.py61
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/phenotype.py37
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/runlmm.py103
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/standalone.py110
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py45
12 files changed, 658 insertions, 291 deletions
diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md
index f6b0e72d..4845ec03 100644
--- a/wqflask/wqflask/my_pylmm/README.md
+++ b/wqflask/wqflask/my_pylmm/README.md
@@ -1,21 +1,30 @@
-# RELEASE NOTES
-
-## 0.50-gn2-pre1 release
-
-This is the first test release of multi-core pylmm into GN2. Both
-kinship calculation and GWAS have been made multi-threaded by
-introducing the Python multiprocessing module. Note that only
-run_other has been updated to use the new routines (so human is still
-handled the old way). I have taken care that we can still run both
-old-style and new-style LMM (through passing the 'new_code'
-boolean). This could be an option in the web server for users to
-select and test for any unexpected differences (of which there should
-be none, naturally ;).
-
-The current version can handle missing phenotypes, but as they are
-removed there is no way for GN2 to know what SNPs the P-values belong
-to. A future version will pass a SNP index to allow for missing
-phenotypes.
+# Genenetwork2/pylmm RELEASE NOTES
+
+## 0.50-gn2 (April 2nd, 2015)
+
+- Replaced the GN2 genotype normalization
+
+## 0.50-gn2-pre2 (March 18, 2015)
+
+- Added abstractions for progress meter and info/debug statements;
+ Redis perc_complete is now updated through a lambda
+
+## 0.50-gn2-pre1 (release, March 17, 2015)
+
+- This is the first test release of multi-core pylmm into GN2. Both
+ kinship calculation and GWAS have been made multi-threaded by
+ introducing the Python multiprocessing module. Note that only
+ run_other has been updated to use the new routines (so human is
+ still handled the old way). I have taken care that we can still run
+ both old-style and new-style LMM (through passing the 'new_code'
+ boolean). This could be an option in the web server for users to
+ select and test for any unexpected differences (of which there
+ should be none, naturally ;).
+
+- The current version can handle missing phenotypes, but as they are
+ removed there is no way for GN2 to know what SNPs the P-values
+ belong to. A future version will pass a SNP index to allow for
+ missing phenotypes.
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
index c40c3221..6ab60d02 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
@@ -1 +1 @@
-PYLMM_VERSION="0.50-gn2-pre1"
+PYLMM_VERSION="0.50-gn2-pre2"
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index 315fd824..49f32e3a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -37,14 +37,15 @@ def normalize(ind_g):
Run for every SNP list (for one individual) and return
normalized SNP genotype values with missing data filled in
"""
- g1 = np.copy(ind_g) # avoid side effects
- x = True - np.isnan(ind_g) # Matrix of True/False
- m = ind_g[x].mean() # Global mean value
- s = np.sqrt(ind_g[x].var()) # Global stddev
- g1[np.isnan(ind_g)] = m # Plug-in mean values for missing data
- if s == 0:
- g1 = g1 - m # Subtract the mean
+ g = np.copy(ind_g) # copy to avoid side effects
+ missing = np.isnan(g)
+ values = g[True - missing]
+ mean = values.mean() # Global mean value
+ stddev = np.sqrt(values.var()) # Global stddev
+ g[missing] = mean # Plug-in mean values for missing data
+ if stddev == 0:
+ g = g - mean # Subtract the mean
else:
- g1 = (g1 - m) / s # Normalize the deviation
- return g1
+ g = (g - mean) / stddev # Normalize the deviation
+ return g
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gn2.py b/wqflask/wqflask/my_pylmm/pyLMM/gn2.py
new file mode 100644
index 00000000..7bceb089
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gn2.py
@@ -0,0 +1,98 @@
+# Genenetwork2 specific methods and callback handler
+#
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+
+from __future__ import absolute_import, print_function, division
+
+import numpy as np
+import sys
+import logging
+
+# logging.basicConfig(level=logging.DEBUG)
+# np.set_printoptions()
+
+progress_location = None
+progress_current = None
+progress_prev_perc = None
+
+def progress_default_func(location,count,total):
+ global progress_current
+ value = round(count*100.0/total)
+ progress_current = value
+
+progress_func = progress_default_func
+
+def progress_set_func(func):
+ global progress_func
+ progress_func = func
+
+def progress(location, count, total):
+ global progress_location
+ global progress_prev_perc
+
+ perc = round(count*100.0/total)
+ if perc != progress_prev_perc and (location != progress_location or perc > 98 or perc > progress_prev_perc + 5):
+ progress_func(location, count, total)
+ logger.info("Progress: %s %d%%" % (location,perc))
+ progress_location = location
+ progress_prev_perc = perc
+
+def mprint(msg,data):
+ """
+ Array/matrix print function
+ """
+ m = np.array(data)
+ print(msg,m.shape,"=\n",m)
+
+def fatal(msg):
+ logger.critical(msg)
+ raise Exception(msg)
+
+def callbacks():
+ return dict(
+ write = sys.stdout.write,
+ writeln = print,
+ debug = logger.debug,
+ info = logger.info,
+ warning = logger.warning,
+ error = logger.error,
+ critical = logger.critical,
+ fatal = fatal,
+ progress = progress,
+ mprint = mprint
+ )
+
+def uses(*funcs):
+ """
+ Some sugar
+ """
+ return [callbacks()[func] for func in funcs]
+
+# ----- Minor test cases:
+
+if __name__ == '__main__':
+ # logging.basicConfig(level=logging.DEBUG)
+ logging.debug("Test %i" % (1))
+ d = callbacks()['debug']
+ d("TEST")
+ wrln = callbacks()['writeln']
+ wrln("Hello %i" % 34)
+ progress = callbacks()['progress']
+ progress("I am half way",50,100)
+ list = [0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]
+ mprint("list",list)
+ matrix = [[1,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [2,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [3,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [4,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [5,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [6,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]]
+ mprint("matrix",matrix)
+ ix,dx = uses("info","debug")
+ ix("ix")
+ dx("dx")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index b901c0e2..247a8729 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -20,7 +20,6 @@
import pdb
import time
import sys
-# from utility import temp_data
import lmm2
import os
@@ -32,16 +31,25 @@ from lmm2 import LMM2
import multiprocessing as mp # Multiprocessing is part of the Python stdlib
import Queue
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses
+except ImportError:
+ has_gn2=False
+ from standalone import uses
+
+progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
+
+
def formatResult(id,beta,betaSD,ts,ps):
return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
- # print("COMPUTE SNP",j,snp_ids,"\n")
result = []
for snp_id in snp_ids:
snp,id = snp_id
x = snp.reshape((n,1)) # all the SNPs
- # print "X=",x
# if refit:
# L.fit(X=snp,REML=REML)
ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True)
@@ -51,33 +59,28 @@ def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
q = compute_snp.q
q.put([j,result])
return j
- # PS.append(ps)
- # TS.append(ts)
- # return len(result)
- # compute.q.put(result)
- # return None
def f_init(q):
compute_snp.q = q
def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
"""
- Execute a GWAS. The G matrix should be n inds (cols) x m snps (rows)
+ GWAS. The G matrix should be n inds (cols) x m snps (rows)
"""
+ info("In gwas.gwas")
matrix_initialize()
cpu_num = mp.cpu_count()
numThreads = None # for now use all available threads
kfile2 = False
reml = restricted_max_likelihood
- sys.stderr.write(str(G.shape)+"\n")
+ mprint("G",G)
n = G.shape[1] # inds
inds = n
m = G.shape[0] # snps
snps = m
- sys.stderr.write(str(m)+" SNPs\n")
- # print "***** GWAS: G",G.shape,G
- assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
+ info("%s SNPs",snps)
+ assert snps>=inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
# CREATE LMM object for association
# if not kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
@@ -85,19 +88,10 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose)
if not refit:
- if verbose: sys.stderr.write("Computing fit for null model\n")
+ info("Computing fit for null model")
lmm2.fit() # follow GN model in run_other
- if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma))
-
- # outFile = "test.out"
- # out = open(outFile,'w')
- out = sys.stderr
-
- def outputResult(id,beta,betaSD,ts,ps):
- out.write(formatResult(id,beta,betaSD,ts,ps))
- def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
-
- # printOutHead()
+ info("heritability=%0.3f, sigma=%0.3f" % (lmm2.optH,lmm2.optSigma))
+
res = []
# Set up the pool
@@ -106,26 +100,24 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
p = mp.Pool(numThreads, f_init, [q])
collect = []
- # Buffers for pvalues and t-stats
- # PS = []
- # TS = []
count = 0
job = 0
jobs_running = 0
+ jobs_completed = 0
for snp in G:
snp_id = (snp,'SNPID')
count += 1
if count % 1000 == 0:
job += 1
- if verbose:
- sys.stderr.write("Job %d At SNP %d\n" % (job,count))
+ debug("Job %d At SNP %d" % (job,count))
if numThreads == 1:
- print "Running on 1 THREAD"
+ debug("Running on 1 THREAD")
compute_snp(job,n,collect,lmm2,reml,q)
collect = []
j,lst = q.get()
- if verbose:
- sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
+ jobs_completed += 1
+ progress("GWAS2",jobs_completed,snps/1000)
res.append((j,lst))
else:
p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
@@ -134,8 +126,9 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
while jobs_running > cpu_num:
try:
j,lst = q.get_nowait()
- if verbose:
- sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
+ jobs_completed += 1
+ progress("GWAS2",jobs_completed,snps/1000)
res.append((j,lst))
jobs_running -= 1
except Queue.Empty:
@@ -150,24 +143,23 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
if numThreads==1 or count<1000 or len(collect)>0:
job += 1
- print "Collect final batch size %i job %i @%i: " % (len(collect), job, count)
+ debug("Collect final batch size %i job %i @%i: " % (len(collect), job, count))
compute_snp(job,n,collect,lmm2,reml,q)
collect = []
j,lst = q.get()
res.append((j,lst))
- print "count=",count," running=",jobs_running," collect=",len(collect)
+ debug("count=%i running=%i collect=%i" % (count,jobs_running,len(collect)))
for job in range(jobs_running):
j,lst = q.get(True,15) # time out
- if verbose:
- sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
+ jobs_completed += 1
+ progress("GWAS2",jobs_completed,snps/1000)
res.append((j,lst))
- print "Before sort",[res1[0] for res1 in res]
+ mprint("Before sort",[res1[0] for res1 in res])
res = sorted(res,key=lambda x: x[0])
- # if verbose:
- # print "res=",res[0][0:10]
- print "After sort",[res1[0] for res1 in res]
- print [len(res1[1]) for res1 in res]
+ mprint("After sort",[res1[0] for res1 in res])
+ info([len(res1[1]) for res1 in res])
ts = [item[0] for j,res1 in res for item in res1]
ps = [item[1] for j,res1 in res for item in res1]
return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 0c43587e..1c157fd8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -28,17 +28,30 @@ import time
from optmatrix import matrix_initialize, matrixMultT
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses, progress_set_func
+except ImportError:
+ has_gn2=False
+ import standalone as handlers
+ from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
def kinship_full(G):
"""
Calculate the Kinship matrix using a full dot multiplication
"""
- print G.shape
+ # mprint("kinship_full G",G)
m = G.shape[0] # snps
n = G.shape[1] # inds
- sys.stderr.write(str(m)+" SNPs\n")
- assert m>n, "n should be larger than m (snps>inds)"
- m = np.dot(G.T,G)
+ info("%d SNPs",m)
+ assert m>n, "n should be larger than m (%d snps > %d inds)" % (m,n)
+ # m = np.dot(G.T,G)
+ m = matrixMultT(G.T)
m = m/G.shape[0]
+ # mprint("kinship_full K",m)
return m
def compute_W(job,G,n,snps,compute_size):
@@ -74,46 +87,38 @@ def f_init(q):
# Calculate the kinship matrix from G (SNPs as rows!), returns K
#
-def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
- numThreads = None
- if numThreads:
- numThreads = int(numThreads)
+def kinship(G,computeSize=1000,numThreads=None,useBLAS=False):
+
matrix_initialize(useBLAS)
-
- sys.stderr.write(str(G.shape)+"\n")
+
+ mprint("G",G)
n = G.shape[1] # inds
inds = n
m = G.shape[0] # snps
snps = m
- sys.stderr.write(str(m)+" SNPs\n")
- assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
+ info("%i SNPs" % (m))
+ assert snps>=inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
q = mp.Queue()
p = mp.Pool(numThreads, f_init, [q])
cpu_num = mp.cpu_count()
- print "CPU cores:",cpu_num
- print snps,computeSize
+ info("CPU cores: %i" % cpu_num)
iterations = snps/computeSize+1
- # if testing:
- # iterations = 8
- # jobs = range(0,8) # range(0,iterations)
results = []
-
K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals
completed = 0
for job in range(iterations):
- if verbose:
- sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*computeSize)))
+ info("Processing job %d first %d SNPs" % (job, ((job+1)*computeSize)))
W = compute_W(job,G,n,snps,computeSize)
if numThreads == 1:
# Single-core
compute_matrixMult(job,W,q)
j,x = q.get()
- if verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
+ progress("kinship",j,iterations)
K_j = x
- # print j,K_j[:,0]
K = K + K_j
else:
# Multi-core
@@ -123,52 +128,38 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
time.sleep(0.1)
try:
j,x = q.get_nowait()
- if verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
K_j = x
- # print j,K_j[:,0]
K = K + K_j
completed += 1
+ progress("kinship",completed,iterations)
except Queue.Empty:
pass
if numThreads == None or numThreads > 1:
- # results contains the growing result set
for job in range(len(results)-completed):
j,x = q.get(True,15)
- if verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ debug("Job "+str(j)+" finished")
K_j = x
- # print j,K_j[:,0]
K = K + K_j
completed += 1
+ progress("kinship",completed,iterations)
K = K / float(snps)
-
- # outFile = 'runtest.kin'
- # if verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
- # np.savetxt(outFile,K)
-
- # if saveKvaKve:
- # if verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
- # Kva,Kve = linalg.eigh(K)
- # if verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
- # np.savetxt(outFile+".kva",Kva)
- # np.savetxt(outFile+".kve",Kve)
return K
-def kvakve(K, verbose=True):
+def kvakve(K):
"""
Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned
of small values < 1e-6 (notably smaller than zero)
"""
- if verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
-
+ info("Obtaining eigendecomposition for %dx%d matrix" % (K.shape[0],K.shape[1]) )
Kva,Kve = linalg.eigh(K)
- if verbose:
- print("Kva is: ", Kva.shape, Kva)
- print("Kve is: ", Kve.shape, Kve)
+ mprint("Kva",Kva)
+ mprint("Kve",Kve)
- if sum(Kva < 1e-6):
- if verbose: sys.stderr.write("Cleaning %d eigen values (Kva<0)\n" % (sum(Kva < 0)))
+ if sum(Kva < 0):
+ info("Cleaning %d eigen values (Kva<0)" % (sum(Kva < 0)))
Kva[Kva < 1e-6] = 1e-6
return Kva,Kve
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 53689071..b2067b27 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -56,12 +56,17 @@ import genotype
import phenotype
import gwas
+# ---- A trick to decide on the environment:
try:
from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses, progress_set_func
except ImportError:
- print("WARNING: Standalone version missing the Genenetwork2 environment\n")
has_gn2=False
- pass
+ import standalone as handlers
+ from standalone import uses, progress_set_func
+ sys.stderr.write("WARNING: LMM standalone version missing the Genenetwork2 environment\n")
+
+progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
#np.seterr('raise')
@@ -76,8 +81,7 @@ def run_human(pheno_vector,
covariate_matrix,
plink_input_file,
kinship_matrix,
- refit=False,
- tempdata=None):
+ refit=False):
v = np.isnan(pheno_vector)
keep = True - v
@@ -169,10 +173,7 @@ def run_human(pheno_vector,
#if count > 1000:
# break
count += 1
-
- percent_complete = (float(count) / total_snps) * 100
- #print("percent_complete: ", percent_complete)
- tempdata.store("percent_complete", percent_complete)
+ progress("human",count,total_snps)
#with Bench("actual association"):
ps, ts = human_association(snp,
@@ -260,23 +261,19 @@ def human_association(snp,
def run_other_old(pheno_vector,
genotype_matrix,
restricted_max_likelihood=True,
- refit=False,
- tempdata=None # <---- can not be None
- ):
+ refit=False):
"""Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
restricted_max_likelihood -- whether to use restricted max likelihood; True or False
refit -- whether to refit the variance component for each marker
- temp_data -- TempData object that stores the progress for each major step of the
- calculations ("calculate_kinship" and "GWAS" take the majority of time)
"""
print("Running the original LMM engine in run_other (old)")
print("REML=",restricted_max_likelihood," REFIT=",refit)
with Bench("Calculate Kinship"):
- kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
+ kinship_matrix,genotype_matrix = calculate_kinship_new(genotype_matrix)
print("kinship_matrix: ", pf(kinship_matrix))
print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
@@ -292,27 +289,22 @@ def run_other_old(pheno_vector,
with Bench("Doing GWAS"):
t_stats, p_values = GWAS(pheno_vector,
- genotype_matrix,
+ genotype_matrix.T,
kinship_matrix,
restricted_max_likelihood=True,
- refit=False,
- temp_data=tempdata)
+ refit=False)
Bench().report()
return p_values, t_stats
-def run_other_new(pheno_vector,
- genotype_matrix,
+def run_other_new(n,m,pheno_vector,
+ geno,
restricted_max_likelihood=True,
- refit=False,
- tempdata=None # <---- can not be None
- ):
+ refit=False):
"""Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
restricted_max_likelihood -- whether to use restricted max likelihood; True or False
refit -- whether to refit the variance component for each marker
- temp_data -- TempData object that stores the progress for each major step of the
- calculations ("calculate_kinship" and "GWAS" take the majority of time)
"""
@@ -320,8 +312,7 @@ def run_other_new(pheno_vector,
print("REML=",restricted_max_likelihood," REFIT=",refit)
# Adjust phenotypes
- Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
- print("Removed missing phenotypes",Y.shape)
+ n,Y,keep = phenotype.remove_missing_new(n,pheno_vector)
# if options.maf_normalization:
# G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
@@ -329,8 +320,9 @@ def run_other_new(pheno_vector,
# if not options.skip_genotype_normalization:
# G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+ geno = geno[:,keep]
with Bench("Calculate Kinship"):
- K,G = calculate_kinship(G, tempdata)
+ K,G = calculate_kinship_new(geno)
print("kinship_matrix: ", pf(K))
print("kinship_matrix.shape: ", pf(K.shape))
@@ -345,7 +337,7 @@ def run_other_new(pheno_vector,
with Bench("Doing GWAS"):
t_stats, p_values = gwas.gwas(Y,
- G.T,
+ G,
K,
restricted_max_likelihood=True,
refit=False,verbose=True)
@@ -385,18 +377,30 @@ def matrixMult(A,B):
return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
-
-def calculate_kinship_new(genotype_matrix, temp_data=None):
+def calculate_kinship_new(genotype_matrix):
+ """
+ Call the new kinship calculation where genotype_matrix contains
+ inds (columns) by snps (rows).
+ """
+ assert type(genotype_matrix) is np.ndarray
+ info("call genotype.normalize")
+ G = np.apply_along_axis( genotype.normalize, axis=1, arr=genotype_matrix)
+ mprint("G",genotype_matrix)
+ info("call calculate_kinship_new")
+ return kinship(G),G # G gets transposed, we'll turn this into an iterator (FIXME)
+
+def calculate_kinship_iter(geno):
"""
Call the new kinship calculation where genotype_matrix contains
inds (columns) by snps (rows).
"""
- print("call genotype.normalize")
+ assert type(genotype_matrix) is iter
+ info("call genotype.normalize")
G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
- print("call calculate_kinship_new")
- return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME)
+ info("call calculate_kinship_new")
+ return kinship(G)
-def calculate_kinship_old(genotype_matrix, temp_data=None):
+def calculate_kinship_old(genotype_matrix):
"""
genotype_matrix is an n x m matrix encoding SNP minor alleles.
@@ -404,13 +408,15 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
normalizes the resulting vectors and returns the RRM matrix.
"""
- print("call calculate_kinship_old")
+ info("call calculate_kinship_old")
+ fatal("THE FUNCTION calculate_kinship_old IS OBSOLETE, use calculate_kinship_new instead - see Genotype Normalization Problem on Pjotr's blog")
n = genotype_matrix.shape[0]
m = genotype_matrix.shape[1]
- print("genotype 2D matrix n (inds) is:", n)
- print("genotype 2D matrix m (snps) is:", m)
+ info("genotype 2D matrix n (inds) is: %d" % (n))
+ info("genotype 2D matrix m (snps) is: %d" % (m))
assert m>n, "n should be larger than m (snps>inds)"
keep = []
+ mprint("G (before old normalize)",genotype_matrix)
for counter in range(m):
#print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
#Checks if any values in column are not numbers
@@ -429,17 +435,13 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
continue
keep.append(counter)
genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr)
-
- percent_complete = int(round((counter/m)*45))
- if temp_data != None:
- temp_data.store("percent_complete", percent_complete)
+ progress('kinship_old normalize genotype',counter,m)
genotype_matrix = genotype_matrix[:,keep]
- print("After kinship (old) genotype_matrix: ", pf(genotype_matrix))
+ mprint("G (after old normalize)",genotype_matrix.T)
kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
return kinship_matrix,genotype_matrix
-
-calculate_kinship = calculate_kinship_new # alias
+ # return kinship_full(genotype_matrix.T),genotype_matrix
def GWAS(pheno_vector,
genotype_matrix,
@@ -465,9 +467,9 @@ def GWAS(pheno_vector,
refit - refit the variance component for each SNP
"""
- if kinship_eigen_vals == None:
+ if kinship_eigen_vals is None:
kinship_eigen_vals = []
- if kinship_eigen_vectors == None:
+ if kinship_eigen_vectors is None:
kinship_eigen_vectors = []
n = genotype_matrix.shape[0]
@@ -537,9 +539,8 @@ def GWAS(pheno_vector,
lmm_ob.fit(X=x)
ts, ps, beta, betaVar = lmm_ob.association(x, REML=restricted_max_likelihood)
- percent_complete = 45 + int(round((counter/m)*55))
- temp_data.store("percent_complete", percent_complete)
-
+ progress("gwas_old",counter,m)
+
p_values.append(ps)
t_statistics.append(ts)
@@ -572,7 +573,7 @@ class LMM:
When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
"""
- if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
+ if X0 is None: X0 = np.ones(len(Y)).reshape(len(Y),1)
self.verbose = verbose
#x = Y != -9
@@ -665,7 +666,7 @@ class LMM:
REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
"""
- if X == None:
+ if X is None:
X = self.X0t
elif stack:
self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
@@ -804,45 +805,74 @@ class LMM:
pl.ylabel("Probability of data")
pl.title(title)
-
-def gn2_redis(key,species,new_code=True):
+def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,new_code=True):
"""
- Invoke pylmm using Redis as a container. new_code runs the new
- version
+ Invoke pylmm using genotype as a matrix or as a (SNP) iterator.
"""
- json_params = Redis.get(key)
-
- params = json.loads(json_params)
-
- tempdata = temp_data.TempData(params['temp_uuid'])
-
-
- print('pheno', np.array(params['pheno_vector']))
+ info("run_gwas")
+ print('pheno', y)
if species == "human" :
- print('kinship', np.array(params['kinship_matrix']))
- ps, ts = run_human(pheno_vector = np.array(params['pheno_vector']),
- covariate_matrix = np.array(params['covariate_matrix']),
- plink_input_file = params['input_file_name'],
- kinship_matrix = np.array(params['kinship_matrix']),
- refit = params['refit'],
- tempdata = tempdata)
+ print('kinship', k )
+ ps, ts = run_human(pheno_vector = y,
+ covariate_matrix = cov,
+ plink_input_file = inputfn,
+ kinship_matrix = k,
+ refit = refit)
else:
- geno = np.array(params['genotype_matrix'])
print('geno', geno.shape, geno)
if new_code:
- ps, ts = run_other_new(pheno_vector = np.array(params['pheno_vector']),
- genotype_matrix = geno,
- restricted_max_likelihood = params['restricted_max_likelihood'],
- refit = params['refit'],
- tempdata = tempdata)
+ ps, ts = run_other_new(n,m,pheno_vector = y,
+ geno = geno,
+ restricted_max_likelihood = reml,
+ refit = refit)
else:
- ps, ts = run_other_old(pheno_vector = np.array(params['pheno_vector']),
+ ps, ts = run_other_old(pheno_vector = y,
genotype_matrix = geno,
- restricted_max_likelihood = params['restricted_max_likelihood'],
- refit = params['refit'],
- tempdata = tempdata)
+ restricted_max_likelihood = reml,
+ refit = refit)
+ return ps,ts
+
+def gwas_with_redis(key,species,new_code=True):
+ """
+ Invoke pylmm using Redis as a container. new_code runs the new
+ version. All the Redis code goes here!
+ """
+ json_params = Redis.get(key)
+
+ params = json.loads(json_params)
+
+ tempdata = temp_data.TempData(params['temp_uuid'])
+ def update_tempdata(loc,i,total):
+ """
+ This is the single method that updates Redis for percentage complete!
+ """
+ tempdata.store("percent_complete",round(i*100.0/total))
+ debug("Updating REDIS percent_complete=%d" % (round(i*100.0/total)))
+ progress_set_func(update_tempdata)
+
+ def narray(key):
+ print(key)
+ v = params[key]
+ if v is not None:
+ v = np.array(v)
+ return v
+
+ def narrayT(key):
+ m = narray(key)
+ if m is not None:
+ return m.T
+ return m
+
+ # We are transposing before we enter run_gwas - this should happen on the webserver
+ # side (or when reading data from file)
+ k = narray('kinship_matrix')
+ g = narrayT('genotype_matrix')
+ y = narray('pheno_vector')
+ n = len(y)
+ m = params['num_genotypes']
+ ps,ts = run_gwas(species,n,m,k,y,g,narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code)
results_key = "pylmm:results:" + params['temp_uuid']
@@ -854,28 +884,56 @@ def gn2_redis(key,species,new_code=True):
Redis.expire(results_key, 60*60)
return ps, ts
-# This is the main function used by Genenetwork2 (with environment)
-def gn2_main():
- parser = argparse.ArgumentParser(description='Run pyLMM')
- parser.add_argument('-k', '--key')
- parser.add_argument('-s', '--species')
-
- opts = parser.parse_args()
-
- key = opts.key
- species = opts.species
+def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
+ """
+ This function emulates current GN2 behaviour by pre-loading Redis (note the input
+ genotype is transposed to emulate GN2 (FIXME!)
+ """
+ info("Loading Redis from parsed data")
+ if kinship == None:
+ k = None
+ else:
+ k = kinship.tolist()
+ params = dict(pheno_vector = pheno.tolist(),
+ genotype_matrix = geno.T.tolist(),
+ num_genotypes = geno.shape[0],
+ kinship_matrix = k,
+ covariate_matrix = None,
+ input_file_name = None,
+ restricted_max_likelihood = True,
+ refit = False,
+ temp_uuid = "testrun_temp_uuid",
+
+ # meta data
+ timestamp = datetime.datetime.now().isoformat())
+
+ json_params = json.dumps(params)
+ Redis.set(key, json_params)
+ Redis.expire(key, 60*60)
- gn2_redis(key,species)
+ return gwas_with_redis(key,species,new_code)
-def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
- print("Loading Redis from parsed data")
+def gn2_load_redis_iter(key,species,kinship,pheno,geno_iterator):
+ """
+ This function emulates GN2 behaviour by pre-loading Redis with
+ a SNP iterator, for this it sets a key for every genotype (SNP)
+ """
+ print("Loading Redis using a SNP iterator")
+ for i,genotypes in enumerate(geno_iterator):
+ gkey = key+'_geno_'+str(i)
+ Redis.set(gkey, genotypes)
+ Redis.expire(gkey, 60*60)
+
if kinship == None:
k = None
else:
k = kinship.tolist()
params = dict(pheno_vector = pheno.tolist(),
- genotype_matrix = geno.tolist(),
- kinship_matrix= k,
+ genotype_matrix = "iterator",
+ num_genotypes = i,
+ kinship_matrix = k,
+ covariate_matrix = None,
+ input_file_name = None,
restricted_max_likelihood = True,
refit = False,
temp_uuid = "testrun_temp_uuid",
@@ -887,9 +945,25 @@ def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
json_params = json.dumps(params)
Redis.set(key, json_params)
Redis.expire(key, 60*60)
+ return gwas_with_redis(key,species)
- return gn2_redis(key,species,new_code)
+# This is the main function used by Genenetwork2 (with environment)
+#
+# Note that this calling route will become OBSOLETE (we should use runlmm.py
+# instead)
+def gn2_main():
+ parser = argparse.ArgumentParser(description='Run pyLMM')
+ parser.add_argument('-k', '--key')
+ parser.add_argument('-s', '--species')
+ opts = parser.parse_args()
+
+ key = opts.key
+ species = opts.species
+
+ gwas_with_redis(key,species)
+
+
if __name__ == '__main__':
print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
if has_gn2:
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index d4b3ac82..358bf27e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -24,6 +24,15 @@ from scipy import optimize
from optmatrix import matrixMult
import kinship
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses
+except ImportError:
+ sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n")
+ has_gn2=False
+ from standalone import uses
+
def calculateKinship(W,center=False):
"""
W is an n x m matrix encoding SNP minor alleles.
@@ -75,7 +84,7 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
print("genotype matrix n is:", n)
print("genotype matrix m is:", m)
- if X0 == None:
+ if X0 is None:
X0 = np.ones((n,1))
# Remove missing values in Y and adjust associated parameters
@@ -139,31 +148,35 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
class LMM2:
- """
- This is a simple version of EMMA/fastLMM.
- The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
- and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
- There are three model parameters: heritability (h), covariate coefficients (beta) and the total
- phenotypic variance (sigma).
- Heritability as defined here is the proportion of the total variance (sigma) that is attributed to
- the kinship matrix.
-
- For simplicity, we assume that everything being input is a numpy array.
- If this is not the case, the module may throw an error as conversion from list to numpy array
- is not done consistently.
+ """This is a simple version of EMMA/fastLMM.
+
+ The main purpose of this module is to take a phenotype vector (Y),
+ a set of covariates (X) and a kinship matrix (K) and to optimize
+ this model by finding the maximum-likelihood estimates for the
+ model parameters. There are three model parameters: heritability
+ (h), covariate coefficients (beta) and the total phenotypic
+ variance (sigma). Heritability as defined here is the proportion
+ of the total variance (sigma) that is attributed to the kinship
+ matrix.
+
+ For simplicity, we assume that everything being input is a numpy
+ array. If this is not the case, the module may throw an error as
+ conversion from list to numpy array is not done consistently.
"""
def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
- """
- The constructor takes a phenotype vector or array Y of size n.
- It takes a kinship matrix K of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
- If they are not provided, the constructor will calculate them.
- X0 is an optional covariate matrix of size n x q, where there are q covariates.
- When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+ """The constructor takes a phenotype vector or array Y of size n. It
+ takes a kinship matrix K of size n x n. Kva and Kve can be
+ computed as Kva,Kve = linalg.eigh(K) and cached. If they are
+ not provided, the constructor will calculate them. X0 is an
+ optional covariate matrix of size n x q, where there are q
+ covariates. When this parameter is not provided, the
+ constructor will set X0 to an n x 1 matrix of all ones to
+ represent a mean effect.
"""
- if X0 == None:
+ if X0 is None:
X0 = np.ones(len(Y)).reshape(len(Y),1)
self.verbose = verbose
@@ -250,7 +263,7 @@ class LMM2:
REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
"""
- if X == None: X = self.X0t
+ if X is None: X = self.X0t
elif stack:
self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
X = self.X0t_stack
@@ -306,7 +319,7 @@ class LMM2:
Given this optimum, the function computes the LL and associated ML solutions.
"""
- if X == None: X = self.X0t
+ if X is None: X = self.X0t
else:
#X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
@@ -330,7 +343,7 @@ class LMM2:
def association(self,X,h=None,stack=True,REML=True,returnBeta=False):
"""
Calculates association statitics for the SNPs encoded in the vector X of size n.
- If h == None, the optimal h stored in optH is used.
+ If h is None, the optimal h stored in optH is used.
"""
if False:
@@ -348,7 +361,7 @@ class LMM2:
self.X0t_stack[:,(self.q)] = m
X = self.X0t_stack
- if h == None: h = self.optH
+ if h is None: h = self.optH
L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML)
q = len(beta)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index 682ba371..7b652515 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -19,22 +19,47 @@
import sys
import numpy as np
-def remove_missing(y,g,verbose=False):
+# ---- A trick to decide on the environment:
+try:
+ from wqflask.my_pylmm.pyLMM import chunks
+ from gn2 import uses, progress_set_func
+except ImportError:
+ has_gn2=False
+ import standalone as handlers
+ from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
+def remove_missing(n,y,g):
"""
Remove missing data from matrices, make sure the genotype data has
individuals as rows
"""
- assert(y!=None)
- assert(y.shape[0] == g.shape[0])
+ assert(y is not None)
+ assert y.shape[0] == g.shape[0],"y (n) %d, g (n,m) %s" % (y.shape[0],g.shape)
y1 = y
g1 = g
v = np.isnan(y)
keep = True - v
if v.sum():
- if verbose:
- sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+ info("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
y1 = y[keep]
g1 = g[keep,:]
- return y1,g1,keep
+ n = y1.shape[0]
+ return n,y1,g1,keep
+
+def remove_missing_new(n,y):
+ """
+ Remove missing data. Returns new n,y,keep
+ """
+ assert(y is not None)
+ y1 = y
+ v = np.isnan(y)
+ keep = True - v
+ if v.sum():
+ info("runlmm.py: Cleaning the phenotype vector by removing %d individuals" % (v.sum()))
+ y1 = y[keep]
+ n = y1.shape[0]
+ return n,y1,keep
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 324c4f2c..52c3c80a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,10 +21,13 @@ from optparse import OptionParser
import sys
import tsvreader
import numpy as np
-from lmm import gn2_load_redis, calculate_kinship_old
+from lmm import gn2_load_redis, gn2_load_redis_iter, calculate_kinship_new, run_gwas
from kinship import kinship, kinship_full
import genotype
import phenotype
+from standalone import uses
+
+progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
usage = """
python runlmm.py [options] command
@@ -98,44 +101,64 @@ if options.pheno:
y = tsvreader.pheno(options.pheno)
print y.shape
-if options.geno:
+if options.geno and cmd != 'iterator':
g = tsvreader.geno(options.geno)
print g.shape
-if cmd == 'redis_new':
- # The main difference between redis_new and redis is that missing
- # phenotypes are handled by the first
- Y = y
- G = g
- print "Original G",G.shape, "\n", G
-
- gt = G.T
- G = None
- ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
+def check_results(ps,ts):
print np.array(ps)
print len(ps),sum(ps)
- # Test results
p1 = round(ps[0],4)
p2 = round(ps[-1],4)
- sys.stderr.write(options.geno+"\n")
if options.geno == 'data/small.geno':
- assert p1==0.0708, "p1=%f" % p1
- assert p2==0.1417, "p2=%f" % p2
+ info("Validating results for "+options.geno)
+ assert p1==0.7387, "p1=%f" % p1
+ assert p2==0.7387, "p2=%f" % p2
if options.geno == 'data/small_na.geno':
- assert p1==0.0897, "p1=%f" % p1
- assert p2==0.0405, "p2=%f" % p2
+ info("Validating results for "+options.geno)
+ assert p1==0.062, "p1=%f" % p1
+ assert p2==0.062, "p2=%f" % p2
if options.geno == 'data/test8000.geno':
- # assert p1==0.8984, "p1=%f" % p1
- # assert p2==0.9621, "p2=%f" % p2
+ info("Validating results for "+options.geno)
assert round(sum(ps)) == 4070
assert len(ps) == 8000
+ info("Run completed")
+
+if y is not None:
+ n = y.shape[0]
+
+if cmd == 'run':
+ if options.remove_missing_phenotypes:
+ raise Exception('Can not use --remove-missing-phenotypes with LMM2')
+ n = len(y)
+ m = g.shape[1]
+ ps, ts = run_gwas('other',n,m,k,y,g) # <--- pass in geno by SNP
+ check_results(ps,ts)
+elif cmd == 'iterator':
+ if options.remove_missing_phenotypes:
+ raise Exception('Can not use --remove-missing-phenotypes with LMM2')
+ geno_iterator = tsvreader.geno_iter(options.geno)
+ ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,geno_iterator)
+ check_results(ps,ts)
+elif cmd == 'redis_new':
+ # The main difference between redis_new and redis is that missing
+ # phenotypes are handled by the first
+ if options.remove_missing_phenotypes:
+ raise Exception('Can not use --remove-missing-phenotypes with LMM2')
+ Y = y
+ G = g
+ print "Original G",G.shape, "\n", G
+ # gt = G.T
+ # G = None
+ ps, ts = gn2_load_redis('testrun','other',k,Y,G,new_code=True)
+ check_results(ps,ts)
elif cmd == 'redis':
# Emulating the redis setup of GN2
G = g
print "Original G",G.shape, "\n", G
- if y != None and options.remove_missing_phenotypes:
+ if y is not None and options.remove_missing_phenotypes:
gnt = np.array(g).T
- Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose)
+ n,Y,g,keep = phenotype.remove_missing(n,y,gnt)
G = g.T
print "Removed missing phenotypes",G.shape, "\n", G
else:
@@ -148,30 +171,16 @@ elif cmd == 'redis':
g = None
gnt = None
- gt = G.T
- G = None
- ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
- print np.array(ps)
- print len(ps),sum(ps)
- # Test results 4070.02346579
- p1 = round(ps[0],4)
- p2 = round(ps[-1],4)
- sys.stderr.write(options.geno+"\n")
- if options.geno == 'data/small.geno':
- assert p1==0.0708, "p1=%f" % p1
- assert p2==0.1417, "p2=%f" % p2
- if options.geno == 'data/small_na.geno':
- assert p1==0.0897, "p1=%f" % p1
- assert p2==0.0405, "p2=%f" % p2
- if options.geno == 'data/test8000.geno':
- assert round(sum(ps)) == 4070
- assert len(ps) == 8000
+ # gt = G.T
+ # G = None
+ ps, ts = gn2_load_redis('testrun','other',k,Y,G, new_code=False)
+ check_results(ps,ts)
elif cmd == 'kinship':
G = g
print "Original G",G.shape, "\n", G
if y != None and options.remove_missing_phenotypes:
gnt = np.array(g).T
- Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+ n,Y,g,keep = phenotype.remove_missing(n,y,g.T)
G = g.T
print "Removed missing phenotypes",G.shape, "\n", G
if options.maf_normalization:
@@ -187,20 +196,20 @@ elif cmd == 'kinship':
print "Genotype",G.shape, "\n", G
print "first Kinship method",K.shape,"\n",K
k1 = round(K[0][0],4)
- K2,G = calculate_kinship_old(np.copy(G).T,temp_data=None)
+ K2,G = calculate_kinship_new(np.copy(G))
print "Genotype",G.shape, "\n", G
print "GN2 Kinship method",K2.shape,"\n",K2
k2 = round(K2[0][0],4)
print "Genotype",G.shape, "\n", G
- K3 = kinship(G.T)
+ K3 = kinship(G)
print "third Kinship method",K3.shape,"\n",K3
sys.stderr.write(options.geno+"\n")
k3 = round(K3[0][0],4)
if options.geno == 'data/small.geno':
- assert k1==0.8, "k1=%f" % k1
- assert k2==0.7939, "k2=%f" % k2
- assert k3==0.7939, "k3=%f" % k3
+ assert k1==0.8333, "k1=%f" % k1
+ assert k2==0.9375, "k2=%f" % k2
+ assert k3==0.9375, "k3=%f" % k3
if options.geno == 'data/small_na.geno':
assert k1==0.8333, "k1=%f" % k1
assert k2==0.7172, "k2=%f" % k2
@@ -209,4 +218,4 @@ elif cmd == 'kinship':
assert k3==1.4352, "k3=%f" % k3
else:
- print "Doing nothing"
+ fatal("Doing nothing")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/standalone.py b/wqflask/wqflask/my_pylmm/pyLMM/standalone.py
new file mode 100644
index 00000000..40b2021d
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/standalone.py
@@ -0,0 +1,110 @@
+# Standalone specific methods and callback handler
+#
+# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# Set the log level with
+#
+# logging.basicConfig(level=logging.DEBUG)
+
+from __future__ import absolute_import, print_function, division
+
+import numpy as np
+import sys
+import logging
+
+# logger = logging.getLogger(__name__)
+logger = logging.getLogger('lmm2')
+logging.basicConfig(level=logging.DEBUG)
+np.set_printoptions(precision=3,suppress=True)
+
+progress_location = None
+progress_current = None
+progress_prev_perc = None
+
+def progress_default_func(location,count,total):
+ global progress_current
+ value = round(count*100.0/total)
+ progress_current = value
+
+progress_func = progress_default_func
+
+def progress_set_func(func):
+ global progress_func
+ progress_func = func
+
+def progress(location, count, total):
+ global progress_location
+ global progress_prev_perc
+
+ perc = round(count*100.0/total)
+ if perc != progress_prev_perc and (location != progress_location or perc > 98 or perc > progress_prev_perc + 5):
+ progress_func(location, count, total)
+ logger.info("Progress: %s %d%%" % (location,perc))
+ progress_location = location
+ progress_prev_perc = perc
+
+def mprint(msg,data):
+ """
+ Array/matrix print function
+ """
+ m = np.array(data)
+ if m.ndim == 1:
+ print(msg,m.shape,"=\n",m[0:3]," ... ",m[-3:])
+ if m.ndim == 2:
+ print(msg,m.shape,"=\n[",
+ m[0][0:3]," ... ",m[0][-3:],"\n ",
+ m[1][0:3]," ... ",m[1][-3:],"\n ...\n ",
+ m[-2][0:3]," ... ",m[-2][-3:],"\n ",
+ m[-1][0:3]," ... ",m[-1][-3:],"]")
+
+def fatal(msg):
+ logger.critical(msg)
+ raise Exception(msg)
+
+def callbacks():
+ return dict(
+ write = sys.stdout.write,
+ writeln = print,
+ debug = logger.debug,
+ info = logger.info,
+ warning = logger.warning,
+ error = logger.error,
+ critical = logger.critical,
+ fatal = fatal,
+ progress = progress,
+ mprint = mprint
+ )
+
+def uses(*funcs):
+ """
+ Some sugar
+ """
+ return [callbacks()[func] for func in funcs]
+
+# ----- Minor test cases:
+
+if __name__ == '__main__':
+ # logging.basicConfig(level=logging.DEBUG)
+ logging.debug("Test %i" % (1))
+ d = callbacks()['debug']
+ d("TEST")
+ wrln = callbacks()['writeln']
+ wrln("Hello %i" % 34)
+ progress = callbacks()['progress']
+ progress("I am half way",50,100)
+ list = [0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15,
+ 0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]
+ mprint("list",list)
+ matrix = [[1,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [2,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [3,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [4,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [5,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15],
+ [6,0.5,0.6096595 , -0.31559815, -0.52793285, 1.16573418e-15]]
+ mprint("matrix",matrix)
+ ix,dx = uses("info","debug")
+ ix("ix")
+ dx("dx")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
index b24ffe8f..66b34ee2 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/tsvreader.py
@@ -75,3 +75,48 @@ def geno(fn):
G = np.array(G1)
return G
+def geno(fn):
+ G1 = []
+ for id,values in geno_iter(fn):
+ G1.append(values) # <--- slow
+ G = np.array(G1)
+ return G
+
+def geno_callback(fn,func):
+ hab_mapper = {'A':0,'H':1,'B':2,'-':3}
+ pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ]
+
+ print fn
+ with open(fn,'r') as tsvin:
+ assert(tsvin.readline().strip() == "# Genotype format version 1.0")
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsv = csv.reader(tsvin, delimiter='\t')
+ for row in tsv:
+ id = row[0]
+ gs = list(row[1])
+ gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs]
+ func(id,gs2)
+
+def geno_iter(fn):
+ """
+ Yield a tuple of snpid and values
+ """
+ hab_mapper = {'A':0,'H':1,'B':2,'-':3}
+ pylmm_mapper = [ 0.0, 0.5, 1.0, float('nan') ]
+
+ print fn
+ with open(fn,'r') as tsvin:
+ assert(tsvin.readline().strip() == "# Genotype format version 1.0")
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsvin.readline()
+ tsv = csv.reader(tsvin, delimiter='\t')
+ for row in tsv:
+ id = row[0]
+ gs = list(row[1])
+ gs2 = [pylmm_mapper[hab_mapper[g]] for g in gs]
+ yield (id,gs2)