diff options
author | zsloan | 2015-04-06 15:58:11 +0000 |
---|---|---|
committer | zsloan | 2015-04-06 15:58:11 +0000 |
commit | 4d5f000e41a98283ed50eb0e775d135f78222be8 (patch) | |
tree | d7493d3e62df1b8947c7cbd85bb658678958ad76 /wqflask | |
parent | 03d43722b043deff164420e07d6c7d5e8c128cd6 (diff) | |
parent | 24fe675688467b77cb0b1e858e23f119027d08d8 (diff) | |
download | genenetwork2-4d5f000e41a98283ed50eb0e775d135f78222be8.tar.gz |
Merge branch 'master' of github.com:zsloan/genenetwork2
Diffstat (limited to 'wqflask')
-rw-r--r-- | wqflask/wqflask/my_pylmm/README.md | 21 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/__init__.py | 1 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 173 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/input.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 22 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 137 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm2.py | 410 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 2 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 75 |
9 files changed, 789 insertions, 54 deletions
diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md new file mode 100644 index 00000000..f6b0e72d --- /dev/null +++ b/wqflask/wqflask/my_pylmm/README.md @@ -0,0 +1,21 @@ +# RELEASE NOTES + +## 0.50-gn2-pre1 release + +This is the first test release of multi-core pylmm into GN2. Both +kinship calculation and GWAS have been made multi-threaded by +introducing the Python multiprocessing module. Note that only +run_other has been updated to use the new routines (so human is still +handled the old way). I have taken care that we can still run both +old-style and new-style LMM (through passing the 'new_code' +boolean). This could be an option in the web server for users to +select and test for any unexpected differences (of which there should +be none, naturally ;). + +The current version can handle missing phenotypes, but as they are +removed there is no way for GN2 to know what SNPs the P-values belong +to. A future version will pass a SNP index to allow for missing +phenotypes. + + +
\ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py index e69de29b..c40c3221 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py @@ -0,0 +1 @@ +PYLMM_VERSION="0.50-gn2-pre1" diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py new file mode 100644 index 00000000..b901c0e2 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py @@ -0,0 +1,173 @@ +# pylmm-based GWAS calculation +# +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# Copyright (C) 2015 Pjotr Prins (pjotr.prins@thebird.nl) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +#!/usr/bin/python + +import pdb +import time +import sys +# from utility import temp_data +import lmm2 + +import os +import numpy as np +import input +from optmatrix import matrix_initialize +from lmm2 import LMM2 + +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +def formatResult(id,beta,betaSD,ts,ps): + return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" + +def compute_snp(j,n,snp_ids,lmm2,REML,q = None): + # print("COMPUTE SNP",j,snp_ids,"\n") + result = [] + for snp_id in snp_ids: + snp,id = snp_id + x = snp.reshape((n,1)) # all the SNPs + # print "X=",x + # if refit: + # L.fit(X=snp,REML=REML) + ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True) + # result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)) + result.append( (ts,ps) ) + if not q: + q = compute_snp.q + q.put([j,result]) + return j + # PS.append(ps) + # TS.append(ts) + # return len(result) + # compute.q.put(result) + # return None + +def f_init(q): + compute_snp.q = q + +def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True): + """ + Execute a GWAS. The G matrix should be n inds (cols) x m snps (rows) + """ + matrix_initialize() + cpu_num = mp.cpu_count() + numThreads = None # for now use all available threads + kfile2 = False + reml = restricted_max_likelihood + + sys.stderr.write(str(G.shape)+"\n") + n = G.shape[1] # inds + inds = n + m = G.shape[0] # snps + snps = m + sys.stderr.write(str(m)+" SNPs\n") + # print "***** GWAS: G",G.shape,G + assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds) + + # CREATE LMM object for association + # if not kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=verbose) + # else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=verbose,K2=K2) + + lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose) + if not refit: + if verbose: sys.stderr.write("Computing fit for null model\n") + lmm2.fit() # follow GN model in run_other + if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma)) + + # outFile = "test.out" + # out = open(outFile,'w') + out = sys.stderr + + def outputResult(id,beta,betaSD,ts,ps): + out.write(formatResult(id,beta,betaSD,ts,ps)) + def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") + + # printOutHead() + res = [] + + # Set up the pool + # mp.set_start_method('spawn') + q = mp.Queue() + p = mp.Pool(numThreads, f_init, [q]) + collect = [] + + # Buffers for pvalues and t-stats + # PS = [] + # TS = [] + count = 0 + job = 0 + jobs_running = 0 + for snp in G: + snp_id = (snp,'SNPID') + count += 1 + if count % 1000 == 0: + job += 1 + if verbose: + sys.stderr.write("Job %d At SNP %d\n" % (job,count)) + if numThreads == 1: + print "Running on 1 THREAD" + compute_snp(job,n,collect,lmm2,reml,q) + collect = [] + j,lst = q.get() + if verbose: + sys.stderr.write("Job "+str(j)+" finished\n") + res.append((j,lst)) + else: + p.apply_async(compute_snp,(job,n,collect,lmm2,reml)) + jobs_running += 1 + collect = [] + while jobs_running > cpu_num: + try: + j,lst = q.get_nowait() + if verbose: + sys.stderr.write("Job "+str(j)+" finished\n") + res.append((j,lst)) + jobs_running -= 1 + except Queue.Empty: + time.sleep(0.1) + pass + if jobs_running > cpu_num*2: + time.sleep(1.0) + else: + break + + collect.append(snp_id) + + if numThreads==1 or count<1000 or len(collect)>0: + job += 1 + print "Collect final batch size %i job %i @%i: " % (len(collect), job, count) + compute_snp(job,n,collect,lmm2,reml,q) + collect = [] + j,lst = q.get() + res.append((j,lst)) + print "count=",count," running=",jobs_running," collect=",len(collect) + for job in range(jobs_running): + j,lst = q.get(True,15) # time out + if verbose: + sys.stderr.write("Job "+str(j)+" finished\n") + res.append((j,lst)) + + print "Before sort",[res1[0] for res1 in res] + res = sorted(res,key=lambda x: x[0]) + # if verbose: + # print "res=",res[0][0:10] + print "After sort",[res1[0] for res1 in res] + print [len(res1[1]) for res1 in res] + ts = [item[0] for j,res1 in res for item in res1] + ps = [item[1] for j,res1 in res for item in res1] + return ts,ps diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py index f7b556a5..7063fedf 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/input.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py @@ -135,6 +135,8 @@ class plink: def normalizeGenotype(self,G): # print "Before",G # print G.shape + print "call input.normalizeGenotype" + raise "This should not be used" x = True - np.isnan(G) m = G[x].mean() s = np.sqrt(G[x].var()) diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py index 28f2042d..0c43587e 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py @@ -21,6 +21,7 @@ import sys import os import numpy as np +from scipy import linalg import multiprocessing as mp # Multiprocessing is part of the Python stdlib import Queue import time @@ -85,12 +86,13 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True): m = G.shape[0] # snps snps = m sys.stderr.write(str(m)+" SNPs\n") - assert m>n, "n should be larger than m (snps>inds)" + assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds) q = mp.Queue() p = mp.Pool(numThreads, f_init, [q]) cpu_num = mp.cpu_count() print "CPU cores:",cpu_num + print snps,computeSize iterations = snps/computeSize+1 # if testing: # iterations = 8 @@ -153,5 +155,23 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True): # np.savetxt(outFile+".kve",Kve) return K +def kvakve(K, verbose=True): + """ + Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned + of small values < 1e-6 (notably smaller than zero) + """ + if verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + + Kva,Kve = linalg.eigh(K) + if verbose: + print("Kva is: ", Kva.shape, Kva) + print("Kve is: ", Kve.shape, Kve) + + if sum(Kva < 1e-6): + if verbose: sys.stderr.write("Cleaning %d eigen values (Kva<0)\n" % (sum(Kva < 0))) + Kva[Kva < 1e-6] = 1e-6 + return Kva,Kve + + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 937d3340..58ff809b 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -50,8 +50,10 @@ has_gn2=True from utility.benchmark import Bench from utility import temp_data -from kinship import kinship, kinship_full +from kinship import kinship, kinship_full, kvakve import genotype +import phenotype +import gwas try: from wqflask.my_pylmm.pyLMM import chunks @@ -254,7 +256,7 @@ def human_association(snp, # refit=False, # temp_data=None): -def run_other(pheno_vector, +def run_other_old(pheno_vector, genotype_matrix, restricted_max_likelihood=True, refit=False, @@ -270,7 +272,7 @@ def run_other(pheno_vector, """ - print("In run_other") + print("Running the original LMM engine in run_other (old)") print("REML=",restricted_max_likelihood," REFIT=",refit) with Bench("Calculate Kinship"): kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata) @@ -278,25 +280,79 @@ def run_other(pheno_vector, print("kinship_matrix: ", pf(kinship_matrix)) print("kinship_matrix.shape: ", pf(kinship_matrix.shape)) - with Bench("Create LMM object"): - lmm_ob = LMM(pheno_vector, kinship_matrix) + # with Bench("Create LMM object"): + # lmm_ob = LMM(pheno_vector, kinship_matrix) - with Bench("LMM_ob fitting"): - lmm_ob.fit() + # with Bench("LMM_ob fitting"): + # lmm_ob.fit() - print("genotype_matrix: ", genotype_matrix.shape) + print("run_other_old genotype_matrix: ", genotype_matrix.shape) print(genotype_matrix) with Bench("Doing GWAS"): t_stats, p_values = GWAS(pheno_vector, - genotype_matrix, - kinship_matrix, - restricted_max_likelihood=True, - refit=False, - temp_data=tempdata) + genotype_matrix, + kinship_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=tempdata) Bench().report() return p_values, t_stats +def run_other_new(pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + tempdata=None # <---- can not be None + ): + + """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics + + restricted_max_likelihood -- whether to use restricted max likelihood; True or False + refit -- whether to refit the variance component for each marker + temp_data -- TempData object that stores the progress for each major step of the + calculations ("calculate_kinship" and "GWAS" take the majority of time) + + """ + + print("Running the new LMM2 engine in run_other_new") + print("REML=",restricted_max_likelihood," REFIT=",refit) + + # Adjust phenotypes + Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True) + print("Removed missing phenotypes",Y.shape) + + # if options.maf_normalization: + # G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) + # print "MAF replacements: \n",G + # if not options.skip_genotype_normalization: + # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) + + with Bench("Calculate Kinship"): + K,G = calculate_kinship(G, tempdata) + + print("kinship_matrix: ", pf(K)) + print("kinship_matrix.shape: ", pf(K.shape)) + + # with Bench("Create LMM object"): + # lmm_ob = lmm2.LMM2(Y,K) + # with Bench("LMM_ob fitting"): + # lmm_ob.fit() + + print("run_other_new genotype_matrix: ", G.shape) + print(G) + + with Bench("Doing GWAS"): + t_stats, p_values = gwas.gwas(Y, + G.T, + K, + restricted_max_likelihood=True, + refit=False,verbose=True) + Bench().report() + return p_values, t_stats + +# def matrixMult(A,B): +# return np.dot(A,B) def matrixMult(A,B): @@ -334,8 +390,10 @@ def calculate_kinship_new(genotype_matrix, temp_data=None): Call the new kinship calculation where genotype_matrix contains inds (columns) by snps (rows). """ + print("call genotype.normalize") G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix) - return kinship(G.T),G + print("call calculate_kinship_new") + return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME) def calculate_kinship_old(genotype_matrix, temp_data=None): """ @@ -345,6 +403,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): normalizes the resulting vectors and returns the RRM matrix. """ + print("call calculate_kinship_old") n = genotype_matrix.shape[0] m = genotype_matrix.shape[1] print("genotype 2D matrix n (inds) is:", n) @@ -375,7 +434,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None): temp_data.store("percent_complete", percent_complete) genotype_matrix = genotype_matrix[:,keep] - print("genotype_matrix: ", pf(genotype_matrix)) + print("After kinship (old) genotype_matrix: ", pf(genotype_matrix)) kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m) return kinship_matrix,genotype_matrix @@ -533,11 +592,13 @@ class LMM: print("this K is:", K.shape, pf(K)) if len(Kva) == 0 or len(Kve) == 0: - if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) begin = time.time() - Kva,Kve = linalg.eigh(K) + # Kva,Kve = linalg.eigh(K) + Kva,Kve = kvakve(K) end = time.time() if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve)) self.K = K self.Kva = Kva @@ -547,10 +608,11 @@ class LMM: self.Y = Y self.X0 = X0 self.N = self.K.shape[0] - - if sum(self.Kva < 1e-6): - if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0))) - self.Kva[self.Kva < 1e-6] = 1e-6 + + # ----> Below moved to kinship.kvakve(K) + # if sum(self.Kva < 1e-6): + # if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0))) + # self.Kva[self.Kva < 1e-6] = 1e-6 self.transform() @@ -713,7 +775,10 @@ class LMM: ts = beta / np.sqrt(var * sigma) ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) - if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(") + if not len(ts) == 1 or not len(ps) == 1: + print("ts=",ts) + print("ps=",ps) + raise Exception("Something bad happened :(") return ts.sum(),ps.sum() def plotFit(self,color='b-',title=''): @@ -739,7 +804,11 @@ class LMM: pl.title(title) -def gn2_redis(key,species): +def gn2_redis(key,species,new_code=True): + """ + Invoke pylmm using Redis as a container. new_code runs the new + version + """ json_params = Redis.get(key) params = json.loads(json_params) @@ -761,11 +830,18 @@ def gn2_redis(key,species): geno = np.array(params['genotype_matrix']) print('geno', geno.shape, geno) - ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']), - genotype_matrix = geno, - restricted_max_likelihood = params['restricted_max_likelihood'], - refit = params['refit'], - tempdata = tempdata) + if new_code: + ps, ts = run_other_new(pheno_vector = np.array(params['pheno_vector']), + genotype_matrix = geno, + restricted_max_likelihood = params['restricted_max_likelihood'], + refit = params['refit'], + tempdata = tempdata) + else: + ps, ts = run_other_old(pheno_vector = np.array(params['pheno_vector']), + genotype_matrix = geno, + restricted_max_likelihood = params['restricted_max_likelihood'], + refit = params['refit'], + tempdata = tempdata) results_key = "pylmm:results:" + params['temp_uuid'] @@ -790,7 +866,7 @@ def gn2_main(): gn2_redis(key,species) -def gn2_load_redis(key,species,kinship,pheno,geno): +def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True): print("Loading Redis from parsed data") if kinship == None: k = None @@ -811,7 +887,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno): Redis.set(key, json_params) Redis.expire(key, 60*60) - return gn2_redis(key,species) + return gn2_redis(key,species,new_code) if __name__ == '__main__': print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!") @@ -821,4 +897,3 @@ if __name__ == '__main__': print("Run from runlmm.py instead") - diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py new file mode 100644 index 00000000..d4b3ac82 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py @@ -0,0 +1,410 @@ +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013,2014 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import sys +import time +import numpy as np +from scipy.linalg import eigh, inv, det +import scipy.stats as stats # t-tests +from scipy import optimize +from optmatrix import matrixMult +import kinship + +def calculateKinship(W,center=False): + """ + W is an n x m matrix encoding SNP minor alleles. + + This function takes a matrix oF SNPs, imputes missing values with the maf, + normalizes the resulting vectors and returns the RRM matrix. + """ + n = W.shape[0] + m = W.shape[1] + keep = [] + for i in range(m): + mn = W[True - np.isnan(W[:,i]),i].mean() + W[np.isnan(W[:,i]),i] = mn + vr = W[:,i].var() + if vr == 0: continue + + keep.append(i) + W[:,i] = (W[:,i] - mn) / np.sqrt(vr) + + W = W[:,keep] + K = matrixMult(W,W.T) * 1.0/float(m) + if center: + P = np.diag(np.repeat(1,n)) - 1/float(n) * np.ones((n,n)) + S = np.trace(matrixMult(matrixMult(P,K),P)) + K_n = (n - 1)*K / S + return K_n + return K + +def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): + """ + + Performs a basic GWAS scan using the LMM. This function + uses the LMM module to assess association at each SNP and + does some simple cleanup, such as removing missing individuals + per SNP and re-computing the eigen-decomp + + Y - n x 1 phenotype vector + X - n x m SNP matrix (genotype matrix) + K - n x n kinship matrix + Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K + X0 - n x q covariate matrix + REML - use restricted maximum likelihood + refit - refit the variance component for each SNP + + """ + n = X.shape[0] + m = X.shape[1] + prins("Initialize GWAS") + print("genotype matrix n is:", n) + print("genotype matrix m is:", m) + + if X0 == None: + X0 = np.ones((n,1)) + + # Remove missing values in Y and adjust associated parameters + v = np.isnan(Y) + if v.sum(): + keep = True - v + keep = keep.reshape((-1,)) + Y = Y[keep] + X = X[keep,:] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + Kva = [] + Kve = [] + + if len(Y) == 0: + return np.ones(m)*np.nan,np.ones(m)*np.nan + + L = LMM(Y,K,Kva,Kve,X0) + if not refit: L.fit() + + PS = [] + TS = [] + + n = X.shape[0] + m = X.shape[1] + + for i in range(m): + x = X[:,i].reshape((n,1)) + v = np.isnan(x).reshape((-1,)) + if v.sum(): + keep = True - v + xs = x[keep,:] + if xs.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue + + Ys = Y[keep] + X0s = X0[keep,:] + Ks = K[keep,:][:,keep] + Ls = LMM(Ys,Ks,X0=X0s) + if refit: + Ls.fit(X=xs) + else: + Ls.fit() + ts,ps = Ls.association(xs,REML=REML) + else: + if x.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue + + if refit: + L.fit(X=x) + ts,ps = L.association(x,REML=REML) + + PS.append(ps) + TS.append(ts) + + return TS,PS + +class LMM2: + + """ + This is a simple version of EMMA/fastLMM. + The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) + and to optimize this model by finding the maximum-likelihood estimates for the model parameters. + There are three model parameters: heritability (h), covariate coefficients (beta) and the total + phenotypic variance (sigma). + Heritability as defined here is the proportion of the total variance (sigma) that is attributed to + the kinship matrix. + + For simplicity, we assume that everything being input is a numpy array. + If this is not the case, the module may throw an error as conversion from list to numpy array + is not done consistently. + + """ + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False): + + """ + The constructor takes a phenotype vector or array Y of size n. + It takes a kinship matrix K of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. + If they are not provided, the constructor will calculate them. + X0 is an optional covariate matrix of size n x q, where there are q covariates. + When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """ + + if X0 == None: + X0 = np.ones(len(Y)).reshape(len(Y),1) + self.verbose = verbose + + x = True - np.isnan(Y) + x = x.reshape(-1,) + if not x.sum() == len(Y): + if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) + Y = Y[x] + K = K[x,:][:,x] + X0 = X0[x,:] + Kva = [] + Kve = [] + self.nonmissing = x + + print("this K is:", K.shape, K) + + if len(Kva) == 0 or len(Kve) == 0: + # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + begin = time.time() + # Kva,Kve = linalg.eigh(K) + Kva,Kve = kinship.kvakve(K) + end = time.time() + if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve)) + + self.K = K + self.Kva = Kva + self.Kve = Kve + self.N = self.K.shape[0] + self.Y = Y.reshape((self.N,1)) + self.X0 = X0 + + if sum(self.Kva < 1e-6): + if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0))) + self.Kva[self.Kva < 1e-6] = 1e-6 + + self.transform() + + def transform(self): + + """ + Computes a transformation on the phenotype vector and the covariate matrix. + The transformation is obtained by left multiplying each parameter by the transpose of the + eigenvector matrix of K (the kinship). + """ + + self.Yt = matrixMult(self.Kve.T, self.Y) + self.X0t = matrixMult(self.Kve.T, self.X0) + self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))]) + self.q = self.X0t.shape[1] + + def getMLSoln(self,h,X): + + """ + Obtains the maximum-likelihood estimates for the covariate coefficients (beta), + the total variance of the trait (sigma) and also passes intermediates that can + be utilized in other functions. The input parameter h is a value between 0 and 1 and represents + the heritability or the proportion of the total variance attributed to genetics. The X is the + covariate matrix. + """ + + S = 1.0/(h*self.Kva + (1.0 - h)) + Xt = X.T*S + XX = matrixMult(Xt,X) + XX_i = inv(XX) + beta = matrixMult(matrixMult(XX_i,Xt),self.Yt) + Yt = self.Yt - matrixMult(X,beta) + Q = np.dot(Yt.T*S,Yt) + sigma = Q * 1.0 / (float(self.N) - float(X.shape[1])) + return beta,sigma,Q,XX_i,XX + + def LL_brent(self,h,X=None,REML=False): + #brent will not be bounded by the specified bracket. + # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search. + if h < 0: return 1e6 + return -self.LL(h,X,stack=False,REML=REML)[0] + + def LL(self,h,X=None,stack=True,REML=False): + + """ + Computes the log-likelihood for a given heritability (h). If X==None, then the + default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with + the input X. If stack is false, then X is used in place of X0t in the LL calculation. + REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. + """ + + if X == None: X = self.X0t + elif stack: + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + n = float(self.N) + q = float(X.shape[1]) + beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) + LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) + LL = -0.5 * LL + + if REML: + LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(det(matrixMult(X.T,X))) - np.log(det(XX)) + LL = LL + 0.5*LL_REML_part + + + LL = LL.sum() + return LL,beta,sigma,XX_i + + def getMax(self,H, X=None,REML=False): + + """ + Helper functions for .fit(...). + This function takes a set of LLs computed over a grid and finds possible regions + containing a maximum. Within these regions, a Brent search is performed to find the + optimum. + + """ + n = len(self.LLs) + HOpt = [] + for i in range(1,n-2): + if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: + HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) + if np.isnan(HOpt[-1]): HOpt[-1] = H[i-1] + #if np.isnan(HOpt[-1]): HOpt[-1] = self.LLs[i-1] + #if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]] + + if len(HOpt) > 1: + if self.verbose: sys.stderr.write("NOTE: Found multiple optima. Returning first...\n") + return HOpt[0] + elif len(HOpt) == 1: return HOpt[0] + elif self.LLs[0] > self.LLs[n-1]: return H[0] + else: return H[n-1] + + + def fit(self,X=None,ngrids=100,REML=True): + + """ + Finds the maximum-likelihood solution for the heritability (h) given the current parameters. + X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as + the covariate matrix. + + This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. + Given this optimum, the function computes the LL and associated ML solutions. + """ + + if X == None: X = self.X0t + else: + #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + H = np.array(range(ngrids)) / float(ngrids) + L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) + self.LLs = L + + hmax = self.getMax(H,X,REML) + L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) + + self.H = H + self.optH = hmax.sum() + self.optLL = L + self.optBeta = beta + self.optSigma = sigma.sum() + + return hmax,beta,sigma,L + + def association(self,X,h=None,stack=True,REML=True,returnBeta=False): + """ + Calculates association statitics for the SNPs encoded in the vector X of size n. + If h == None, the optimal h stored in optH is used. + + """ + if False: + print "X=",X + print "h=",h + print "q=",self.q + print "self.Kve=",self.Kve + print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack + + if stack: + # X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + m = matrixMult(self.Kve.T,X) + # print "m=",m + m = m[:,0] + self.X0t_stack[:,(self.q)] = m + X = self.X0t_stack + + if h == None: h = self.optH + + L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML) + q = len(beta) + ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q) + + if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma + return ts,ps + + def tstat(self,beta,var,sigma,q,log=False): + + """ + Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. + This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. + """ + + ts = beta / np.sqrt(var * sigma) + #ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) + # sf == survival function - this is more accurate -- could also use logsf if the precision is not good enough + if log: + ps = 2.0 + (stats.t.logsf(np.abs(ts), self.N-q)) + else: + ps = 2.0*(stats.t.sf(np.abs(ts), self.N-q)) + if not len(ts) == 1 or not len(ps) == 1: + raise Exception("Something bad happened :(") + return ts.sum(),ps.sum() + + def plotFit(self,color='b-',title=''): + + """ + Simple function to visualize the likelihood space. It takes the LLs + calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. + The resulting "probabilities" are normalized to one and plotted against heritability. + This can be seen as an approximation to the posterior distribuiton of heritability. + + For diagnostic purposes this lets you see if there is one distinct maximum or multiple + and what the variance of the parameter looks like. + """ + import matplotlib.pyplot as pl + + mx = self.LLs.max() + p = np.exp(self.LLs - mx) + p = p/p.sum() + + pl.plot(self.H,p,color) + pl.xlabel("Heritability") + pl.ylabel("Probability of data") + pl.title(title) + + def meanAndVar(self): + + mx = self.LLs.max() + p = np.exp(self.LLs - mx) + p = p/p.sum() + + mn = (self.H * p).sum() + vx = ((self.H - mn)**2 * p).sum() + + return mn,vx + diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py index bb620052..682ba371 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py @@ -36,5 +36,5 @@ def remove_missing(y,g,verbose=False): sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum())) y1 = y[keep] g1 = g[keep,:] - return y1,g1 + return y1,g1,keep diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py index 6bb79856..324c4f2c 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py @@ -21,7 +21,7 @@ from optparse import OptionParser import sys import tsvreader import numpy as np -from lmm import gn2_load_redis, calculate_kinship +from lmm import gn2_load_redis, calculate_kinship_old from kinship import kinship, kinship_full import genotype import phenotype @@ -54,9 +54,12 @@ parser.add_option("--geno",dest="geno", parser.add_option("--maf-normalization", action="store_true", dest="maf_normalization", default=False, help="Apply MAF genotype normalization") -parser.add_option("--skip-genotype-normalization", - action="store_true", dest="skip_genotype_normalization", default=False, - help="Skip genotype normalization") +parser.add_option("--genotype-normalization", + action="store_true", dest="genotype_normalization", default=False, + help="Force genotype normalization") +parser.add_option("--remove-missing-phenotypes", + action="store_true", dest="remove_missing_phenotypes", default=False, + help="Remove missing phenotypes") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") @@ -99,28 +102,58 @@ if options.geno: g = tsvreader.geno(options.geno) print g.shape -if cmd == 'redis': +if cmd == 'redis_new': + # The main difference between redis_new and redis is that missing + # phenotypes are handled by the first + Y = y + G = g + print "Original G",G.shape, "\n", G + + gt = G.T + G = None + ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True) + print np.array(ps) + print len(ps),sum(ps) + # Test results + p1 = round(ps[0],4) + p2 = round(ps[-1],4) + sys.stderr.write(options.geno+"\n") + if options.geno == 'data/small.geno': + assert p1==0.0708, "p1=%f" % p1 + assert p2==0.1417, "p2=%f" % p2 + if options.geno == 'data/small_na.geno': + assert p1==0.0897, "p1=%f" % p1 + assert p2==0.0405, "p2=%f" % p2 + if options.geno == 'data/test8000.geno': + # assert p1==0.8984, "p1=%f" % p1 + # assert p2==0.9621, "p2=%f" % p2 + assert round(sum(ps)) == 4070 + assert len(ps) == 8000 +elif cmd == 'redis': # Emulating the redis setup of GN2 G = g print "Original G",G.shape, "\n", G - if y != None: + if y != None and options.remove_missing_phenotypes: gnt = np.array(g).T - Y,g = phenotype.remove_missing(y,g.T,options.verbose) + Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose) G = g.T print "Removed missing phenotypes",G.shape, "\n", G + else: + Y = y if options.maf_normalization: G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) print "MAF replacements: \n",G - if not options.skip_genotype_normalization: + if options.genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) g = None gnt = None gt = G.T G = None - ps, ts = gn2_load_redis('testrun','other',k,Y,gt) + ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False) print np.array(ps) - # Test results + print len(ps),sum(ps) + # Test results 4070.02346579 p1 = round(ps[0],4) p2 = round(ps[-1],4) sys.stderr.write(options.geno+"\n") @@ -128,15 +161,15 @@ if cmd == 'redis': assert p1==0.0708, "p1=%f" % p1 assert p2==0.1417, "p2=%f" % p2 if options.geno == 'data/small_na.geno': - assert p1==0.0958, "p1=%f" % p1 - assert p2==0.0435, "p2=%f" % p2 + assert p1==0.0897, "p1=%f" % p1 + assert p2==0.0405, "p2=%f" % p2 if options.geno == 'data/test8000.geno': - assert p1==0.8984, "p1=%f" % p1 - assert p2==0.9623, "p2=%f" % p2 + assert round(sum(ps)) == 4070 + assert len(ps) == 8000 elif cmd == 'kinship': G = g print "Original G",G.shape, "\n", G - if y != None: + if y != None and options.remove_missing_phenotypes: gnt = np.array(g).T Y,g = phenotype.remove_missing(y,g.T,options.verbose) G = g.T @@ -144,32 +177,32 @@ elif cmd == 'kinship': if options.maf_normalization: G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g ) print "MAF replacements: \n",G - if not options.skip_genotype_normalization: + if options.genotype_normalization: G = np.apply_along_axis( genotype.normalize, axis=1, arr=G) g = None gnt = None if options.test_kinship: - K = kinship_full(G) + K = kinship_full(np.copy(G)) print "Genotype",G.shape, "\n", G print "first Kinship method",K.shape,"\n",K k1 = round(K[0][0],4) - K2 = calculate_kinship(np.copy(G.T),temp_data=None) + K2,G = calculate_kinship_old(np.copy(G).T,temp_data=None) print "Genotype",G.shape, "\n", G print "GN2 Kinship method",K2.shape,"\n",K2 k2 = round(K2[0][0],4) print "Genotype",G.shape, "\n", G - K3 = kinship(np.copy(G),options) + K3 = kinship(G.T) print "third Kinship method",K3.shape,"\n",K3 sys.stderr.write(options.geno+"\n") k3 = round(K3[0][0],4) if options.geno == 'data/small.geno': - assert k1==0.7939, "k1=%f" % k1 + assert k1==0.8, "k1=%f" % k1 assert k2==0.7939, "k2=%f" % k2 assert k3==0.7939, "k3=%f" % k3 if options.geno == 'data/small_na.geno': - assert k1==0.7172, "k1=%f" % k1 + assert k1==0.8333, "k1=%f" % k1 assert k2==0.7172, "k2=%f" % k2 assert k3==0.7172, "k3=%f" % k3 if options.geno == 'data/test8000.geno': |