diff options
Diffstat (limited to 'wqflask')
-rwxr-xr-x | wqflask/wqflask/marker_regression/marker_regression.py | 61 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/data/genofile_parser.py | 85 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 663 |
3 files changed, 462 insertions, 347 deletions
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index 92270eb2..13ec4280 100755 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -17,6 +17,8 @@ import urllib import numpy as np +import json + from htmlgen import HTMLgen2 as HT from utility import Plot, Bunch from wqflask.interval_analyst import GeneUtil @@ -453,21 +455,43 @@ class MarkerRegression(object): def gen_data(self): """Todo: Fill this in here""" - prep_data.PrepData(self.vals, self.dataset.group.name) + json_data = open(os.path.join(webqtlConfig.NEWGENODIR + self.dataset.group.name + '.json')) + markers = json.load(json_data) + genotype_data = [marker['genotypes'] for marker in markers] + + no_val_samples = self.identify_empty_samples() + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + #print("trimmed genotype data is:", pf(trimmed_genotype_data)) + + #for marker_object in genotype_data: + # print("marker_object:", pf(marker_object)) + + + #prep_data.PrepData(self.vals, genotype_data) pheno_vector = np.array([float(val) for val in self.vals if val!="x"]) - genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR, - self.dataset.group.name + '.snps.new')) + genotypes = np.array(trimmed_genotype_data).T + print("genotypes is", pf(genotypes)) + #genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR, + # self.dataset.group.name + '.snps.new')).T - print("genotypes is:", pf(genotypes)) + print("pheno_vector is:", pf(pheno_vector.shape)) + print("genotypes is:", pf(genotypes.shape)) kinship_matrix = lmm.calculateKinship(genotypes) print("kinship_matrix is:", pf(kinship_matrix)) - print("pheno_vector is:", pf(pheno_vector)) lmm_ob = lmm.LMM(pheno_vector, kinship_matrix) lmm_ob.fit() - + + t_stats, p_values = lmm.GWAS(pheno_vector, + genotypes, + kinship_matrix, + REML=True, + refit=False) + + print("p_values is:", pf(len(p_values))) #calculate QTL for each trait self.qtl_results = self.genotype.regression(strains = self.samples, @@ -633,6 +657,31 @@ class MarkerRegression(object): #return rv,tblobj,bottomInfo + def identify_empty_samples(self): + no_val_samples = [] + for sample_count, val in enumerate(self.vals): + if val == "x": + no_val_samples.append(sample_count) + return no_val_samples + #print("self.no_val_samples:", self.no_val_samples) + #nums = set(range(0, 176)) + #print("not included:", nums-self.empty_columns) + + def trim_genotypes(self, genotype_data, no_value_samples): + trimmed_genotype_data = [] + for marker in genotype_data: + new_genotypes = [] + for item_count, genotype in enumerate(marker): + if item_count in no_value_samples: + continue + try: + genotype = float(genotype) + except ValueError: + pass + new_genotypes.append(genotype) + trimmed_genotype_data.append(new_genotypes) + return trimmed_genotype_data + def plotIntMappingForPLINK(self, fd, canvas, offset= (80, 120, 20, 80), zoom = 1, startMb = None, endMb = None, showLocusForm = "",plinkResultDict={}): #calculating margins xLeftOffset, xRightOffset, yTopOffset, yBottomOffset = offset diff --git a/wqflask/wqflask/my_pylmm/data/genofile_parser.py b/wqflask/wqflask/my_pylmm/data/genofile_parser.py index 1dafecc8..ec8c521c 100644 --- a/wqflask/wqflask/my_pylmm/data/genofile_parser.py +++ b/wqflask/wqflask/my_pylmm/data/genofile_parser.py @@ -1,13 +1,31 @@ #!/usr/bin/python from __future__ import print_function, division, absolute_import -import csv +import sys +sys.path.append("..") import os import glob import traceback +import numpy as np +from pyLMM import lmm + +import simplejson as json + +from pprint import pformat as pf + class EmptyConfigurations(Exception): pass + + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + class ConvertGenoFile(object): def __init__(self, input_file, output_file): @@ -15,6 +33,9 @@ class ConvertGenoFile(object): self.input_file = input_file self.output_file = output_file + self.mb_exists = False + self.markers = [] + self.latest_row_pos = None self.latest_col_pos = None @@ -23,7 +44,7 @@ class ConvertGenoFile(object): def convert(self): - self.prefer_config = { + self.haplotype_notation = { '@mat': "1", '@pat': "0", '@het': "0.5", @@ -31,36 +52,56 @@ class ConvertGenoFile(object): } self.configurations = {} - self.skipped_cols = 3 + #self.skipped_cols = 3 self.input_fh = open(self.input_file) - with open(self.output_file, "w") as self.output_fh: self.process_csv() - - - + + #def process_row(self, row): # counter = 0 # for char in row: # if char # counter += 1 - + def process_csv(self): for row_count, row in enumerate(self.process_rows()): #self.latest_row_pos = row_count - for item_count, item in enumerate(row.split()[self.skipped_cols:]): + row_items = row.split() + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + this_marker.cM = row_items[2] + if self.mb_exists: + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + else: + genotypes = row_items[3:] + for item_count, genotype in enumerate(genotypes): + this_marker.genotypes.append(self.configurations[genotype.upper()]) + + #print("this_marker is:", pf(this_marker.__dict__)) + + self.markers.append(this_marker.__dict__) + + with open(self.output_file, 'w') as fh: + json.dump(self.markers, fh, indent=" ", sort_keys=True) + # print('configurations:', str(configurations)) - self.latest_col_pos = item_count + self.skipped_cols - self.latest_col_value = item - if item_count != 0: - self.output_fh.write(" ") - self.output_fh.write(self.configurations[item.upper()]) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + #if item_count != 0: + # self.output_fh.write(" ") + #self.output_fh.write(self.configurations[item.upper()]) - self.output_fh.write("\n") - + #self.output_fh.write("\n") + + def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): self.latest_row_value = row @@ -69,14 +110,14 @@ class ConvertGenoFile(object): continue if row.startswith('Chr'): if 'Mb' in row.split(): - self.skipped_cols = 4 + self.mb_exists = True continue if row.startswith('@'): key, _separater, value = row.partition(':') key = key.strip() value = value.strip() - if key in self.prefer_config: - self.configurations[value] = self.prefer_config[key] + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] continue if not len(self.configurations): raise EmptyConfigurations @@ -87,17 +128,17 @@ class ConvertGenoFile(object): os.chdir(old_directory) for input_file in glob.glob("*.geno"): group_name = input_file.split('.')[0] - output_file = os.path.join(new_directory, group_name + ".snps") + output_file = os.path.join(new_directory, group_name + ".json") print("%s -> %s" % (input_file, output_file)) convertob = ConvertGenoFile(input_file, output_file) try: - convertob.convert() + convertob.convert() except EmptyConfigurations as why: print(" No config info? Continuing...") #excepted = True continue except Exception as why: - + print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %i at tabular column %i" % (convertob.latest_row_pos, diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 1ae663d4..015c2e14 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -1,109 +1,128 @@ -# pyLMM software Copyright 2012, Nicholas A. Furlotte -# Version 0.1 - -#License Details -#--------------- - -# The program is free for academic use. Please contact Nick Furlotte -# <nick.furlotte@gmail.com> if you are interested in using the software for -# commercial purposes. - -# The software must not be modified and distributed without prior -# permission of the author. -# Any instance of this software must retain the above copyright notice. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import absolute_import, print_function, division import sys import time import numpy as np -import numpy.linalg as linalg +from scipy import linalg from scipy import optimize from scipy import stats -#import matplotlib.pyplot as pl import pdb from pprint import pformat as pf +#np.seterr('raise') + +def matrixMult(A,B): + #return np.dot(A,B) + + print("A is:", pf(A.shape)) + print("B is:", pf(B.shape)) + + # If the matrices are in Fortran order then the computations will be faster + # when using dgemm. Otherwise, the function will copy the matrix and that takes time. + if not A.flags['F_CONTIGUOUS']: + AA = A.T + transA = True + else: + AA = A + transA = False + + if not B.flags['F_CONTIGUOUS']: + BB = B.T + transB = True + else: + BB = B + transB = False + + return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) + def calculateKinship(W): - """ - W is an n x m matrix encoding SNP minor alleles. - """ - n = W.shape[0] - m = W.shape[1] - keep = [] - for i in range(m): - mn = W[True - np.isnan(W[:,i]),i].mean() - W[np.isnan(W[:,i]),i] = mn - vr = W[:,i].var() - if vr == 0: continue - - keep.append(i) - W[:,i] = (W[:,i] - mn) / np.sqrt(vr) - - W = W[:,keep] - K = np.dot(W,W.T) * 1.0/float(m) - return K + """ + W is an n x m matrix encoding SNP minor alleles. + + This function takes a matrix oF SNPs, imputes missing values with the maf, + normalizes the resulting vectors and returns the RRM matrix. + """ + n = W.shape[0] + m = W.shape[1] + keep = [] + for i in range(m): + mn = W[True - np.isnan(W[:,i]),i].mean() + W[np.isnan(W[:,i]),i] = mn + vr = W[:,i].var() + if vr == 0: continue + + keep.append(i) + W[:,i] = (W[:,i] - mn) / np.sqrt(vr) + + W = W[:,keep] + K = matrixMult(W,W.T) * 1.0/float(m) + return K def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): - """ - Performs a basic GWAS scan using the LMM. This function - uses the LMM module to assess association at each SNP and - does some simple cleanup, such as removing missing individuals - per SNP and re-computing the eigen-decomp - - Y - n x 1 phenotype vector - X - n x m SNP matrix - K - n x n kinship matrix - Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K - X0 - n x q covariate matrix - REML - use restricted maximum likelihood - refit - refit the variance component for each SNP - """ - n = X.shape[0] - m = X.shape[1] - - if X0 == None: X0 = np.ones((n,1)) - - # Remove missing values in Y and adjust associated parameters - v = np.isnan(Y) - if v.sum(): - keep = True - v - Y = Y[keep] - X = X[keep,:] - X0 = X0[keep,:] - K = K[keep,:][:,keep] - Kva = [] - Kve = [] - - L = LMM(Y,K,Kva,Kve,X0) - if not refit: L.fit() - - PS = [] - TS = [] - - for i in range(m): - x = X[:,i].reshape((n,1)) - v = np.isnan(x).reshape((-1,)) - if v.sum(): + """ + Performs a basic GWAS scan using the LMM. This function + uses the LMM module to assess association at each SNP and + does some simple cleanup, such as removing missing individuals + per SNP and re-computing the eigen-decomp + + Y - n x 1 phenotype vector + X - n x m SNP matrix + K - n x n kinship matrix + Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K + X0 - n x q covariate matrix + REML - use restricted maximum likelihood + refit - refit the variance component for each SNP + """ + n = X.shape[0] + m = X.shape[1] + + if X0 == None: X0 = np.ones((n,1)) + + # Remove missing values in Y and adjust associated parameters + v = np.isnan(Y) + if v.sum(): + keep = True - v + Y = Y[keep] + X = X[keep,:] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + Kva = [] + Kve = [] + + L = LMM(Y,K,Kva,Kve,X0) + if not refit: L.fit() + + PS = [] + TS = [] + + for i in range(m): + x = X[:,i].reshape((n,1)) + v = np.isnan(x).reshape((-1,)) + if v.sum(): keep = True - v xs = x[keep,:] - if xs.var() == 0: - PS.append(np.nan) - TS.append(np.nan) - continue + if xs.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue Ys = Y[keep] X0s = X0[keep,:] @@ -112,245 +131,251 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): if refit: Ls.fit(X=xs) else: Ls.fit() ts,ps = Ls.association(xs,REML=REML) - else: - if x.var() == 0: - PS.append(np.nan) - TS.append(np.nan) - continue + else: + if x.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue if refit: L.fit(X=x) ts,ps = L.association(x,REML=REML) + + PS.append(ps) + TS.append(ts) - PS.append(ps) - TS.append(ts) - - return TS,PS + return TS,PS class LMM: - """ - This is a simple version of EMMA/fastLMM. - The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) - and to optimize this model by finding the maximum-likelihood estimates for the model parameters. - There are three model parameters: heritability (h), covariate coefficients (beta) and the total - phenotypic variance (sigma). - Heritability as defined here is the proportion of the total variance (sigma) that is attributed to - the kinship matrix. - - For simplicity, we assume that everything being input is a numpy array. - If this is not the case, the module may throw an error as conversion from list to numpy array - is not done consistently. - - """ - def __init__(self, Y, K, Kva=None, Kve=None, X0=None): - """ - The constructor takes a phenotype vector or array of size n. - It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. - If they are not provided, the constructor will calculate them. - X0 is an optional covariate matrix of size n x q, where there are q covariates. - When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. - - """ - - if Kva is None: - Kva = [] - if Kve is None: - Kve = [] - - - if X0 == None: - X0 = np.ones(len(Y)).reshape(len(Y),1) - - print("Y is:", pf(Y)) - - for key, value in locals().iteritems(): - print(" %s - %s" % (key, type(value))) - - x = Y != -9 - print("x is:", pf(x)) - if not x.sum() == len(Y): - print("x.sum is:", pf(x.sum())) - print("len(Y) is:", pf(len(Y))) - sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) - Y = Y[x] - K = K[x,:][:,x] - X0 = X0[x,:] - Kva = [] - Kve = [] - self.nonmissing = x - - if len(Kva) == 0 or len(Kve) == 0: - sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) - begin = time.time() - Kva,Kve = linalg.eigh(K) - end = time.time() - sys.stderr.write("Total time: %0.3f\n" % (end - begin)) - self.K = K - self.Kva = Kva - self.Kve = Kve - self.Y = Y - self.X0 = X0 - self.N = self.K.shape[0] - - self.transform() - - def transform(self): + """ + This is a simple version of EMMA/fastLMM. + The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) + and to optimize this model by finding the maximum-likelihood estimates for the model parameters. + There are three model parameters: heritability (h), covariate coefficients (beta) and the total + phenotypic variance (sigma). + Heritability as defined here is the proportion of the total variance (sigma) that is attributed to + the kinship matrix. + + For simplicity, we assume that everything being input is a numpy array. + If this is not the case, the module may throw an error as conversion from list to numpy array + is not done consistently. + + """ + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False): + + """ + The constructor takes a phenotype vector or array of size n. + It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. + If they are not provided, the constructor will calculate them. + X0 is an optional covariate matrix of size n x q, where there are q covariates. + When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """ + + if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) + self.verbose = verbose + + #x = Y != -9 + x = True - np.isnan(Y) + if not x.sum() == len(Y): + if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) + Y = Y[x] + K = K[x,:][:,x] + X0 = X0[x,:] + Kva = [] + Kve = [] + self.nonmissing = x + + if len(Kva) == 0 or len(Kve) == 0: + if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + begin = time.time() + Kva,Kve = linalg.eigh(K) + end = time.time() + if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + + self.K = K + self.Kva = Kva + self.Kve = Kve + self.Y = Y + self.X0 = X0 + self.N = self.K.shape[0] + + if sum(self.Kva < 1e-6): + if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0))) + self.Kva[self.Kva < 1e-6] = 1e-6 + + self.transform() + + def transform(self): """ Computes a transformation on the phenotype vector and the covariate matrix. - The transformation is obtained by left multiplying each parameter by the transpose of the + The transformation is obtained by left multiplying each parameter by the transpose of the eigenvector matrix of K (the kinship). """ - - print(len(self.Kve.T)) - print(len(self.Y)) - - self.Yt = np.dot(self.Kve.T, self.Y) - self.X0t = np.dot(self.Kve.T, self.X0) - - def getMLSoln(self,h,X): - - """ - Obtains the maximum-likelihood estimates for the covariate coefficients (beta), - the total variance of the trait (sigma) and also passes intermediates that can - be utilized in other functions. The input parameter h is a value between 0 and 1 and represents - the heritability or the proportion of the total variance attributed to genetics. The X is the - covariate matrix. - """ - - #print("h is", pf(h)) - #print("X is", pf(X)) - print("X.shape is", pf(X.shape)) - - S = 1.0/(h*self.Kva + (1.0 - h)) - Xt = X.T*S - XX = np.dot(Xt,X) - - - XX_i = linalg.inv(XX) - beta = np.dot(np.dot(XX_i,Xt),self.Yt) - Yt = self.Yt - np.dot(X,beta) - Q = np.dot(Yt.T*S,Yt) - sigma = Q * 1.0 / (float(len(self.Yt)) - float(X.shape[1])) - return beta,sigma,Q,XX_i,XX - - def LL_brent(self,h,X=None,REML=False): return -self.LL(h,X,stack=False,REML=REML)[0] - def LL(self,h,X=None,stack=True,REML=False): - - """ - Computes the log-likelihood for a given heritability (h). If X==None, then the - default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with - the input X. If stack is false, then X is used in place of X0t in the LL calculation. - REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. - """ - - if X == None: X = self.X0t - elif stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - - n = float(self.N) - q = float(X.shape[1]) - beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) - LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) - LL = -0.5 * LL - - if REML: - LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(np.dot(X.T,X))) - np.log(linalg.det(XX)) - LL = LL + 0.5*LL_REML_part - - return LL,beta,sigma,XX_i - - def getMax(self,H, X=None,REML=False): - - """ - Helper functions for .fit(...). - This function takes a set of LLs computed over a grid and finds possible regions - containing a maximum. Within these regions, a Brent search is performed to find the - optimum. - - """ - n = len(self.LLs) - HOpt = [] - for i in range(1,n-2): - if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) - - if len(HOpt) > 1: - sys.stderr.write("ERR: Found multiple maximum. Returning first...\n") - return HOpt[0] - elif len(HOpt) == 1: return HOpt[0] - elif self.LLs[0] > self.LLs[n-1]: return H[0] - else: return H[n-1] - - def fit(self,X=None,ngrids=100,REML=True): - - """ - Finds the maximum-likelihood solution for the heritability (h) given the current parameters. - X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as - the covariate matrix. - - This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. - Given this optimum, the function computes the LL and associated ML solutions. - """ - - if X == None: X = self.X0t - else: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - H = np.array(range(ngrids)) / float(ngrids) - L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) - self.LLs = L - - hmax = self.getMax(H,X,REML) - L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) - - self.H = H - self.optH = hmax - self.optLL = L - self.optBeta = beta - self.optSigma = sigma - - return hmax,beta,sigma,L - - - def association(self,X, h = None, stack=True,REML=True): - - """ - Calculates association statitics for the SNPs encoded in the vector X of size n. - If h == None, the optimal h stored in optH is used. - - """ - if stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - if h == None: h = self.optH - - L,beta,sigma,betaSTDERR = self.LL(h,X,stack=False,REML=REML) - q = len(beta) - ts,ps = self.tstat(beta[q-1],betaSTDERR[q-1,q-1],sigma,q) - return ts,ps - - def tstat(self,beta,stderr,sigma,q): - - """ - Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. - This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. - """ - - ts = beta / np.sqrt(stderr * sigma) - ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) - return ts,ps - - def plotFit(self,color='b-',title=''): - - """ - Simple function to visualize the likelihood space. It takes the LLs - calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. - The resulting "probabilities" are normalized to one and plotted against heritability. - This can be seen as an approximation to the posterior distribuiton of heritability. - - For diagnostic purposes this lets you see if there is one distinct maximum or multiple - and what the variance of the parameter looks like. - """ - mx = self.LLs.max() - p = np.exp(self.LLs - mx) - p = p/p.sum() - - pl.plot(self.H,p,color) - pl.xlabel("Heritability") - pl.ylabel("Probability of data") - pl.title(title) + + self.Yt = matrixMult(self.Kve.T, self.Y) + self.X0t = matrixMult(self.Kve.T, self.X0) + self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))]) + self.q = self.X0t.shape[1] + + def getMLSoln(self,h,X): + + """ + Obtains the maximum-likelihood estimates for the covariate coefficients (beta), + the total variance of the trait (sigma) and also passes intermediates that can + be utilized in other functions. The input parameter h is a value between 0 and 1 and represents + the heritability or the proportion of the total variance attributed to genetics. The X is the + covariate matrix. + """ + + S = 1.0/(h*self.Kva + (1.0 - h)) + Xt = X.T*S + XX = matrixMult(Xt,X) + XX_i = linalg.inv(XX) + beta = matrixMult(matrixMult(XX_i,Xt),self.Yt) + Yt = self.Yt - matrixMult(X,beta) + Q = np.dot(Yt.T*S,Yt) + sigma = Q * 1.0 / (float(self.N) - float(X.shape[1])) + return beta,sigma,Q,XX_i,XX + + def LL_brent(self,h,X=None,REML=False): + #brent will not be bounded by the specified bracket. + # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search. + if h < 0: return 1e6 + return -self.LL(h,X,stack=False,REML=REML)[0] + + def LL(self,h,X=None,stack=True,REML=False): + + """ + Computes the log-likelihood for a given heritability (h). If X==None, then the + default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with + the input X. If stack is false, then X is used in place of X0t in the LL calculation. + REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. + """ + + if X == None: X = self.X0t + elif stack: + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + n = float(self.N) + q = float(X.shape[1]) + beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) + LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) + LL = -0.5 * LL + + if REML: + LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(matrixMult(X.T,X))) - np.log(linalg.det(XX)) + LL = LL + 0.5*LL_REML_part + + return LL,beta,sigma,XX_i + + def getMax(self,H, X=None,REML=False): + + """ + Helper functions for .fit(...). + This function takes a set of LLs computed over a grid and finds possible regions + containing a maximum. Within these regions, a Brent search is performed to find the + optimum. + + """ + n = len(self.LLs) + HOpt = [] + for i in range(1,n-2): + if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: + HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) + if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]] + + if len(HOpt) > 1: + if self.verbose: sys.stderr.write("NOTE: Found multiple optima. Returning first...\n") + return HOpt[0] + elif len(HOpt) == 1: return HOpt[0] + elif self.LLs[0] > self.LLs[n-1]: return H[0] + else: return H[n-1] + + def fit(self,X=None,ngrids=100,REML=True): + + """ + Finds the maximum-likelihood solution for the heritability (h) given the current parameters. + X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as + the covariate matrix. + + This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. + Given this optimum, the function computes the LL and associated ML solutions. + """ + + if X == None: X = self.X0t + else: + #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + H = np.array(range(ngrids)) / float(ngrids) + L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) + self.LLs = L + + hmax = self.getMax(H,X,REML) + L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) + + self.H = H + self.optH = hmax + self.optLL = L + self.optBeta = beta + self.optSigma = sigma + + return hmax,beta,sigma,L + + def association(self,X, h = None, stack=True,REML=True, returnBeta=False): + + """ + Calculates association statitics for the SNPs encoded in the vector X of size n. + If h == None, the optimal h stored in optH is used. + + """ + if stack: + #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + if h == None: h = self.optH + + L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML) + q = len(beta) + ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q) + + if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma + return ts,ps + + def tstat(self,beta,var,sigma,q): + + """ + Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. + This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. + """ + + ts = beta / np.sqrt(var * sigma) + ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) + if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(") + return ts.sum(),ps.sum() + + def plotFit(self,color='b-',title=''): + + """ + Simple function to visualize the likelihood space. It takes the LLs + calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. + The resulting "probabilities" are normalized to one and plotted against heritability. + This can be seen as an approximation to the posterior distribuiton of heritability. + + For diagnostic purposes this lets you see if there is one distinct maximum or multiple + and what the variance of the parameter looks like. + """ + import matplotlib.pyplot as pl + + mx = self.LLs.max() + p = np.exp(self.LLs - mx) + p = p/p.sum() + + pl.plot(self.H,p,color) + pl.xlabel("Heritability") + pl.ylabel("Probability of data") + pl.title(title)
\ No newline at end of file |