diff options
| author | Zachary Sloan | 2013-02-08 17:47:58 -0600 | 
|---|---|---|
| committer | Zachary Sloan | 2013-02-08 17:47:58 -0600 | 
| commit | bf6e7bd8e473a10d80044fa6bf778b261d5ee6ff (patch) | |
| tree | 2027003734f77b60108bfabb7a8e95dade541463 | |
| parent | 9b0264bf13e994298de95a4e08198336b6c97a38 (diff) | |
| download | genenetwork2-bf6e7bd8e473a10d80044fa6bf778b261d5ee6ff.tar.gz | |
Converted .geno files to json files and wrote code in marker_regression.py
that loads the json files, converts them into the relevant numpy arrays, and passes them into Nick's code (which is returning results that may or may not be correct, but is at least running)
| -rwxr-xr-x | wqflask/wqflask/marker_regression/marker_regression.py | 61 | ||||
| -rw-r--r-- | wqflask/wqflask/my_pylmm/data/genofile_parser.py | 85 | ||||
| -rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 663 | 
3 files changed, 462 insertions, 347 deletions
| diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py index 92270eb2..13ec4280 100755 --- a/wqflask/wqflask/marker_regression/marker_regression.py +++ b/wqflask/wqflask/marker_regression/marker_regression.py @@ -17,6 +17,8 @@ import urllib import numpy as np +import json + from htmlgen import HTMLgen2 as HT from utility import Plot, Bunch from wqflask.interval_analyst import GeneUtil @@ -453,21 +455,43 @@ class MarkerRegression(object): def gen_data(self): """Todo: Fill this in here""" - prep_data.PrepData(self.vals, self.dataset.group.name) + json_data = open(os.path.join(webqtlConfig.NEWGENODIR + self.dataset.group.name + '.json')) + markers = json.load(json_data) + genotype_data = [marker['genotypes'] for marker in markers] + + no_val_samples = self.identify_empty_samples() + trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples) + + #print("trimmed genotype data is:", pf(trimmed_genotype_data)) + + #for marker_object in genotype_data: + # print("marker_object:", pf(marker_object)) + + + #prep_data.PrepData(self.vals, genotype_data) pheno_vector = np.array([float(val) for val in self.vals if val!="x"]) - genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR, - self.dataset.group.name + '.snps.new')) + genotypes = np.array(trimmed_genotype_data).T + print("genotypes is", pf(genotypes)) + #genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR, + # self.dataset.group.name + '.snps.new')).T - print("genotypes is:", pf(genotypes)) + print("pheno_vector is:", pf(pheno_vector.shape)) + print("genotypes is:", pf(genotypes.shape)) kinship_matrix = lmm.calculateKinship(genotypes) print("kinship_matrix is:", pf(kinship_matrix)) - print("pheno_vector is:", pf(pheno_vector)) lmm_ob = lmm.LMM(pheno_vector, kinship_matrix) lmm_ob.fit() - + + t_stats, p_values = lmm.GWAS(pheno_vector, + genotypes, + kinship_matrix, + REML=True, + refit=False) + + print("p_values is:", pf(len(p_values))) #calculate QTL for each trait self.qtl_results = self.genotype.regression(strains = self.samples, @@ -633,6 +657,31 @@ class MarkerRegression(object): #return rv,tblobj,bottomInfo + def identify_empty_samples(self): + no_val_samples = [] + for sample_count, val in enumerate(self.vals): + if val == "x": + no_val_samples.append(sample_count) + return no_val_samples + #print("self.no_val_samples:", self.no_val_samples) + #nums = set(range(0, 176)) + #print("not included:", nums-self.empty_columns) + + def trim_genotypes(self, genotype_data, no_value_samples): + trimmed_genotype_data = [] + for marker in genotype_data: + new_genotypes = [] + for item_count, genotype in enumerate(marker): + if item_count in no_value_samples: + continue + try: + genotype = float(genotype) + except ValueError: + pass + new_genotypes.append(genotype) + trimmed_genotype_data.append(new_genotypes) + return trimmed_genotype_data + def plotIntMappingForPLINK(self, fd, canvas, offset= (80, 120, 20, 80), zoom = 1, startMb = None, endMb = None, showLocusForm = "",plinkResultDict={}): #calculating margins xLeftOffset, xRightOffset, yTopOffset, yBottomOffset = offset diff --git a/wqflask/wqflask/my_pylmm/data/genofile_parser.py b/wqflask/wqflask/my_pylmm/data/genofile_parser.py index 1dafecc8..ec8c521c 100644 --- a/wqflask/wqflask/my_pylmm/data/genofile_parser.py +++ b/wqflask/wqflask/my_pylmm/data/genofile_parser.py @@ -1,13 +1,31 @@ #!/usr/bin/python from __future__ import print_function, division, absolute_import -import csv +import sys +sys.path.append("..") import os import glob import traceback +import numpy as np +from pyLMM import lmm + +import simplejson as json + +from pprint import pformat as pf + class EmptyConfigurations(Exception): pass + + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + class ConvertGenoFile(object): def __init__(self, input_file, output_file): @@ -15,6 +33,9 @@ class ConvertGenoFile(object): self.input_file = input_file self.output_file = output_file + self.mb_exists = False + self.markers = [] + self.latest_row_pos = None self.latest_col_pos = None @@ -23,7 +44,7 @@ class ConvertGenoFile(object): def convert(self): - self.prefer_config = { + self.haplotype_notation = { '@mat': "1", '@pat': "0", '@het': "0.5", @@ -31,36 +52,56 @@ class ConvertGenoFile(object): } self.configurations = {} - self.skipped_cols = 3 + #self.skipped_cols = 3 self.input_fh = open(self.input_file) - with open(self.output_file, "w") as self.output_fh: self.process_csv() - - - + + #def process_row(self, row): # counter = 0 # for char in row: # if char # counter += 1 - + def process_csv(self): for row_count, row in enumerate(self.process_rows()): #self.latest_row_pos = row_count - for item_count, item in enumerate(row.split()[self.skipped_cols:]): + row_items = row.split() + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + this_marker.cM = row_items[2] + if self.mb_exists: + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + else: + genotypes = row_items[3:] + for item_count, genotype in enumerate(genotypes): + this_marker.genotypes.append(self.configurations[genotype.upper()]) + + #print("this_marker is:", pf(this_marker.__dict__)) + + self.markers.append(this_marker.__dict__) + + with open(self.output_file, 'w') as fh: + json.dump(self.markers, fh, indent=" ", sort_keys=True) + # print('configurations:', str(configurations)) - self.latest_col_pos = item_count + self.skipped_cols - self.latest_col_value = item - if item_count != 0: - self.output_fh.write(" ") - self.output_fh.write(self.configurations[item.upper()]) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + #if item_count != 0: + # self.output_fh.write(" ") + #self.output_fh.write(self.configurations[item.upper()]) - self.output_fh.write("\n") - + #self.output_fh.write("\n") + + def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): self.latest_row_value = row @@ -69,14 +110,14 @@ class ConvertGenoFile(object): continue if row.startswith('Chr'): if 'Mb' in row.split(): - self.skipped_cols = 4 + self.mb_exists = True continue if row.startswith('@'): key, _separater, value = row.partition(':') key = key.strip() value = value.strip() - if key in self.prefer_config: - self.configurations[value] = self.prefer_config[key] + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] continue if not len(self.configurations): raise EmptyConfigurations @@ -87,17 +128,17 @@ class ConvertGenoFile(object): os.chdir(old_directory) for input_file in glob.glob("*.geno"): group_name = input_file.split('.')[0] - output_file = os.path.join(new_directory, group_name + ".snps") + output_file = os.path.join(new_directory, group_name + ".json") print("%s -> %s" % (input_file, output_file)) convertob = ConvertGenoFile(input_file, output_file) try: - convertob.convert() + convertob.convert() except EmptyConfigurations as why: print(" No config info? Continuing...") #excepted = True continue except Exception as why: - + print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %i at tabular column %i" % (convertob.latest_row_pos, diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 1ae663d4..015c2e14 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -1,109 +1,128 @@ -# pyLMM software Copyright 2012, Nicholas A. Furlotte -# Version 0.1 - -#License Details -#--------------- - -# The program is free for academic use. Please contact Nick Furlotte -# <nick.furlotte@gmail.com> if you are interested in using the software for -# commercial purposes. - -# The software must not be modified and distributed without prior -# permission of the author. -# Any instance of this software must retain the above copyright notice. - -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylmm is a python-based linear mixed-model solver with applications to GWAS + +# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import absolute_import, print_function, division import sys import time import numpy as np -import numpy.linalg as linalg +from scipy import linalg from scipy import optimize from scipy import stats -#import matplotlib.pyplot as pl import pdb from pprint import pformat as pf +#np.seterr('raise') + +def matrixMult(A,B): + #return np.dot(A,B) + + print("A is:", pf(A.shape)) + print("B is:", pf(B.shape)) + + # If the matrices are in Fortran order then the computations will be faster + # when using dgemm. Otherwise, the function will copy the matrix and that takes time. + if not A.flags['F_CONTIGUOUS']: + AA = A.T + transA = True + else: + AA = A + transA = False + + if not B.flags['F_CONTIGUOUS']: + BB = B.T + transB = True + else: + BB = B + transB = False + + return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB) + def calculateKinship(W): - """ - W is an n x m matrix encoding SNP minor alleles. - """ - n = W.shape[0] - m = W.shape[1] - keep = [] - for i in range(m): - mn = W[True - np.isnan(W[:,i]),i].mean() - W[np.isnan(W[:,i]),i] = mn - vr = W[:,i].var() - if vr == 0: continue - - keep.append(i) - W[:,i] = (W[:,i] - mn) / np.sqrt(vr) - - W = W[:,keep] - K = np.dot(W,W.T) * 1.0/float(m) - return K + """ + W is an n x m matrix encoding SNP minor alleles. + + This function takes a matrix oF SNPs, imputes missing values with the maf, + normalizes the resulting vectors and returns the RRM matrix. + """ + n = W.shape[0] + m = W.shape[1] + keep = [] + for i in range(m): + mn = W[True - np.isnan(W[:,i]),i].mean() + W[np.isnan(W[:,i]),i] = mn + vr = W[:,i].var() + if vr == 0: continue + + keep.append(i) + W[:,i] = (W[:,i] - mn) / np.sqrt(vr) + + W = W[:,keep] + K = matrixMult(W,W.T) * 1.0/float(m) + return K def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): - """ - Performs a basic GWAS scan using the LMM. This function - uses the LMM module to assess association at each SNP and - does some simple cleanup, such as removing missing individuals - per SNP and re-computing the eigen-decomp - - Y - n x 1 phenotype vector - X - n x m SNP matrix - K - n x n kinship matrix - Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K - X0 - n x q covariate matrix - REML - use restricted maximum likelihood - refit - refit the variance component for each SNP - """ - n = X.shape[0] - m = X.shape[1] - - if X0 == None: X0 = np.ones((n,1)) - - # Remove missing values in Y and adjust associated parameters - v = np.isnan(Y) - if v.sum(): - keep = True - v - Y = Y[keep] - X = X[keep,:] - X0 = X0[keep,:] - K = K[keep,:][:,keep] - Kva = [] - Kve = [] - - L = LMM(Y,K,Kva,Kve,X0) - if not refit: L.fit() - - PS = [] - TS = [] - - for i in range(m): - x = X[:,i].reshape((n,1)) - v = np.isnan(x).reshape((-1,)) - if v.sum(): + """ + Performs a basic GWAS scan using the LMM. This function + uses the LMM module to assess association at each SNP and + does some simple cleanup, such as removing missing individuals + per SNP and re-computing the eigen-decomp + + Y - n x 1 phenotype vector + X - n x m SNP matrix + K - n x n kinship matrix + Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K + X0 - n x q covariate matrix + REML - use restricted maximum likelihood + refit - refit the variance component for each SNP + """ + n = X.shape[0] + m = X.shape[1] + + if X0 == None: X0 = np.ones((n,1)) + + # Remove missing values in Y and adjust associated parameters + v = np.isnan(Y) + if v.sum(): + keep = True - v + Y = Y[keep] + X = X[keep,:] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + Kva = [] + Kve = [] + + L = LMM(Y,K,Kva,Kve,X0) + if not refit: L.fit() + + PS = [] + TS = [] + + for i in range(m): + x = X[:,i].reshape((n,1)) + v = np.isnan(x).reshape((-1,)) + if v.sum(): keep = True - v xs = x[keep,:] - if xs.var() == 0: - PS.append(np.nan) - TS.append(np.nan) - continue + if xs.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue Ys = Y[keep] X0s = X0[keep,:] @@ -112,245 +131,251 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): if refit: Ls.fit(X=xs) else: Ls.fit() ts,ps = Ls.association(xs,REML=REML) - else: - if x.var() == 0: - PS.append(np.nan) - TS.append(np.nan) - continue + else: + if x.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue if refit: L.fit(X=x) ts,ps = L.association(x,REML=REML) + + PS.append(ps) + TS.append(ts) - PS.append(ps) - TS.append(ts) - - return TS,PS + return TS,PS class LMM: - """ - This is a simple version of EMMA/fastLMM. - The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) - and to optimize this model by finding the maximum-likelihood estimates for the model parameters. - There are three model parameters: heritability (h), covariate coefficients (beta) and the total - phenotypic variance (sigma). - Heritability as defined here is the proportion of the total variance (sigma) that is attributed to - the kinship matrix. - - For simplicity, we assume that everything being input is a numpy array. - If this is not the case, the module may throw an error as conversion from list to numpy array - is not done consistently. - - """ - def __init__(self, Y, K, Kva=None, Kve=None, X0=None): - """ - The constructor takes a phenotype vector or array of size n. - It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. - If they are not provided, the constructor will calculate them. - X0 is an optional covariate matrix of size n x q, where there are q covariates. - When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. - - """ - - if Kva is None: - Kva = [] - if Kve is None: - Kve = [] - - - if X0 == None: - X0 = np.ones(len(Y)).reshape(len(Y),1) - - print("Y is:", pf(Y)) - - for key, value in locals().iteritems(): - print(" %s - %s" % (key, type(value))) - - x = Y != -9 - print("x is:", pf(x)) - if not x.sum() == len(Y): - print("x.sum is:", pf(x.sum())) - print("len(Y) is:", pf(len(Y))) - sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) - Y = Y[x] - K = K[x,:][:,x] - X0 = X0[x,:] - Kva = [] - Kve = [] - self.nonmissing = x - - if len(Kva) == 0 or len(Kve) == 0: - sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) - begin = time.time() - Kva,Kve = linalg.eigh(K) - end = time.time() - sys.stderr.write("Total time: %0.3f\n" % (end - begin)) - self.K = K - self.Kva = Kva - self.Kve = Kve - self.Y = Y - self.X0 = X0 - self.N = self.K.shape[0] - - self.transform() - - def transform(self): + """ + This is a simple version of EMMA/fastLMM. + The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) + and to optimize this model by finding the maximum-likelihood estimates for the model parameters. + There are three model parameters: heritability (h), covariate coefficients (beta) and the total + phenotypic variance (sigma). + Heritability as defined here is the proportion of the total variance (sigma) that is attributed to + the kinship matrix. + + For simplicity, we assume that everything being input is a numpy array. + If this is not the case, the module may throw an error as conversion from list to numpy array + is not done consistently. + + """ + def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False): + + """ + The constructor takes a phenotype vector or array of size n. + It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. + If they are not provided, the constructor will calculate them. + X0 is an optional covariate matrix of size n x q, where there are q covariates. + When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """ + + if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) + self.verbose = verbose + + #x = Y != -9 + x = True - np.isnan(Y) + if not x.sum() == len(Y): + if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) + Y = Y[x] + K = K[x,:][:,x] + X0 = X0[x,:] + Kva = [] + Kve = [] + self.nonmissing = x + + if len(Kva) == 0 or len(Kve) == 0: + if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + begin = time.time() + Kva,Kve = linalg.eigh(K) + end = time.time() + if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + + self.K = K + self.Kva = Kva + self.Kve = Kve + self.Y = Y + self.X0 = X0 + self.N = self.K.shape[0] + + if sum(self.Kva < 1e-6): + if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0))) + self.Kva[self.Kva < 1e-6] = 1e-6 + + self.transform() + + def transform(self): """ Computes a transformation on the phenotype vector and the covariate matrix. - The transformation is obtained by left multiplying each parameter by the transpose of the + The transformation is obtained by left multiplying each parameter by the transpose of the eigenvector matrix of K (the kinship). """ - - print(len(self.Kve.T)) - print(len(self.Y)) - - self.Yt = np.dot(self.Kve.T, self.Y) - self.X0t = np.dot(self.Kve.T, self.X0) - - def getMLSoln(self,h,X): - - """ - Obtains the maximum-likelihood estimates for the covariate coefficients (beta), - the total variance of the trait (sigma) and also passes intermediates that can - be utilized in other functions. The input parameter h is a value between 0 and 1 and represents - the heritability or the proportion of the total variance attributed to genetics. The X is the - covariate matrix. - """ - - #print("h is", pf(h)) - #print("X is", pf(X)) - print("X.shape is", pf(X.shape)) - - S = 1.0/(h*self.Kva + (1.0 - h)) - Xt = X.T*S - XX = np.dot(Xt,X) - - - XX_i = linalg.inv(XX) - beta = np.dot(np.dot(XX_i,Xt),self.Yt) - Yt = self.Yt - np.dot(X,beta) - Q = np.dot(Yt.T*S,Yt) - sigma = Q * 1.0 / (float(len(self.Yt)) - float(X.shape[1])) - return beta,sigma,Q,XX_i,XX - - def LL_brent(self,h,X=None,REML=False): return -self.LL(h,X,stack=False,REML=REML)[0] - def LL(self,h,X=None,stack=True,REML=False): - - """ - Computes the log-likelihood for a given heritability (h). If X==None, then the - default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with - the input X. If stack is false, then X is used in place of X0t in the LL calculation. - REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. - """ - - if X == None: X = self.X0t - elif stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - - n = float(self.N) - q = float(X.shape[1]) - beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) - LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) - LL = -0.5 * LL - - if REML: - LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(np.dot(X.T,X))) - np.log(linalg.det(XX)) - LL = LL + 0.5*LL_REML_part - - return LL,beta,sigma,XX_i - - def getMax(self,H, X=None,REML=False): - - """ - Helper functions for .fit(...). - This function takes a set of LLs computed over a grid and finds possible regions - containing a maximum. Within these regions, a Brent search is performed to find the - optimum. - - """ - n = len(self.LLs) - HOpt = [] - for i in range(1,n-2): - if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) - - if len(HOpt) > 1: - sys.stderr.write("ERR: Found multiple maximum. Returning first...\n") - return HOpt[0] - elif len(HOpt) == 1: return HOpt[0] - elif self.LLs[0] > self.LLs[n-1]: return H[0] - else: return H[n-1] - - def fit(self,X=None,ngrids=100,REML=True): - - """ - Finds the maximum-likelihood solution for the heritability (h) given the current parameters. - X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as - the covariate matrix. - - This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. - Given this optimum, the function computes the LL and associated ML solutions. - """ - - if X == None: X = self.X0t - else: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - H = np.array(range(ngrids)) / float(ngrids) - L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) - self.LLs = L - - hmax = self.getMax(H,X,REML) - L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) - - self.H = H - self.optH = hmax - self.optLL = L - self.optBeta = beta - self.optSigma = sigma - - return hmax,beta,sigma,L - - - def association(self,X, h = None, stack=True,REML=True): - - """ - Calculates association statitics for the SNPs encoded in the vector X of size n. - If h == None, the optimal h stored in optH is used. - - """ - if stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) - if h == None: h = self.optH - - L,beta,sigma,betaSTDERR = self.LL(h,X,stack=False,REML=REML) - q = len(beta) - ts,ps = self.tstat(beta[q-1],betaSTDERR[q-1,q-1],sigma,q) - return ts,ps - - def tstat(self,beta,stderr,sigma,q): - - """ - Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. - This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. - """ - - ts = beta / np.sqrt(stderr * sigma) - ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) - return ts,ps - - def plotFit(self,color='b-',title=''): - - """ - Simple function to visualize the likelihood space. It takes the LLs - calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. - The resulting "probabilities" are normalized to one and plotted against heritability. - This can be seen as an approximation to the posterior distribuiton of heritability. - - For diagnostic purposes this lets you see if there is one distinct maximum or multiple - and what the variance of the parameter looks like. - """ - mx = self.LLs.max() - p = np.exp(self.LLs - mx) - p = p/p.sum() - - pl.plot(self.H,p,color) - pl.xlabel("Heritability") - pl.ylabel("Probability of data") - pl.title(title) + + self.Yt = matrixMult(self.Kve.T, self.Y) + self.X0t = matrixMult(self.Kve.T, self.X0) + self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))]) + self.q = self.X0t.shape[1] + + def getMLSoln(self,h,X): + + """ + Obtains the maximum-likelihood estimates for the covariate coefficients (beta), + the total variance of the trait (sigma) and also passes intermediates that can + be utilized in other functions. The input parameter h is a value between 0 and 1 and represents + the heritability or the proportion of the total variance attributed to genetics. The X is the + covariate matrix. + """ + + S = 1.0/(h*self.Kva + (1.0 - h)) + Xt = X.T*S + XX = matrixMult(Xt,X) + XX_i = linalg.inv(XX) + beta = matrixMult(matrixMult(XX_i,Xt),self.Yt) + Yt = self.Yt - matrixMult(X,beta) + Q = np.dot(Yt.T*S,Yt) + sigma = Q * 1.0 / (float(self.N) - float(X.shape[1])) + return beta,sigma,Q,XX_i,XX + + def LL_brent(self,h,X=None,REML=False): + #brent will not be bounded by the specified bracket. + # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search. + if h < 0: return 1e6 + return -self.LL(h,X,stack=False,REML=REML)[0] + + def LL(self,h,X=None,stack=True,REML=False): + + """ + Computes the log-likelihood for a given heritability (h). If X==None, then the + default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with + the input X. If stack is false, then X is used in place of X0t in the LL calculation. + REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. + """ + + if X == None: X = self.X0t + elif stack: + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + n = float(self.N) + q = float(X.shape[1]) + beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) + LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) + LL = -0.5 * LL + + if REML: + LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(matrixMult(X.T,X))) - np.log(linalg.det(XX)) + LL = LL + 0.5*LL_REML_part + + return LL,beta,sigma,XX_i + + def getMax(self,H, X=None,REML=False): + + """ + Helper functions for .fit(...). + This function takes a set of LLs computed over a grid and finds possible regions + containing a maximum. Within these regions, a Brent search is performed to find the + optimum. + + """ + n = len(self.LLs) + HOpt = [] + for i in range(1,n-2): + if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: + HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) + if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]] + + if len(HOpt) > 1: + if self.verbose: sys.stderr.write("NOTE: Found multiple optima. Returning first...\n") + return HOpt[0] + elif len(HOpt) == 1: return HOpt[0] + elif self.LLs[0] > self.LLs[n-1]: return H[0] + else: return H[n-1] + + def fit(self,X=None,ngrids=100,REML=True): + + """ + Finds the maximum-likelihood solution for the heritability (h) given the current parameters. + X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as + the covariate matrix. + + This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. + Given this optimum, the function computes the LL and associated ML solutions. + """ + + if X == None: X = self.X0t + else: + #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + H = np.array(range(ngrids)) / float(ngrids) + L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) + self.LLs = L + + hmax = self.getMax(H,X,REML) + L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) + + self.H = H + self.optH = hmax + self.optLL = L + self.optBeta = beta + self.optSigma = sigma + + return hmax,beta,sigma,L + + def association(self,X, h = None, stack=True,REML=True, returnBeta=False): + + """ + Calculates association statitics for the SNPs encoded in the vector X of size n. + If h == None, the optimal h stored in optH is used. + + """ + if stack: + #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)]) + self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0] + X = self.X0t_stack + + if h == None: h = self.optH + + L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML) + q = len(beta) + ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q) + + if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma + return ts,ps + + def tstat(self,beta,var,sigma,q): + + """ + Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. + This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. + """ + + ts = beta / np.sqrt(var * sigma) + ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) + if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(") + return ts.sum(),ps.sum() + + def plotFit(self,color='b-',title=''): + + """ + Simple function to visualize the likelihood space. It takes the LLs + calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. + The resulting "probabilities" are normalized to one and plotted against heritability. + This can be seen as an approximation to the posterior distribuiton of heritability. + + For diagnostic purposes this lets you see if there is one distinct maximum or multiple + and what the variance of the parameter looks like. + """ + import matplotlib.pyplot as pl + + mx = self.LLs.max() + p = np.exp(self.LLs - mx) + p = p/p.sum() + + pl.plot(self.H,p,color) + pl.xlabel("Heritability") + pl.ylabel("Probability of data") + pl.title(title) \ No newline at end of file | 
