aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xwqflask/wqflask/marker_regression/marker_regression.py61
-rw-r--r--wqflask/wqflask/my_pylmm/data/genofile_parser.py85
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py663
3 files changed, 462 insertions, 347 deletions
diff --git a/wqflask/wqflask/marker_regression/marker_regression.py b/wqflask/wqflask/marker_regression/marker_regression.py
index 92270eb2..13ec4280 100755
--- a/wqflask/wqflask/marker_regression/marker_regression.py
+++ b/wqflask/wqflask/marker_regression/marker_regression.py
@@ -17,6 +17,8 @@ import urllib
import numpy as np
+import json
+
from htmlgen import HTMLgen2 as HT
from utility import Plot, Bunch
from wqflask.interval_analyst import GeneUtil
@@ -453,21 +455,43 @@ class MarkerRegression(object):
def gen_data(self):
"""Todo: Fill this in here"""
- prep_data.PrepData(self.vals, self.dataset.group.name)
+ json_data = open(os.path.join(webqtlConfig.NEWGENODIR + self.dataset.group.name + '.json'))
+ markers = json.load(json_data)
+ genotype_data = [marker['genotypes'] for marker in markers]
+
+ no_val_samples = self.identify_empty_samples()
+ trimmed_genotype_data = self.trim_genotypes(genotype_data, no_val_samples)
+
+ #print("trimmed genotype data is:", pf(trimmed_genotype_data))
+
+ #for marker_object in genotype_data:
+ # print("marker_object:", pf(marker_object))
+
+
+ #prep_data.PrepData(self.vals, genotype_data)
pheno_vector = np.array([float(val) for val in self.vals if val!="x"])
- genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR,
- self.dataset.group.name + '.snps.new'))
+ genotypes = np.array(trimmed_genotype_data).T
+ print("genotypes is", pf(genotypes))
+ #genotypes = np.genfromtxt(os.path.join(webqtlConfig.TMPDIR,
+ # self.dataset.group.name + '.snps.new')).T
- print("genotypes is:", pf(genotypes))
+ print("pheno_vector is:", pf(pheno_vector.shape))
+ print("genotypes is:", pf(genotypes.shape))
kinship_matrix = lmm.calculateKinship(genotypes)
print("kinship_matrix is:", pf(kinship_matrix))
- print("pheno_vector is:", pf(pheno_vector))
lmm_ob = lmm.LMM(pheno_vector, kinship_matrix)
lmm_ob.fit()
-
+
+ t_stats, p_values = lmm.GWAS(pheno_vector,
+ genotypes,
+ kinship_matrix,
+ REML=True,
+ refit=False)
+
+ print("p_values is:", pf(len(p_values)))
#calculate QTL for each trait
self.qtl_results = self.genotype.regression(strains = self.samples,
@@ -633,6 +657,31 @@ class MarkerRegression(object):
#return rv,tblobj,bottomInfo
+ def identify_empty_samples(self):
+ no_val_samples = []
+ for sample_count, val in enumerate(self.vals):
+ if val == "x":
+ no_val_samples.append(sample_count)
+ return no_val_samples
+ #print("self.no_val_samples:", self.no_val_samples)
+ #nums = set(range(0, 176))
+ #print("not included:", nums-self.empty_columns)
+
+ def trim_genotypes(self, genotype_data, no_value_samples):
+ trimmed_genotype_data = []
+ for marker in genotype_data:
+ new_genotypes = []
+ for item_count, genotype in enumerate(marker):
+ if item_count in no_value_samples:
+ continue
+ try:
+ genotype = float(genotype)
+ except ValueError:
+ pass
+ new_genotypes.append(genotype)
+ trimmed_genotype_data.append(new_genotypes)
+ return trimmed_genotype_data
+
def plotIntMappingForPLINK(self, fd, canvas, offset= (80, 120, 20, 80), zoom = 1, startMb = None, endMb = None, showLocusForm = "",plinkResultDict={}):
#calculating margins
xLeftOffset, xRightOffset, yTopOffset, yBottomOffset = offset
diff --git a/wqflask/wqflask/my_pylmm/data/genofile_parser.py b/wqflask/wqflask/my_pylmm/data/genofile_parser.py
index 1dafecc8..ec8c521c 100644
--- a/wqflask/wqflask/my_pylmm/data/genofile_parser.py
+++ b/wqflask/wqflask/my_pylmm/data/genofile_parser.py
@@ -1,13 +1,31 @@
#!/usr/bin/python
from __future__ import print_function, division, absolute_import
-import csv
+import sys
+sys.path.append("..")
import os
import glob
import traceback
+import numpy as np
+from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
class EmptyConfigurations(Exception): pass
+
+
+class Marker(object):
+ def __init__(self):
+ self.name = None
+ self.chr = None
+ self.cM = None
+ self.Mb = None
+ self.genotypes = []
+
class ConvertGenoFile(object):
def __init__(self, input_file, output_file):
@@ -15,6 +33,9 @@ class ConvertGenoFile(object):
self.input_file = input_file
self.output_file = output_file
+ self.mb_exists = False
+ self.markers = []
+
self.latest_row_pos = None
self.latest_col_pos = None
@@ -23,7 +44,7 @@ class ConvertGenoFile(object):
def convert(self):
- self.prefer_config = {
+ self.haplotype_notation = {
'@mat': "1",
'@pat': "0",
'@het': "0.5",
@@ -31,36 +52,56 @@ class ConvertGenoFile(object):
}
self.configurations = {}
- self.skipped_cols = 3
+ #self.skipped_cols = 3
self.input_fh = open(self.input_file)
-
with open(self.output_file, "w") as self.output_fh:
self.process_csv()
-
-
-
+
+
#def process_row(self, row):
# counter = 0
# for char in row:
# if char
# counter += 1
-
+
def process_csv(self):
for row_count, row in enumerate(self.process_rows()):
#self.latest_row_pos = row_count
- for item_count, item in enumerate(row.split()[self.skipped_cols:]):
+ row_items = row.split()
+
+ this_marker = Marker()
+ this_marker.name = row_items[1]
+ this_marker.chr = row_items[0]
+ this_marker.cM = row_items[2]
+ if self.mb_exists:
+ this_marker.Mb = row_items[3]
+ genotypes = row_items[4:]
+ else:
+ genotypes = row_items[3:]
+ for item_count, genotype in enumerate(genotypes):
+ this_marker.genotypes.append(self.configurations[genotype.upper()])
+
+ #print("this_marker is:", pf(this_marker.__dict__))
+
+ self.markers.append(this_marker.__dict__)
+
+ with open(self.output_file, 'w') as fh:
+ json.dump(self.markers, fh, indent=" ", sort_keys=True)
+
# print('configurations:', str(configurations))
- self.latest_col_pos = item_count + self.skipped_cols
- self.latest_col_value = item
- if item_count != 0:
- self.output_fh.write(" ")
- self.output_fh.write(self.configurations[item.upper()])
+ #self.latest_col_pos = item_count + self.skipped_cols
+ #self.latest_col_value = item
+
+ #if item_count != 0:
+ # self.output_fh.write(" ")
+ #self.output_fh.write(self.configurations[item.upper()])
- self.output_fh.write("\n")
-
+ #self.output_fh.write("\n")
+
+
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
self.latest_row_value = row
@@ -69,14 +110,14 @@ class ConvertGenoFile(object):
continue
if row.startswith('Chr'):
if 'Mb' in row.split():
- self.skipped_cols = 4
+ self.mb_exists = True
continue
if row.startswith('@'):
key, _separater, value = row.partition(':')
key = key.strip()
value = value.strip()
- if key in self.prefer_config:
- self.configurations[value] = self.prefer_config[key]
+ if key in self.haplotype_notation:
+ self.configurations[value] = self.haplotype_notation[key]
continue
if not len(self.configurations):
raise EmptyConfigurations
@@ -87,17 +128,17 @@ class ConvertGenoFile(object):
os.chdir(old_directory)
for input_file in glob.glob("*.geno"):
group_name = input_file.split('.')[0]
- output_file = os.path.join(new_directory, group_name + ".snps")
+ output_file = os.path.join(new_directory, group_name + ".json")
print("%s -> %s" % (input_file, output_file))
convertob = ConvertGenoFile(input_file, output_file)
try:
- convertob.convert()
+ convertob.convert()
except EmptyConfigurations as why:
print(" No config info? Continuing...")
#excepted = True
continue
except Exception as why:
-
+
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %i at tabular column %i" % (convertob.latest_row_pos,
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 1ae663d4..015c2e14 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -1,109 +1,128 @@
-# pyLMM software Copyright 2012, Nicholas A. Furlotte
-# Version 0.1
-
-#License Details
-#---------------
-
-# The program is free for academic use. Please contact Nick Furlotte
-# <nick.furlotte@gmail.com> if you are interested in using the software for
-# commercial purposes.
-
-# The software must not be modified and distributed without prior
-# permission of the author.
-# Any instance of this software must retain the above copyright notice.
-
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import absolute_import, print_function, division
import sys
import time
import numpy as np
-import numpy.linalg as linalg
+from scipy import linalg
from scipy import optimize
from scipy import stats
-#import matplotlib.pyplot as pl
import pdb
from pprint import pformat as pf
+#np.seterr('raise')
+
+def matrixMult(A,B):
+ #return np.dot(A,B)
+
+ print("A is:", pf(A.shape))
+ print("B is:", pf(B.shape))
+
+ # If the matrices are in Fortran order then the computations will be faster
+ # when using dgemm. Otherwise, the function will copy the matrix and that takes time.
+ if not A.flags['F_CONTIGUOUS']:
+ AA = A.T
+ transA = True
+ else:
+ AA = A
+ transA = False
+
+ if not B.flags['F_CONTIGUOUS']:
+ BB = B.T
+ transB = True
+ else:
+ BB = B
+ transB = False
+
+ return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
+
def calculateKinship(W):
- """
- W is an n x m matrix encoding SNP minor alleles.
- """
- n = W.shape[0]
- m = W.shape[1]
- keep = []
- for i in range(m):
- mn = W[True - np.isnan(W[:,i]),i].mean()
- W[np.isnan(W[:,i]),i] = mn
- vr = W[:,i].var()
- if vr == 0: continue
-
- keep.append(i)
- W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
-
- W = W[:,keep]
- K = np.dot(W,W.T) * 1.0/float(m)
- return K
+ """
+ W is an n x m matrix encoding SNP minor alleles.
+
+ This function takes a matrix oF SNPs, imputes missing values with the maf,
+ normalizes the resulting vectors and returns the RRM matrix.
+ """
+ n = W.shape[0]
+ m = W.shape[1]
+ keep = []
+ for i in range(m):
+ mn = W[True - np.isnan(W[:,i]),i].mean()
+ W[np.isnan(W[:,i]),i] = mn
+ vr = W[:,i].var()
+ if vr == 0: continue
+
+ keep.append(i)
+ W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
+
+ W = W[:,keep]
+ K = matrixMult(W,W.T) * 1.0/float(m)
+ return K
def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
- """
- Performs a basic GWAS scan using the LMM. This function
- uses the LMM module to assess association at each SNP and
- does some simple cleanup, such as removing missing individuals
- per SNP and re-computing the eigen-decomp
-
- Y - n x 1 phenotype vector
- X - n x m SNP matrix
- K - n x n kinship matrix
- Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
- X0 - n x q covariate matrix
- REML - use restricted maximum likelihood
- refit - refit the variance component for each SNP
- """
- n = X.shape[0]
- m = X.shape[1]
-
- if X0 == None: X0 = np.ones((n,1))
-
- # Remove missing values in Y and adjust associated parameters
- v = np.isnan(Y)
- if v.sum():
- keep = True - v
- Y = Y[keep]
- X = X[keep,:]
- X0 = X0[keep,:]
- K = K[keep,:][:,keep]
- Kva = []
- Kve = []
-
- L = LMM(Y,K,Kva,Kve,X0)
- if not refit: L.fit()
-
- PS = []
- TS = []
-
- for i in range(m):
- x = X[:,i].reshape((n,1))
- v = np.isnan(x).reshape((-1,))
- if v.sum():
+ """
+ Performs a basic GWAS scan using the LMM. This function
+ uses the LMM module to assess association at each SNP and
+ does some simple cleanup, such as removing missing individuals
+ per SNP and re-computing the eigen-decomp
+
+ Y - n x 1 phenotype vector
+ X - n x m SNP matrix
+ K - n x n kinship matrix
+ Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
+ X0 - n x q covariate matrix
+ REML - use restricted maximum likelihood
+ refit - refit the variance component for each SNP
+ """
+ n = X.shape[0]
+ m = X.shape[1]
+
+ if X0 == None: X0 = np.ones((n,1))
+
+ # Remove missing values in Y and adjust associated parameters
+ v = np.isnan(Y)
+ if v.sum():
+ keep = True - v
+ Y = Y[keep]
+ X = X[keep,:]
+ X0 = X0[keep,:]
+ K = K[keep,:][:,keep]
+ Kva = []
+ Kve = []
+
+ L = LMM(Y,K,Kva,Kve,X0)
+ if not refit: L.fit()
+
+ PS = []
+ TS = []
+
+ for i in range(m):
+ x = X[:,i].reshape((n,1))
+ v = np.isnan(x).reshape((-1,))
+ if v.sum():
keep = True - v
xs = x[keep,:]
- if xs.var() == 0:
- PS.append(np.nan)
- TS.append(np.nan)
- continue
+ if xs.var() == 0:
+ PS.append(np.nan)
+ TS.append(np.nan)
+ continue
Ys = Y[keep]
X0s = X0[keep,:]
@@ -112,245 +131,251 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
if refit: Ls.fit(X=xs)
else: Ls.fit()
ts,ps = Ls.association(xs,REML=REML)
- else:
- if x.var() == 0:
- PS.append(np.nan)
- TS.append(np.nan)
- continue
+ else:
+ if x.var() == 0:
+ PS.append(np.nan)
+ TS.append(np.nan)
+ continue
if refit: L.fit(X=x)
ts,ps = L.association(x,REML=REML)
+
+ PS.append(ps)
+ TS.append(ts)
- PS.append(ps)
- TS.append(ts)
-
- return TS,PS
+ return TS,PS
class LMM:
- """
- This is a simple version of EMMA/fastLMM.
- The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
- and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
- There are three model parameters: heritability (h), covariate coefficients (beta) and the total
- phenotypic variance (sigma).
- Heritability as defined here is the proportion of the total variance (sigma) that is attributed to
- the kinship matrix.
-
- For simplicity, we assume that everything being input is a numpy array.
- If this is not the case, the module may throw an error as conversion from list to numpy array
- is not done consistently.
-
- """
- def __init__(self, Y, K, Kva=None, Kve=None, X0=None):
- """
- The constructor takes a phenotype vector or array of size n.
- It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
- If they are not provided, the constructor will calculate them.
- X0 is an optional covariate matrix of size n x q, where there are q covariates.
- When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
-
- """
-
- if Kva is None:
- Kva = []
- if Kve is None:
- Kve = []
-
-
- if X0 == None:
- X0 = np.ones(len(Y)).reshape(len(Y),1)
-
- print("Y is:", pf(Y))
-
- for key, value in locals().iteritems():
- print(" %s - %s" % (key, type(value)))
-
- x = Y != -9
- print("x is:", pf(x))
- if not x.sum() == len(Y):
- print("x.sum is:", pf(x.sum()))
- print("len(Y) is:", pf(len(Y)))
- sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
- Y = Y[x]
- K = K[x,:][:,x]
- X0 = X0[x,:]
- Kva = []
- Kve = []
- self.nonmissing = x
-
- if len(Kva) == 0 or len(Kve) == 0:
- sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
- begin = time.time()
- Kva,Kve = linalg.eigh(K)
- end = time.time()
- sys.stderr.write("Total time: %0.3f\n" % (end - begin))
- self.K = K
- self.Kva = Kva
- self.Kve = Kve
- self.Y = Y
- self.X0 = X0
- self.N = self.K.shape[0]
-
- self.transform()
-
- def transform(self):
+ """
+ This is a simple version of EMMA/fastLMM.
+ The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
+ and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
+ There are three model parameters: heritability (h), covariate coefficients (beta) and the total
+ phenotypic variance (sigma).
+ Heritability as defined here is the proportion of the total variance (sigma) that is attributed to
+ the kinship matrix.
+
+ For simplicity, we assume that everything being input is a numpy array.
+ If this is not the case, the module may throw an error as conversion from list to numpy array
+ is not done consistently.
+
+ """
+ def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
+
+ """
+ The constructor takes a phenotype vector or array of size n.
+ It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
+ If they are not provided, the constructor will calculate them.
+ X0 is an optional covariate matrix of size n x q, where there are q covariates.
+ When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+ """
+
+ if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
+ self.verbose = verbose
+
+ #x = Y != -9
+ x = True - np.isnan(Y)
+ if not x.sum() == len(Y):
+ if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
+ Y = Y[x]
+ K = K[x,:][:,x]
+ X0 = X0[x,:]
+ Kva = []
+ Kve = []
+ self.nonmissing = x
+
+ if len(Kva) == 0 or len(Kve) == 0:
+ if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+ begin = time.time()
+ Kva,Kve = linalg.eigh(K)
+ end = time.time()
+ if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+
+ self.K = K
+ self.Kva = Kva
+ self.Kve = Kve
+ self.Y = Y
+ self.X0 = X0
+ self.N = self.K.shape[0]
+
+ if sum(self.Kva < 1e-6):
+ if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+ self.Kva[self.Kva < 1e-6] = 1e-6
+
+ self.transform()
+
+ def transform(self):
"""
Computes a transformation on the phenotype vector and the covariate matrix.
- The transformation is obtained by left multiplying each parameter by the transpose of the
+ The transformation is obtained by left multiplying each parameter by the transpose of the
eigenvector matrix of K (the kinship).
"""
-
- print(len(self.Kve.T))
- print(len(self.Y))
-
- self.Yt = np.dot(self.Kve.T, self.Y)
- self.X0t = np.dot(self.Kve.T, self.X0)
-
- def getMLSoln(self,h,X):
-
- """
- Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
- the total variance of the trait (sigma) and also passes intermediates that can
- be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
- the heritability or the proportion of the total variance attributed to genetics. The X is the
- covariate matrix.
- """
-
- #print("h is", pf(h))
- #print("X is", pf(X))
- print("X.shape is", pf(X.shape))
-
- S = 1.0/(h*self.Kva + (1.0 - h))
- Xt = X.T*S
- XX = np.dot(Xt,X)
-
-
- XX_i = linalg.inv(XX)
- beta = np.dot(np.dot(XX_i,Xt),self.Yt)
- Yt = self.Yt - np.dot(X,beta)
- Q = np.dot(Yt.T*S,Yt)
- sigma = Q * 1.0 / (float(len(self.Yt)) - float(X.shape[1]))
- return beta,sigma,Q,XX_i,XX
-
- def LL_brent(self,h,X=None,REML=False): return -self.LL(h,X,stack=False,REML=REML)[0]
- def LL(self,h,X=None,stack=True,REML=False):
-
- """
- Computes the log-likelihood for a given heritability (h). If X==None, then the
- default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with
- the input X. If stack is false, then X is used in place of X0t in the LL calculation.
- REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
- """
-
- if X == None: X = self.X0t
- elif stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
-
- n = float(self.N)
- q = float(X.shape[1])
- beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
- LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
- LL = -0.5 * LL
-
- if REML:
- LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(np.dot(X.T,X))) - np.log(linalg.det(XX))
- LL = LL + 0.5*LL_REML_part
-
- return LL,beta,sigma,XX_i
-
- def getMax(self,H, X=None,REML=False):
-
- """
- Helper functions for .fit(...).
- This function takes a set of LLs computed over a grid and finds possible regions
- containing a maximum. Within these regions, a Brent search is performed to find the
- optimum.
-
- """
- n = len(self.LLs)
- HOpt = []
- for i in range(1,n-2):
- if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
-
- if len(HOpt) > 1:
- sys.stderr.write("ERR: Found multiple maximum. Returning first...\n")
- return HOpt[0]
- elif len(HOpt) == 1: return HOpt[0]
- elif self.LLs[0] > self.LLs[n-1]: return H[0]
- else: return H[n-1]
-
- def fit(self,X=None,ngrids=100,REML=True):
-
- """
- Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
- X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as
- the covariate matrix.
-
- This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
- Given this optimum, the function computes the LL and associated ML solutions.
- """
-
- if X == None: X = self.X0t
- else: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
- H = np.array(range(ngrids)) / float(ngrids)
- L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
- self.LLs = L
-
- hmax = self.getMax(H,X,REML)
- L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
-
- self.H = H
- self.optH = hmax
- self.optLL = L
- self.optBeta = beta
- self.optSigma = sigma
-
- return hmax,beta,sigma,L
-
-
- def association(self,X, h = None, stack=True,REML=True):
-
- """
- Calculates association statitics for the SNPs encoded in the vector X of size n.
- If h == None, the optimal h stored in optH is used.
-
- """
- if stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
- if h == None: h = self.optH
-
- L,beta,sigma,betaSTDERR = self.LL(h,X,stack=False,REML=REML)
- q = len(beta)
- ts,ps = self.tstat(beta[q-1],betaSTDERR[q-1,q-1],sigma,q)
- return ts,ps
-
- def tstat(self,beta,stderr,sigma,q):
-
- """
- Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
- This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
- """
-
- ts = beta / np.sqrt(stderr * sigma)
- ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
- return ts,ps
-
- def plotFit(self,color='b-',title=''):
-
- """
- Simple function to visualize the likelihood space. It takes the LLs
- calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
- The resulting "probabilities" are normalized to one and plotted against heritability.
- This can be seen as an approximation to the posterior distribuiton of heritability.
-
- For diagnostic purposes this lets you see if there is one distinct maximum or multiple
- and what the variance of the parameter looks like.
- """
- mx = self.LLs.max()
- p = np.exp(self.LLs - mx)
- p = p/p.sum()
-
- pl.plot(self.H,p,color)
- pl.xlabel("Heritability")
- pl.ylabel("Probability of data")
- pl.title(title)
+
+ self.Yt = matrixMult(self.Kve.T, self.Y)
+ self.X0t = matrixMult(self.Kve.T, self.X0)
+ self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))])
+ self.q = self.X0t.shape[1]
+
+ def getMLSoln(self,h,X):
+
+ """
+ Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
+ the total variance of the trait (sigma) and also passes intermediates that can
+ be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
+ the heritability or the proportion of the total variance attributed to genetics. The X is the
+ covariate matrix.
+ """
+
+ S = 1.0/(h*self.Kva + (1.0 - h))
+ Xt = X.T*S
+ XX = matrixMult(Xt,X)
+ XX_i = linalg.inv(XX)
+ beta = matrixMult(matrixMult(XX_i,Xt),self.Yt)
+ Yt = self.Yt - matrixMult(X,beta)
+ Q = np.dot(Yt.T*S,Yt)
+ sigma = Q * 1.0 / (float(self.N) - float(X.shape[1]))
+ return beta,sigma,Q,XX_i,XX
+
+ def LL_brent(self,h,X=None,REML=False):
+ #brent will not be bounded by the specified bracket.
+ # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search.
+ if h < 0: return 1e6
+ return -self.LL(h,X,stack=False,REML=REML)[0]
+
+ def LL(self,h,X=None,stack=True,REML=False):
+
+ """
+ Computes the log-likelihood for a given heritability (h). If X==None, then the
+ default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with
+ the input X. If stack is false, then X is used in place of X0t in the LL calculation.
+ REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
+ """
+
+ if X == None: X = self.X0t
+ elif stack:
+ self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+ X = self.X0t_stack
+
+ n = float(self.N)
+ q = float(X.shape[1])
+ beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
+ LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
+ LL = -0.5 * LL
+
+ if REML:
+ LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(matrixMult(X.T,X))) - np.log(linalg.det(XX))
+ LL = LL + 0.5*LL_REML_part
+
+ return LL,beta,sigma,XX_i
+
+ def getMax(self,H, X=None,REML=False):
+
+ """
+ Helper functions for .fit(...).
+ This function takes a set of LLs computed over a grid and finds possible regions
+ containing a maximum. Within these regions, a Brent search is performed to find the
+ optimum.
+
+ """
+ n = len(self.LLs)
+ HOpt = []
+ for i in range(1,n-2):
+ if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]:
+ HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+ if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]]
+
+ if len(HOpt) > 1:
+ if self.verbose: sys.stderr.write("NOTE: Found multiple optima. Returning first...\n")
+ return HOpt[0]
+ elif len(HOpt) == 1: return HOpt[0]
+ elif self.LLs[0] > self.LLs[n-1]: return H[0]
+ else: return H[n-1]
+
+ def fit(self,X=None,ngrids=100,REML=True):
+
+ """
+ Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
+ X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as
+ the covariate matrix.
+
+ This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
+ Given this optimum, the function computes the LL and associated ML solutions.
+ """
+
+ if X == None: X = self.X0t
+ else:
+ #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+ self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+ X = self.X0t_stack
+
+ H = np.array(range(ngrids)) / float(ngrids)
+ L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
+ self.LLs = L
+
+ hmax = self.getMax(H,X,REML)
+ L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
+
+ self.H = H
+ self.optH = hmax
+ self.optLL = L
+ self.optBeta = beta
+ self.optSigma = sigma
+
+ return hmax,beta,sigma,L
+
+ def association(self,X, h = None, stack=True,REML=True, returnBeta=False):
+
+ """
+ Calculates association statitics for the SNPs encoded in the vector X of size n.
+ If h == None, the optimal h stored in optH is used.
+
+ """
+ if stack:
+ #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+ self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+ X = self.X0t_stack
+
+ if h == None: h = self.optH
+
+ L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML)
+ q = len(beta)
+ ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q)
+
+ if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma
+ return ts,ps
+
+ def tstat(self,beta,var,sigma,q):
+
+ """
+ Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
+ This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
+ """
+
+ ts = beta / np.sqrt(var * sigma)
+ ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
+ if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(")
+ return ts.sum(),ps.sum()
+
+ def plotFit(self,color='b-',title=''):
+
+ """
+ Simple function to visualize the likelihood space. It takes the LLs
+ calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
+ The resulting "probabilities" are normalized to one and plotted against heritability.
+ This can be seen as an approximation to the posterior distribuiton of heritability.
+
+ For diagnostic purposes this lets you see if there is one distinct maximum or multiple
+ and what the variance of the parameter looks like.
+ """
+ import matplotlib.pyplot as pl
+
+ mx = self.LLs.max()
+ p = np.exp(self.LLs - mx)
+ p = p/p.sum()
+
+ pl.plot(self.H,p,color)
+ pl.xlabel("Heritability")
+ pl.ylabel("Probability of data")
+ pl.title(title) \ No newline at end of file