diff options
author | Zachary Sloan | 2013-02-06 18:32:30 -0600 |
---|---|---|
committer | Zachary Sloan | 2013-02-06 18:32:30 -0600 |
commit | d75fc63891f617fbe8b2b030fdce80b1628c6a41 (patch) | |
tree | a99fab2d8a4d08aec26622fc92e4ac2f29c67f30 /wqflask | |
parent | 471480255a28ae8743f4b736057a8d54585b1eca (diff) | |
download | genenetwork2-d75fc63891f617fbe8b2b030fdce80b1628c6a41.tar.gz |
Edited genofile_parser.py to go through all genofiles and convert
them to the format used by nick's code
Diffstat (limited to 'wqflask')
-rw-r--r-- | wqflask/wqflask/my_pylmm/data/genofile_parser.py | 118 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/data/prep_data.py | 64 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/example.py | 58 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/__init__.py | 0 | ||||
-rw-r--r-- | wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 341 |
5 files changed, 581 insertions, 0 deletions
diff --git a/wqflask/wqflask/my_pylmm/data/genofile_parser.py b/wqflask/wqflask/my_pylmm/data/genofile_parser.py new file mode 100644 index 00000000..1dafecc8 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/data/genofile_parser.py @@ -0,0 +1,118 @@ +#!/usr/bin/python + +from __future__ import print_function, division, absolute_import +import csv +import os +import glob +import traceback + +class EmptyConfigurations(Exception): pass + +class ConvertGenoFile(object): + + def __init__(self, input_file, output_file): + + self.input_file = input_file + self.output_file = output_file + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + + def convert(self): + + self.prefer_config = { + '@mat': "1", + '@pat': "0", + '@het': "0.5", + '@unk': "NA" + } + + self.configurations = {} + self.skipped_cols = 3 + + self.input_fh = open(self.input_file) + + + with open(self.output_file, "w") as self.output_fh: + self.process_csv() + + + + #def process_row(self, row): + # counter = 0 + # for char in row: + # if char + # counter += 1 + + def process_csv(self): + for row_count, row in enumerate(self.process_rows()): + #self.latest_row_pos = row_count + + for item_count, item in enumerate(row.split()[self.skipped_cols:]): + # print('configurations:', str(configurations)) + self.latest_col_pos = item_count + self.skipped_cols + self.latest_col_value = item + if item_count != 0: + self.output_fh.write(" ") + self.output_fh.write(self.configurations[item.upper()]) + + self.output_fh.write("\n") + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + self.latest_row_value = row + # Take care of headers + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.skipped_cols = 4 + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key in self.prefer_config: + self.configurations[value] = self.prefer_config[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + @classmethod + def process_all(cls, old_directory, new_directory): + os.chdir(old_directory) + for input_file in glob.glob("*.geno"): + group_name = input_file.split('.')[0] + output_file = os.path.join(new_directory, group_name + ".snps") + print("%s -> %s" % (input_file, output_file)) + convertob = ConvertGenoFile(input_file, output_file) + try: + convertob.convert() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + #excepted = True + continue + except Exception as why: + + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %i at tabular column %i" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + + +if __name__=="__main__": + Old_Geno_Directory = """/home/zas1024/gene/web/genotypes/""" + New_Geno_Directory = """/home/zas1024/gene/web/new_genotypes/""" + #Input_File = """/home/zas1024/gene/web/genotypes/BXD.geno""" + #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" + ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) + #ConvertGenoFiles(Geno_Directory) + + #process_csv(Input_File, Output_File)
\ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/data/prep_data.py b/wqflask/wqflask/my_pylmm/data/prep_data.py new file mode 100644 index 00000000..b7a133c2 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/data/prep_data.py @@ -0,0 +1,64 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division +import numpy + + +class PrepData(object): + def __init__(self, exprs_file, snps_file): + self.exprs_file = exprs_file + self.snps_file = snps_file + self.empty_columns = set() + #self.identify_no_genotype_samples() + self.identify_empty_samples() + self.trim_files() + + def identify_empty_samples(self): + with open(self.exprs_file) as fh: + for line in fh: + for pos, item in enumerate(line.split()): + if item == "NA": + self.empty_columns.add(pos) + #print("self.empty_columns:", self.empty_columns) + nums = set(range(0, 176)) + print("not included:", nums-self.empty_columns) + + #def identify_no_genotype_samples(self): + # #for this_file in (self.exprs_file, self.snps_file): + # #with open(this_file) as fh: + # no_geno_samples = [] + # has_genotypes = False + # with open(self.snps_file) as fh: + # for line in fh: + # num_samples = len(line.split()) + # break + # for sample in range (num_samples): + # for line in fh: + # if line.split()[sample] != "NA": + # has_genotypes = True + # break + # if has_genotypes == False: + # no_geno_samples.append(sample) + # + # print(no_geno_samples) + + def trim_files(self): + for this_file in (self.exprs_file, self.snps_file): + input_file = open(this_file) + this_file_name_output = this_file + ".new" + with open(this_file_name_output, "w") as output: + for line in input_file: + data_wanted = [] + for pos, item in enumerate(line.split()): + if pos in self.empty_columns: + continue + else: + data_wanted.append("%2s" % (item)) + #print("data_wanted is", data_wanted) + output.write(" ".join(data_wanted) + "\n") + print("Done writing file:", this_file_name_output) + +if __name__=="__main__": + exprs_file = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.exprs.1""" + snps_file = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.1000""" + PrepData(exprs_file, snps_file)
\ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/example.py new file mode 100644 index 00000000..0348d67b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/example.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import sys +import time + +import numpy as np +from pyLMM import lmm + +from pprint import pformat as pf + + +Y = np.genfromtxt('data/mdp.exprs.1.new') +print("exprs is:", pf(Y.shape)) + +# Loading npdump and first 1000 snps for speed +#K = np.load('data/hmdp.liver.K.npdump') +#snps = np.load('data/hmdp.liver.snps.1000.npdump').T + +# These three lines will load all SNPs (from npdump or from txt) and +# calculate the kinship +snps = np.genfromtxt('data/mdp.snps.1000.new').T +print("snps is:", pf(snps.shape)) +#snps = snps[~np.isnan(snps).all(axis=1)] +#print ("snps is now:", pf(snps)) +np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ') +#snps = np.load('data/hmdp.liver.snps.npdump').T +K = lmm.calculateKinship(snps) +#print("K is:", pf(K)) +#print("Y is:", pf(Y.shape)) + +# Instantiate a LMM object for the phentoype Y and fit the null model +L = lmm.LMM(Y,K) +L.fit() + +# Manually calculate the association at one SNP +X = snps[:,0] +X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF +X = X.reshape(len(X),1) +if X.var() == 0: ts,ps = (np.nan,np.nan) +else: ts,ps = L.association(X) + +# If I want to refit the variance component +L.fit(X=X) +ts,ps = L.association(X) + +# If I want to do a genome-wide scan over the 1000 SNPs. +# This call will use REML (REML = False means use ML). +# It will also refit the variance components for each SNP. +# Setting refit = False will cause the program to fit the model once +# and hold those variance component estimates for each SNP. +begin = time.time() +TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False) +print("TS is:", pf(TS)) +print("PS is:", pf(PS)) +end = time.time() +sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin))
\ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py new file mode 100644 index 00000000..7fe599c4 --- /dev/null +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -0,0 +1,341 @@ +# pyLMM software Copyright 2012, Nicholas A. Furlotte +# Version 0.1 + +#License Details +#--------------- + +# The program is free for academic use. Please contact Nick Furlotte +# <nick.furlotte@gmail.com> if you are interested in using the software for +# commercial purposes. + +# The software must not be modified and distributed without prior +# permission of the author. +# Any instance of this software must retain the above copyright notice. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import absolute_import, print_function, division + +import sys +import time +import numpy as np +import numpy.linalg as linalg +from scipy import optimize +from scipy import stats +#import matplotlib.pyplot as pl +import pdb + +from pprint import pformat as pf + +def calculateKinship(W): + """ + W is an n x m matrix encoding SNP minor alleles. + """ + n = W.shape[0] + m = W.shape[1] + keep = [] + for i in range(m): + mn = W[True - np.isnan(W[:,i]),i].mean() + W[np.isnan(W[:,i]),i] = mn + vr = W[:,i].var() + if vr == 0: continue + + keep.append(i) + W[:,i] = (W[:,i] - mn) / np.sqrt(vr) + + W = W[:,keep] + K = np.dot(W,W.T) * 1.0/float(m) + return K + +def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False): + """ + Performs a basic GWAS scan using the LMM. This function + uses the LMM module to assess association at each SNP and + does some simple cleanup, such as removing missing individuals + per SNP and re-computing the eigen-decomp + + Y - n x 1 phenotype vector + X - n x m SNP matrix + K - n x n kinship matrix + Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K + X0 - n x q covariate matrix + REML - use restricted maximum likelihood + refit - refit the variance component for each SNP + """ + n = X.shape[0] + m = X.shape[1] + + if X0 == None: X0 = np.ones((n,1)) + + # Remove missing values in Y and adjust associated parameters + v = np.isnan(Y) + if v.sum(): + keep = True - v + Y = Y[keep] + X = X[keep,:] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + Kva = [] + Kve = [] + + L = LMM(Y,K,Kva,Kve,X0) + if not refit: L.fit() + + PS = [] + TS = [] + + for i in range(m): + x = X[:,i].reshape((n,1)) + v = np.isnan(x).reshape((-1,)) + if v.sum(): + keep = True - v + xs = x[keep,:] + if xs.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue + + Ys = Y[keep] + X0s = X0[keep,:] + Ks = K[keep,:][:,keep] + Ls = LMM(Ys,Ks,X0=X0s) + if refit: Ls.fit(X=xs) + else: Ls.fit() + ts,ps = Ls.association(xs,REML=REML) + else: + if x.var() == 0: + PS.append(np.nan) + TS.append(np.nan) + continue + + if refit: L.fit(X=x) + ts,ps = L.association(x,REML=REML) + + PS.append(ps) + TS.append(ts) + + return TS,PS + +class LMM: + + """ + This is a simple version of EMMA/fastLMM. + The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K) + and to optimize this model by finding the maximum-likelihood estimates for the model parameters. + There are three model parameters: heritability (h), covariate coefficients (beta) and the total + phenotypic variance (sigma). + Heritability as defined here is the proportion of the total variance (sigma) that is attributed to + the kinship matrix. + + For simplicity, we assume that everything being input is a numpy array. + If this is not the case, the module may throw an error as conversion from list to numpy array + is not done consistently. + + """ + def __init__(self,Y,K,Kva=[],Kve=[],X0=None): + + """ + The constructor takes a phenotype vector or array of size n. + It takes a kinship matrix of size n x n. Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached. + If they are not provided, the constructor will calculate them. + X0 is an optional covariate matrix of size n x q, where there are q covariates. + When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect. + """ + + if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1) + + x = Y != -9 + if not x.sum() == len(Y): + sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum())) + Y = Y[x] + K = K[x,:][:,x] + X0 = X0[x,:] + Kva = [] + Kve = [] + self.nonmissing = x + + if len(Kva) == 0 or len(Kve) == 0: + sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) ) + begin = time.time() + Kva,Kve = linalg.eigh(K) + end = time.time() + sys.stderr.write("Total time: %0.3f\n" % (end - begin)) + self.K = K + self.Kva = Kva + self.Kve = Kve + self.Y = Y + self.X0 = X0 + self.N = self.K.shape[0] + + self.transform() + + def transform(self): + + """ + Computes a transformation on the phenotype vector and the covariate matrix. + The transformation is obtained by left multiplying each parameter by the transpose of the + eigenvector matrix of K (the kinship). + """ + + print(len(self.Kve.T)) + print(len(self.Y)) + + self.Yt = np.dot(self.Kve.T, self.Y) + self.X0t = np.dot(self.Kve.T, self.X0) + + def getMLSoln(self,h,X): + + """ + Obtains the maximum-likelihood estimates for the covariate coefficients (beta), + the total variance of the trait (sigma) and also passes intermediates that can + be utilized in other functions. The input parameter h is a value between 0 and 1 and represents + the heritability or the proportion of the total variance attributed to genetics. The X is the + covariate matrix. + """ + + #print("h is", pf(h)) + #print("X is", pf(X)) + print("X.shape is", pf(X.shape)) + + S = 1.0/(h*self.Kva + (1.0 - h)) + Xt = X.T*S + XX = np.dot(Xt,X) + + + XX_i = linalg.inv(XX) + beta = np.dot(np.dot(XX_i,Xt),self.Yt) + Yt = self.Yt - np.dot(X,beta) + Q = np.dot(Yt.T*S,Yt) + sigma = Q * 1.0 / (float(len(self.Yt)) - float(X.shape[1])) + return beta,sigma,Q,XX_i,XX + + def LL_brent(self,h,X=None,REML=False): return -self.LL(h,X,stack=False,REML=REML)[0] + def LL(self,h,X=None,stack=True,REML=False): + + """ + Computes the log-likelihood for a given heritability (h). If X==None, then the + default X0t will be used. If X is set and stack=True, then X0t will be matrix concatenated with + the input X. If stack is false, then X is used in place of X0t in the LL calculation. + REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True. + """ + + if X == None: X = self.X0t + elif stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) + + n = float(self.N) + q = float(X.shape[1]) + beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X) + LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q) + LL = -0.5 * LL + + if REML: + LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(np.dot(X.T,X))) - np.log(linalg.det(XX)) + LL = LL + 0.5*LL_REML_part + + return LL,beta,sigma,XX_i + + def getMax(self,H, X=None,REML=False): + + """ + Helper functions for .fit(...). + This function takes a set of LLs computed over a grid and finds possible regions + containing a maximum. Within these regions, a Brent search is performed to find the + optimum. + + """ + n = len(self.LLs) + HOpt = [] + for i in range(1,n-2): + if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1]))) + + if len(HOpt) > 1: + sys.stderr.write("ERR: Found multiple maximum. Returning first...\n") + return HOpt[0] + elif len(HOpt) == 1: return HOpt[0] + elif self.LLs[0] > self.LLs[n-1]: return H[0] + else: return H[n-1] + + def fit(self,X=None,ngrids=100,REML=True): + + """ + Finds the maximum-likelihood solution for the heritability (h) given the current parameters. + X can be passed and will transformed and concatenated to X0t. Otherwise, X0t is used as + the covariate matrix. + + This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum. + Given this optimum, the function computes the LL and associated ML solutions. + """ + + if X == None: X = self.X0t + else: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) + H = np.array(range(ngrids)) / float(ngrids) + L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H]) + self.LLs = L + + hmax = self.getMax(H,X,REML) + L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML) + + self.H = H + self.optH = hmax + self.optLL = L + self.optBeta = beta + self.optSigma = sigma + + return hmax,beta,sigma,L + + + def association(self,X, h = None, stack=True,REML=True): + + """ + Calculates association statitics for the SNPs encoded in the vector X of size n. + If h == None, the optimal h stored in optH is used. + + """ + if stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)]) + if h == None: h = self.optH + + L,beta,sigma,betaSTDERR = self.LL(h,X,stack=False,REML=REML) + q = len(beta) + ts,ps = self.tstat(beta[q-1],betaSTDERR[q-1,q-1],sigma,q) + return ts,ps + + def tstat(self,beta,stderr,sigma,q): + + """ + Calculates a t-statistic and associated p-value given the estimate of beta and its standard error. + This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test. + """ + + ts = beta / np.sqrt(stderr * sigma) + ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q)) + return ts,ps + + def plotFit(self,color='b-',title=''): + + """ + Simple function to visualize the likelihood space. It takes the LLs + calcualted over a grid and normalizes them by subtracting off the mean and exponentiating. + The resulting "probabilities" are normalized to one and plotted against heritability. + This can be seen as an approximation to the posterior distribuiton of heritability. + + For diagnostic purposes this lets you see if there is one distinct maximum or multiple + and what the variance of the parameter looks like. + """ + mx = self.LLs.max() + p = np.exp(self.LLs - mx) + p = p/p.sum() + + pl.plot(self.H,p,color) + pl.xlabel("Heritability") + pl.ylabel("Probability of data") + pl.title(title) |