Edited genofile_parser.py to go through all genofiles and convert

them to the format used by nick's code
author: Zachary Sloan 2013-02-06 18:32:30 -0600
committer: Zachary Sloan 2013-02-06 18:32:30 -0600
commit: d75fc63891f617fbe8b2b030fdce80b1628c6a41 (patch)
tree: a99fab2d8a4d08aec26622fc92e4ac2f29c67f30
parent: 471480255a28ae8743f4b736057a8d54585b1eca (diff)
download: genenetwork2-d75fc63891f617fbe8b2b030fdce80b1628c6a41.tar.gz
5 files changed, 581 insertions, 0 deletions
diff --git a/wqflask/wqflask/my_pylmm/data/genofile_parser.py b/wqflask/wqflask/my_pylmm/data/genofile_parser.py
new file mode 100644
index 00000000..1dafecc8
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/data/genofile_parser.py
@@ -0,0 +1,118 @@
+#!/usr/bin/python
+
+from __future__ import print_function, division, absolute_import
+import csv
+import os
+import glob
+import traceback
+
+class EmptyConfigurations(Exception): pass
+
+class ConvertGenoFile(object):
+
+    def __init__(self, input_file, output_file):
+        
+        self.input_file = input_file
+        self.output_file = output_file
+        
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+        
+        self.latest_row_value = None
+        self.latest_col_value = None
+        
+    def convert(self):
+
+        self.prefer_config = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+            }
+        
+        self.configurations = {}
+        self.skipped_cols = 3
+        
+        self.input_fh = open(self.input_file)
+        
+        
+        with open(self.output_file, "w") as self.output_fh:
+            self.process_csv()
+     
+            
+        
+    #def process_row(self, row):
+    #    counter = 0
+    #    for char in row:
+    #        if char 
+    #        counter += 1
+     
+    def process_csv(self):
+        for row_count, row in enumerate(self.process_rows()):
+            #self.latest_row_pos = row_count
+
+            for item_count, item in enumerate(row.split()[self.skipped_cols:]):
+                # print('configurations:', str(configurations))
+                self.latest_col_pos = item_count + self.skipped_cols
+                self.latest_col_value = item
+                if item_count != 0:
+                    self.output_fh.write(" ")
+                self.output_fh.write(self.configurations[item.upper()])
+                    
+            self.output_fh.write("\n")
+            
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            self.latest_row_value = row
+            # Take care of headers
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.skipped_cols = 4
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key in self.prefer_config:
+                    self.configurations[value] = self.prefer_config[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*.geno"):
+            group_name = input_file.split('.')[0]
+            output_file = os.path.join(new_directory, group_name + ".snps")
+            print("%s -> %s" % (input_file, output_file))
+            convertob = ConvertGenoFile(input_file, output_file)
+            try:
+                convertob.convert() 
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                #excepted = True
+                continue
+            except Exception as why:
+            
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %i at tabular column %i" % (convertob.latest_row_pos,
+                                                                convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+
+if __name__=="__main__":
+    Old_Geno_Directory = """/home/zas1024/gene/web/genotypes/"""
+    New_Geno_Directory = """/home/zas1024/gene/web/new_genotypes/"""
+    #Input_File = """/home/zas1024/gene/web/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" 
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    #ConvertGenoFiles(Geno_Directory)
+    
+    #process_csv(Input_File, Output_File)
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/data/prep_data.py b/wqflask/wqflask/my_pylmm/data/prep_data.py
new file mode 100644
index 00000000..b7a133c2
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/data/prep_data.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+from __future__ import absolute_import, print_function, division
+import numpy
+
+    
+class PrepData(object):
+    def __init__(self, exprs_file, snps_file):
+        self.exprs_file = exprs_file
+        self.snps_file = snps_file
+        self.empty_columns = set()
+        #self.identify_no_genotype_samples()
+        self.identify_empty_samples()
+        self.trim_files()
+
+    def identify_empty_samples(self):
+        with open(self.exprs_file) as fh:
+            for line in fh:
+                for pos, item in enumerate(line.split()):
+                    if item == "NA":
+                        self.empty_columns.add(pos)
+        #print("self.empty_columns:", self.empty_columns)
+        nums = set(range(0, 176))
+        print("not included:", nums-self.empty_columns)
+            
+    #def identify_no_genotype_samples(self):
+    #    #for this_file in (self.exprs_file, self.snps_file):
+    #        #with open(this_file) as fh:
+    #        no_geno_samples = []
+    #        has_genotypes = False
+    #        with open(self.snps_file) as fh:
+    #            for line in fh:
+    #                num_samples = len(line.split())
+    #                break
+    #            for sample in range (num_samples):
+    #                for line in fh:
+    #                    if line.split()[sample] != "NA":
+    #                        has_genotypes = True
+    #                        break
+    #                if has_genotypes == False:
+    #                    no_geno_samples.append(sample)
+    #                    
+    #        print(no_geno_samples)
+
+    def trim_files(self):
+        for this_file in (self.exprs_file, self.snps_file):
+            input_file = open(this_file)
+            this_file_name_output = this_file + ".new"
+            with open(this_file_name_output, "w") as output:
+                for line in input_file:
+                    data_wanted = []
+                    for pos, item in enumerate(line.split()):
+                        if pos in self.empty_columns:
+                            continue
+                        else:
+                            data_wanted.append("%2s" % (item))
+                    #print("data_wanted is", data_wanted)
+                    output.write(" ".join(data_wanted) + "\n")
+            print("Done writing file:", this_file_name_output)
+
+if __name__=="__main__":
+    exprs_file = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.exprs.1"""
+    snps_file = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.1000"""
+    PrepData(exprs_file, snps_file)
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/example.py b/wqflask/wqflask/my_pylmm/example.py
new file mode 100644
index 00000000..0348d67b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/example.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python
+
+from __future__ import absolute_import, print_function, division
+
+import sys
+import time
+
+import numpy as np
+from pyLMM import lmm
+
+from pprint import pformat as pf
+
+
+Y = np.genfromtxt('data/mdp.exprs.1.new')
+print("exprs is:", pf(Y.shape))
+
+# Loading npdump and first 1000 snps for speed
+#K = np.load('data/hmdp.liver.K.npdump')
+#snps = np.load('data/hmdp.liver.snps.1000.npdump').T
+
+# These three lines will load all SNPs (from npdump or from txt) and 
+# calculate the kinship
+snps = np.genfromtxt('data/mdp.snps.1000.new').T
+print("snps is:", pf(snps.shape))
+#snps = snps[~np.isnan(snps).all(axis=1)]
+#print ("snps is now:", pf(snps))
+np.savetxt("/home/zas1024/gene/wqflask/wqflask/pylmm/data/mdp.snps.trimmed", snps, fmt='%s', delimiter=' ')
+#snps = np.load('data/hmdp.liver.snps.npdump').T
+K = lmm.calculateKinship(snps)
+#print("K is:", pf(K))
+#print("Y is:", pf(Y.shape))
+
+# Instantiate a LMM object for the phentoype Y and fit the null model
+L = lmm.LMM(Y,K)
+L.fit()
+
+# Manually calculate the association at one SNP
+X = snps[:,0]
+X[np.isnan(X)] = X[True - np.isnan(X)].mean() # Fill missing with MAF
+X = X.reshape(len(X),1)
+if X.var() == 0: ts,ps = (np.nan,np.nan)
+else: ts,ps = L.association(X)
+
+# If I want to refit the variance component
+L.fit(X=X)
+ts,ps = L.association(X)
+
+# If I want to do a genome-wide scan over the 1000 SNPs.
+# This call will use REML (REML = False means use ML).
+# It will also refit the variance components for each SNP.
+# Setting refit = False will cause the program to fit the model once
+# and hold those variance component estimates for each SNP.
+begin = time.time()
+TS,PS = lmm.GWAS(Y,snps,K,REML=True,refit=False)
+print("TS is:", pf(TS))
+print("PS is:", pf(PS))
+end = time.time()
+sys.stderr.write("Total time for 1000 SNPs: %0.3f\n" % (end- begin))
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
new file mode 100644
index 00000000..7fe599c4
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -0,0 +1,341 @@
+# pyLMM software Copyright 2012, Nicholas A. Furlotte
+# Version 0.1
+
+#License Details
+#---------------
+
+# The program is free for academic use. Please contact Nick Furlotte
+# <nick.furlotte@gmail.com> if you are interested in using the software for
+# commercial purposes.
+
+# The software must not be modified and distributed without prior
+# permission of the author.
+# Any instance of this software must retain the above copyright notice.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import absolute_import, print_function, division
+
+import sys
+import time
+import numpy as np
+import numpy.linalg as linalg
+from scipy import optimize
+from scipy import stats
+#import matplotlib.pyplot as pl
+import pdb
+
+from pprint import pformat as pf
+
+def calculateKinship(W):
+    """
+       W is an n x m matrix encoding SNP minor alleles.
+    """
+    n = W.shape[0]
+    m = W.shape[1]
+    keep = []
+    for i in range(m):
+        mn = W[True - np.isnan(W[:,i]),i].mean()
+        W[np.isnan(W[:,i]),i] = mn
+        vr = W[:,i].var()
+        if vr == 0: continue
+
+        keep.append(i)
+        W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
+
+    W = W[:,keep]
+    K = np.dot(W,W.T) * 1.0/float(m)
+    return K
+
+def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
+    """
+      Performs a basic GWAS scan using the LMM.  This function
+      uses the LMM module to assess association at each SNP and
+      does some simple cleanup, such as removing missing individuals
+      per SNP and re-computing the eigen-decomp
+
+      Y - n x 1 phenotype vector
+      X - n x m SNP matrix
+      K - n x n kinship matrix
+      Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
+      X0 - n x q covariate matrix
+      REML - use restricted maximum likelihood
+      refit - refit the variance component for each SNP
+    """
+    n = X.shape[0]
+    m = X.shape[1]
+
+    if X0 == None: X0 = np.ones((n,1))
+
+    # Remove missing values in Y and adjust associated parameters
+    v = np.isnan(Y)
+    if v.sum():
+        keep = True - v
+        Y = Y[keep]
+        X = X[keep,:]
+        X0 = X0[keep,:]
+        K = K[keep,:][:,keep]
+        Kva = []
+        Kve = []
+
+    L = LMM(Y,K,Kva,Kve,X0)
+    if not refit: L.fit()
+
+    PS = []
+    TS = []
+
+    for i in range(m):
+        x = X[:,i].reshape((n,1))
+        v = np.isnan(x).reshape((-1,))
+        if v.sum():
+            keep = True - v
+            xs = x[keep,:]
+            if xs.var() == 0:
+                PS.append(np.nan)
+                TS.append(np.nan)
+                continue
+
+            Ys = Y[keep]
+            X0s = X0[keep,:]
+            Ks = K[keep,:][:,keep]
+            Ls = LMM(Ys,Ks,X0=X0s)
+            if refit: Ls.fit(X=xs)
+            else: Ls.fit()
+            ts,ps = Ls.association(xs,REML=REML)
+        else:
+            if x.var() == 0:
+                PS.append(np.nan)
+                TS.append(np.nan)
+                continue
+
+            if refit: L.fit(X=x)
+            ts,ps = L.association(x,REML=REML)
+
+        PS.append(ps)
+        TS.append(ts)
+
+    return TS,PS
+
+class LMM:
+
+    """
+          This is a simple version of EMMA/fastLMM.
+          The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
+          and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
+          There are three model parameters: heritability (h), covariate coefficients (beta) and the total
+          phenotypic variance (sigma).
+          Heritability as defined here is the proportion of the total variance (sigma) that is attributed to
+          the kinship matrix.
+
+          For simplicity, we assume that everything being input is a numpy array.
+          If this is not the case, the module may throw an error as conversion from list to numpy array
+          is not done consistently.
+
+    """
+    def __init__(self,Y,K,Kva=[],Kve=[],X0=None):
+
+        """
+        The constructor takes a phenotype vector or array of size n.
+        It takes a kinship matrix of size n x n.  Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
+        If they are not provided, the constructor will calculate them.
+        X0 is an optional covariate matrix of size n x q, where there are q covariates.
+        When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+        """
+
+        if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
+
+        x = Y != -9
+        if not x.sum() == len(Y):
+            sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
+            Y = Y[x]
+            K = K[x,:][:,x]
+            X0 = X0[x,:]
+            Kva = []
+            Kve = []
+        self.nonmissing = x
+
+        if len(Kva) == 0 or len(Kve) == 0:
+            sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+            begin = time.time()
+            Kva,Kve = linalg.eigh(K)
+            end = time.time()
+            sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+        self.K = K
+        self.Kva = Kva
+        self.Kve = Kve
+        self.Y = Y
+        self.X0 = X0
+        self.N = self.K.shape[0]
+
+        self.transform()
+
+    def transform(self):
+
+        """
+           Computes a transformation on the phenotype vector and the covariate matrix.
+           The transformation is obtained by left multiplying each parameter by the transpose of the
+           eigenvector matrix of K (the kinship).
+        """
+        
+        print(len(self.Kve.T))
+        print(len(self.Y))
+
+        self.Yt = np.dot(self.Kve.T, self.Y)
+        self.X0t = np.dot(self.Kve.T, self.X0)
+
+    def getMLSoln(self,h,X):
+
+        """
+           Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
+           the total variance of the trait (sigma) and also passes intermediates that can
+           be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
+           the heritability or the proportion of the total variance attributed to genetics.  The X is the
+           covariate matrix.
+        """
+
+        #print("h is", pf(h))
+        #print("X is", pf(X))
+        print("X.shape is", pf(X.shape))
+
+        S = 1.0/(h*self.Kva + (1.0 - h))
+        Xt = X.T*S
+        XX = np.dot(Xt,X)
+
+
+        XX_i = linalg.inv(XX)
+        beta =  np.dot(np.dot(XX_i,Xt),self.Yt)
+        Yt = self.Yt - np.dot(X,beta)
+        Q = np.dot(Yt.T*S,Yt)
+        sigma = Q * 1.0 / (float(len(self.Yt)) - float(X.shape[1]))
+        return beta,sigma,Q,XX_i,XX
+
+    def LL_brent(self,h,X=None,REML=False): return -self.LL(h,X,stack=False,REML=REML)[0]
+    def LL(self,h,X=None,stack=True,REML=False):
+
+        """
+           Computes the log-likelihood for a given heritability (h).  If X==None, then the
+           default X0t will be used.  If X is set and stack=True, then X0t will be matrix concatenated with
+           the input X.  If stack is false, then X is used in place of X0t in the LL calculation.
+           REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
+        """
+
+        if X == None: X = self.X0t
+        elif stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
+
+        n = float(self.N)
+        q = float(X.shape[1])
+        beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
+        LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
+        LL = -0.5 * LL
+
+        if REML:
+            LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(linalg.det(np.dot(X.T,X))) - np.log(linalg.det(XX))
+            LL = LL + 0.5*LL_REML_part
+
+        return LL,beta,sigma,XX_i
+
+    def getMax(self,H, X=None,REML=False):
+
+        """
+           Helper functions for .fit(...).
+           This function takes a set of LLs computed over a grid and finds possible regions
+           containing a maximum.  Within these regions, a Brent search is performed to find the
+           optimum.
+
+        """
+        n = len(self.LLs)
+        HOpt = []
+        for i in range(1,n-2):
+            if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+
+        if len(HOpt) > 1:
+            sys.stderr.write("ERR: Found multiple maximum.  Returning first...\n")
+            return HOpt[0]
+        elif len(HOpt) == 1: return HOpt[0]
+        elif self.LLs[0] > self.LLs[n-1]: return H[0]
+        else: return H[n-1]
+
+    def fit(self,X=None,ngrids=100,REML=True):
+
+        """
+           Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
+           X can be passed and will transformed and concatenated to X0t.  Otherwise, X0t is used as
+           the covariate matrix.
+
+           This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
+           Given this optimum, the function computes the LL and associated ML solutions.
+        """
+
+        if X == None: X = self.X0t
+        else: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
+        H = np.array(range(ngrids)) / float(ngrids)
+        L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
+        self.LLs = L
+
+        hmax = self.getMax(H,X,REML)
+        L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
+
+        self.H = H
+        self.optH = hmax
+        self.optLL = L
+        self.optBeta = beta
+        self.optSigma = sigma
+
+        return hmax,beta,sigma,L
+
+
+    def association(self,X, h = None, stack=True,REML=True):
+
+        """
+          Calculates association statitics for the SNPs encoded in the vector X of size n.
+          If h == None, the optimal h stored in optH is used.
+
+        """
+        if stack: X = np.hstack([self.X0t,np.dot(self.Kve.T, X)])
+        if h == None: h = self.optH
+
+        L,beta,sigma,betaSTDERR = self.LL(h,X,stack=False,REML=REML)
+        q  = len(beta)
+        ts,ps = self.tstat(beta[q-1],betaSTDERR[q-1,q-1],sigma,q)
+        return ts,ps
+
+    def tstat(self,beta,stderr,sigma,q):
+
+        """
+           Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
+           This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
+        """
+
+        ts = beta / np.sqrt(stderr * sigma)
+        ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
+        return ts,ps
+
+    def plotFit(self,color='b-',title=''):
+
+        """
+           Simple function to visualize the likelihood space.  It takes the LLs
+           calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
+           The resulting "probabilities" are normalized to one and plotted against heritability.
+           This can be seen as an approximation to the posterior distribuiton of heritability.
+
+           For diagnostic purposes this lets you see if there is one distinct maximum or multiple
+           and what the variance of the parameter looks like.
+        """
+        mx = self.LLs.max()
+        p = np.exp(self.LLs - mx)
+        p = p/p.sum()
+
+        pl.plot(self.H,p,color)
+        pl.xlabel("Heritability")
+        pl.ylabel("Probability of data")
+        pl.title(title)
author	Zachary Sloan	2013-02-06 18:32:30 -0600
committer	Zachary Sloan	2013-02-06 18:32:30 -0600
commit	d75fc63891f617fbe8b2b030fdce80b1628c6a41 (patch)
tree	a99fab2d8a4d08aec26622fc92e4ac2f29c67f30
parent	471480255a28ae8743f4b736057a8d54585b1eca (diff)
download	genenetwork2-d75fc63891f617fbe8b2b030fdce80b1628c6a41.tar.gz