Adding GWAS code

author: Pjotr Prins 2015-03-16 17:34:40 +0300
committer: Pjotr Prins 2015-03-16 17:34:40 +0300
commit: cc419336f559a51ed03a669d19042e6fd21355ed (patch)
tree: a0af7aa95e5bcec5c6be9616e529689f423fbf15
parent: 62f6db77b22e1fb28b3355d75a30f9ecf6c94d95 (diff)
download: genenetwork2-cc419336f559a51ed03a669d19042e6fd21355ed.tar.gz
5 files changed, 490 insertions, 86 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 52455014..b9d5db61 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -21,79 +21,30 @@ import pdb
 import time
 import sys
 # from utility import temp_data
-import lmm
-
+import lmm2
 
 import os
 import numpy as np
 import input
 from optmatrix import matrix_initialize
-# from lmm import LMM
+from lmm2 import LMM2
 
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue 
 
-def compute_snp(j,snp_ids,q = None):
-   # print(j,len(snp_ids),"\n")
+def formatResult(id,beta,betaSD,ts,ps):
+   return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+
+def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
+   # print(j,snp_ids,"\n")
    result = []
    for snp_id in snp_ids:
-      # j,snp_id = collect
       snp,id = snp_id
-      # id = collect[1]
-      # result = []
-      # Check SNPs for missing values
-      x = snp[keep].reshape((n,1))  # all the SNPs
-      v = np.isnan(x).reshape((-1,))
-      if v.sum():
-         # NOTE: this code appears to be unreachable!
-         if verbose:
-            sys.stderr.write("Found missing values in "+str(x))
-         keeps = True - v
-         xs = x[keeps,:]
-         if keeps.sum() <= 1 or xs.var() <= 1e-6: 
-            # PS.append(np.nan)
-            # TS.append(np.nan)
-            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            # continue
-            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            continue
-
-         # Its ok to center the genotype -  I used normalizeGenotype to 
-         # force the removal of missing genotypes as opposed to replacing them with MAF.
-         if not normalizeGenotype:
-            xs = (xs - xs.mean()) / np.sqrt(xs.var())
-         Ys = Y[keeps]
-         X0s = X0[keeps,:]
-         Ks = K[keeps,:][:,keeps]
-         if kfile2:
-            K2s = K2[keeps,:][:,keeps]
-            Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=verbose,K2=K2s)
-         else:
-            Ls = LMM(Ys,Ks,X0=X0s,verbose=verbose)
-         if refit:
-           Ls.fit(X=xs,REML=REML)
-         else:
-            #try:
-            Ls.fit(REML=REML)
-            #except: pdb.set_trace()
-         ts,ps,beta,betaVar = Ls.association(xs,REML=REML,returnBeta=True)
-      else:
-         if x.var() == 0:
-            # Note: this code appears to be unreachable!
-
-            # PS.append(np.nan)
-            # TS.append(np.nan)
-            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values
-            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            continue
-
-         if refit:
-            L.fit(X=x,REML=REML)
-         # This is where it happens
-         ts,ps,beta,betaVar = L.association(x,REML=REML,returnBeta=True)
+      x = snp.reshape((n,1))  # all the SNPs
+      # if refit:
+      #    L.fit(X=snp,REML=REML)
+      ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True)
       result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
-      # compute_snp.q.put([j,formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)])
-   # print [j,result[0]]," in result queue\n"
    if not q:
       q = compute_snp.q
    q.put([j,result])
@@ -113,8 +64,9 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    """
    matrix_initialize()
    cpu_num = mp.cpu_count()
-   numThreads = 1
+   numThreads = None
    kfile2 = False
+   reml = restricted_max_likelihood
 
    sys.stderr.write(str(G.shape)+"\n")
    n = G.shape[1] # inds
@@ -123,17 +75,17 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    snps = m
    sys.stderr.write(str(m)+" SNPs\n")
    # print "***** GWAS: G",G.shape,G
-   assert m>n, "n should be larger than m (snps>inds)"
+   assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
 
    # CREATE LMM object for association
    # if not kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
    # else:  L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=verbose,K2=K2)
 
-   runlmm = lmm.LMM(Y,K) # ,Kva,Kve,X0,verbose=verbose)
+   lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose)
    if not refit:
       if verbose: sys.stderr.write("Computing fit for null model\n")
-      runlmm.fit()  # follow GN model in run_other
-      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (runlmm.optH,runlmm.optSigma))
+      lmm2.fit()  # follow GN model in run_other
+      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma))
       
    # outFile = "test.out"
    # out = open(outFile,'w')
@@ -142,8 +94,6 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    def outputResult(id,beta,betaSD,ts,ps):
       out.write(formatResult(id,beta,betaSD,ts,ps))
    def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
-   def formatResult(id,beta,betaSD,ts,ps):
-      return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
 
    printOutHead()
 
@@ -162,15 +112,15 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    last_j = 0
    # for snp_id in G:
    for snp in G:
-      print snp.shape,snp
-      snp_id = ('SNPID',snp)
+      snp_id = (snp,'SNPID')
       count += 1
       if count % 1000 == 0:
          job = count/1000
          if verbose:
             sys.stderr.write("Job %d At SNP %d\n" % (job,count))
          if numThreads == 1:
-            compute_snp(job,collect,q)
+            print "Running on 1 THREAD"
+            compute_snp(job,n,collect,lmm2,reml,q)
             collect = []
             j,lines = q.get()
             if verbose:
@@ -178,7 +128,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             for line in lines:
                out.write(line)
          else:
-            p.apply_async(compute_snp,(job,collect))
+            p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
             collect = []
             while job > completed:
                try:
@@ -205,6 +155,13 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             sys.stderr.write("Job "+str(j)+" finished\n")
          for line in lines:
             out.write(line)
+   else:
+      print "Running on 1 THREAD"
+      # print collect
+      compute_snp(count/1000,n,collect,lmm2,reml,q)
+      j,lines = q.get()
+      for line in lines:
+         out.write(line)
 
    # print collect
    ps = [item[1][3] for item in collect]
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 8c6d3c3c..c42f9fb7 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -51,6 +51,7 @@ from utility.benchmark import Bench
 from utility import temp_data
 from kinship import kinship, kinship_full, kvakve
 import genotype
+import phenotype
 import gwas
 
 try:
@@ -315,32 +316,45 @@ def run_other_new(pheno_vector,
     
     print("In run_other (new)")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
+
+    # Adjust phenotypes
+    Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
+    print("Removed missing phenotypes",Y.shape)
+    # if options.maf_normalization:
+    #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+    #     print "MAF replacements: \n",G
+    # if not options.skip_genotype_normalization:
+    # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+
     with Bench("Calculate Kinship"):
-        kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
+        K,G = calculate_kinship(G, tempdata)
     
-    print("kinship_matrix: ", pf(kinship_matrix))
-    print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
+    print("kinship_matrix: ", pf(K))
+    print("kinship_matrix.shape: ", pf(K.shape))
 
-    with Bench("Create LMM object"):
-        lmm_ob = LMM(pheno_vector, kinship_matrix)
-    with Bench("LMM_ob fitting"):
-        lmm_ob.fit()
+    # with Bench("Create LMM object"):
+    #     lmm_ob = lmm2.LMM2(Y,K)
+    # with Bench("LMM_ob fitting"):
+    #     lmm_ob.fit()
 
-    print("genotype_matrix: ", genotype_matrix.shape)
-    print(genotype_matrix)
+    print("genotype_matrix: ", G.shape)
+    print(G)
 
     with Bench("Doing GWAS"):
-        t_stats, p_values = gwas.gwas(pheno_vector,
-                                      genotype_matrix.T,
-                                      kinship_matrix,
+        t_stats, p_values = gwas.gwas(Y,
+                                      G.T,
+                                      K,
                                       restricted_max_likelihood=True,
                                       refit=False,verbose=True)
     Bench().report()
     return p_values, t_stats
 
-run_other = run_other_old
+run_other = run_other_new
 
 def matrixMult(A,B):
+    return np.dot(A,B)
+
+def matrixMult_old(A,B):
 
     # If there is no fblas then we will revert to np.dot()
 
@@ -674,11 +688,15 @@ class LMM:
            optimum.
   
         """
+        print("H=",H)
+        print("X=",X)
+        print("REML=",REML)
         n = len(self.LLs)
         HOpt = []
         for i in range(1,n-2):
             if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: 
                 HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+                print("HOpt=",HOpt)
                 if np.isnan(HOpt[-1][0]):
                     HOpt[-1][0] = [self.LLs[i-1]]
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
new file mode 100644
index 00000000..cba47a9b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -0,0 +1,405 @@
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013,2014  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import time
+import numpy as np
+from scipy.linalg import eigh, inv, det
+import scipy.stats as stats # t-tests
+from scipy import optimize
+from optmatrix import matrixMult
+
+def calculateKinship(W,center=False):
+      """
+	 W is an n x m matrix encoding SNP minor alleles.
+
+	 This function takes a matrix oF SNPs, imputes missing values with the maf,
+	 normalizes the resulting vectors and returns the RRM matrix.
+      """
+      n = W.shape[0]
+      m = W.shape[1]
+      keep = []
+      for i in range(m):
+	 mn = W[True - np.isnan(W[:,i]),i].mean()
+	 W[np.isnan(W[:,i]),i] = mn
+	 vr = W[:,i].var()
+	 if vr == 0: continue
+
+	 keep.append(i)
+	 W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
+
+      W = W[:,keep]
+      K = matrixMult(W,W.T) * 1.0/float(m)
+      if center:
+	 P = np.diag(np.repeat(1,n)) - 1/float(n) * np.ones((n,n))
+	 S = np.trace(matrixMult(matrixMult(P,K),P))
+	 K_n = (n - 1)*K / S
+	 return K_n
+      return K
+
+def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
+      """
+
+        Performs a basic GWAS scan using the LMM.  This function
+        uses the LMM module to assess association at each SNP and 
+        does some simple cleanup, such as removing missing individuals 
+        per SNP and re-computing the eigen-decomp
+
+	Y - n x 1 phenotype vector 
+        X - n x m SNP matrix (genotype matrix)
+	K - n x n kinship matrix
+        Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
+        X0 - n x q covariate matrix
+	REML - use restricted maximum likelihood 
+        refit - refit the variance component for each SNP
+
+      """
+      n = X.shape[0]
+      m = X.shape[1]
+      prins("Initialize GWAS")
+      print("genotype matrix n is:", n)
+      print("genotype matrix m is:", m)
+
+      if X0 == None: 
+         X0 = np.ones((n,1))
+      
+      # Remove missing values in Y and adjust associated parameters
+      v = np.isnan(Y)
+      if v.sum():
+	 keep = True - v
+	 keep = keep.reshape((-1,))
+	 Y = Y[keep]
+	 X = X[keep,:]
+	 X0 = X0[keep,:]
+	 K = K[keep,:][:,keep]
+	 Kva = []
+	 Kve = []
+
+      if len(Y) == 0: 
+         return np.ones(m)*np.nan,np.ones(m)*np.nan
+
+      L = LMM(Y,K,Kva,Kve,X0)
+      if not refit: L.fit()
+
+      PS = []
+      TS = []
+
+      n = X.shape[0]
+      m = X.shape[1]
+
+      for i in range(m):
+	 x = X[:,i].reshape((n,1))
+	 v = np.isnan(x).reshape((-1,))
+	 if v.sum():
+	    keep = True - v
+	    xs = x[keep,:]
+	    if xs.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    Ys = Y[keep]
+	    X0s = X0[keep,:]
+	    Ks = K[keep,:][:,keep]
+	    Ls = LMM(Ys,Ks,X0=X0s)
+	    if refit: 
+               Ls.fit(X=xs)
+	    else: 
+               Ls.fit()
+	    ts,ps = Ls.association(xs,REML=REML)
+	 else: 
+	    if x.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    if refit: 
+               L.fit(X=x)
+	    ts,ps = L.association(x,REML=REML)
+	    
+	 PS.append(ps)
+	 TS.append(ts)
+
+      return TS,PS
+
+class LMM2:
+
+   """
+	 This is a simple version of EMMA/fastLMM.  
+	 The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
+	 and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
+	 There are three model parameters: heritability (h), covariate coefficients (beta) and the total
+	 phenotypic variance (sigma).
+	 Heritability as defined here is the proportion of the total variance (sigma) that is attributed to 
+	 the kinship matrix.
+
+	 For simplicity, we assume that everything being input is a numpy array.
+	 If this is not the case, the module may throw an error as conversion from list to numpy array
+	 is not done consistently.
+
+   """
+   def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
+
+      """
+      The constructor takes a phenotype vector or array Y of size n.
+      It takes a kinship matrix K of size n x n.  Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
+      If they are not provided, the constructor will calculate them.
+      X0 is an optional covariate matrix of size n x q, where there are q covariates.
+      When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+      """
+
+      if X0 == None: 
+	 X0 = np.ones(len(Y)).reshape(len(Y),1)
+      self.verbose = verbose
+
+      x = True - np.isnan(Y)
+      x = x.reshape(-1,)
+      if not x.sum() == len(Y):
+	 if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
+	 Y = Y[x]
+	 K = K[x,:][:,x]
+	 X0 = X0[x,:]
+	 Kva = []
+	 Kve = []
+      self.nonmissing = x
+
+      if len(Kva) == 0 or len(Kve) == 0:
+	 if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+	 begin = time.time()
+	 Kva,Kve = eigh(K)
+	 end = time.time()
+	 if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+      
+      self.K = K
+      self.Kva = Kva
+      self.Kve = Kve
+      self.N = self.K.shape[0]
+      self.Y = Y.reshape((self.N,1))
+      self.X0 = X0
+
+      if sum(self.Kva < 1e-6):
+         if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+         self.Kva[self.Kva < 1e-6] = 1e-6
+
+      self.transform()
+
+   def transform(self):
+
+      """
+	 Computes a transformation on the phenotype vector and the covariate matrix.
+	 The transformation is obtained by left multiplying each parameter by the transpose of the 
+	 eigenvector matrix of K (the kinship).
+      """
+
+      self.Yt = matrixMult(self.Kve.T, self.Y)
+      self.X0t = matrixMult(self.Kve.T, self.X0)
+      self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))])
+      self.q = self.X0t.shape[1]
+
+   def getMLSoln(self,h,X):
+
+      """
+	 Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
+	 the total variance of the trait (sigma) and also passes intermediates that can 
+	 be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
+	 the heritability or the proportion of the total variance attributed to genetics.  The X is the 
+	 covariate matrix.
+      """
+   
+      S = 1.0/(h*self.Kva + (1.0 - h))
+      Xt = X.T*S
+      XX = matrixMult(Xt,X)
+      XX_i = inv(XX)
+      beta =  matrixMult(matrixMult(XX_i,Xt),self.Yt)
+      Yt = self.Yt - matrixMult(X,beta)
+      Q = np.dot(Yt.T*S,Yt)
+      sigma = Q * 1.0 / (float(self.N) - float(X.shape[1]))
+      return beta,sigma,Q,XX_i,XX
+
+   def LL_brent(self,h,X=None,REML=False): 
+      #brent will not be bounded by the specified bracket.
+      # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search.
+      if h < 0: return 1e6
+      return -self.LL(h,X,stack=False,REML=REML)[0]
+	 
+   def LL(self,h,X=None,stack=True,REML=False):
+
+      """
+	 Computes the log-likelihood for a given heritability (h).  If X==None, then the 
+	 default X0t will be used.  If X is set and stack=True, then X0t will be matrix concatenated with
+	 the input X.  If stack is false, then X is used in place of X0t in the LL calculation.
+	 REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
+      """
+
+      if X == None: X = self.X0t
+      elif stack: 
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      n = float(self.N)
+      q = float(X.shape[1])
+      beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
+      LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
+      LL = -0.5 * LL
+
+      if REML:
+	 LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(det(matrixMult(X.T,X))) - np.log(det(XX))
+	 LL = LL + 0.5*LL_REML_part
+
+
+      LL = LL.sum()
+      return LL,beta,sigma,XX_i
+
+   def getMax(self,H, X=None,REML=False):
+
+      """
+	 Helper functions for .fit(...).  
+	 This function takes a set of LLs computed over a grid and finds possible regions 
+	 containing a maximum.  Within these regions, a Brent search is performed to find the 
+	 optimum.
+
+      """
+      n = len(self.LLs)
+      HOpt = []
+      for i in range(1,n-2):
+          if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]:
+	    HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+	    if np.isnan(HOpt[-1]): HOpt[-1] = H[i-1]
+	    #if np.isnan(HOpt[-1]): HOpt[-1] = self.LLs[i-1]
+	    #if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]]
+
+      if len(HOpt) > 1: 
+	 if self.verbose: sys.stderr.write("NOTE: Found multiple optima.  Returning first...\n")
+	 return HOpt[0]
+      elif len(HOpt) == 1: return HOpt[0]
+      elif self.LLs[0] > self.LLs[n-1]: return H[0]
+      else: return H[n-1]
+
+
+   def fit(self,X=None,ngrids=100,REML=True):
+
+      """
+	 Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
+	 X can be passed and will transformed and concatenated to X0t.  Otherwise, X0t is used as 
+	 the covariate matrix.
+
+	 This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
+	 Given this optimum, the function computes the LL and associated ML solutions.
+      """
+      
+      if X == None: X = self.X0t
+      else: 
+	 #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      H = np.array(range(ngrids)) / float(ngrids)
+      L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
+      self.LLs = L
+
+      hmax = self.getMax(H,X,REML)
+      L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
+      
+      self.H = H
+      self.optH = hmax.sum()
+      self.optLL = L
+      self.optBeta = beta
+      self.optSigma = sigma.sum()
+
+      return hmax,beta,sigma,L
+
+   def association(self,X,h=None,stack=True,REML=True,returnBeta=False):
+      """
+	Calculates association statitics for the SNPs encoded in the vector X of size n.
+	If h == None, the optimal h stored in optH is used.
+
+      """
+      if False:
+            print "X=",X
+            print "h=",h
+            print "q=",self.q
+            print "self.Kve=",self.Kve
+            print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack
+      
+      if stack:
+	 # X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+         m = matrixMult(self.Kve.T,X)
+         # print "m=",m
+         m = m[:,0]
+         self.X0t_stack[:,(self.q)] = m
+	 X = self.X0t_stack
+	 
+      if h == None: h = self.optH
+
+      L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML)
+      q  = len(beta)
+      ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q)
+      
+      if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma
+      return ts,ps
+
+   def tstat(self,beta,var,sigma,q,log=False): 
+
+	 """
+	    Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
+	    This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
+	 """
+
+	 ts = beta / np.sqrt(var * sigma)	 
+	 #ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
+	 # sf == survival function - this is more accurate -- could also use logsf if the precision is not good enough
+	 if log:
+	    ps = 2.0 + (stats.t.logsf(np.abs(ts), self.N-q))
+	 else:
+	    ps = 2.0*(stats.t.sf(np.abs(ts), self.N-q))
+	 if not len(ts) == 1 or not len(ps) == 1: 
+	    raise Exception("Something bad happened :(")
+	 return ts.sum(),ps.sum()
+
+   def plotFit(self,color='b-',title=''):
+
+      """
+	 Simple function to visualize the likelihood space.  It takes the LLs 
+	 calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
+	 The resulting "probabilities" are normalized to one and plotted against heritability.
+	 This can be seen as an approximation to the posterior distribuiton of heritability.
+
+	 For diagnostic purposes this lets you see if there is one distinct maximum or multiple 
+	 and what the variance of the parameter looks like.
+      """
+      import matplotlib.pyplot as pl
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      pl.plot(self.H,p,color)
+      pl.xlabel("Heritability")
+      pl.ylabel("Probability of data")
+      pl.title(title)
+   
+   def meanAndVar(self):
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      mn = (self.H * p).sum()
+      vx = ((self.H - mn)**2 * p).sum()
+
+      return mn,vx
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index bb620052..682ba371 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -36,5 +36,5 @@ def remove_missing(y,g,verbose=False):
             sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
         y1 = y[keep]
         g1 = g[keep,:]
-    return y1,g1
+    return y1,g1,keep
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index f17f1bd1..4268f3be 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -99,13 +99,37 @@ if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
 
+if cmd == 'redis_new':
+    # Emulating the redis setup of GN2
+    Y = y
+    G = g
+    print "Original G",G.shape, "\n", G
+
+    gt = G.T
+    G = None
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
+    print np.array(ps)
+    print sum(ps)
+    # Test results
+    p1 = round(ps[0],4)
+    p2 = round(ps[-1],4)
+    sys.stderr.write(options.geno+"\n")
+    if options.geno == 'data/small.geno':
+        assert p1==0.0708, "p1=%f" % p1
+        assert p2==0.1417, "p2=%f" % p2
+    if options.geno == 'data/small_na.geno':
+        assert p1==0.0958, "p1=%f" % p1
+        assert p2==0.0435, "p2=%f" % p2
+    if options.geno == 'data/test8000.geno':
+        assert p1==0.8984, "p1=%f" % p1
+        assert p2==0.9623, "p2=%f" % p2
 if cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
     if y != None:
         gnt = np.array(g).T
-        Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+        Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
         print "Removed missing phenotypes",G.shape, "\n", G
     if options.maf_normalization:
author	Pjotr Prins	2015-03-16 17:34:40 +0300
committer	Pjotr Prins	2015-03-16 17:34:40 +0300
commit	cc419336f559a51ed03a669d19042e6fd21355ed (patch)
tree	a0af7aa95e5bcec5c6be9616e529689f423fbf15
parent	62f6db77b22e1fb28b3355d75a30f9ecf6c94d95 (diff)
download	genenetwork2-cc419336f559a51ed03a669d19042e6fd21355ed.tar.gz