Merge pull request #3 from pjotrp/master

0.50-gn2-pre1
author: zsloan 2015-04-06 10:50:59 -0500
committer: zsloan 2015-04-06 10:50:59 -0500
commit: 24fe675688467b77cb0b1e858e23f119027d08d8 (patch)
tree: fdc0ea6440f342f9bd2a3ed8946998859efebc9b
parent: 5fb4245a3d08f7107cf885639fc275a6d73ea82d (diff)
parent: 67d7d982f96cfd0f3dcb9806ecc6d2f947af9dc9 (diff)
download: genenetwork2-24fe675688467b77cb0b1e858e23f119027d08d8.tar.gz
9 files changed, 789 insertions, 54 deletions
diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md
new file mode 100644
index 00000000..f6b0e72d
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/README.md
@@ -0,0 +1,21 @@
+# RELEASE NOTES
+
+## 0.50-gn2-pre1 release
+
+This is the first test release of multi-core pylmm into GN2. Both
+kinship calculation and GWAS have been made multi-threaded by
+introducing the Python multiprocessing module. Note that only
+run_other has been updated to use the new routines (so human is still
+handled the old way). I have taken care that we can still run both
+old-style and new-style LMM (through passing the 'new_code'
+boolean). This could be an option in the web server for users to
+select and test for any unexpected differences (of which there should
+be none, naturally ;).
+
+The current version can handle missing phenotypes, but as they are
+removed there is no way for GN2 to know what SNPs the P-values belong
+to. A future version will pass a SNP index to allow for missing
+phenotypes.
+
+
+  
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
index e69de29b..c40c3221 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
@@ -0,0 +1 @@
+PYLMM_VERSION="0.50-gn2-pre1"
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
new file mode 100644
index 00000000..b901c0e2
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -0,0 +1,173 @@
+# pylmm-based GWAS calculation
+#
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#!/usr/bin/python
+
+import pdb
+import time
+import sys
+# from utility import temp_data
+import lmm2
+
+import os
+import numpy as np
+import input
+from optmatrix import matrix_initialize
+from lmm2 import LMM2
+
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+def formatResult(id,beta,betaSD,ts,ps):
+   return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+
+def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
+   # print("COMPUTE SNP",j,snp_ids,"\n")
+   result = []
+   for snp_id in snp_ids:
+      snp,id = snp_id
+      x = snp.reshape((n,1))  # all the SNPs
+      # print "X=",x
+      # if refit:
+      #    L.fit(X=snp,REML=REML)
+      ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True)
+      # result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
+      result.append( (ts,ps) )
+   if not q:
+      q = compute_snp.q
+   q.put([j,result])
+   return j
+      # PS.append(ps)
+      # TS.append(ts)
+      # return len(result)
+      # compute.q.put(result)
+      # return None
+
+def f_init(q):
+   compute_snp.q = q
+
+def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
+   """
+   Execute a GWAS. The G matrix should be n inds (cols) x m snps (rows)
+   """
+   matrix_initialize()
+   cpu_num = mp.cpu_count()
+   numThreads = None # for now use all available threads
+   kfile2 = False
+   reml = restricted_max_likelihood
+
+   sys.stderr.write(str(G.shape)+"\n")
+   n = G.shape[1] # inds
+   inds = n
+   m = G.shape[0] # snps
+   snps = m
+   sys.stderr.write(str(m)+" SNPs\n")
+   # print "***** GWAS: G",G.shape,G
+   assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
+
+   # CREATE LMM object for association
+   # if not kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
+   # else:  L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=verbose,K2=K2)
+
+   lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose)
+   if not refit:
+      if verbose: sys.stderr.write("Computing fit for null model\n")
+      lmm2.fit()  # follow GN model in run_other
+      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma))
+      
+   # outFile = "test.out"
+   # out = open(outFile,'w')
+   out = sys.stderr
+
+   def outputResult(id,beta,betaSD,ts,ps):
+      out.write(formatResult(id,beta,betaSD,ts,ps))
+   def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
+
+   # printOutHead()
+   res = []
+
+   # Set up the pool
+   # mp.set_start_method('spawn')
+   q = mp.Queue()
+   p = mp.Pool(numThreads, f_init, [q])
+   collect = []
+
+   # Buffers for pvalues and t-stats
+   # PS = []
+   # TS = []
+   count = 0
+   job = 0
+   jobs_running = 0
+   for snp in G:
+      snp_id = (snp,'SNPID')
+      count += 1
+      if count % 1000 == 0:
+         job += 1
+         if verbose:
+            sys.stderr.write("Job %d At SNP %d\n" % (job,count))
+         if numThreads == 1:
+            print "Running on 1 THREAD"
+            compute_snp(job,n,collect,lmm2,reml,q)
+            collect = []
+            j,lst = q.get()
+            if verbose:
+               sys.stderr.write("Job "+str(j)+" finished\n")
+            res.append((j,lst))
+         else:
+            p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
+            jobs_running += 1
+            collect = []
+            while jobs_running > cpu_num:
+               try:
+                  j,lst = q.get_nowait()
+                  if verbose:
+                     sys.stderr.write("Job "+str(j)+" finished\n")
+                  res.append((j,lst))
+                  jobs_running -= 1
+               except Queue.Empty:
+                  time.sleep(0.1)
+                  pass
+               if jobs_running > cpu_num*2:
+                  time.sleep(1.0)
+               else:
+                  break
+
+      collect.append(snp_id)
+
+   if numThreads==1 or count<1000 or len(collect)>0:
+      job += 1
+      print "Collect final batch size %i job %i @%i: " % (len(collect), job, count)
+      compute_snp(job,n,collect,lmm2,reml,q)
+      collect = []
+      j,lst = q.get()
+      res.append((j,lst))
+   print "count=",count," running=",jobs_running," collect=",len(collect)
+   for job in range(jobs_running):
+      j,lst = q.get(True,15) # time out
+      if verbose:
+         sys.stderr.write("Job "+str(j)+" finished\n")
+      res.append((j,lst))
+
+   print "Before sort",[res1[0] for res1 in res]
+   res = sorted(res,key=lambda x: x[0])
+   # if verbose:
+   #    print "res=",res[0][0:10]
+   print "After sort",[res1[0] for res1 in res]
+   print [len(res1[1]) for res1 in res]
+   ts = [item[0] for j,res1 in res for item in res1]
+   ps = [item[1] for j,res1 in res for item in res1]
+   return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index f7b556a5..7063fedf 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -135,6 +135,8 @@ class plink:
     def normalizeGenotype(self,G):
         # print "Before",G
         # print G.shape
+        print "call input.normalizeGenotype"
+        raise "This should not be used"
         x = True - np.isnan(G)
         m = G[x].mean()
         s = np.sqrt(G[x].var())
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 28f2042d..0c43587e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -21,6 +21,7 @@
 import sys
 import os
 import numpy as np
+from scipy import linalg
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue
 import time
@@ -85,12 +86,13 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
    m = G.shape[0] # snps
    snps = m
    sys.stderr.write(str(m)+" SNPs\n")
-   assert m>n, "n should be larger than m (snps>inds)"
+   assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
 
    q = mp.Queue()
    p = mp.Pool(numThreads, f_init, [q])
    cpu_num = mp.cpu_count()
    print "CPU cores:",cpu_num
+   print snps,computeSize
    iterations = snps/computeSize+1
    # if testing:
    #    iterations = 8
@@ -153,5 +155,23 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
    #    np.savetxt(outFile+".kve",Kve)
    return K      
 
+def kvakve(K, verbose=True):
+   """
+   Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned
+   of small values < 1e-6 (notably smaller than zero)
+   """
+   if verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+   
+   Kva,Kve = linalg.eigh(K)
+   if verbose:
+      print("Kva is: ", Kva.shape, Kva)
+      print("Kve is: ", Kve.shape, Kve)
+
+   if sum(Kva < 1e-6):
+      if verbose: sys.stderr.write("Cleaning %d eigen values (Kva<0)\n" % (sum(Kva < 0)))
+      Kva[Kva < 1e-6] = 1e-6
+   return Kva,Kve
+
+
 
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 937d3340..58ff809b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -50,8 +50,10 @@ has_gn2=True
 
 from utility.benchmark import Bench
 from utility import temp_data
-from kinship import kinship, kinship_full
+from kinship import kinship, kinship_full, kvakve
 import genotype
+import phenotype
+import gwas
 
 try:
     from wqflask.my_pylmm.pyLMM import chunks
@@ -254,7 +256,7 @@ def human_association(snp,
 #        refit=False,
 #        temp_data=None):
     
-def run_other(pheno_vector,
+def run_other_old(pheno_vector,
         genotype_matrix,
         restricted_max_likelihood=True,
         refit=False,
@@ -270,7 +272,7 @@ def run_other(pheno_vector,
     
     """
     
-    print("In run_other")
+    print("Running the original LMM engine in run_other (old)")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
     with Bench("Calculate Kinship"):
         kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
@@ -278,25 +280,79 @@ def run_other(pheno_vector,
     print("kinship_matrix: ", pf(kinship_matrix))
     print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
     
-    with Bench("Create LMM object"):
-        lmm_ob = LMM(pheno_vector, kinship_matrix)
+    # with Bench("Create LMM object"):
+    #     lmm_ob = LMM(pheno_vector, kinship_matrix)
     
-    with Bench("LMM_ob fitting"):
-        lmm_ob.fit()
+    # with Bench("LMM_ob fitting"):
+    #     lmm_ob.fit()
 
-    print("genotype_matrix: ", genotype_matrix.shape)
+    print("run_other_old genotype_matrix: ", genotype_matrix.shape)
     print(genotype_matrix)
 
     with Bench("Doing GWAS"):
         t_stats, p_values = GWAS(pheno_vector,
-                                genotype_matrix,
-                                kinship_matrix,
-                                restricted_max_likelihood=True,
-                                refit=False,
-                                temp_data=tempdata)
+                                      genotype_matrix,
+                                      kinship_matrix,
+                                      restricted_max_likelihood=True,
+                                      refit=False,
+                                      temp_data=tempdata)
     Bench().report()
     return p_values, t_stats
 
+def run_other_new(pheno_vector,
+        genotype_matrix,
+        restricted_max_likelihood=True,
+        refit=False,
+        tempdata=None      # <---- can not be None
+        ):
+    
+    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
+    
+    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
+    refit -- whether to refit the variance component for each marker
+    temp_data -- TempData object that stores the progress for each major step of the
+    calculations ("calculate_kinship" and "GWAS" take the majority of time) 
+    
+    """
+    
+    print("Running the new LMM2 engine in run_other_new")
+    print("REML=",restricted_max_likelihood," REFIT=",refit)
+
+    # Adjust phenotypes
+    Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
+    print("Removed missing phenotypes",Y.shape)
+
+    # if options.maf_normalization:
+    #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+    #     print "MAF replacements: \n",G
+    # if not options.skip_genotype_normalization:
+    # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+
+    with Bench("Calculate Kinship"):
+        K,G = calculate_kinship(G, tempdata)
+    
+    print("kinship_matrix: ", pf(K))
+    print("kinship_matrix.shape: ", pf(K.shape))
+
+    # with Bench("Create LMM object"):
+    #     lmm_ob = lmm2.LMM2(Y,K)
+    # with Bench("LMM_ob fitting"):
+    #     lmm_ob.fit()
+
+    print("run_other_new genotype_matrix: ", G.shape)
+    print(G)
+
+    with Bench("Doing GWAS"):
+        t_stats, p_values = gwas.gwas(Y,
+                                      G.T,
+                                      K,
+                                      restricted_max_likelihood=True,
+                                      refit=False,verbose=True)
+    Bench().report()
+    return p_values, t_stats
+
+# def matrixMult(A,B):
+#     return np.dot(A,B)
 
 def matrixMult(A,B):
 
@@ -334,8 +390,10 @@ def calculate_kinship_new(genotype_matrix, temp_data=None):
     Call the new kinship calculation where genotype_matrix contains
     inds (columns) by snps (rows).
     """
+    print("call genotype.normalize")
     G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
-    return kinship(G.T),G
+    print("call calculate_kinship_new")
+    return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME)
 
 def calculate_kinship_old(genotype_matrix, temp_data=None):
     """
@@ -345,6 +403,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
     normalizes the resulting vectors and returns the RRM matrix.
     
     """
+    print("call calculate_kinship_old")
     n = genotype_matrix.shape[0]
     m = genotype_matrix.shape[1]
     print("genotype 2D matrix n (inds) is:", n)
@@ -375,7 +434,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
             temp_data.store("percent_complete", percent_complete)
         
     genotype_matrix = genotype_matrix[:,keep]
-    print("genotype_matrix: ", pf(genotype_matrix))
+    print("After kinship (old) genotype_matrix: ", pf(genotype_matrix))
     kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
     return kinship_matrix,genotype_matrix
 
@@ -533,11 +592,13 @@ class LMM:
        print("this K is:", K.shape, pf(K))
        
        if len(Kva) == 0 or len(Kve) == 0:
-          if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+          # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
           begin = time.time()
-          Kva,Kve = linalg.eigh(K)
+          # Kva,Kve = linalg.eigh(K)
+          Kva,Kve = kvakve(K)
           end = time.time()
           if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
 
        self.K = K
        self.Kva = Kva
@@ -547,10 +608,11 @@ class LMM:
        self.Y = Y
        self.X0 = X0
        self.N = self.K.shape[0]
- 
-       if sum(self.Kva < 1e-6):
-          if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
-          self.Kva[self.Kva < 1e-6] = 1e-6
+
+       # ----> Below moved to kinship.kvakve(K)
+       # if sum(self.Kva < 1e-6):
+       #    if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+       #    self.Kva[self.Kva < 1e-6] = 1e-6
  
        self.transform()
 
@@ -713,7 +775,10 @@ class LMM:
  
           ts = beta / np.sqrt(var * sigma)        
           ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
-          if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(")
+          if not len(ts) == 1 or not len(ps) == 1:
+              print("ts=",ts)
+              print("ps=",ps)
+              raise Exception("Something bad happened :(")
           return ts.sum(),ps.sum()
 
     def plotFit(self,color='b-',title=''):
@@ -739,7 +804,11 @@ class LMM:
        pl.title(title)
 
 
-def gn2_redis(key,species):
+def gn2_redis(key,species,new_code=True):
+    """
+    Invoke pylmm using Redis as a container. new_code runs the new
+    version
+    """
     json_params = Redis.get(key)
     
     params = json.loads(json_params)
@@ -761,11 +830,18 @@ def gn2_redis(key,species):
         geno = np.array(params['genotype_matrix'])
         print('geno', geno.shape, geno)
 
-        ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']),
-                  genotype_matrix = geno,
-                  restricted_max_likelihood = params['restricted_max_likelihood'],
-                  refit = params['refit'],
-                  tempdata = tempdata)
+        if new_code:
+            ps, ts = run_other_new(pheno_vector = np.array(params['pheno_vector']),
+                               genotype_matrix = geno,
+                               restricted_max_likelihood = params['restricted_max_likelihood'],
+                               refit = params['refit'],
+                               tempdata = tempdata)
+        else:
+            ps, ts = run_other_old(pheno_vector = np.array(params['pheno_vector']),
+                               genotype_matrix = geno,
+                               restricted_max_likelihood = params['restricted_max_likelihood'],
+                               refit = params['refit'],
+                               tempdata = tempdata)
         
     results_key = "pylmm:results:" + params['temp_uuid']
 
@@ -790,7 +866,7 @@ def gn2_main():
 
     gn2_redis(key,species)
 
-def gn2_load_redis(key,species,kinship,pheno,geno):
+def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
     print("Loading Redis from parsed data")
     if kinship == None:
         k = None
@@ -811,7 +887,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno):
     Redis.set(key, json_params)
     Redis.expire(key, 60*60)
 
-    return gn2_redis(key,species)
+    return gn2_redis(key,species,new_code)
     
 if __name__ == '__main__':
     print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
@@ -821,4 +897,3 @@ if __name__ == '__main__':
         print("Run from runlmm.py instead")
 
 
-
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
new file mode 100644
index 00000000..d4b3ac82
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -0,0 +1,410 @@
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013,2014  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import time
+import numpy as np
+from scipy.linalg import eigh, inv, det
+import scipy.stats as stats # t-tests
+from scipy import optimize
+from optmatrix import matrixMult
+import kinship
+
+def calculateKinship(W,center=False):
+      """
+	 W is an n x m matrix encoding SNP minor alleles.
+
+	 This function takes a matrix oF SNPs, imputes missing values with the maf,
+	 normalizes the resulting vectors and returns the RRM matrix.
+      """
+      n = W.shape[0]
+      m = W.shape[1]
+      keep = []
+      for i in range(m):
+	 mn = W[True - np.isnan(W[:,i]),i].mean()
+	 W[np.isnan(W[:,i]),i] = mn
+	 vr = W[:,i].var()
+	 if vr == 0: continue
+
+	 keep.append(i)
+	 W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
+
+      W = W[:,keep]
+      K = matrixMult(W,W.T) * 1.0/float(m)
+      if center:
+	 P = np.diag(np.repeat(1,n)) - 1/float(n) * np.ones((n,n))
+	 S = np.trace(matrixMult(matrixMult(P,K),P))
+	 K_n = (n - 1)*K / S
+	 return K_n
+      return K
+
+def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
+      """
+
+        Performs a basic GWAS scan using the LMM.  This function
+        uses the LMM module to assess association at each SNP and 
+        does some simple cleanup, such as removing missing individuals 
+        per SNP and re-computing the eigen-decomp
+
+	Y - n x 1 phenotype vector 
+        X - n x m SNP matrix (genotype matrix)
+	K - n x n kinship matrix
+        Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
+        X0 - n x q covariate matrix
+	REML - use restricted maximum likelihood 
+        refit - refit the variance component for each SNP
+
+      """
+      n = X.shape[0]
+      m = X.shape[1]
+      prins("Initialize GWAS")
+      print("genotype matrix n is:", n)
+      print("genotype matrix m is:", m)
+
+      if X0 == None: 
+         X0 = np.ones((n,1))
+      
+      # Remove missing values in Y and adjust associated parameters
+      v = np.isnan(Y)
+      if v.sum():
+	 keep = True - v
+	 keep = keep.reshape((-1,))
+	 Y = Y[keep]
+	 X = X[keep,:]
+	 X0 = X0[keep,:]
+	 K = K[keep,:][:,keep]
+	 Kva = []
+	 Kve = []
+
+      if len(Y) == 0: 
+         return np.ones(m)*np.nan,np.ones(m)*np.nan
+
+      L = LMM(Y,K,Kva,Kve,X0)
+      if not refit: L.fit()
+
+      PS = []
+      TS = []
+
+      n = X.shape[0]
+      m = X.shape[1]
+
+      for i in range(m):
+	 x = X[:,i].reshape((n,1))
+	 v = np.isnan(x).reshape((-1,))
+	 if v.sum():
+	    keep = True - v
+	    xs = x[keep,:]
+	    if xs.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    Ys = Y[keep]
+	    X0s = X0[keep,:]
+	    Ks = K[keep,:][:,keep]
+	    Ls = LMM(Ys,Ks,X0=X0s)
+	    if refit: 
+               Ls.fit(X=xs)
+	    else: 
+               Ls.fit()
+	    ts,ps = Ls.association(xs,REML=REML)
+	 else: 
+	    if x.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    if refit: 
+               L.fit(X=x)
+	    ts,ps = L.association(x,REML=REML)
+	    
+	 PS.append(ps)
+	 TS.append(ts)
+
+      return TS,PS
+
+class LMM2:
+
+   """
+	 This is a simple version of EMMA/fastLMM.  
+	 The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
+	 and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
+	 There are three model parameters: heritability (h), covariate coefficients (beta) and the total
+	 phenotypic variance (sigma).
+	 Heritability as defined here is the proportion of the total variance (sigma) that is attributed to 
+	 the kinship matrix.
+
+	 For simplicity, we assume that everything being input is a numpy array.
+	 If this is not the case, the module may throw an error as conversion from list to numpy array
+	 is not done consistently.
+
+   """
+   def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
+
+      """
+      The constructor takes a phenotype vector or array Y of size n.
+      It takes a kinship matrix K of size n x n.  Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
+      If they are not provided, the constructor will calculate them.
+      X0 is an optional covariate matrix of size n x q, where there are q covariates.
+      When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+      """
+
+      if X0 == None: 
+	 X0 = np.ones(len(Y)).reshape(len(Y),1)
+      self.verbose = verbose
+
+      x = True - np.isnan(Y)
+      x = x.reshape(-1,)
+      if not x.sum() == len(Y):
+	 if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
+	 Y = Y[x]
+	 K = K[x,:][:,x]
+	 X0 = X0[x,:]
+	 Kva = []
+	 Kve = []
+      self.nonmissing = x
+
+      print("this K is:", K.shape, K)
+      
+      if len(Kva) == 0 or len(Kve) == 0:
+          # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+          begin = time.time()
+          # Kva,Kve = linalg.eigh(K)
+          Kva,Kve = kinship.kvakve(K)
+          end = time.time()
+          if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
+
+      self.K = K
+      self.Kva = Kva
+      self.Kve = Kve
+      self.N = self.K.shape[0]
+      self.Y = Y.reshape((self.N,1))
+      self.X0 = X0
+
+      if sum(self.Kva < 1e-6):
+         if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+         self.Kva[self.Kva < 1e-6] = 1e-6
+
+      self.transform()
+
+   def transform(self):
+
+      """
+	 Computes a transformation on the phenotype vector and the covariate matrix.
+	 The transformation is obtained by left multiplying each parameter by the transpose of the 
+	 eigenvector matrix of K (the kinship).
+      """
+
+      self.Yt = matrixMult(self.Kve.T, self.Y)
+      self.X0t = matrixMult(self.Kve.T, self.X0)
+      self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))])
+      self.q = self.X0t.shape[1]
+
+   def getMLSoln(self,h,X):
+
+      """
+	 Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
+	 the total variance of the trait (sigma) and also passes intermediates that can 
+	 be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
+	 the heritability or the proportion of the total variance attributed to genetics.  The X is the 
+	 covariate matrix.
+      """
+   
+      S = 1.0/(h*self.Kva + (1.0 - h))
+      Xt = X.T*S
+      XX = matrixMult(Xt,X)
+      XX_i = inv(XX)
+      beta =  matrixMult(matrixMult(XX_i,Xt),self.Yt)
+      Yt = self.Yt - matrixMult(X,beta)
+      Q = np.dot(Yt.T*S,Yt)
+      sigma = Q * 1.0 / (float(self.N) - float(X.shape[1]))
+      return beta,sigma,Q,XX_i,XX
+
+   def LL_brent(self,h,X=None,REML=False): 
+      #brent will not be bounded by the specified bracket.
+      # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search.
+      if h < 0: return 1e6
+      return -self.LL(h,X,stack=False,REML=REML)[0]
+	 
+   def LL(self,h,X=None,stack=True,REML=False):
+
+      """
+	 Computes the log-likelihood for a given heritability (h).  If X==None, then the 
+	 default X0t will be used.  If X is set and stack=True, then X0t will be matrix concatenated with
+	 the input X.  If stack is false, then X is used in place of X0t in the LL calculation.
+	 REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
+      """
+
+      if X == None: X = self.X0t
+      elif stack: 
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      n = float(self.N)
+      q = float(X.shape[1])
+      beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
+      LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
+      LL = -0.5 * LL
+
+      if REML:
+	 LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(det(matrixMult(X.T,X))) - np.log(det(XX))
+	 LL = LL + 0.5*LL_REML_part
+
+
+      LL = LL.sum()
+      return LL,beta,sigma,XX_i
+
+   def getMax(self,H, X=None,REML=False):
+
+      """
+	 Helper functions for .fit(...).  
+	 This function takes a set of LLs computed over a grid and finds possible regions 
+	 containing a maximum.  Within these regions, a Brent search is performed to find the 
+	 optimum.
+
+      """
+      n = len(self.LLs)
+      HOpt = []
+      for i in range(1,n-2):
+          if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]:
+	    HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+	    if np.isnan(HOpt[-1]): HOpt[-1] = H[i-1]
+	    #if np.isnan(HOpt[-1]): HOpt[-1] = self.LLs[i-1]
+	    #if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]]
+
+      if len(HOpt) > 1: 
+	 if self.verbose: sys.stderr.write("NOTE: Found multiple optima.  Returning first...\n")
+	 return HOpt[0]
+      elif len(HOpt) == 1: return HOpt[0]
+      elif self.LLs[0] > self.LLs[n-1]: return H[0]
+      else: return H[n-1]
+
+
+   def fit(self,X=None,ngrids=100,REML=True):
+
+      """
+	 Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
+	 X can be passed and will transformed and concatenated to X0t.  Otherwise, X0t is used as 
+	 the covariate matrix.
+
+	 This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
+	 Given this optimum, the function computes the LL and associated ML solutions.
+      """
+      
+      if X == None: X = self.X0t
+      else: 
+	 #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      H = np.array(range(ngrids)) / float(ngrids)
+      L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
+      self.LLs = L
+
+      hmax = self.getMax(H,X,REML)
+      L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
+      
+      self.H = H
+      self.optH = hmax.sum()
+      self.optLL = L
+      self.optBeta = beta
+      self.optSigma = sigma.sum()
+
+      return hmax,beta,sigma,L
+
+   def association(self,X,h=None,stack=True,REML=True,returnBeta=False):
+      """
+	Calculates association statitics for the SNPs encoded in the vector X of size n.
+	If h == None, the optimal h stored in optH is used.
+
+      """
+      if False:
+         print "X=",X
+         print "h=",h
+         print "q=",self.q
+         print "self.Kve=",self.Kve
+         print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack
+      
+      if stack:
+	 # X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+         m = matrixMult(self.Kve.T,X)
+         # print "m=",m
+         m = m[:,0]
+         self.X0t_stack[:,(self.q)] = m
+	 X = self.X0t_stack
+	 
+      if h == None: h = self.optH
+
+      L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML)
+      q  = len(beta)
+      ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q)
+      
+      if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma
+      return ts,ps
+
+   def tstat(self,beta,var,sigma,q,log=False): 
+
+	 """
+	    Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
+	    This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
+	 """
+
+	 ts = beta / np.sqrt(var * sigma)	 
+	 #ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
+	 # sf == survival function - this is more accurate -- could also use logsf if the precision is not good enough
+	 if log:
+	    ps = 2.0 + (stats.t.logsf(np.abs(ts), self.N-q))
+	 else:
+	    ps = 2.0*(stats.t.sf(np.abs(ts), self.N-q))
+	 if not len(ts) == 1 or not len(ps) == 1: 
+	    raise Exception("Something bad happened :(")
+	 return ts.sum(),ps.sum()
+
+   def plotFit(self,color='b-',title=''):
+
+      """
+	 Simple function to visualize the likelihood space.  It takes the LLs 
+	 calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
+	 The resulting "probabilities" are normalized to one and plotted against heritability.
+	 This can be seen as an approximation to the posterior distribuiton of heritability.
+
+	 For diagnostic purposes this lets you see if there is one distinct maximum or multiple 
+	 and what the variance of the parameter looks like.
+      """
+      import matplotlib.pyplot as pl
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      pl.plot(self.H,p,color)
+      pl.xlabel("Heritability")
+      pl.ylabel("Probability of data")
+      pl.title(title)
+   
+   def meanAndVar(self):
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      mn = (self.H * p).sum()
+      vx = ((self.H - mn)**2 * p).sum()
+
+      return mn,vx
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index bb620052..682ba371 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -36,5 +36,5 @@ def remove_missing(y,g,verbose=False):
             sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
         y1 = y[keep]
         g1 = g[keep,:]
-    return y1,g1
+    return y1,g1,keep
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 6bb79856..324c4f2c 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,7 +21,7 @@ from optparse import OptionParser
 import sys
 import tsvreader
 import numpy as np
-from lmm import gn2_load_redis, calculate_kinship
+from lmm import gn2_load_redis, calculate_kinship_old
 from kinship import kinship, kinship_full
 import genotype
 import phenotype
@@ -54,9 +54,12 @@ parser.add_option("--geno",dest="geno",
 parser.add_option("--maf-normalization",
                   action="store_true", dest="maf_normalization", default=False,
                   help="Apply MAF genotype normalization")
-parser.add_option("--skip-genotype-normalization",
-                  action="store_true", dest="skip_genotype_normalization", default=False,
-                  help="Skip genotype normalization")
+parser.add_option("--genotype-normalization",
+                  action="store_true", dest="genotype_normalization", default=False,
+                  help="Force genotype normalization")
+parser.add_option("--remove-missing-phenotypes",
+                  action="store_true", dest="remove_missing_phenotypes", default=False,
+                  help="Remove missing phenotypes")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
@@ -99,28 +102,58 @@ if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
 
-if cmd == 'redis':
+if cmd == 'redis_new':
+    # The main difference between redis_new and redis is that missing
+    # phenotypes are handled by the first
+    Y = y
+    G = g
+    print "Original G",G.shape, "\n", G
+
+    gt = G.T
+    G = None
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
+    print np.array(ps)
+    print len(ps),sum(ps)
+    # Test results
+    p1 = round(ps[0],4)
+    p2 = round(ps[-1],4)
+    sys.stderr.write(options.geno+"\n")
+    if options.geno == 'data/small.geno':
+        assert p1==0.0708, "p1=%f" % p1
+        assert p2==0.1417, "p2=%f" % p2
+    if options.geno == 'data/small_na.geno':
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
+    if options.geno == 'data/test8000.geno':
+        # assert p1==0.8984, "p1=%f" % p1
+        # assert p2==0.9621, "p2=%f" % p2
+        assert round(sum(ps)) == 4070
+        assert len(ps) == 8000
+elif cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
-    if y != None:
+    if y != None and options.remove_missing_phenotypes:
         gnt = np.array(g).T
-        Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+        Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
         print "Removed missing phenotypes",G.shape, "\n", G
+    else:
+        Y = y
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
 
     gt = G.T
     G = None
-    ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
     print np.array(ps)
-    # Test results
+    print len(ps),sum(ps)
+    # Test results 4070.02346579
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
     sys.stderr.write(options.geno+"\n")
@@ -128,15 +161,15 @@ if cmd == 'redis':
         assert p1==0.0708, "p1=%f" % p1
         assert p2==0.1417, "p2=%f" % p2
     if options.geno == 'data/small_na.geno':
-        assert p1==0.0958, "p1=%f" % p1
-        assert p2==0.0435, "p2=%f" % p2
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
-        assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9623, "p2=%f" % p2
+        assert round(sum(ps)) == 4070
+        assert len(ps) == 8000
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
-    if y != None:
+    if y != None and options.remove_missing_phenotypes:
         gnt = np.array(g).T
         Y,g = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
@@ -144,32 +177,32 @@ elif cmd == 'kinship':
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
 
     if options.test_kinship:
-        K = kinship_full(G)
+        K = kinship_full(np.copy(G))
         print "Genotype",G.shape, "\n", G
         print "first Kinship method",K.shape,"\n",K
         k1 = round(K[0][0],4)
-        K2 = calculate_kinship(np.copy(G.T),temp_data=None)
+        K2,G = calculate_kinship_old(np.copy(G).T,temp_data=None)
         print "Genotype",G.shape, "\n", G
         print "GN2 Kinship method",K2.shape,"\n",K2
         k2 = round(K2[0][0],4)
     
     print "Genotype",G.shape, "\n", G
-    K3 = kinship(np.copy(G),options)
+    K3 = kinship(G.T)
     print "third Kinship method",K3.shape,"\n",K3
     sys.stderr.write(options.geno+"\n")
     k3 = round(K3[0][0],4)
     if options.geno == 'data/small.geno':
-        assert k1==0.7939, "k1=%f" % k1
+        assert k1==0.8, "k1=%f" % k1
         assert k2==0.7939, "k2=%f" % k2
         assert k3==0.7939, "k3=%f" % k3
     if options.geno == 'data/small_na.geno':
-        assert k1==0.7172, "k1=%f" % k1
+        assert k1==0.8333, "k1=%f" % k1
         assert k2==0.7172, "k2=%f" % k2
         assert k3==0.7172, "k3=%f" % k3
     if options.geno == 'data/test8000.geno':
author	zsloan	2015-04-06 10:50:59 -0500
committer	zsloan	2015-04-06 10:50:59 -0500
commit	24fe675688467b77cb0b1e858e23f119027d08d8 (patch)
tree	fdc0ea6440f342f9bd2a3ed8946998859efebc9b
parent	5fb4245a3d08f7107cf885639fc275a6d73ea82d (diff)
parent	67d7d982f96cfd0f3dcb9806ecc6d2f947af9dc9 (diff)
download	genenetwork2-24fe675688467b77cb0b1e858e23f119027d08d8.tar.gz