From dbc048f06a50837f59ce4d2ee7cfe84ab8e6062f Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 15 Mar 2015 14:41:14 +0300
Subject: Comment

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 5aa27106..1fa7a895 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -334,7 +334,7 @@ def calculate_kinship_new(genotype_matrix, temp_data=None):
     inds (columns) by snps (rows).
     """
     G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
-    return kinship(G.T),G
+    return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME)
 
 def calculate_kinship_old(genotype_matrix, temp_data=None):
     """
-- 
cgit 1.4.1


From 8463812f328973190b5a5160bdcff47a7f1cd12d Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 16 Mar 2015 12:41:43 +0300
Subject: Refactoring eigenvalue KvaKve

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 20 ++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py     | 13 +++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 28f2042d..bb4f5ace 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -21,10 +21,12 @@
 import sys
 import os
 import numpy as np
+from scipy import linalg
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue
 import time
 
+
 from optmatrix import matrix_initialize, matrixMultT
 
 def kinship_full(G):
@@ -153,5 +155,23 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
    #    np.savetxt(outFile+".kve",Kve)
    return K      
 
+def kvakve(K, verbose=True):
+   """
+   Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned
+   of small values < 1e-6 (notably smaller than zero)
+   """
+   if verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+   
+   Kva,Kve = linalg.eigh(K)
+   if verbose:
+      print("self.Kva is: ", Kva.shape, Kva)
+      print("self.Kve is: ", Kve.shape, Kve)
+
+   if sum(Kva < 1e-6):
+      if verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(Kva < 1e-6)))
+      Kva[Kva < 1e-6] = 1e-6
+   return Kva,Kve
+
+
 
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 1fa7a895..17fe952a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -49,7 +49,7 @@ has_gn2=True
 
 from utility.benchmark import Bench
 from utility import temp_data
-from kinship import kinship, kinship_full
+from kinship import kinship, kinship_full, kvakve
 import genotype
 
 try:
@@ -532,11 +532,12 @@ class LMM:
        print("this K is:", K.shape, pf(K))
        
        if len(Kva) == 0 or len(Kve) == 0:
-          if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
-          begin = time.time()
-          Kva,Kve = linalg.eigh(K)
-          end = time.time()
-          if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+          # begin = time.time()
+          # Kva,Kve = linalg.eigh(K)
+          # end = time.time()
+          # if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          Kva,Kve = kvakve(K)
 
        self.K = K
        self.Kva = Kva
-- 
cgit 1.4.1


From 3cff3091379a2e8835027c272ada0d8d4314624e Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 16 Mar 2015 13:15:21 +0300
Subject: Output added

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py |  4 ++--
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py     | 16 +++++++++-------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index bb4f5ace..cd2445f1 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -164,8 +164,8 @@ def kvakve(K, verbose=True):
    
    Kva,Kve = linalg.eigh(K)
    if verbose:
-      print("self.Kva is: ", Kva.shape, Kva)
-      print("self.Kve is: ", Kve.shape, Kve)
+      print("Kva is: ", Kva.shape, Kva)
+      print("Kve is: ", Kve.shape, Kve)
 
    if sum(Kva < 1e-6):
       if verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(Kva < 1e-6)))
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 17fe952a..9fd05b42 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -533,11 +533,12 @@ class LMM:
        
        if len(Kva) == 0 or len(Kve) == 0:
           # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
-          # begin = time.time()
+          begin = time.time()
           # Kva,Kve = linalg.eigh(K)
-          # end = time.time()
-          # if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
           Kva,Kve = kvakve(K)
+          end = time.time()
+          if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
 
        self.K = K
        self.Kva = Kva
@@ -547,10 +548,11 @@ class LMM:
        self.Y = Y
        self.X0 = X0
        self.N = self.K.shape[0]
- 
-       if sum(self.Kva < 1e-6):
-          if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
-          self.Kva[self.Kva < 1e-6] = 1e-6
+
+       # ----> Below moved to kinship.kvakve(K)
+       # if sum(self.Kva < 1e-6):
+       #    if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+       #    self.Kva[self.Kva < 1e-6] = 1e-6
  
        self.transform()
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 6bb79856..cef0cdc4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -120,6 +120,7 @@ if cmd == 'redis':
     G = None
     ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
     print np.array(ps)
+    print sum(ps)
     # Test results
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
-- 
cgit 1.4.1


From 62f6db77b22e1fb28b3355d75a30f9ecf6c94d95 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 16 Mar 2015 15:34:38 +0300
Subject: Adding multi-core GWAS

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py    | 213 ++++++++++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py |   4 +-
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py     |  62 +++++++--
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  |   8 +-
 4 files changed, 271 insertions(+), 16 deletions(-)
 create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/gwas.py

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
new file mode 100644
index 00000000..52455014
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -0,0 +1,213 @@
+# pylmm-based GWAS calculation
+#
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#!/usr/bin/python
+
+import pdb
+import time
+import sys
+# from utility import temp_data
+import lmm
+
+
+import os
+import numpy as np
+import input
+from optmatrix import matrix_initialize
+# from lmm import LMM
+
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+def compute_snp(j,snp_ids,q = None):
+   # print(j,len(snp_ids),"\n")
+   result = []
+   for snp_id in snp_ids:
+      # j,snp_id = collect
+      snp,id = snp_id
+      # id = collect[1]
+      # result = []
+      # Check SNPs for missing values
+      x = snp[keep].reshape((n,1))  # all the SNPs
+      v = np.isnan(x).reshape((-1,))
+      if v.sum():
+         # NOTE: this code appears to be unreachable!
+         if verbose:
+            sys.stderr.write("Found missing values in "+str(x))
+         keeps = True - v
+         xs = x[keeps,:]
+         if keeps.sum() <= 1 or xs.var() <= 1e-6: 
+            # PS.append(np.nan)
+            # TS.append(np.nan)
+            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
+            # continue
+            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
+            continue
+
+         # Its ok to center the genotype -  I used normalizeGenotype to 
+         # force the removal of missing genotypes as opposed to replacing them with MAF.
+         if not normalizeGenotype:
+            xs = (xs - xs.mean()) / np.sqrt(xs.var())
+         Ys = Y[keeps]
+         X0s = X0[keeps,:]
+         Ks = K[keeps,:][:,keeps]
+         if kfile2:
+            K2s = K2[keeps,:][:,keeps]
+            Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=verbose,K2=K2s)
+         else:
+            Ls = LMM(Ys,Ks,X0=X0s,verbose=verbose)
+         if refit:
+           Ls.fit(X=xs,REML=REML)
+         else:
+            #try:
+            Ls.fit(REML=REML)
+            #except: pdb.set_trace()
+         ts,ps,beta,betaVar = Ls.association(xs,REML=REML,returnBeta=True)
+      else:
+         if x.var() == 0:
+            # Note: this code appears to be unreachable!
+
+            # PS.append(np.nan)
+            # TS.append(np.nan)
+            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values
+            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
+            continue
+
+         if refit:
+            L.fit(X=x,REML=REML)
+         # This is where it happens
+         ts,ps,beta,betaVar = L.association(x,REML=REML,returnBeta=True)
+      result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
+      # compute_snp.q.put([j,formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)])
+   # print [j,result[0]]," in result queue\n"
+   if not q:
+      q = compute_snp.q
+   q.put([j,result])
+   return j
+      # PS.append(ps)
+      # TS.append(ts)
+      # return len(result)
+      # compute.q.put(result)
+      # return None
+
+def f_init(q):
+   compute_snp.q = q
+
+def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
+   """
+   Execute a GWAS. The G matrix should be n inds (cols) x m snps (rows)
+   """
+   matrix_initialize()
+   cpu_num = mp.cpu_count()
+   numThreads = 1
+   kfile2 = False
+
+   sys.stderr.write(str(G.shape)+"\n")
+   n = G.shape[1] # inds
+   inds = n
+   m = G.shape[0] # snps
+   snps = m
+   sys.stderr.write(str(m)+" SNPs\n")
+   # print "***** GWAS: G",G.shape,G
+   assert m>n, "n should be larger than m (snps>inds)"
+
+   # CREATE LMM object for association
+   # if not kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
+   # else:  L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=verbose,K2=K2)
+
+   runlmm = lmm.LMM(Y,K) # ,Kva,Kve,X0,verbose=verbose)
+   if not refit:
+      if verbose: sys.stderr.write("Computing fit for null model\n")
+      runlmm.fit()  # follow GN model in run_other
+      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (runlmm.optH,runlmm.optSigma))
+      
+   # outFile = "test.out"
+   # out = open(outFile,'w')
+   out = sys.stderr
+
+   def outputResult(id,beta,betaSD,ts,ps):
+      out.write(formatResult(id,beta,betaSD,ts,ps))
+   def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
+   def formatResult(id,beta,betaSD,ts,ps):
+      return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+
+   printOutHead()
+
+   # Set up the pool
+   # mp.set_start_method('spawn')
+   q = mp.Queue()
+   p = mp.Pool(numThreads, f_init, [q])
+   collect = []
+
+   # Buffers for pvalues and t-stats
+   # PS = []
+   # TS = []
+   count = 0
+
+   completed = 0
+   last_j = 0
+   # for snp_id in G:
+   for snp in G:
+      print snp.shape,snp
+      snp_id = ('SNPID',snp)
+      count += 1
+      if count % 1000 == 0:
+         job = count/1000
+         if verbose:
+            sys.stderr.write("Job %d At SNP %d\n" % (job,count))
+         if numThreads == 1:
+            compute_snp(job,collect,q)
+            collect = []
+            j,lines = q.get()
+            if verbose:
+               sys.stderr.write("Job "+str(j)+" finished\n")
+            for line in lines:
+               out.write(line)
+         else:
+            p.apply_async(compute_snp,(job,collect))
+            collect = []
+            while job > completed:
+               try:
+                  j,lines = q.get_nowait()
+                  if verbose:
+                     sys.stderr.write("Job "+str(j)+" finished\n")
+                  for line in lines:
+                     out.write(line)
+                  completed += 1
+               except Queue.Empty:
+                  pass
+               if job > completed + cpu_num + 5:
+                  time.sleep(1)
+               else:
+                  if job >= completed:
+                    break
+
+      collect.append(snp_id)
+
+   if not numThreads or numThreads > 1:
+      for job in range(int(count/1000)-completed):
+         j,lines = q.get(True,15) # time out
+         if verbose:
+            sys.stderr.write("Job "+str(j)+" finished\n")
+         for line in lines:
+            out.write(line)
+
+   # print collect
+   ps = [item[1][3] for item in collect]
+   ts = [item[1][2] for item in collect]
+   print ps
+   return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index cd2445f1..00f48939 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -26,7 +26,6 @@ import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue
 import time
 
-
 from optmatrix import matrix_initialize, matrixMultT
 
 def kinship_full(G):
@@ -87,12 +86,13 @@ def kinship(G,computeSize=1000,numThreads=None,useBLAS=False,verbose=True):
    m = G.shape[0] # snps
    snps = m
    sys.stderr.write(str(m)+" SNPs\n")
-   assert m>n, "n should be larger than m (snps>inds)"
+   assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
 
    q = mp.Queue()
    p = mp.Pool(numThreads, f_init, [q])
    cpu_num = mp.cpu_count()
    print "CPU cores:",cpu_num
+   print snps,computeSize
    iterations = snps/computeSize+1
    # if testing:
    #    iterations = 8
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 9fd05b42..8c6d3c3c 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -51,6 +51,7 @@ from utility.benchmark import Bench
 from utility import temp_data
 from kinship import kinship, kinship_full, kvakve
 import genotype
+import gwas
 
 try:
     from wqflask.my_pylmm.pyLMM import chunks
@@ -253,7 +254,7 @@ def human_association(snp,
 #        refit=False,
 #        temp_data=None):
     
-def run_other(pheno_vector,
+def run_other_old(pheno_vector,
         genotype_matrix,
         restricted_max_likelihood=True,
         refit=False,
@@ -269,7 +270,7 @@ def run_other(pheno_vector,
     
     """
     
-    print("In run_other")
+    print("In run_other (old)")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
     with Bench("Calculate Kinship"):
         kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
@@ -277,9 +278,51 @@ def run_other(pheno_vector,
     print("kinship_matrix: ", pf(kinship_matrix))
     print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
     
+    # with Bench("Create LMM object"):
+    #     lmm_ob = LMM(pheno_vector, kinship_matrix)
+    
+    # with Bench("LMM_ob fitting"):
+    #     lmm_ob.fit()
+
+    print("genotype_matrix: ", genotype_matrix.shape)
+    print(genotype_matrix)
+
+    with Bench("Doing GWAS"):
+        t_stats, p_values = GWAS(pheno_vector,
+                                      genotype_matrix,
+                                      kinship_matrix,
+                                      restricted_max_likelihood=True,
+                                      refit=False,
+                                      temp_data=tempdata)
+    Bench().report()
+    return p_values, t_stats
+
+def run_other_new(pheno_vector,
+        genotype_matrix,
+        restricted_max_likelihood=True,
+        refit=False,
+        tempdata=None      # <---- can not be None
+        ):
+    
+    """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
+    
+    restricted_max_likelihood -- whether to use restricted max likelihood; True or False
+    refit -- whether to refit the variance component for each marker
+    temp_data -- TempData object that stores the progress for each major step of the
+    calculations ("calculate_kinship" and "GWAS" take the majority of time) 
+    
+    """
+    
+    print("In run_other (new)")
+    print("REML=",restricted_max_likelihood," REFIT=",refit)
+    with Bench("Calculate Kinship"):
+        kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
+    
+    print("kinship_matrix: ", pf(kinship_matrix))
+    print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
+
     with Bench("Create LMM object"):
         lmm_ob = LMM(pheno_vector, kinship_matrix)
-    
     with Bench("LMM_ob fitting"):
         lmm_ob.fit()
 
@@ -287,15 +330,15 @@ def run_other(pheno_vector,
     print(genotype_matrix)
 
     with Bench("Doing GWAS"):
-        t_stats, p_values = GWAS(pheno_vector,
-                                genotype_matrix,
-                                kinship_matrix,
-                                restricted_max_likelihood=True,
-                                refit=False,
-                                temp_data=tempdata)
+        t_stats, p_values = gwas.gwas(pheno_vector,
+                                      genotype_matrix.T,
+                                      kinship_matrix,
+                                      restricted_max_likelihood=True,
+                                      refit=False,verbose=True)
     Bench().report()
     return p_values, t_stats
 
+run_other = run_other_old
 
 def matrixMult(A,B):
 
@@ -821,4 +864,3 @@ if __name__ == '__main__':
         print("Run from runlmm.py instead")
 
 
-
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index cef0cdc4..f17f1bd1 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,7 +21,7 @@ from optparse import OptionParser
 import sys
 import tsvreader
 import numpy as np
-from lmm import gn2_load_redis, calculate_kinship
+from lmm import gn2_load_redis, calculate_kinship_old
 from kinship import kinship, kinship_full
 import genotype
 import phenotype
@@ -151,17 +151,17 @@ elif cmd == 'kinship':
     gnt = None
 
     if options.test_kinship:
-        K = kinship_full(G)
+        K = kinship_full(np.copy(G))
         print "Genotype",G.shape, "\n", G
         print "first Kinship method",K.shape,"\n",K
         k1 = round(K[0][0],4)
-        K2 = calculate_kinship(np.copy(G.T),temp_data=None)
+        K2,G = calculate_kinship_old(np.copy(G).T,temp_data=None)
         print "Genotype",G.shape, "\n", G
         print "GN2 Kinship method",K2.shape,"\n",K2
         k2 = round(K2[0][0],4)
     
     print "Genotype",G.shape, "\n", G
-    K3 = kinship(np.copy(G),options)
+    K3 = kinship(G.T)
     print "third Kinship method",K3.shape,"\n",K3
     sys.stderr.write(options.geno+"\n")
     k3 = round(K3[0][0],4)
-- 
cgit 1.4.1


From cc419336f559a51ed03a669d19042e6fd21355ed Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 16 Mar 2015 17:34:40 +0300
Subject: Adding GWAS code

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py      |  99 ++-----
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py       |  44 ++-
 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py      | 405 ++++++++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py |   2 +-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py    |  26 +-
 5 files changed, 490 insertions(+), 86 deletions(-)
 create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 52455014..b9d5db61 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -21,79 +21,30 @@ import pdb
 import time
 import sys
 # from utility import temp_data
-import lmm
-
+import lmm2
 
 import os
 import numpy as np
 import input
 from optmatrix import matrix_initialize
-# from lmm import LMM
+from lmm2 import LMM2
 
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue 
 
-def compute_snp(j,snp_ids,q = None):
-   # print(j,len(snp_ids),"\n")
+def formatResult(id,beta,betaSD,ts,ps):
+   return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+
+def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
+   # print(j,snp_ids,"\n")
    result = []
    for snp_id in snp_ids:
-      # j,snp_id = collect
       snp,id = snp_id
-      # id = collect[1]
-      # result = []
-      # Check SNPs for missing values
-      x = snp[keep].reshape((n,1))  # all the SNPs
-      v = np.isnan(x).reshape((-1,))
-      if v.sum():
-         # NOTE: this code appears to be unreachable!
-         if verbose:
-            sys.stderr.write("Found missing values in "+str(x))
-         keeps = True - v
-         xs = x[keeps,:]
-         if keeps.sum() <= 1 or xs.var() <= 1e-6: 
-            # PS.append(np.nan)
-            # TS.append(np.nan)
-            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            # continue
-            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            continue
-
-         # Its ok to center the genotype -  I used normalizeGenotype to 
-         # force the removal of missing genotypes as opposed to replacing them with MAF.
-         if not normalizeGenotype:
-            xs = (xs - xs.mean()) / np.sqrt(xs.var())
-         Ys = Y[keeps]
-         X0s = X0[keeps,:]
-         Ks = K[keeps,:][:,keeps]
-         if kfile2:
-            K2s = K2[keeps,:][:,keeps]
-            Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=verbose,K2=K2s)
-         else:
-            Ls = LMM(Ys,Ks,X0=X0s,verbose=verbose)
-         if refit:
-           Ls.fit(X=xs,REML=REML)
-         else:
-            #try:
-            Ls.fit(REML=REML)
-            #except: pdb.set_trace()
-         ts,ps,beta,betaVar = Ls.association(xs,REML=REML,returnBeta=True)
-      else:
-         if x.var() == 0:
-            # Note: this code appears to be unreachable!
-
-            # PS.append(np.nan)
-            # TS.append(np.nan)
-            # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values
-            result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
-            continue
-
-         if refit:
-            L.fit(X=x,REML=REML)
-         # This is where it happens
-         ts,ps,beta,betaVar = L.association(x,REML=REML,returnBeta=True)
+      x = snp.reshape((n,1))  # all the SNPs
+      # if refit:
+      #    L.fit(X=snp,REML=REML)
+      ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True)
       result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
-      # compute_snp.q.put([j,formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)])
-   # print [j,result[0]]," in result queue\n"
    if not q:
       q = compute_snp.q
    q.put([j,result])
@@ -113,8 +64,9 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    """
    matrix_initialize()
    cpu_num = mp.cpu_count()
-   numThreads = 1
+   numThreads = None
    kfile2 = False
+   reml = restricted_max_likelihood
 
    sys.stderr.write(str(G.shape)+"\n")
    n = G.shape[1] # inds
@@ -123,17 +75,17 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    snps = m
    sys.stderr.write(str(m)+" SNPs\n")
    # print "***** GWAS: G",G.shape,G
-   assert m>n, "n should be larger than m (snps>inds)"
+   assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
 
    # CREATE LMM object for association
    # if not kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
    # else:  L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=verbose,K2=K2)
 
-   runlmm = lmm.LMM(Y,K) # ,Kva,Kve,X0,verbose=verbose)
+   lmm2 = LMM2(Y,K) # ,Kva,Kve,X0,verbose=verbose)
    if not refit:
       if verbose: sys.stderr.write("Computing fit for null model\n")
-      runlmm.fit()  # follow GN model in run_other
-      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (runlmm.optH,runlmm.optSigma))
+      lmm2.fit()  # follow GN model in run_other
+      if verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (lmm2.optH,lmm2.optSigma))
       
    # outFile = "test.out"
    # out = open(outFile,'w')
@@ -142,8 +94,6 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    def outputResult(id,beta,betaSD,ts,ps):
       out.write(formatResult(id,beta,betaSD,ts,ps))
    def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
-   def formatResult(id,beta,betaSD,ts,ps):
-      return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
 
    printOutHead()
 
@@ -162,15 +112,15 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    last_j = 0
    # for snp_id in G:
    for snp in G:
-      print snp.shape,snp
-      snp_id = ('SNPID',snp)
+      snp_id = (snp,'SNPID')
       count += 1
       if count % 1000 == 0:
          job = count/1000
          if verbose:
             sys.stderr.write("Job %d At SNP %d\n" % (job,count))
          if numThreads == 1:
-            compute_snp(job,collect,q)
+            print "Running on 1 THREAD"
+            compute_snp(job,n,collect,lmm2,reml,q)
             collect = []
             j,lines = q.get()
             if verbose:
@@ -178,7 +128,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             for line in lines:
                out.write(line)
          else:
-            p.apply_async(compute_snp,(job,collect))
+            p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
             collect = []
             while job > completed:
                try:
@@ -205,6 +155,13 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             sys.stderr.write("Job "+str(j)+" finished\n")
          for line in lines:
             out.write(line)
+   else:
+      print "Running on 1 THREAD"
+      # print collect
+      compute_snp(count/1000,n,collect,lmm2,reml,q)
+      j,lines = q.get()
+      for line in lines:
+         out.write(line)
 
    # print collect
    ps = [item[1][3] for item in collect]
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 8c6d3c3c..c42f9fb7 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -51,6 +51,7 @@ from utility.benchmark import Bench
 from utility import temp_data
 from kinship import kinship, kinship_full, kvakve
 import genotype
+import phenotype
 import gwas
 
 try:
@@ -315,32 +316,45 @@ def run_other_new(pheno_vector,
     
     print("In run_other (new)")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
+
+    # Adjust phenotypes
+    Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
+    print("Removed missing phenotypes",Y.shape)
+    # if options.maf_normalization:
+    #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+    #     print "MAF replacements: \n",G
+    # if not options.skip_genotype_normalization:
+    # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+
     with Bench("Calculate Kinship"):
-        kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
+        K,G = calculate_kinship(G, tempdata)
     
-    print("kinship_matrix: ", pf(kinship_matrix))
-    print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
+    print("kinship_matrix: ", pf(K))
+    print("kinship_matrix.shape: ", pf(K.shape))
 
-    with Bench("Create LMM object"):
-        lmm_ob = LMM(pheno_vector, kinship_matrix)
-    with Bench("LMM_ob fitting"):
-        lmm_ob.fit()
+    # with Bench("Create LMM object"):
+    #     lmm_ob = lmm2.LMM2(Y,K)
+    # with Bench("LMM_ob fitting"):
+    #     lmm_ob.fit()
 
-    print("genotype_matrix: ", genotype_matrix.shape)
-    print(genotype_matrix)
+    print("genotype_matrix: ", G.shape)
+    print(G)
 
     with Bench("Doing GWAS"):
-        t_stats, p_values = gwas.gwas(pheno_vector,
-                                      genotype_matrix.T,
-                                      kinship_matrix,
+        t_stats, p_values = gwas.gwas(Y,
+                                      G.T,
+                                      K,
                                       restricted_max_likelihood=True,
                                       refit=False,verbose=True)
     Bench().report()
     return p_values, t_stats
 
-run_other = run_other_old
+run_other = run_other_new
 
 def matrixMult(A,B):
+    return np.dot(A,B)
+
+def matrixMult_old(A,B):
 
     # If there is no fblas then we will revert to np.dot()
 
@@ -674,11 +688,15 @@ class LMM:
            optimum.
   
         """
+        print("H=",H)
+        print("X=",X)
+        print("REML=",REML)
         n = len(self.LLs)
         HOpt = []
         for i in range(1,n-2):
             if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: 
                 HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+                print("HOpt=",HOpt)
                 if np.isnan(HOpt[-1][0]):
                     HOpt[-1][0] = [self.LLs[i-1]]
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
new file mode 100644
index 00000000..cba47a9b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -0,0 +1,405 @@
+# pylmm is a python-based linear mixed-model solver with applications to GWAS
+
+# Copyright (C) 2013,2014  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import time
+import numpy as np
+from scipy.linalg import eigh, inv, det
+import scipy.stats as stats # t-tests
+from scipy import optimize
+from optmatrix import matrixMult
+
+def calculateKinship(W,center=False):
+      """
+	 W is an n x m matrix encoding SNP minor alleles.
+
+	 This function takes a matrix oF SNPs, imputes missing values with the maf,
+	 normalizes the resulting vectors and returns the RRM matrix.
+      """
+      n = W.shape[0]
+      m = W.shape[1]
+      keep = []
+      for i in range(m):
+	 mn = W[True - np.isnan(W[:,i]),i].mean()
+	 W[np.isnan(W[:,i]),i] = mn
+	 vr = W[:,i].var()
+	 if vr == 0: continue
+
+	 keep.append(i)
+	 W[:,i] = (W[:,i] - mn) / np.sqrt(vr)
+
+      W = W[:,keep]
+      K = matrixMult(W,W.T) * 1.0/float(m)
+      if center:
+	 P = np.diag(np.repeat(1,n)) - 1/float(n) * np.ones((n,n))
+	 S = np.trace(matrixMult(matrixMult(P,K),P))
+	 K_n = (n - 1)*K / S
+	 return K_n
+      return K
+
+def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
+      """
+
+        Performs a basic GWAS scan using the LMM.  This function
+        uses the LMM module to assess association at each SNP and 
+        does some simple cleanup, such as removing missing individuals 
+        per SNP and re-computing the eigen-decomp
+
+	Y - n x 1 phenotype vector 
+        X - n x m SNP matrix (genotype matrix)
+	K - n x n kinship matrix
+        Kva,Kve = linalg.eigh(K) - or the eigen vectors and values for K
+        X0 - n x q covariate matrix
+	REML - use restricted maximum likelihood 
+        refit - refit the variance component for each SNP
+
+      """
+      n = X.shape[0]
+      m = X.shape[1]
+      prins("Initialize GWAS")
+      print("genotype matrix n is:", n)
+      print("genotype matrix m is:", m)
+
+      if X0 == None: 
+         X0 = np.ones((n,1))
+      
+      # Remove missing values in Y and adjust associated parameters
+      v = np.isnan(Y)
+      if v.sum():
+	 keep = True - v
+	 keep = keep.reshape((-1,))
+	 Y = Y[keep]
+	 X = X[keep,:]
+	 X0 = X0[keep,:]
+	 K = K[keep,:][:,keep]
+	 Kva = []
+	 Kve = []
+
+      if len(Y) == 0: 
+         return np.ones(m)*np.nan,np.ones(m)*np.nan
+
+      L = LMM(Y,K,Kva,Kve,X0)
+      if not refit: L.fit()
+
+      PS = []
+      TS = []
+
+      n = X.shape[0]
+      m = X.shape[1]
+
+      for i in range(m):
+	 x = X[:,i].reshape((n,1))
+	 v = np.isnan(x).reshape((-1,))
+	 if v.sum():
+	    keep = True - v
+	    xs = x[keep,:]
+	    if xs.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    Ys = Y[keep]
+	    X0s = X0[keep,:]
+	    Ks = K[keep,:][:,keep]
+	    Ls = LMM(Ys,Ks,X0=X0s)
+	    if refit: 
+               Ls.fit(X=xs)
+	    else: 
+               Ls.fit()
+	    ts,ps = Ls.association(xs,REML=REML)
+	 else: 
+	    if x.var() == 0: 
+	       PS.append(np.nan) 
+	       TS.append(np.nan) 
+	       continue
+
+	    if refit: 
+               L.fit(X=x)
+	    ts,ps = L.association(x,REML=REML)
+	    
+	 PS.append(ps)
+	 TS.append(ts)
+
+      return TS,PS
+
+class LMM2:
+
+   """
+	 This is a simple version of EMMA/fastLMM.  
+	 The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
+	 and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
+	 There are three model parameters: heritability (h), covariate coefficients (beta) and the total
+	 phenotypic variance (sigma).
+	 Heritability as defined here is the proportion of the total variance (sigma) that is attributed to 
+	 the kinship matrix.
+
+	 For simplicity, we assume that everything being input is a numpy array.
+	 If this is not the case, the module may throw an error as conversion from list to numpy array
+	 is not done consistently.
+
+   """
+   def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
+
+      """
+      The constructor takes a phenotype vector or array Y of size n.
+      It takes a kinship matrix K of size n x n.  Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
+      If they are not provided, the constructor will calculate them.
+      X0 is an optional covariate matrix of size n x q, where there are q covariates.
+      When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+      """
+
+      if X0 == None: 
+	 X0 = np.ones(len(Y)).reshape(len(Y),1)
+      self.verbose = verbose
+
+      x = True - np.isnan(Y)
+      x = x.reshape(-1,)
+      if not x.sum() == len(Y):
+	 if self.verbose: sys.stderr.write("Removing %d missing values from Y\n" % ((True - x).sum()))
+	 Y = Y[x]
+	 K = K[x,:][:,x]
+	 X0 = X0[x,:]
+	 Kva = []
+	 Kve = []
+      self.nonmissing = x
+
+      if len(Kva) == 0 or len(Kve) == 0:
+	 if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+	 begin = time.time()
+	 Kva,Kve = eigh(K)
+	 end = time.time()
+	 if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+      
+      self.K = K
+      self.Kva = Kva
+      self.Kve = Kve
+      self.N = self.K.shape[0]
+      self.Y = Y.reshape((self.N,1))
+      self.X0 = X0
+
+      if sum(self.Kva < 1e-6):
+         if self.verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(self.Kva < 0)))
+         self.Kva[self.Kva < 1e-6] = 1e-6
+
+      self.transform()
+
+   def transform(self):
+
+      """
+	 Computes a transformation on the phenotype vector and the covariate matrix.
+	 The transformation is obtained by left multiplying each parameter by the transpose of the 
+	 eigenvector matrix of K (the kinship).
+      """
+
+      self.Yt = matrixMult(self.Kve.T, self.Y)
+      self.X0t = matrixMult(self.Kve.T, self.X0)
+      self.X0t_stack = np.hstack([self.X0t, np.ones((self.N,1))])
+      self.q = self.X0t.shape[1]
+
+   def getMLSoln(self,h,X):
+
+      """
+	 Obtains the maximum-likelihood estimates for the covariate coefficients (beta),
+	 the total variance of the trait (sigma) and also passes intermediates that can 
+	 be utilized in other functions. The input parameter h is a value between 0 and 1 and represents
+	 the heritability or the proportion of the total variance attributed to genetics.  The X is the 
+	 covariate matrix.
+      """
+   
+      S = 1.0/(h*self.Kva + (1.0 - h))
+      Xt = X.T*S
+      XX = matrixMult(Xt,X)
+      XX_i = inv(XX)
+      beta =  matrixMult(matrixMult(XX_i,Xt),self.Yt)
+      Yt = self.Yt - matrixMult(X,beta)
+      Q = np.dot(Yt.T*S,Yt)
+      sigma = Q * 1.0 / (float(self.N) - float(X.shape[1]))
+      return beta,sigma,Q,XX_i,XX
+
+   def LL_brent(self,h,X=None,REML=False): 
+      #brent will not be bounded by the specified bracket.
+      # I return a large number if we encounter h < 0 to avoid errors in LL computation during the search.
+      if h < 0: return 1e6
+      return -self.LL(h,X,stack=False,REML=REML)[0]
+	 
+   def LL(self,h,X=None,stack=True,REML=False):
+
+      """
+	 Computes the log-likelihood for a given heritability (h).  If X==None, then the 
+	 default X0t will be used.  If X is set and stack=True, then X0t will be matrix concatenated with
+	 the input X.  If stack is false, then X is used in place of X0t in the LL calculation.
+	 REML is computed by adding additional terms to the standard LL and can be computed by setting REML=True.
+      """
+
+      if X == None: X = self.X0t
+      elif stack: 
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      n = float(self.N)
+      q = float(X.shape[1])
+      beta,sigma,Q,XX_i,XX = self.getMLSoln(h,X)
+      LL = n*np.log(2*np.pi) + np.log(h*self.Kva + (1.0-h)).sum() + n + n*np.log(1.0/n * Q)
+      LL = -0.5 * LL
+
+      if REML:
+	 LL_REML_part = q*np.log(2.0*np.pi*sigma) + np.log(det(matrixMult(X.T,X))) - np.log(det(XX))
+	 LL = LL + 0.5*LL_REML_part
+
+
+      LL = LL.sum()
+      return LL,beta,sigma,XX_i
+
+   def getMax(self,H, X=None,REML=False):
+
+      """
+	 Helper functions for .fit(...).  
+	 This function takes a set of LLs computed over a grid and finds possible regions 
+	 containing a maximum.  Within these regions, a Brent search is performed to find the 
+	 optimum.
+
+      """
+      n = len(self.LLs)
+      HOpt = []
+      for i in range(1,n-2):
+          if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]:
+	    HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
+	    if np.isnan(HOpt[-1]): HOpt[-1] = H[i-1]
+	    #if np.isnan(HOpt[-1]): HOpt[-1] = self.LLs[i-1]
+	    #if np.isnan(HOpt[-1][0]): HOpt[-1][0] = [self.LLs[i-1]]
+
+      if len(HOpt) > 1: 
+	 if self.verbose: sys.stderr.write("NOTE: Found multiple optima.  Returning first...\n")
+	 return HOpt[0]
+      elif len(HOpt) == 1: return HOpt[0]
+      elif self.LLs[0] > self.LLs[n-1]: return H[0]
+      else: return H[n-1]
+
+
+   def fit(self,X=None,ngrids=100,REML=True):
+
+      """
+	 Finds the maximum-likelihood solution for the heritability (h) given the current parameters.
+	 X can be passed and will transformed and concatenated to X0t.  Otherwise, X0t is used as 
+	 the covariate matrix.
+
+	 This function calculates the LLs over a grid and then uses .getMax(...) to find the optimum.
+	 Given this optimum, the function computes the LL and associated ML solutions.
+      """
+      
+      if X == None: X = self.X0t
+      else: 
+	 #X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+	 self.X0t_stack[:,(self.q)] = matrixMult(self.Kve.T,X)[:,0]
+	 X = self.X0t_stack
+
+      H = np.array(range(ngrids)) / float(ngrids)
+      L = np.array([self.LL(h,X,stack=False,REML=REML)[0] for h in H])
+      self.LLs = L
+
+      hmax = self.getMax(H,X,REML)
+      L,beta,sigma,betaSTDERR = self.LL(hmax,X,stack=False,REML=REML)
+      
+      self.H = H
+      self.optH = hmax.sum()
+      self.optLL = L
+      self.optBeta = beta
+      self.optSigma = sigma.sum()
+
+      return hmax,beta,sigma,L
+
+   def association(self,X,h=None,stack=True,REML=True,returnBeta=False):
+      """
+	Calculates association statitics for the SNPs encoded in the vector X of size n.
+	If h == None, the optimal h stored in optH is used.
+
+      """
+      if False:
+            print "X=",X
+            print "h=",h
+            print "q=",self.q
+            print "self.Kve=",self.Kve
+            print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack
+      
+      if stack:
+	 # X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
+         m = matrixMult(self.Kve.T,X)
+         # print "m=",m
+         m = m[:,0]
+         self.X0t_stack[:,(self.q)] = m
+	 X = self.X0t_stack
+	 
+      if h == None: h = self.optH
+
+      L,beta,sigma,betaVAR = self.LL(h,X,stack=False,REML=REML)
+      q  = len(beta)
+      ts,ps = self.tstat(beta[q-1],betaVAR[q-1,q-1],sigma,q)
+      
+      if returnBeta: return ts,ps,beta[q-1].sum(),betaVAR[q-1,q-1].sum()*sigma
+      return ts,ps
+
+   def tstat(self,beta,var,sigma,q,log=False): 
+
+	 """
+	    Calculates a t-statistic and associated p-value given the estimate of beta and its standard error.
+	    This is actually an F-test, but when only one hypothesis is being performed, it reduces to a t-test.
+	 """
+
+	 ts = beta / np.sqrt(var * sigma)	 
+	 #ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
+	 # sf == survival function - this is more accurate -- could also use logsf if the precision is not good enough
+	 if log:
+	    ps = 2.0 + (stats.t.logsf(np.abs(ts), self.N-q))
+	 else:
+	    ps = 2.0*(stats.t.sf(np.abs(ts), self.N-q))
+	 if not len(ts) == 1 or not len(ps) == 1: 
+	    raise Exception("Something bad happened :(")
+	 return ts.sum(),ps.sum()
+
+   def plotFit(self,color='b-',title=''):
+
+      """
+	 Simple function to visualize the likelihood space.  It takes the LLs 
+	 calcualted over a grid and normalizes them by subtracting off the mean and exponentiating.
+	 The resulting "probabilities" are normalized to one and plotted against heritability.
+	 This can be seen as an approximation to the posterior distribuiton of heritability.
+
+	 For diagnostic purposes this lets you see if there is one distinct maximum or multiple 
+	 and what the variance of the parameter looks like.
+      """
+      import matplotlib.pyplot as pl
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      pl.plot(self.H,p,color)
+      pl.xlabel("Heritability")
+      pl.ylabel("Probability of data")
+      pl.title(title)
+   
+   def meanAndVar(self):
+
+      mx = self.LLs.max()
+      p = np.exp(self.LLs - mx)
+      p = p/p.sum()
+
+      mn = (self.H * p).sum()
+      vx = ((self.H - mn)**2 * p).sum()
+
+      return mn,vx
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index bb620052..682ba371 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -36,5 +36,5 @@ def remove_missing(y,g,verbose=False):
             sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
         y1 = y[keep]
         g1 = g[keep,:]
-    return y1,g1
+    return y1,g1,keep
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index f17f1bd1..4268f3be 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -99,13 +99,37 @@ if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
 
+if cmd == 'redis_new':
+    # Emulating the redis setup of GN2
+    Y = y
+    G = g
+    print "Original G",G.shape, "\n", G
+
+    gt = G.T
+    G = None
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
+    print np.array(ps)
+    print sum(ps)
+    # Test results
+    p1 = round(ps[0],4)
+    p2 = round(ps[-1],4)
+    sys.stderr.write(options.geno+"\n")
+    if options.geno == 'data/small.geno':
+        assert p1==0.0708, "p1=%f" % p1
+        assert p2==0.1417, "p2=%f" % p2
+    if options.geno == 'data/small_na.geno':
+        assert p1==0.0958, "p1=%f" % p1
+        assert p2==0.0435, "p2=%f" % p2
+    if options.geno == 'data/test8000.geno':
+        assert p1==0.8984, "p1=%f" % p1
+        assert p2==0.9623, "p2=%f" % p2
 if cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
     if y != None:
         gnt = np.array(g).T
-        Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+        Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
         print "Removed missing phenotypes",G.shape, "\n", G
     if options.maf_normalization:
-- 
cgit 1.4.1


From 886934b9cf18cf092c5d6cd0667d860aa30e64b8 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Mon, 16 Mar 2015 18:08:00 +0300
Subject: GWAS: results get returned

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 56 +++++++++++++++++++---------------
 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py | 10 +++---
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index b9d5db61..f8d77ab6 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -36,15 +36,17 @@ def formatResult(id,beta,betaSD,ts,ps):
    return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
 
 def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
-   # print(j,snp_ids,"\n")
+   # print("COMPUTE SNP",j,snp_ids,"\n")
    result = []
    for snp_id in snp_ids:
       snp,id = snp_id
       x = snp.reshape((n,1))  # all the SNPs
+      # print "X=",x
       # if refit:
       #    L.fit(X=snp,REML=REML)
       ts,ps,beta,betaVar = lmm2.association(x,REML=REML,returnBeta=True)
-      result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
+      # result.append(formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps))
+      result.append( (ts,ps) )
    if not q:
       q = compute_snp.q
    q.put([j,result])
@@ -95,7 +97,8 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
       out.write(formatResult(id,beta,betaSD,ts,ps))
    def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
 
-   printOutHead()
+   # printOutHead()
+   res = []
 
    # Set up the pool
    # mp.set_start_method('spawn')
@@ -114,6 +117,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    for snp in G:
       snp_id = (snp,'SNPID')
       count += 1
+      # print count,snp_id
       if count % 1000 == 0:
          job = count/1000
          if verbose:
@@ -122,21 +126,23 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             print "Running on 1 THREAD"
             compute_snp(job,n,collect,lmm2,reml,q)
             collect = []
-            j,lines = q.get()
+            j,lst = q.get()
             if verbose:
                sys.stderr.write("Job "+str(j)+" finished\n")
-            for line in lines:
-               out.write(line)
+            # for line in lines:
+            #    out.write(line)
+            res.append(lst)
          else:
             p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
             collect = []
             while job > completed:
                try:
-                  j,lines = q.get_nowait()
+                  j,lst = q.get_nowait()
                   if verbose:
                      sys.stderr.write("Job "+str(j)+" finished\n")
-                  for line in lines:
-                     out.write(line)
+                  # for line in lines:
+                  #    out.write(line)
+                  res.append(lst)
                   completed += 1
                except Queue.Empty:
                   pass
@@ -148,23 +154,23 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
 
       collect.append(snp_id)
 
-   if not numThreads or numThreads > 1:
+   if numThreads==1:
+      print "Running on 1 THREAD"
+      compute_snp(count/1000,n,collect,lmm2,reml,q)
+      j,lst = q.get()
+      # for line in lines:
+      #    out.write(line)
+      res.append(lst)
+   else:
       for job in range(int(count/1000)-completed):
-         j,lines = q.get(True,15) # time out
+         j,lst = q.get(True,15) # time out
          if verbose:
             sys.stderr.write("Job "+str(j)+" finished\n")
-         for line in lines:
-            out.write(line)
-   else:
-      print "Running on 1 THREAD"
-      # print collect
-      compute_snp(count/1000,n,collect,lmm2,reml,q)
-      j,lines = q.get()
-      for line in lines:
-         out.write(line)
-
-   # print collect
-   ps = [item[1][3] for item in collect]
-   ts = [item[1][2] for item in collect]
-   print ps
+         res.append(lst)
+
+   # print res
+   ts = [item[0] for res1 in res for item in res1]
+   ps = [item[1] for res1 in res for item in res1]
+   # ts = [item[1] for item in res]
+   # print ps
    return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index cba47a9b..d50b0111 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -329,11 +329,11 @@ class LMM2:
 
       """
       if False:
-            print "X=",X
-            print "h=",h
-            print "q=",self.q
-            print "self.Kve=",self.Kve
-            print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack
+         print "X=",X
+         print "h=",h
+         print "q=",self.q
+         print "self.Kve=",self.Kve
+         print "X0t_stack=",self.X0t_stack.shape,self.X0t_stack
       
       if stack:
 	 # X = np.hstack([self.X0t,matrixMult(self.Kve.T, X)])
-- 
cgit 1.4.1


From 7268ed46c9bce6fea58b55723c325426bdeeb617 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 10:08:41 +0300
Subject: GWAS multi-core: Tweak queue

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index f8d77ab6..2a472717 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -146,8 +146,8 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
                   completed += 1
                except Queue.Empty:
                   pass
-               if job > completed + cpu_num + 5:
-                  time.sleep(1)
+               if job > completed + cpu_num*2:
+                  time.sleep(0.1)
                else:
                   if job >= completed:
                     break
-- 
cgit 1.4.1


From 7c958796142e9fe0cc3155d10581d68ae2bbc5df Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 10:39:58 +0300
Subject: lmm.py: bring to original state

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index c42f9fb7..994b6be7 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -688,15 +688,11 @@ class LMM:
            optimum.
   
         """
-        print("H=",H)
-        print("X=",X)
-        print("REML=",REML)
         n = len(self.LLs)
         HOpt = []
         for i in range(1,n-2):
             if self.LLs[i-1] < self.LLs[i] and self.LLs[i] > self.LLs[i+1]: 
                 HOpt.append(optimize.brent(self.LL_brent,args=(X,REML),brack=(H[i-1],H[i+1])))
-                print("HOpt=",HOpt)
                 if np.isnan(HOpt[-1][0]):
                     HOpt[-1][0] = [self.LLs[i-1]]
 
-- 
cgit 1.4.1


From 46170b2741d006db89ce69128feff51c6e0e7752 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 11:05:30 +0300
Subject: lmm1: fixed a few regressions introduced while debugging

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py |  2 +-
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py     | 42 ++++++++++++++++++++-----------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  |  4 +--
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 00f48939..0c43587e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -168,7 +168,7 @@ def kvakve(K, verbose=True):
       print("Kve is: ", Kve.shape, Kve)
 
    if sum(Kva < 1e-6):
-      if verbose: sys.stderr.write("Cleaning %d eigen values\n" % (sum(Kva < 1e-6)))
+      if verbose: sys.stderr.write("Cleaning %d eigen values (Kva<0)\n" % (sum(Kva < 0)))
       Kva[Kva < 1e-6] = 1e-6
    return Kva,Kve
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 994b6be7..2014ffb8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -271,7 +271,7 @@ def run_other_old(pheno_vector,
     
     """
     
-    print("In run_other (old)")
+    print("Running the original LMM engine in run_other (old)")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
     with Bench("Calculate Kinship"):
         kinship_matrix,genotype_matrix = calculate_kinship(genotype_matrix, tempdata)
@@ -314,7 +314,7 @@ def run_other_new(pheno_vector,
     
     """
     
-    print("In run_other (new)")
+    print("Running the new LMM2 engine in run_other_new")
     print("REML=",restricted_max_likelihood," REFIT=",refit)
 
     # Adjust phenotypes
@@ -349,12 +349,10 @@ def run_other_new(pheno_vector,
     Bench().report()
     return p_values, t_stats
 
-run_other = run_other_new
+# def matrixMult(A,B):
+#     return np.dot(A,B)
 
 def matrixMult(A,B):
-    return np.dot(A,B)
-
-def matrixMult_old(A,B):
 
     # If there is no fblas then we will revert to np.dot()
 
@@ -772,7 +770,10 @@ class LMM:
  
           ts = beta / np.sqrt(var * sigma)        
           ps = 2.0*(1.0 - stats.t.cdf(np.abs(ts), self.N-q))
-          if not len(ts) == 1 or not len(ps) == 1: raise Exception("Something bad happened :(")
+          if not len(ts) == 1 or not len(ps) == 1:
+              print("ts=",ts)
+              print("ps=",ps)
+              raise Exception("Something bad happened :(")
           return ts.sum(),ps.sum()
 
     def plotFit(self,color='b-',title=''):
@@ -798,7 +799,11 @@ class LMM:
        pl.title(title)
 
 
-def gn2_redis(key,species):
+def gn2_redis(key,species,new_code=True):
+    """
+    Invoke pylmm using Redis as a container. new_code runs the new
+    version
+    """
     json_params = Redis.get(key)
     
     params = json.loads(json_params)
@@ -818,11 +823,18 @@ def gn2_redis(key,species):
                   refit = params['refit'],
                   tempdata = tempdata)
     else:
-        ps, ts = run_other(pheno_vector = np.array(params['pheno_vector']),
-                  genotype_matrix = geno,
-                  restricted_max_likelihood = params['restricted_max_likelihood'],
-                  refit = params['refit'],
-                  tempdata = tempdata)
+        if new_code:
+            ps, ts = run_other_new(pheno_vector = np.array(params['pheno_vector']),
+                               genotype_matrix = geno,
+                               restricted_max_likelihood = params['restricted_max_likelihood'],
+                               refit = params['refit'],
+                               tempdata = tempdata)
+        else:
+            ps, ts = run_other_old(pheno_vector = np.array(params['pheno_vector']),
+                               genotype_matrix = geno,
+                               restricted_max_likelihood = params['restricted_max_likelihood'],
+                               refit = params['refit'],
+                               tempdata = tempdata)
         
     results_key = "pylmm:results:" + params['temp_uuid']
 
@@ -847,7 +859,7 @@ def gn2_main():
 
     gn2_redis(key,species)
 
-def gn2_load_redis(key,species,kinship,pheno,geno):
+def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
     print("Loading Redis from parsed data")
     if kinship == None:
         k = None
@@ -868,7 +880,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno):
     Redis.set(key, json_params)
     Redis.expire(key, 60*60)
 
-    return gn2_redis(key,species)
+    return gn2_redis(key,species,new_code)
     
 if __name__ == '__main__':
     print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 4268f3be..1af5a443 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -107,7 +107,7 @@ if cmd == 'redis_new':
 
     gt = G.T
     G = None
-    ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
     print np.array(ps)
     print sum(ps)
     # Test results
@@ -142,7 +142,7 @@ if cmd == 'redis':
 
     gt = G.T
     G = None
-    ps, ts = gn2_load_redis('testrun','other',k,Y,gt)
+    ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
     print np.array(ps)
     print sum(ps)
     # Test results
-- 
cgit 1.4.1


From 73ed2f67bd0478473b887902ae96caaa0fca58d4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 13:10:49 +0300
Subject: GWAS: one result is missing

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py   | 33 +++++++++++++-----------------
 wqflask/wqflask/my_pylmm/pyLMM/input.py  |  2 ++
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py    | 10 ++++++---
 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py   | 17 ++++++++++------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 35 +++++++++++++++++---------------
 5 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 2a472717..20083bde 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -66,7 +66,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    """
    matrix_initialize()
    cpu_num = mp.cpu_count()
-   numThreads = None
+   numThreads = None # for now use all available threads
    kfile2 = False
    reml = restricted_max_likelihood
 
@@ -110,10 +110,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    # PS = []
    # TS = []
    count = 0
-
-   completed = 0
-   last_j = 0
-   # for snp_id in G:
+   jobs_running = 0
    for snp in G:
       snp_id = (snp,'SNPID')
       count += 1
@@ -129,13 +126,12 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             j,lst = q.get()
             if verbose:
                sys.stderr.write("Job "+str(j)+" finished\n")
-            # for line in lines:
-            #    out.write(line)
             res.append(lst)
          else:
             p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
+            jobs_running += 1
             collect = []
-            while job > completed:
+            while jobs_running:
                try:
                   j,lst = q.get_nowait()
                   if verbose:
@@ -143,34 +139,33 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
                   # for line in lines:
                   #    out.write(line)
                   res.append(lst)
-                  completed += 1
+                  jobs_running -= 1
                except Queue.Empty:
                   pass
-               if job > completed + cpu_num*2:
-                  time.sleep(0.1)
+               if jobs_running + cpu_num*2 > 0:
+                  time.sleep(1.0)
                else:
-                  if job >= completed:
+                  if jobs_running > 0:
                     break
 
       collect.append(snp_id)
 
-   if numThreads==1:
+   if numThreads==1 or count<1000:
       print "Running on 1 THREAD"
       compute_snp(count/1000,n,collect,lmm2,reml,q)
       j,lst = q.get()
-      # for line in lines:
-      #    out.write(line)
       res.append(lst)
    else:
-      for job in range(int(count/1000)-completed):
+      print "count=",count," running=",jobs_running," collect=",len(collect)
+      for job in range(jobs_running):
          j,lst = q.get(True,15) # time out
          if verbose:
             sys.stderr.write("Job "+str(j)+" finished\n")
          res.append(lst)
 
-   # print res
+   if verbose:
+      print "res=",res[0][0:10]
+   print [len(res1) for res1 in res]
    ts = [item[0] for res1 in res for item in res1]
    ps = [item[1] for res1 in res for item in res1]
-   # ts = [item[1] for item in res]
-   # print ps
    return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index f7b556a5..7063fedf 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -135,6 +135,8 @@ class plink:
     def normalizeGenotype(self,G):
         # print "Before",G
         # print G.shape
+        print "call input.normalizeGenotype"
+        raise "This should not be used"
         x = True - np.isnan(G)
         m = G[x].mean()
         s = np.sqrt(G[x].var())
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 2014ffb8..8a24d98b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -285,7 +285,7 @@ def run_other_old(pheno_vector,
     # with Bench("LMM_ob fitting"):
     #     lmm_ob.fit()
 
-    print("genotype_matrix: ", genotype_matrix.shape)
+    print("run_other_old genotype_matrix: ", genotype_matrix.shape)
     print(genotype_matrix)
 
     with Bench("Doing GWAS"):
@@ -320,6 +320,7 @@ def run_other_new(pheno_vector,
     # Adjust phenotypes
     Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
     print("Removed missing phenotypes",Y.shape)
+
     # if options.maf_normalization:
     #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
     #     print "MAF replacements: \n",G
@@ -337,7 +338,7 @@ def run_other_new(pheno_vector,
     # with Bench("LMM_ob fitting"):
     #     lmm_ob.fit()
 
-    print("genotype_matrix: ", G.shape)
+    print("run_other_new genotype_matrix: ", G.shape)
     print(G)
 
     with Bench("Doing GWAS"):
@@ -388,7 +389,9 @@ def calculate_kinship_new(genotype_matrix, temp_data=None):
     Call the new kinship calculation where genotype_matrix contains
     inds (columns) by snps (rows).
     """
+    print("call genotype.normalize")
     G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
+    print("call calculate_kinship_new")
     return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME)
 
 def calculate_kinship_old(genotype_matrix, temp_data=None):
@@ -399,6 +402,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
     normalizes the resulting vectors and returns the RRM matrix.
     
     """
+    print("call calculate_kinship_old")
     n = genotype_matrix.shape[0]
     m = genotype_matrix.shape[1]
     print("genotype 2D matrix n (inds) is:", n)
@@ -429,7 +433,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
             temp_data.store("percent_complete", percent_complete)
         
     genotype_matrix = genotype_matrix[:,keep]
-    print("genotype_matrix: ", pf(genotype_matrix))
+    print("After kinship (old) genotype_matrix: ", pf(genotype_matrix))
     kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
     return kinship_matrix,genotype_matrix
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index d50b0111..d4b3ac82 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -22,6 +22,7 @@ from scipy.linalg import eigh, inv, det
 import scipy.stats as stats # t-tests
 from scipy import optimize
 from optmatrix import matrixMult
+import kinship
 
 def calculateKinship(W,center=False):
       """
@@ -177,13 +178,17 @@ class LMM2:
 	 Kve = []
       self.nonmissing = x
 
-      if len(Kva) == 0 or len(Kve) == 0:
-	 if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
-	 begin = time.time()
-	 Kva,Kve = eigh(K)
-	 end = time.time()
-	 if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+      print("this K is:", K.shape, K)
       
+      if len(Kva) == 0 or len(Kve) == 0:
+          # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+          begin = time.time()
+          # Kva,Kve = linalg.eigh(K)
+          Kva,Kve = kinship.kvakve(K)
+          end = time.time()
+          if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
+
       self.K = K
       self.Kva = Kva
       self.Kve = Kve
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 1af5a443..708c9185 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -54,9 +54,9 @@ parser.add_option("--geno",dest="geno",
 parser.add_option("--maf-normalization",
                   action="store_true", dest="maf_normalization", default=False,
                   help="Apply MAF genotype normalization")
-parser.add_option("--skip-genotype-normalization",
-                  action="store_true", dest="skip_genotype_normalization", default=False,
-                  help="Skip genotype normalization")
+parser.add_option("--genotype-normalization",
+                  action="store_true", dest="genotype_normalization", default=False,
+                  help="Force genotype normalization")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
@@ -100,7 +100,8 @@ if options.geno:
     print g.shape
 
 if cmd == 'redis_new':
-    # Emulating the redis setup of GN2
+    # The main difference between redis_new and redis is that missing
+    # phenotypes are handled by the first
     Y = y
     G = g
     print "Original G",G.shape, "\n", G
@@ -109,7 +110,7 @@ if cmd == 'redis_new':
     G = None
     ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
     print np.array(ps)
-    print sum(ps)
+    print len(ps),sum(ps)
     # Test results
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
@@ -118,12 +119,14 @@ if cmd == 'redis_new':
         assert p1==0.0708, "p1=%f" % p1
         assert p2==0.1417, "p2=%f" % p2
     if options.geno == 'data/small_na.geno':
-        assert p1==0.0958, "p1=%f" % p1
-        assert p2==0.0435, "p2=%f" % p2
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
         assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9623, "p2=%f" % p2
-if cmd == 'redis':
+        assert p2==0.9620, "p2=%f" % p2
+        assert sum(ps) == 4070.02346579
+        assert len(ps) == 8000
+elif cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
@@ -135,7 +138,7 @@ if cmd == 'redis':
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
@@ -144,8 +147,8 @@ if cmd == 'redis':
     G = None
     ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
     print np.array(ps)
-    print sum(ps)
-    # Test results
+    print len(ps),sum(ps)
+    # Test results 4070.02346579
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
     sys.stderr.write(options.geno+"\n")
@@ -153,11 +156,11 @@ if cmd == 'redis':
         assert p1==0.0708, "p1=%f" % p1
         assert p2==0.1417, "p2=%f" % p2
     if options.geno == 'data/small_na.geno':
-        assert p1==0.0958, "p1=%f" % p1
-        assert p2==0.0435, "p2=%f" % p2
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
         assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9623, "p2=%f" % p2
+        assert p2==0.8827, "p2=%f" % p2
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
@@ -169,7 +172,7 @@ elif cmd == 'kinship':
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
-- 
cgit 1.4.1


From 5cee365cc2d8bf8cafca018b9a04e4821616899d Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 13:27:38 +0300
Subject: GWAS: should be correct now

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py   | 26 ++++++++++++--------------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py |  6 +++---
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 20083bde..ce7842f7 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -131,37 +131,35 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
             jobs_running += 1
             collect = []
-            while jobs_running:
+            while jobs_running > cpu_num:
                try:
                   j,lst = q.get_nowait()
                   if verbose:
                      sys.stderr.write("Job "+str(j)+" finished\n")
-                  # for line in lines:
-                  #    out.write(line)
                   res.append(lst)
                   jobs_running -= 1
                except Queue.Empty:
+                  time.sleep(0.1)
                   pass
-               if jobs_running + cpu_num*2 > 0:
+               if jobs_running > cpu_num*2:
                   time.sleep(1.0)
                else:
-                  if jobs_running > 0:
-                    break
+                  break
 
       collect.append(snp_id)
 
-   if numThreads==1 or count<1000:
+   if numThreads==1 or count<1000 or len(collect)>0:
       print "Running on 1 THREAD"
       compute_snp(count/1000,n,collect,lmm2,reml,q)
+      collect = []
       j,lst = q.get()
       res.append(lst)
-   else:
-      print "count=",count," running=",jobs_running," collect=",len(collect)
-      for job in range(jobs_running):
-         j,lst = q.get(True,15) # time out
-         if verbose:
-            sys.stderr.write("Job "+str(j)+" finished\n")
-         res.append(lst)
+   print "count=",count," running=",jobs_running," collect=",len(collect)
+   for job in range(jobs_running):
+      j,lst = q.get(True,15) # time out
+      if verbose:
+         sys.stderr.write("Job "+str(j)+" finished\n")
+      res.append(lst)
 
    if verbose:
       print "res=",res[0][0:10]
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 708c9185..e301ef1a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -122,9 +122,9 @@ if cmd == 'redis_new':
         assert p1==0.0897, "p1=%f" % p1
         assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
-        assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9620, "p2=%f" % p2
-        assert sum(ps) == 4070.02346579
+        # assert p1==0.8984, "p1=%f" % p1
+        # assert p2==0.9621, "p2=%f" % p2
+        assert round(sum(ps)) == 4070
         assert len(ps) == 8000
 elif cmd == 'redis':
     # Emulating the redis setup of GN2
-- 
cgit 1.4.1


From 62f287b551a3b207eadffb52d6de1e0ef64a1a08 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 16:28:08 +0300
Subject: runlmm: Explicit handling of missing phenotypes

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index e301ef1a..324c4f2c 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -57,6 +57,9 @@ parser.add_option("--maf-normalization",
 parser.add_option("--genotype-normalization",
                   action="store_true", dest="genotype_normalization", default=False,
                   help="Force genotype normalization")
+parser.add_option("--remove-missing-phenotypes",
+                  action="store_true", dest="remove_missing_phenotypes", default=False,
+                  help="Remove missing phenotypes")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
@@ -130,11 +133,13 @@ elif cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
-    if y != None:
+    if y != None and options.remove_missing_phenotypes:
         gnt = np.array(g).T
         Y,g,keep = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
         print "Removed missing phenotypes",G.shape, "\n", G
+    else:
+        Y = y
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
@@ -159,12 +164,12 @@ elif cmd == 'redis':
         assert p1==0.0897, "p1=%f" % p1
         assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
-        assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.8827, "p2=%f" % p2
+        assert round(sum(ps)) == 4070
+        assert len(ps) == 8000
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
-    if y != None:
+    if y != None and options.remove_missing_phenotypes:
         gnt = np.array(g).T
         Y,g = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
@@ -193,11 +198,11 @@ elif cmd == 'kinship':
     sys.stderr.write(options.geno+"\n")
     k3 = round(K3[0][0],4)
     if options.geno == 'data/small.geno':
-        assert k1==0.7939, "k1=%f" % k1
+        assert k1==0.8, "k1=%f" % k1
         assert k2==0.7939, "k2=%f" % k2
         assert k3==0.7939, "k3=%f" % k3
     if options.geno == 'data/small_na.geno':
-        assert k1==0.7172, "k1=%f" % k1
+        assert k1==0.8333, "k1=%f" % k1
         assert k2==0.7172, "k2=%f" % k2
         assert k3==0.7172, "k3=%f" % k3
     if options.geno == 'data/test8000.geno':
-- 
cgit 1.4.1


From 42c94e94414da06d85dbb6f53e77eb660624b5d8 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 16:35:20 +0300
Subject: README: Started release information and versioning of module

---
 wqflask/wqflask/my_pylmm/README.md         | 9 +++++++++
 wqflask/wqflask/my_pylmm/pyLMM/__init__.py | 1 +
 2 files changed, 10 insertions(+)
 create mode 100644 wqflask/wqflask/my_pylmm/README.md

diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md
new file mode 100644
index 00000000..0c809610
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/README.md
@@ -0,0 +1,9 @@
+# RELEASE NOTES
+
+## 0.50-gn2-pre1
+
+1. This is the first test release of multi-core pylmm into GN2. Both
+   Kinship K calculation and GWAS have been made multi-threaded by
+   introducing the Python multiprocessing module. Note that only
+   run_other has been updated to use the new routines.
+  
\ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
index e69de29b..c40c3221 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/__init__.py
@@ -0,0 +1 @@
+PYLMM_VERSION="0.50-gn2-pre1"
-- 
cgit 1.4.1


From c4132c7ec5e0d720c4229563c9edddbcfc2e7fa6 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 17:03:53 +0300
Subject: GWAS: sort results so we always get the same list

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index ce7842f7..b901c0e2 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -110,13 +110,13 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    # PS = []
    # TS = []
    count = 0
+   job = 0
    jobs_running = 0
    for snp in G:
       snp_id = (snp,'SNPID')
       count += 1
-      # print count,snp_id
       if count % 1000 == 0:
-         job = count/1000
+         job += 1
          if verbose:
             sys.stderr.write("Job %d At SNP %d\n" % (job,count))
          if numThreads == 1:
@@ -126,7 +126,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             j,lst = q.get()
             if verbose:
                sys.stderr.write("Job "+str(j)+" finished\n")
-            res.append(lst)
+            res.append((j,lst))
          else:
             p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
             jobs_running += 1
@@ -136,7 +136,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
                   j,lst = q.get_nowait()
                   if verbose:
                      sys.stderr.write("Job "+str(j)+" finished\n")
-                  res.append(lst)
+                  res.append((j,lst))
                   jobs_running -= 1
                except Queue.Empty:
                   time.sleep(0.1)
@@ -149,21 +149,25 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
       collect.append(snp_id)
 
    if numThreads==1 or count<1000 or len(collect)>0:
-      print "Running on 1 THREAD"
-      compute_snp(count/1000,n,collect,lmm2,reml,q)
+      job += 1
+      print "Collect final batch size %i job %i @%i: " % (len(collect), job, count)
+      compute_snp(job,n,collect,lmm2,reml,q)
       collect = []
       j,lst = q.get()
-      res.append(lst)
+      res.append((j,lst))
    print "count=",count," running=",jobs_running," collect=",len(collect)
    for job in range(jobs_running):
       j,lst = q.get(True,15) # time out
       if verbose:
          sys.stderr.write("Job "+str(j)+" finished\n")
-      res.append(lst)
-
-   if verbose:
-      print "res=",res[0][0:10]
-   print [len(res1) for res1 in res]
-   ts = [item[0] for res1 in res for item in res1]
-   ps = [item[1] for res1 in res for item in res1]
+      res.append((j,lst))
+
+   print "Before sort",[res1[0] for res1 in res]
+   res = sorted(res,key=lambda x: x[0])
+   # if verbose:
+   #    print "res=",res[0][0:10]
+   print "After sort",[res1[0] for res1 in res]
+   print [len(res1[1]) for res1 in res]
+   ts = [item[0] for j,res1 in res for item in res1]
+   ps = [item[1] for j,res1 in res for item in res1]
    return ts,ps
-- 
cgit 1.4.1


From e615b83d34bcc991e671241db71c6ad0c0267479 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 17:15:56 +0300
Subject: Release 0.50-gn2-pre1

---
 wqflask/wqflask/my_pylmm/README.md | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/README.md b/wqflask/wqflask/my_pylmm/README.md
index 0c809610..f6b0e72d 100644
--- a/wqflask/wqflask/my_pylmm/README.md
+++ b/wqflask/wqflask/my_pylmm/README.md
@@ -1,9 +1,21 @@
 # RELEASE NOTES
 
-## 0.50-gn2-pre1
+## 0.50-gn2-pre1 release
+
+This is the first test release of multi-core pylmm into GN2. Both
+kinship calculation and GWAS have been made multi-threaded by
+introducing the Python multiprocessing module. Note that only
+run_other has been updated to use the new routines (so human is still
+handled the old way). I have taken care that we can still run both
+old-style and new-style LMM (through passing the 'new_code'
+boolean). This could be an option in the web server for users to
+select and test for any unexpected differences (of which there should
+be none, naturally ;).
+
+The current version can handle missing phenotypes, but as they are
+removed there is no way for GN2 to know what SNPs the P-values belong
+to. A future version will pass a SNP index to allow for missing
+phenotypes.
+
 
-1. This is the first test release of multi-core pylmm into GN2. Both
-   Kinship K calculation and GWAS have been made multi-threaded by
-   introducing the Python multiprocessing module. Note that only
-   run_other has been updated to use the new routines.
   
\ No newline at end of file
-- 
cgit 1.4.1