From e492c9d3812ee2f6d6c7b640e216af09710b91d7 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 11 Mar 2015 14:36:36 +0300
Subject: Adding multi-core kinship.py

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py   | 136 ++++++++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py       |   4 +
 wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py |   2 +-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py    |  59 +++++++-----
 4 files changed, 177 insertions(+), 24 deletions(-)
 create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/kinship.py

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
new file mode 100644
index 00000000..dc2d717d
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -0,0 +1,136 @@
+# pylmm kinship calculation
+#
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test
+
+import sys
+import os
+import numpy as np
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+from optmatrix import matrix_initialize, matrixMultT
+
+
+def compute_W(job,G,n,compute_size):
+   """
+   Read 1000 SNPs at a time into matrix and return the result
+   """
+   W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+   for j in range(0,compute_size):
+      row = job*compute_size + j
+      if row >= compute_size:
+         W = W[:,range(0,j)]
+         break
+      snp = G[job*compute_size+j]
+      # print snp.shape,snp
+      if snp.var() == 0:
+         continue
+      W[:,j] = snp  # set row to list of SNPs
+   return W
+
+def compute_matrixMult(job,W,q = None):
+   """
+   Compute Kinship(W)*j
+
+   For every set of SNPs matrixMult is used to multiply matrices T(W)*W
+   """
+   res = matrixMultT(W)
+   if not q: q=compute_matrixMult.q
+   q.put([job,res])
+   return job
+
+def f_init(q):
+    compute_matrixMult.q = q
+
+# Calculate the kinship matrix from G (SNPs as rows!), returns K
+#
+def kinship(G,options):
+    numThreads = None
+    if options.numThreads:
+        numThreads = int(options.numThreads)
+    options.computeSize = 1000
+    matrix_initialize(options.useBLAS)
+    
+    sys.stderr.write(str(G.shape)+"\n")
+    n = G.shape[1] # inds
+    inds = n
+    m = G.shape[0] # snps
+    snps = m
+    sys.stderr.write(str(m)+" SNPs\n")
+
+    q = mp.Queue()
+    p = mp.Pool(numThreads, f_init, [q])
+    iterations = snps/options.computeSize+1
+    if options.testing:
+      iterations = 8
+    # jobs = range(0,8) # range(0,iterations)
+
+    results = []
+
+    K = np.zeros((n,n))  # The Kinship matrix has dimension individuals x individuals
+
+    completed = 0
+    for job in range(iterations):
+       if options.verbose:
+          sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+       W = compute_W(job,G,n,options.computeSize)
+       if numThreads == 1:
+          compute_matrixMult(job,W,q)
+          j,x = q.get()
+          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+          K_j = x
+          # print j,K_j[:,0]
+          K = K + K_j
+       else:
+          results.append(p.apply_async(compute_matrixMult, (job,W)))
+          # Do we have a result?
+          try: 
+             j,x = q.get_nowait()
+             if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+             K_j = x
+             # print j,K_j[:,0]
+             K = K + K_j
+             completed += 1
+          except Queue.Empty:
+             pass
+
+    if numThreads == None or numThreads > 1:
+       for job in range(len(results)-completed):
+          j,x = q.get(True,15)
+          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+          K_j = x
+          # print j,K_j[:,0]
+          K = K + K_j
+
+    print K.shape,K
+    K = K / float(snps)
+    outFile = 'runtest.kin'
+    if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+    np.savetxt(outFile,K)
+
+    if options.saveKvaKve:
+       if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+       Kva,Kve = linalg.eigh(K)
+       if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+       np.savetxt(outFile+".kva",Kva)
+       np.savetxt(outFile+".kve",Kve)
+      
+
+
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 58d7593d..65b989a8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -406,6 +406,8 @@ def GWAS(pheno_vector,
     v = np.isnan(pheno_vector)
     if v.sum():
         keep = True - v
+        print(pheno_vector.shape,pheno_vector)
+        print(keep.shape,keep)
         pheno_vector = pheno_vector[keep]
         #genotype_matrix = genotype_matrix[keep,:]
         #covariate_matrix = covariate_matrix[keep,:]
@@ -437,6 +439,8 @@ def GWAS(pheno_vector,
                 p_values.append(0)
                 t_statistics.append(np.nan)
                 continue
+            
+            print(genotype_matrix.shape,pheno_vector.shape,keep.shape)
 
             pheno_vector = pheno_vector[keep]
             covariate_matrix = covariate_matrix[keep,:]
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
index abb72081..5c71db6a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
@@ -9,7 +9,7 @@ from scipy import stats
 useNumpy = None
 hasBLAS = None
 
-def matrix_initialize(useBLAS):
+def matrix_initialize(useBLAS=True): 
     global useNumpy  # module based variable
     if useBLAS and useNumpy == None:
         print get_info('blas_opt')
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 738268be..bd6c55fc 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -22,17 +22,19 @@ import sys
 import tsvreader
 import numpy as np
 from lmm import gn2_load_redis
+from kinship import kinship
 
 usage = """
 python runlmm.py [options] command
 
   runlmm.py processing multiplexer reads standardised input formats
-  and calls the different routines
+  and calls the different routines (writes to stdout)
 
   Current commands are:
 
     parse        : only parse input files
     redis        : use Redis to call into GN2
+    kinship      : calculate (new) kinship matrix
 
   try --help for more information
 """
@@ -50,6 +52,13 @@ parser.add_option("--geno",dest="geno",
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
+parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication")
+parser.add_option("-t", "--threads",
+                  type="int", dest="numThreads", 
+                  help="Threads to use")
+parser.add_option("--saveKvaKve",
+                  action="store_true", dest="saveKvaKve", default=False,
+                  help="Testing mode")
 parser.add_option("--test",
                   action="store_true", dest="testing", default=False,
                   help="Testing mode")
@@ -63,6 +72,10 @@ if len(args) != 1:
 cmd = args[0]
 print "Command: ",cmd
 
+k = None
+y = None
+g = None
+
 if options.kinship:
     k = tsvreader.kinship(options.kinship)
     print k.shape
@@ -74,7 +87,7 @@ if options.pheno:
 if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
-
+    
 def normalizeGenotype(G):
     # Run for every SNP list (num individuals)
     x = True - np.isnan(G)  # Matrix of True/False
@@ -89,32 +102,32 @@ def normalizeGenotype(G):
         G = (G - m) / s     # Normalize the deviation
     return G
 
-# g = g.reshape((1, -1))[0]
 print("Before",g)
-gn = []
-for snp in g:
-    gn.append( normalizeGenotype(snp) )
-
-gn = g
-gn = np.array(gn)
-print("After1",gn)
-gnT = gn.T
-print("After",gnT)
-# G = gnT
-G = gnT
+
+gT = g.T
+print("After",gT)
+G = np.array(gT)
 print "G shape",G.shape
-# sys.exit(1)
-# assert(G[0,0]==-2.25726341)
 
 # Remove individuals with missing phenotypes
-v = np.isnan(y)
-keep = True - v
-if v.sum():
-   if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
-   y  = y[keep]
-   G = G[keep,:]
-   k = k[keep,:][:,keep]
+if y != None:
+    v = np.isnan(y)
+    keep = True - v
+    if v.sum():
+       if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+       y = y[keep]
+       G = G[keep,:]
+       if k != None: k = k[keep,:][:,keep]
 
 if cmd == 'redis':
     ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
     print np.array(ps)
+elif cmd == 'kinship':
+    gn = []
+    for snp in G.T:
+        gn.append( normalizeGenotype(snp) )
+    G2 = np.array(gn)
+    print G2.shape,G2
+    K = kinship(G2,options)
+else:
+    print "Doing nothing"
-- 
cgit 1.4.1


From 4bb1113a9c23882ad91634c7cb77537e47a09713 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 11 Mar 2015 14:44:21 +0300
Subject: runlmm.py: message

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index bd6c55fc..5b777580 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -114,7 +114,7 @@ if y != None:
     v = np.isnan(y)
     keep = True - v
     if v.sum():
-       if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+       if options.verbose: sys.stderr.write("runlmm.py: Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
        y = y[keep]
        G = G[keep,:]
        if k != None: k = k[keep,:][:,keep]
-- 
cgit 1.4.1


From d7f25679f0369f9138300e23f49a958239e2a379 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 12 Mar 2015 11:09:08 +0300
Subject: process_plink.py: remove unused file

---
 wqflask/wqflask/my_pylmm/pyLMM/process_plink.py | 127 ------------------------
 1 file changed, 127 deletions(-)
 delete mode 100644 wqflask/wqflask/my_pylmm/pyLMM/process_plink.py

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
deleted file mode 100644
index e47c18e1..00000000
--- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import absolute_import, print_function, division
-
-import sys
-sys.path.append("../../..")
-
-print("sys.path: ", sys.path)
-
-import numpy as np
-
-import zlib
-import cPickle as pickle
-import redis
-Redis = redis.Redis()
-
-import lmm
-
-class ProcessLmmChunk(object):
-    
-    def __init__(self):
-        self.get_snp_data()
-        self.get_lmm_vars()
-        
-        keep = self.trim_matrices()
-        
-        self.do_association(keep)
-        
-        print("p_values is: ", self.p_values)
-        
-    def get_snp_data(self):
-        plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
-        plink_data = pickle.loads(plink_pickled)
-        
-        self.snps = np.array(plink_data['result'])
-        self.identifier = plink_data['identifier']
-        
-    def get_lmm_vars(self):
-        lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars")
-        lmm_vars = pickle.loads(lmm_vars_pickled)
-        
-        self.pheno_vector = np.array(lmm_vars['pheno_vector'])
-        self.covariate_matrix = np.array(lmm_vars['covariate_matrix'])
-        self.kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-        
-    def trim_matrices(self):
-        v = np.isnan(self.pheno_vector)
-        keep = True - v
-        keep = keep.reshape((len(keep),))
-        
-        if v.sum():
-            self.pheno_vector = self.pheno_vector[keep]
-            self.covariate_matrix = self.covariate_matrix[keep,:]
-            self.kinship_matrix = self.kinship_matrix[keep,:][:,keep]
-
-        return keep
-    
-    def do_association(self, keep):
-        n = self.kinship_matrix.shape[0]
-        lmm_ob = lmm.LMM(self.pheno_vector,
-                    self.kinship_matrix,
-                    self.covariate_matrix)
-        lmm_ob.fit()
-    
-        self.p_values = []
-        
-        for snp in self.snps:
-            snp = snp[0]
-            p_value, t_stat = lmm.human_association(snp,
-                                        n,
-                                        keep,
-                                        lmm_ob,
-                                        self.pheno_vector,
-                                        self.covariate_matrix,
-                                        self.kinship_matrix,
-                                        False)
-        
-            self.p_values.append(p_value)
-            
-
-#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
-#
-#plink_data = pickle.loads(plink_pickled)
-#result = np.array(plink_data['result'])
-#print("snp size is: ", result.shape)
-#identifier = plink_data['identifier']
-#
-#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars")
-#lmm_vars = pickle.loads(lmm_vars_pickled)
-#
-#pheno_vector = np.array(lmm_vars['pheno_vector'])
-#covariate_matrix = np.array(lmm_vars['covariate_matrix'])
-#kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-#
-#v = np.isnan(pheno_vector)
-#keep = True - v
-#keep = keep.reshape((len(keep),))
-#print("keep is: ", keep)
-#
-#if v.sum():
-#    pheno_vector = pheno_vector[keep]
-#    covariate_matrix = covariate_matrix[keep,:]
-#    kinship_matrix = kinship_matrix[keep,:][:,keep]
-#
-#n = kinship_matrix.shape[0]
-#print("n is: ", n)
-#lmm_ob = lmm.LMM(pheno_vector,
-#            kinship_matrix,
-#            covariate_matrix)
-#lmm_ob.fit()
-#
-#p_values = []
-#
-#for snp in result:
-#    snp = snp[0]
-#    p_value, t_stat = lmm.human_association(snp,
-#                                n,
-#                                keep,
-#                                lmm_ob,
-#                                pheno_vector,
-#                                covariate_matrix,
-#                                kinship_matrix,
-#                                False)
-#
-#    p_values.append(p_value)
-    
-
-
-
-- 
cgit 1.4.1


From 3fb9db1d8b1d975b2af1d56c8ea0dbd6b39c9001 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 12 Mar 2015 12:02:17 +0300
Subject: Debug info

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 65b989a8..48a931c5 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -143,6 +143,7 @@ def run_human(pheno_vector,
         
         timestamp = datetime.datetime.utcnow().isoformat()
 
+        # Pickle chunks of input SNPs (from Plink interator) and compress them
         #print("Starting adding loop")
         for part, result in enumerate(results):
             #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL)
@@ -517,7 +518,7 @@ class LMM:
           Kve = []
        self.nonmissing = x
  
-       print("this K is:", pf(K))
+       print("this K is:", K.shape, pf(K))
        
        if len(Kva) == 0 or len(Kve) == 0:
           if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
@@ -529,8 +530,8 @@ class LMM:
        self.K = K
        self.Kva = Kva
        self.Kve = Kve
-       print("self.Kva is: ", pf(self.Kva))
-       print("self.Kve is: ", pf(self.Kve))
+       print("self.Kva is: ", self.Kva.shape, pf(self.Kva))
+       print("self.Kve is: ", self.Kve.shape, pf(self.Kve))
        self.Y = Y
        self.X0 = X0
        self.N = self.K.shape[0]
-- 
cgit 1.4.1


From bd8e4f30cd1eb0a41399d8d94b516845da3c32a0 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 12 Mar 2015 17:10:54 +0300
Subject: Check G

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 5b777580..0b727383 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -120,6 +120,7 @@ if y != None:
        if k != None: k = k[keep,:][:,keep]
 
 if cmd == 'redis':
+    print G
     ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
     print np.array(ps)
 elif cmd == 'kinship':
-- 
cgit 1.4.1


From 494edec995a38279eb1da21d520b2cb249eb1079 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 12 Mar 2015 17:18:38 +0300
Subject: weird fix for lmm result - Python weirdness

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 0b727383..e64ca0e6 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -102,6 +102,12 @@ def normalizeGenotype(G):
         G = (G - m) / s     # Normalize the deviation
     return G
 
+gn = []
+for snp in g:
+    gn.append( normalizeGenotype(snp) )
+
+gn = g
+
 print("Before",g)
 
 gT = g.T
-- 
cgit 1.4.1


From fa4227614042312a54a5d404af4235efc077ab3a Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 12 Mar 2015 21:05:51 +0300
Subject: lmm: use deep-copy to avoid side-effects

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index e64ca0e6..d22e2912 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -88,29 +88,28 @@ if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
     
-def normalizeGenotype(G):
+def normalizeGenotype(g1):
+    g = np.copy(g1)         # avoid side effects
     # Run for every SNP list (num individuals)
-    x = True - np.isnan(G)  # Matrix of True/False
-    m = G[x].mean()         # Global mean value
+    x = True - np.isnan(g)  # Matrix of True/False
+    m = g[x].mean()         # Global mean value
     # print m
-    s = np.sqrt(G[x].var()) # Global stddev
+    s = np.sqrt(g[x].var()) # Global stddev
     # print s
-    G[np.isnan(G)] = m      # Plug-in mean values for missing
+    g[np.isnan(g)] = m      # Plug-in mean values for missing
     if s == 0:
-        G = G - m           # Subtract the mean
+        g = g - m           # Subtract the mean
     else:
-        G = (G - m) / s     # Normalize the deviation
-    return G
+        g = (g - m) / s     # Normalize the deviation
+    return g
 
 gn = []
 for snp in g:
     gn.append( normalizeGenotype(snp) )
 
-gn = g
-
 print("Before",g)
 
-gT = g.T
+gT = np.array(gn).T
 print("After",gT)
 G = np.array(gT)
 print "G shape",G.shape
@@ -129,6 +128,7 @@ if cmd == 'redis':
     print G
     ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
     print np.array(ps)
+    assert(options.testing and ps[0]==0.726205761108)
 elif cmd == 'kinship':
     gn = []
     for snp in G.T:
-- 
cgit 1.4.1


From 7ec5303a6eca758236103e191f574bb5480a0ba6 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 13 Mar 2015 09:57:49 +0300
Subject: & runlmm.py: added assertions for default run

---
 wqflask/wqflask/my_pylmm/pyLMM/chunks.py | 2 +-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
index abeffee7..9565fb96 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
@@ -93,4 +93,4 @@ if __name__ == '__main__':
     _main()
     print("\nConfirming doctests...")
     import doctest
-    doctest.testmod()
\ No newline at end of file
+    doctest.testmod()
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index d22e2912..94533ecb 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -125,10 +125,14 @@ if y != None:
        if k != None: k = k[keep,:][:,keep]
 
 if cmd == 'redis':
+    # Emulating the redis setup of GN2
     print G
     ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
     print np.array(ps)
-    assert(options.testing and ps[0]==0.726205761108)
+    print round(ps[0],4)
+    assert(options.testing and round(ps[0],4)==0.7262)
+    print round(ps[-1],4)
+    assert(options.testing and round(ps[-1],4)==0.3461)
 elif cmd == 'kinship':
     gn = []
     for snp in G.T:
-- 
cgit 1.4.1


From 7c7ace427c8dfa81d3448250d9697b4e00418d34 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 13 Mar 2015 11:48:58 +0300
Subject: & Refactoring

---
 wqflask/wqflask/my_pylmm/pyLMM/genotype.py  | 36 ++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 40 ++++++++++++++++++++++++
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py    | 47 ++++++-----------------------
 3 files changed, 85 insertions(+), 38 deletions(-)
 create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/genotype.py
 create mode 100644 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
new file mode 100644
index 00000000..19b0957b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -0,0 +1,36 @@
+# Genotype routines
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+
+def normalizeGenotype(g):
+    """
+    Run for every SNP list (for one individual) and return
+    normalized SNP genotype values with missing data filled in
+    """
+    g1 = np.copy(g)          # avoid side effects
+    x = True - np.isnan(g)   # Matrix of True/False
+    m = g[x].mean()          # Global mean value
+    s = np.sqrt(g[x].var())  # Global stddev
+    g1[np.isnan(g)] = m      # Plug-in mean values for missing data
+    if s == 0:
+        g1 = g1 - m          # Subtract the mean
+    else:
+        g1 = (g1 - m) / s    # Normalize the deviation
+    return g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
new file mode 100644
index 00000000..c22da761
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -0,0 +1,40 @@
+# Phenotype routines
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import numpy as np
+
+def removeMissingPhenotypes(y,g,verbose=False):
+    """
+    Remove missing data from matrices, make sure the genotype data has
+    individuals as rows
+    """
+    assert(y!=None)
+    assert(y.shape[0] == g.shape[0])
+
+    y1 = y
+    g1 = g
+    v = np.isnan(y)
+    keep = True - v
+    if v.sum():
+        if verbose: 
+            sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+        y1 = y[keep]
+        g1 = g[keep,:]
+    return y1,g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 94533ecb..16e88419 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -23,6 +23,8 @@ import tsvreader
 import numpy as np
 from lmm import gn2_load_redis
 from kinship import kinship
+from genotype import normalizeGenotype
+from phenotype import removeMissingPhenotypes
 
 usage = """
 python runlmm.py [options] command
@@ -87,47 +89,16 @@ if options.pheno:
 if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
-    
-def normalizeGenotype(g1):
-    g = np.copy(g1)         # avoid side effects
-    # Run for every SNP list (num individuals)
-    x = True - np.isnan(g)  # Matrix of True/False
-    m = g[x].mean()         # Global mean value
-    # print m
-    s = np.sqrt(g[x].var()) # Global stddev
-    # print s
-    g[np.isnan(g)] = m      # Plug-in mean values for missing
-    if s == 0:
-        g = g - m           # Subtract the mean
-    else:
-        g = (g - m) / s     # Normalize the deviation
-    return g
-
-gn = []
-for snp in g:
-    gn.append( normalizeGenotype(snp) )
-
-print("Before",g)
-
-gT = np.array(gn).T
-print("After",gT)
-G = np.array(gT)
-print "G shape",G.shape
-
-# Remove individuals with missing phenotypes
-if y != None:
-    v = np.isnan(y)
-    keep = True - v
-    if v.sum():
-       if options.verbose: sys.stderr.write("runlmm.py: Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
-       y = y[keep]
-       G = G[keep,:]
-       if k != None: k = k[keep,:][:,keep]
 
 if cmd == 'redis':
     # Emulating the redis setup of GN2
-    print G
-    ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
+    gn = []
+    for ind_g in g:
+        gn.append( normalizeGenotype(ind_g) )
+    gnt = np.array(gn).T
+    Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
+    print "G",G.shape,G
+    ps, ts = gn2_load_redis('testrun','other',np.array(k),Y,G,options.testing)
     print np.array(ps)
     print round(ps[0],4)
     assert(options.testing and round(ps[0],4)==0.7262)
-- 
cgit 1.4.1


From 08054314542485da15fc2d2c1903d844348a4fe3 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 13 Mar 2015 11:50:43 +0300
Subject: runlmm.py: remove extraneous conversion

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 16e88419..69a80411 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -98,7 +98,7 @@ if cmd == 'redis':
     gnt = np.array(gn).T
     Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
     print "G",G.shape,G
-    ps, ts = gn2_load_redis('testrun','other',np.array(k),Y,G,options.testing)
+    ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing)
     print np.array(ps)
     print round(ps[0],4)
     assert(options.testing and round(ps[0],4)==0.7262)
-- 
cgit 1.4.1


From 7b47dd4a9ca1227a0df4f3e7bf2358633944de0f Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 08:39:44 +0300
Subject: kinship: working on pylmm

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 10 ++++++++--
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py     |  5 +++--
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  | 26 +++++++++++++++++++-------
 3 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index dc2d717d..353784aa 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -58,6 +58,12 @@ def compute_matrixMult(job,W,q = None):
 def f_init(q):
     compute_matrixMult.q = q
 
+def kinship_full(G,options):
+    print G.shape
+    m = np.dot(G.T,G)
+    m = m/G.shape[0]
+    return m
+
 # Calculate the kinship matrix from G (SNPs as rows!), returns K
 #
 def kinship(G,options):
@@ -118,7 +124,7 @@ def kinship(G,options):
           # print j,K_j[:,0]
           K = K + K_j
 
-    print K.shape,K
+    print "kiship.kinship: ",K.shape,K
     K = K / float(snps)
     outFile = 'runtest.kin'
     if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
@@ -130,7 +136,7 @@ def kinship(G,options):
        if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
        np.savetxt(outFile+".kva",Kva)
        np.savetxt(outFile+".kve",Kve)
-      
+    return K      
 
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 48a931c5..36c3602f 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -326,7 +326,7 @@ def matrixMult(A,B):
     return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
 
 
-def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
+def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False):
     """
     genotype_matrix is an n x m matrix encoding SNP minor alleles.
     
@@ -361,7 +361,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
         genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr)
         
         percent_complete = int(round((counter/m)*45))
-        temp_data.store("percent_complete", percent_complete)
+        if temp_data != None:
+            temp_data.store("percent_complete", percent_complete)
         
     genotype_matrix = genotype_matrix[:,keep]
     print("genotype_matrix: ", pf(genotype_matrix))
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 69a80411..627cc7a4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,8 +21,8 @@ from optparse import OptionParser
 import sys
 import tsvreader
 import numpy as np
-from lmm import gn2_load_redis
-from kinship import kinship
+from lmm import gn2_load_redis, calculate_kinship
+from kinship import kinship, kinship_full
 from genotype import normalizeGenotype
 from phenotype import removeMissingPhenotypes
 
@@ -106,10 +106,22 @@ if cmd == 'redis':
     assert(options.testing and round(ps[-1],4)==0.3461)
 elif cmd == 'kinship':
     gn = []
-    for snp in G.T:
-        gn.append( normalizeGenotype(snp) )
-    G2 = np.array(gn)
-    print G2.shape,G2
-    K = kinship(G2,options)
+    for ind_g in g:
+        if len(gn)>=8000: break
+        gn.append( normalizeGenotype(ind_g) )
+    K = kinship_full(np.array(gn),options)
+    print "first Kinship method",K.shape,K
+    K = kinship(np.array(gn),options)
+    print "second Kinship method",K.shape,K
+    sys.exit(1)
+    gnt = np.array(gn).T
+    Y,g = removeMissingPhenotypes(y,gnt,options.verbose)
+    G = g
+    print G.shape,G
+    K = calculate_kinship(np.copy(G),None,options)
+    print G.shape,G
+    print "first Kinship method",K.shape,K
+    K = kinship(G.T,options)
+    assert(K[0][0]==1.28)
 else:
     print "Doing nothing"
-- 
cgit 1.4.1


From 5dfb2fdc0a739d86fc1b6c0230d43dcb05428092 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 10:21:25 +0300
Subject: Testing kinship works on small G

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py |  9 +++++----
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  | 29 +++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 353784aa..61da68fc 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -27,16 +27,17 @@ import Queue
 from optmatrix import matrix_initialize, matrixMultT
 
 
-def compute_W(job,G,n,compute_size):
+def compute_W(job,G,n,snps,compute_size):
    """
    Read 1000 SNPs at a time into matrix and return the result
    """
    W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
    for j in range(0,compute_size):
       row = job*compute_size + j
-      if row >= compute_size:
+      if row >= compute_size or row>=snps:
          W = W[:,range(0,j)]
          break
+      # print job,compute_size,j
       snp = G[job*compute_size+j]
       # print snp.shape,snp
       if snp.var() == 0:
@@ -79,6 +80,7 @@ def kinship(G,options):
     m = G.shape[0] # snps
     snps = m
     sys.stderr.write(str(m)+" SNPs\n")
+    assert m>n, "n should be larger than m (snps>inds)"
 
     q = mp.Queue()
     p = mp.Pool(numThreads, f_init, [q])
@@ -95,7 +97,7 @@ def kinship(G,options):
     for job in range(iterations):
        if options.verbose:
           sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
-       W = compute_W(job,G,n,options.computeSize)
+       W = compute_W(job,G,n,snps,options.computeSize)
        if numThreads == 1:
           compute_matrixMult(job,W,q)
           j,x = q.get()
@@ -124,7 +126,6 @@ def kinship(G,options):
           # print j,K_j[:,0]
           K = K + K_j
 
-    print "kiship.kinship: ",K.shape,K
     K = K / float(snps)
     outFile = 'runtest.kin'
     if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 627cc7a4..35f6e9a9 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno",
                   help="Phenotype file format 1.0")
 parser.add_option("--geno",dest="geno",
                   help="Genotype file format 1.0")
+parser.add_option("--skip-genotype-normalization",
+                  action="store_true", dest="skip_genotype_normalization", default=False,
+                  help="Skip genotype normalization")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
@@ -96,8 +99,11 @@ if cmd == 'redis':
     for ind_g in g:
         gn.append( normalizeGenotype(ind_g) )
     gnt = np.array(gn).T
-    Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
-    print "G",G.shape,G
+    if y:
+        Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
+        print "G",G.shape,G
+    else:
+        G = gnt
     ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing)
     print np.array(ps)
     print round(ps[0],4)
@@ -108,17 +114,24 @@ elif cmd == 'kinship':
     gn = []
     for ind_g in g:
         if len(gn)>=8000: break
-        gn.append( normalizeGenotype(ind_g) )
-    K = kinship_full(np.array(gn),options)
-    print "first Kinship method",K.shape,K
-    K = kinship(np.array(gn),options)
-    print "second Kinship method",K.shape,K
+        if options.skip_genotype_normalization:
+          gn.append(ind_g)
+        else:
+            gn.append( normalizeGenotype(ind_g) )
+    G = np.array(gn)
+    print G.shape, "\n", G
+    K = kinship_full(G,options)
+    print "first Kinship method",K.shape,"\n",K
+    K2 = calculate_kinship(np.copy(G.T),None,options)
+    print "GN2 Kinship method",K2.shape,"\n",K2
+    K3 = kinship(G,options)
+    print "third Kinship method",K3.shape,"\n",K3
     sys.exit(1)
     gnt = np.array(gn).T
     Y,g = removeMissingPhenotypes(y,gnt,options.verbose)
     G = g
     print G.shape,G
-    K = calculate_kinship(np.copy(G),None,options)
+    K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing)
     print G.shape,G
     print "first Kinship method",K.shape,K
     K = kinship(G.T,options)
-- 
cgit 1.4.1


From 30cc25c26263f10aa19548c70a18178f2b3ca59e Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 10:56:50 +0300
Subject: Replace missing values with MAF

---
 wqflask/wqflask/my_pylmm/pyLMM/genotype.py  | 31 +++++++++++++++++++++--------
 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py |  2 +-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py    | 27 ++++++++++++++++---------
 3 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index 19b0957b..e2457f6b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -17,20 +17,35 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import numpy as np
+from collections import Counter
 
-def normalizeGenotype(g):
+def replace_missing_with_MAF(snp_g):
+    """
+    Replace the missing genotype with the minor allele frequency (MAF)
+    in the snp row
+    """
+    g1 = np.copy(snp_g)
+    cnt = Counter(g1)
+    print cnt
+    min_val = min(cnt.itervalues())
+    print "min_val=",min_val
+    l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)]
+    print "l=",l[0]
+    return [l[0] if np.isnan(snp) else snp for snp in g1] 
+    
+def normalize(ind_g):
     """
     Run for every SNP list (for one individual) and return
     normalized SNP genotype values with missing data filled in
     """
-    g1 = np.copy(g)          # avoid side effects
-    x = True - np.isnan(g)   # Matrix of True/False
-    m = g[x].mean()          # Global mean value
-    s = np.sqrt(g[x].var())  # Global stddev
-    g1[np.isnan(g)] = m      # Plug-in mean values for missing data
+    g1 = np.copy(ind_g)          # avoid side effects
+    x = True - np.isnan(ind_g)   # Matrix of True/False
+    m = ind_g[x].mean()          # Global mean value
+    s = np.sqrt(ind_g[x].var())  # Global stddev
+    g1[np.isnan(ind_g)] = m      # Plug-in mean values for missing data
     if s == 0:
-        g1 = g1 - m          # Subtract the mean
+        g1 = g1 - m              # Subtract the mean
     else:
-        g1 = (g1 - m) / s    # Normalize the deviation
+        g1 = (g1 - m) / s        # Normalize the deviation
     return g1
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index c22da761..bb620052 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -19,7 +19,7 @@
 import sys
 import numpy as np
 
-def removeMissingPhenotypes(y,g,verbose=False):
+def remove_missing(y,g,verbose=False):
     """
     Remove missing data from matrices, make sure the genotype data has
     individuals as rows
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 35f6e9a9..ffe25fcf 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -23,8 +23,8 @@ import tsvreader
 import numpy as np
 from lmm import gn2_load_redis, calculate_kinship
 from kinship import kinship, kinship_full
-from genotype import normalizeGenotype
-from phenotype import removeMissingPhenotypes
+import genotype
+import phenotype
 
 usage = """
 python runlmm.py [options] command
@@ -51,6 +51,9 @@ parser.add_option("--pheno",dest="pheno",
                   help="Phenotype file format 1.0")
 parser.add_option("--geno",dest="geno",
                   help="Genotype file format 1.0")
+parser.add_option("--maf-normalization",
+                  action="store_true", dest="maf_normalization", default=False,
+                  help="Apply MAF genotype normalization")
 parser.add_option("--skip-genotype-normalization",
                   action="store_true", dest="skip_genotype_normalization", default=False,
                   help="Skip genotype normalization")
@@ -97,10 +100,10 @@ if cmd == 'redis':
     # Emulating the redis setup of GN2
     gn = []
     for ind_g in g:
-        gn.append( normalizeGenotype(ind_g) )
+        gn.append( genotype.normalize(ind_g) )
     gnt = np.array(gn).T
     if y:
-        Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
+        Y,G = phenotype.remove_missing(y,gnt,options.verbose)
         print "G",G.shape,G
     else:
         G = gnt
@@ -111,14 +114,20 @@ if cmd == 'redis':
     print round(ps[-1],4)
     assert(options.testing and round(ps[-1],4)==0.3461)
 elif cmd == 'kinship':
-    gn = []
+    G = g
+    print G.shape, "\n", G
+    if options.maf_normalization:
+        g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+        print "MAF: ",g1
+    sys.exit()
     for ind_g in g:
         if len(gn)>=8000: break
         if options.skip_genotype_normalization:
-          gn.append(ind_g)
+            gn.append(ind_g)
         else:
-            gn.append( normalizeGenotype(ind_g) )
-    G = np.array(gn)
+            gn.append( genotype.normalize(ind_g) )
+        G = np.array(gn)
+            
     print G.shape, "\n", G
     K = kinship_full(G,options)
     print "first Kinship method",K.shape,"\n",K
@@ -128,7 +137,7 @@ elif cmd == 'kinship':
     print "third Kinship method",K3.shape,"\n",K3
     sys.exit(1)
     gnt = np.array(gn).T
-    Y,g = removeMissingPhenotypes(y,gnt,options.verbose)
+    Y,g = remove_missing_phenotypes(y,gnt,options.verbose)
     G = g
     print G.shape,G
     K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing)
-- 
cgit 1.4.1


From 6e6cae20d29de14ab1d0a5dc8e38ebb9162aa639 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 11:28:36 +0300
Subject: More MAF

---
 wqflask/wqflask/my_pylmm/pyLMM/genotype.py | 15 +++++++--------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py   | 16 +++++-----------
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index e2457f6b..f5d9ee8c 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -18,20 +18,19 @@
 
 import numpy as np
 from collections import Counter
+import operator
 
 def replace_missing_with_MAF(snp_g):
     """
     Replace the missing genotype with the minor allele frequency (MAF)
     in the snp row
     """
-    g1 = np.copy(snp_g)
-    cnt = Counter(g1)
-    print cnt
-    min_val = min(cnt.itervalues())
-    print "min_val=",min_val
-    l = [k for k, v in cnt.iteritems() if v == min_val and not np.isnan(k)]
-    print "l=",l[0]
-    return [l[0] if np.isnan(snp) else snp for snp in g1] 
+    cnt = Counter(snp_g)
+    tuples = sorted(cnt.items(), key=operator.itemgetter(1))
+    l2 = [t for t in tuples if not np.isnan(t[0])]
+    maf = l2[0][0]
+    res = np.array([maf if np.isnan(snp) else snp for snp in snp_g])
+    return res
     
 def normalize(ind_g):
     """
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index ffe25fcf..0b8830d4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -115,18 +115,12 @@ if cmd == 'redis':
     assert(options.testing and round(ps[-1],4)==0.3461)
 elif cmd == 'kinship':
     G = g
-    print G.shape, "\n", G
+    print "Original G",G.shape, "\n", G
     if options.maf_normalization:
-        g1 = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
-        print "MAF: ",g1
-    sys.exit()
-    for ind_g in g:
-        if len(gn)>=8000: break
-        if options.skip_genotype_normalization:
-            gn.append(ind_g)
-        else:
-            gn.append( genotype.normalize(ind_g) )
-        G = np.array(gn)
+        G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+        print "MAF replacements: \n",G
+    if not options.skip_genotype_normalization:
+        G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
             
     print G.shape, "\n", G
     K = kinship_full(G,options)
-- 
cgit 1.4.1


From 2c6d1fcba1138415ecb3ca447e09d06d660af0db Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 12:14:53 +0300
Subject: Working on kinship: GN2 and simple matrix multiplicatino agree

---
 wqflask/wqflask/my_pylmm/pyLMM/genotype.py |  2 +-
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py  |  4 ++++
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py      |  4 ++--
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py   | 23 +++++++++++------------
 4 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
index f5d9ee8c..315fd824 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -23,7 +23,7 @@ import operator
 def replace_missing_with_MAF(snp_g):
     """
     Replace the missing genotype with the minor allele frequency (MAF)
-    in the snp row
+    in the snp row. It is rather slow!
     """
     cnt = Counter(snp_g)
     tuples = sorted(cnt.items(), key=operator.itemgetter(1))
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 61da68fc..c1750e1d 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -61,6 +61,10 @@ def f_init(q):
 
 def kinship_full(G,options):
     print G.shape
+    m = G.shape[0] # snps
+    n = G.shape[1] # inds
+    sys.stderr.write(str(m)+" SNPs\n")
+    assert m>n, "n should be larger than m (snps>inds)"
     m = np.dot(G.T,G)
     m = m/G.shape[0]
     return m
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 36c3602f..7bf77be5 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -340,8 +340,8 @@ def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False):
     print("genotype 2D matrix m (snps) is:", m)
     keep = []
     for counter in range(m):
-        if is_testing and counter>8:
-            break
+        # if is_testing and counter>8:
+        #     break
         #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
         #Checks if any values in column are not numbers
         not_number = np.isnan(genotype_matrix[:,counter])
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 0b8830d4..80478368 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -116,28 +116,27 @@ if cmd == 'redis':
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
+    if y:
+        gnt = np.array(gn).T
+        Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+        G = g.T
+        print "Removed missing phenotypes",G.shape, "\n", G
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
             
-    print G.shape, "\n", G
     K = kinship_full(G,options)
+    print "Genotype",G.shape, "\n", G
     print "first Kinship method",K.shape,"\n",K
-    K2 = calculate_kinship(np.copy(G.T),None,options)
+    K2 = calculate_kinship(np.copy(G.T),temp_data=None,is_testing=options.testing)
+    print "Genotype",G.shape, "\n", G
     print "GN2 Kinship method",K2.shape,"\n",K2
-    K3 = kinship(G,options)
+
+    print "Genotype",G.shape, "\n", G
+    K3 = kinship(np.copy(G),options)
     print "third Kinship method",K3.shape,"\n",K3
-    sys.exit(1)
-    gnt = np.array(gn).T
-    Y,g = remove_missing_phenotypes(y,gnt,options.verbose)
-    G = g
-    print G.shape,G
-    K = calculate_kinship(np.copy(G),temp_data=None,is_testing=options.testing)
-    print G.shape,G
-    print "first Kinship method",K.shape,K
-    K = kinship(G.T,options)
     assert(K[0][0]==1.28)
 else:
     print "Doing nothing"
-- 
cgit 1.4.1


From 0533dc4c053165853faa0512229eef30a174c425 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 12:29:35 +0300
Subject: Removing all references to is_testing

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 7bf77be5..87b999e8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -255,8 +255,8 @@ def run_other(pheno_vector,
         genotype_matrix,
         restricted_max_likelihood=True,
         refit=False,
-        tempdata=None,      # <---- can not be None
-        is_testing=False):
+        tempdata=None      # <---- can not be None
+        ):
     
     """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics
     
@@ -268,9 +268,9 @@ def run_other(pheno_vector,
     """
     
     print("In run_other")
-    print("REML=",restricted_max_likelihood," REFIT=",refit, "TESTING=",is_testing)
+    print("REML=",restricted_max_likelihood," REFIT=",refit)
     with Bench("Calculate Kinship"):
-        kinship_matrix = calculate_kinship(genotype_matrix, tempdata, is_testing)
+        kinship_matrix = calculate_kinship(genotype_matrix, tempdata)
     
     print("kinship_matrix: ", pf(kinship_matrix))
     print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
@@ -326,7 +326,7 @@ def matrixMult(A,B):
     return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
 
 
-def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False):
+def calculate_kinship(genotype_matrix, temp_data=None):
     """
     genotype_matrix is an n x m matrix encoding SNP minor alleles.
     
@@ -340,8 +340,6 @@ def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False):
     print("genotype 2D matrix m (snps) is:", m)
     keep = []
     for counter in range(m):
-        # if is_testing and counter>8:
-        #     break
         #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
         #Checks if any values in column are not numbers
         not_number = np.isnan(genotype_matrix[:,counter])
@@ -490,7 +488,7 @@ class LMM:
           is not done consistently.
  
     """
-    def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True,is_testing=False):
+    def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=True):
  
        """
        The constructor takes a phenotype vector or array of size n.
@@ -500,7 +498,6 @@ class LMM:
        When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
        """
 
-       self.is_testing = True
        if X0 == None: X0 = np.ones(len(Y)).reshape(len(Y),1)
        self.verbose = verbose
  
@@ -728,7 +725,7 @@ class LMM:
        pl.title(title)
 
 
-def gn2_redis(key,species,is_testing=False):
+def gn2_redis(key,species):
     json_params = Redis.get(key)
     
     params = json.loads(json_params)
@@ -753,8 +750,7 @@ def gn2_redis(key,species,is_testing=False):
                   genotype_matrix = geno,
                   restricted_max_likelihood = params['restricted_max_likelihood'],
                   refit = params['refit'],
-                  tempdata = tempdata,
-                  is_testing=is_testing)
+                  tempdata = tempdata)
         
     results_key = "pylmm:results:" + params['temp_uuid']
 
@@ -779,7 +775,7 @@ def gn2_main():
 
     gn2_redis(key,species)
 
-def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
+def gn2_load_redis(key,species,kinship,pheno,geno):
     print("Loading Redis from parsed data")
     params = dict(pheno_vector = pheno.tolist(),
                   genotype_matrix = geno.tolist(),
@@ -796,7 +792,7 @@ def gn2_load_redis(key,species,kinship,pheno,geno,is_testing):
     Redis.set(key, json_params)
     Redis.expire(key, 60*60)
 
-    return gn2_redis(key,species,is_testing)
+    return gn2_redis(key,species)
     
 if __name__ == '__main__':
     print("WARNING: Calling pylmm from lmm.py will become OBSOLETE, use runlmm.py instead!")
-- 
cgit 1.4.1


From fc42cd5910bbc021e1fd51b579b2f83a421a33b8 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 12:47:25 +0300
Subject: Allow empty K to be passed in

---
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py    |  7 ++++++-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 28 ++++++++++++++++------------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 87b999e8..d6830787 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -338,6 +338,7 @@ def calculate_kinship(genotype_matrix, temp_data=None):
     m = genotype_matrix.shape[1]
     print("genotype 2D matrix n (inds) is:", n)
     print("genotype 2D matrix m (snps) is:", m)
+    assert m>n, "n should be larger than m (snps>inds)"
     keep = []
     for counter in range(m):
         #print("type of genotype_matrix[:,counter]:", pf(genotype_matrix[:,counter]))
@@ -777,9 +778,13 @@ def gn2_main():
 
 def gn2_load_redis(key,species,kinship,pheno,geno):
     print("Loading Redis from parsed data")
+    if kinship == None:
+        k = None
+    else:
+        k = kinship.tolist()
     params = dict(pheno_vector = pheno.tolist(),
                   genotype_matrix = geno.tolist(),
-                  kinship_matrix= kinship.tolist(),
+                  kinship_matrix= k,
                   restricted_max_likelihood = True,
                   refit = False,
                   temp_uuid = "testrun_temp_uuid",
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 80478368..7a77ad0a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -98,16 +98,20 @@ if options.geno:
 
 if cmd == 'redis':
     # Emulating the redis setup of GN2
-    gn = []
-    for ind_g in g:
-        gn.append( genotype.normalize(ind_g) )
-    gnt = np.array(gn).T
-    if y:
-        Y,G = phenotype.remove_missing(y,gnt,options.verbose)
-        print "G",G.shape,G
-    else:
-        G = gnt
-    ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing)
+    G = g
+    print "Original G",G.shape, "\n", G
+    if y != None:
+        gnt = np.array(g).T
+        Y,g = phenotype.remove_missing(y,g.T,options.verbose)
+        G = g.T
+        print "Removed missing phenotypes",G.shape, "\n", G
+    if options.maf_normalization:
+        G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
+        print "MAF replacements: \n",G
+    if not options.skip_genotype_normalization:
+        G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+
+    ps, ts = gn2_load_redis('testrun','other',k,Y,G.T)
     print np.array(ps)
     print round(ps[0],4)
     assert(options.testing and round(ps[0],4)==0.7262)
@@ -116,8 +120,8 @@ if cmd == 'redis':
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
-    if y:
-        gnt = np.array(gn).T
+    if y != None:
+        gnt = np.array(g).T
         Y,g = phenotype.remove_missing(y,g.T,options.verbose)
         G = g.T
         print "Removed missing phenotypes",G.shape, "\n", G
-- 
cgit 1.4.1


From 60476f068766371c41eb17c2ad20f4f6ce7da2d5 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 13:58:02 +0300
Subject: Adding assertions

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 7a77ad0a..8c0b73eb 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -113,10 +113,19 @@ if cmd == 'redis':
 
     ps, ts = gn2_load_redis('testrun','other',k,Y,G.T)
     print np.array(ps)
-    print round(ps[0],4)
-    assert(options.testing and round(ps[0],4)==0.7262)
-    print round(ps[-1],4)
-    assert(options.testing and round(ps[-1],4)==0.3461)
+    # Test results
+    p1 = round(ps[0],4)
+    p2 = round(ps[-1],4)
+    sys.stderr.write(options.geno+"\n")
+    if options.geno == 'data/small.geno':
+        assert p1==0.0708, "p1=%f" % p1
+        assert p2==0.1417, "p2=%f" % p2
+    if options.geno == 'data/small_na.geno':
+        assert p1==0.0958, "p1=%f" % p1
+        assert p2==0.0435, "p2=%f" % p2
+    if options.geno == 'data/test8000.geno':
+        assert p1==0.8984, "p1=%f" % p1
+        assert p2==0.9623, "p2=%f" % p2
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
-- 
cgit 1.4.1


From 3c8c9103606736abf3e891cb000dc01e8d0c8a43 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 14:17:42 +0300
Subject: Adding tests

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 8c0b73eb..21d9240b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -143,13 +143,24 @@ elif cmd == 'kinship':
     K = kinship_full(G,options)
     print "Genotype",G.shape, "\n", G
     print "first Kinship method",K.shape,"\n",K
-    K2 = calculate_kinship(np.copy(G.T),temp_data=None,is_testing=options.testing)
+    K2 = calculate_kinship(np.copy(G.T),temp_data=None)
     print "Genotype",G.shape, "\n", G
     print "GN2 Kinship method",K2.shape,"\n",K2
 
     print "Genotype",G.shape, "\n", G
     K3 = kinship(np.copy(G),options)
     print "third Kinship method",K3.shape,"\n",K3
-    assert(K[0][0]==1.28)
+    sys.stderr.write(options.geno+"\n")
+    k1 = round(K[0][0],4)
+    k2 = round(K2[0][0],4)
+    k3 = round(K3[0][0],4)
+    if options.geno == 'data/small.geno':
+        assert k1==0.7939, "k1=%f" % k1
+        assert k2==0.7939, "k1=%f" % k1
+        assert k3==0.7939, "k1=%f" % k1
+    if options.geno == 'data/small_na.geno':
+        assert k1==0.7172, "k1=%f" % k1
+        assert k2==0.7172, "k1=%f" % k1
+        assert k3==0.7172, "k1=%f" % k1
 else:
     print "Doing nothing"
-- 
cgit 1.4.1


From 24b44a2de4fb47c2ed6a6a3def3acce1c06bebf4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 15:26:07 +0300
Subject: Conditional run of K calculation

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py |  2 +-
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index c1750e1d..1dca1c57 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -59,7 +59,7 @@ def compute_matrixMult(job,W,q = None):
 def f_init(q):
     compute_matrixMult.q = q
 
-def kinship_full(G,options):
+def kinship_full(G):
     print G.shape
     m = G.shape[0] # snps
     n = G.shape[1] # inds
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 21d9240b..26485136 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -139,20 +139,21 @@ elif cmd == 'kinship':
         print "MAF replacements: \n",G
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
-            
-    K = kinship_full(G,options)
-    print "Genotype",G.shape, "\n", G
-    print "first Kinship method",K.shape,"\n",K
-    K2 = calculate_kinship(np.copy(G.T),temp_data=None)
-    print "Genotype",G.shape, "\n", G
-    print "GN2 Kinship method",K2.shape,"\n",K2
 
+    if True:
+        K = kinship_full(G)
+        print "Genotype",G.shape, "\n", G
+        print "first Kinship method",K.shape,"\n",K
+        k1 = round(K[0][0],4)
+        K2 = calculate_kinship(np.copy(G.T),temp_data=None)
+        print "Genotype",G.shape, "\n", G
+        print "GN2 Kinship method",K2.shape,"\n",K2
+        k2 = round(K2[0][0],4)
+    
     print "Genotype",G.shape, "\n", G
     K3 = kinship(np.copy(G),options)
     print "third Kinship method",K3.shape,"\n",K3
     sys.stderr.write(options.geno+"\n")
-    k1 = round(K[0][0],4)
-    k2 = round(K2[0][0],4)
     k3 = round(K3[0][0],4)
     if options.geno == 'data/small.geno':
         assert k1==0.7939, "k1=%f" % k1
-- 
cgit 1.4.1


From 4931530a6c32eaf56d1586154bc7ea872c0de2ba Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 15:51:48 +0300
Subject: Tests and indentation

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 159 +++++++++++++++---------------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  |  13 ++-
 2 files changed, 89 insertions(+), 83 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 1dca1c57..8caa753b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -57,91 +57,94 @@ def compute_matrixMult(job,W,q = None):
    return job
 
 def f_init(q):
-    compute_matrixMult.q = q
+   compute_matrixMult.q = q
 
 def kinship_full(G):
-    print G.shape
-    m = G.shape[0] # snps
-    n = G.shape[1] # inds
-    sys.stderr.write(str(m)+" SNPs\n")
-    assert m>n, "n should be larger than m (snps>inds)"
-    m = np.dot(G.T,G)
-    m = m/G.shape[0]
-    return m
+   """
+   Calculate the Kinship matrix using a full dot multiplication
+   """
+   print G.shape
+   m = G.shape[0] # snps
+   n = G.shape[1] # inds
+   sys.stderr.write(str(m)+" SNPs\n")
+   assert m>n, "n should be larger than m (snps>inds)"
+   m = np.dot(G.T,G)
+   m = m/G.shape[0]
+   return m
 
 # Calculate the kinship matrix from G (SNPs as rows!), returns K
 #
 def kinship(G,options):
-    numThreads = None
-    if options.numThreads:
-        numThreads = int(options.numThreads)
-    options.computeSize = 1000
-    matrix_initialize(options.useBLAS)
-    
-    sys.stderr.write(str(G.shape)+"\n")
-    n = G.shape[1] # inds
-    inds = n
-    m = G.shape[0] # snps
-    snps = m
-    sys.stderr.write(str(m)+" SNPs\n")
-    assert m>n, "n should be larger than m (snps>inds)"
-
-    q = mp.Queue()
-    p = mp.Pool(numThreads, f_init, [q])
-    iterations = snps/options.computeSize+1
-    if options.testing:
+   numThreads = None
+   if options.numThreads:
+      numThreads = int(options.numThreads)
+   options.computeSize = 1000
+   matrix_initialize(options.useBLAS)
+   
+   sys.stderr.write(str(G.shape)+"\n")
+   n = G.shape[1] # inds
+   inds = n
+   m = G.shape[0] # snps
+   snps = m
+   sys.stderr.write(str(m)+" SNPs\n")
+   assert m>n, "n should be larger than m (snps>inds)"
+
+   q = mp.Queue()
+   p = mp.Pool(numThreads, f_init, [q])
+   iterations = snps/options.computeSize+1
+   if options.testing:
       iterations = 8
-    # jobs = range(0,8) # range(0,iterations)
-
-    results = []
-
-    K = np.zeros((n,n))  # The Kinship matrix has dimension individuals x individuals
-
-    completed = 0
-    for job in range(iterations):
-       if options.verbose:
-          sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
-       W = compute_W(job,G,n,snps,options.computeSize)
-       if numThreads == 1:
-          compute_matrixMult(job,W,q)
-          j,x = q.get()
-          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
-          K_j = x
-          # print j,K_j[:,0]
-          K = K + K_j
-       else:
-          results.append(p.apply_async(compute_matrixMult, (job,W)))
-          # Do we have a result?
-          try: 
-             j,x = q.get_nowait()
-             if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
-             K_j = x
-             # print j,K_j[:,0]
-             K = K + K_j
-             completed += 1
-          except Queue.Empty:
-             pass
-
-    if numThreads == None or numThreads > 1:
-       for job in range(len(results)-completed):
-          j,x = q.get(True,15)
-          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
-          K_j = x
-          # print j,K_j[:,0]
-          K = K + K_j
-
-    K = K / float(snps)
-    outFile = 'runtest.kin'
-    if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
-    np.savetxt(outFile,K)
-
-    if options.saveKvaKve:
-       if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
-       Kva,Kve = linalg.eigh(K)
-       if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
-       np.savetxt(outFile+".kva",Kva)
-       np.savetxt(outFile+".kve",Kve)
-    return K      
+   # jobs = range(0,8) # range(0,iterations)
+
+   results = []
+
+   K = np.zeros((n,n))  # The Kinship matrix has dimension individuals x individuals
+
+   completed = 0
+   for job in range(iterations):
+      if options.verbose:
+         sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+      W = compute_W(job,G,n,snps,options.computeSize)
+      if numThreads == 1:
+         compute_matrixMult(job,W,q)
+         j,x = q.get()
+         if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+         K_j = x
+         # print j,K_j[:,0]
+         K = K + K_j
+      else:
+         results.append(p.apply_async(compute_matrixMult, (job,W)))
+         # Do we have a result?
+         try: 
+            j,x = q.get_nowait()
+            if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+            K_j = x
+            # print j,K_j[:,0]
+            K = K + K_j
+            completed += 1
+         except Queue.Empty:
+            pass
+
+   if numThreads == None or numThreads > 1:
+      for job in range(len(results)-completed):
+         j,x = q.get(True,15)
+         if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+         K_j = x
+         # print j,K_j[:,0]
+         K = K + K_j
+
+   K = K / float(snps)
+   outFile = 'runtest.kin'
+   if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+   np.savetxt(outFile,K)
+
+   if options.saveKvaKve:
+      if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+      Kva,Kve = linalg.eigh(K)
+      if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+      np.savetxt(outFile+".kva",Kva)
+      np.savetxt(outFile+".kve",Kve)
+   return K      
 
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 26485136..547ac7f1 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -140,7 +140,7 @@ elif cmd == 'kinship':
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
 
-    if True:
+    if False:
         K = kinship_full(G)
         print "Genotype",G.shape, "\n", G
         print "first Kinship method",K.shape,"\n",K
@@ -157,11 +157,14 @@ elif cmd == 'kinship':
     k3 = round(K3[0][0],4)
     if options.geno == 'data/small.geno':
         assert k1==0.7939, "k1=%f" % k1
-        assert k2==0.7939, "k1=%f" % k1
-        assert k3==0.7939, "k1=%f" % k1
+        assert k2==0.7939, "k2=%f" % k2
+        assert k3==0.7939, "k3=%f" % k3
     if options.geno == 'data/small_na.geno':
         assert k1==0.7172, "k1=%f" % k1
-        assert k2==0.7172, "k1=%f" % k1
-        assert k3==0.7172, "k1=%f" % k1
+        assert k2==0.7172, "k2=%f" % k2
+        assert k3==0.7172, "k3=%f" % k3
+    if options.geno == 'data/test8000.geno':
+        assert k3==1.4352, "k3=%f" % k3
+
 else:
     print "Doing nothing"
-- 
cgit 1.4.1


From 7759c755cb8d525ba3739864021921888bdf47e9 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 16:17:03 +0300
Subject: Parallel K calculation is fixed

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 33 +++++++++++++++----------------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py  |  5 ++++-
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 8caa753b..a5497a77 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -26,20 +26,31 @@ import Queue
 
 from optmatrix import matrix_initialize, matrixMultT
 
+def kinship_full(G):
+   """
+   Calculate the Kinship matrix using a full dot multiplication
+   """
+   print G.shape
+   m = G.shape[0] # snps
+   n = G.shape[1] # inds
+   sys.stderr.write(str(m)+" SNPs\n")
+   assert m>n, "n should be larger than m (snps>inds)"
+   m = np.dot(G.T,G)
+   m = m/G.shape[0]
+   return m
 
 def compute_W(job,G,n,snps,compute_size):
    """
    Read 1000 SNPs at a time into matrix and return the result
    """
-   W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+   m = compute_size
+   W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
    for j in range(0,compute_size):
-      row = job*compute_size + j
-      if row >= compute_size or row>=snps:
+      pos = job*m + j # real position
+      if pos >= snps:
          W = W[:,range(0,j)]
          break
-      # print job,compute_size,j
       snp = G[job*compute_size+j]
-      # print snp.shape,snp
       if snp.var() == 0:
          continue
       W[:,j] = snp  # set row to list of SNPs
@@ -59,18 +70,6 @@ def compute_matrixMult(job,W,q = None):
 def f_init(q):
    compute_matrixMult.q = q
 
-def kinship_full(G):
-   """
-   Calculate the Kinship matrix using a full dot multiplication
-   """
-   print G.shape
-   m = G.shape[0] # snps
-   n = G.shape[1] # inds
-   sys.stderr.write(str(m)+" SNPs\n")
-   assert m>n, "n should be larger than m (snps>inds)"
-   m = np.dot(G.T,G)
-   m = m/G.shape[0]
-   return m
 
 # Calculate the kinship matrix from G (SNPs as rows!), returns K
 #
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 547ac7f1..a322f988 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -70,6 +70,9 @@ parser.add_option("--saveKvaKve",
 parser.add_option("--test",
                   action="store_true", dest="testing", default=False,
                   help="Testing mode")
+parser.add_option("--test-kinship",
+                  action="store_true", dest="test_kinship", default=False,
+                  help="Testing mode for Kinship calculation")
 
 (options, args) = parser.parse_args()
 
@@ -140,7 +143,7 @@ elif cmd == 'kinship':
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
 
-    if False:
+    if options.test_kinship:
         K = kinship_full(G)
         print "Genotype",G.shape, "\n", G
         print "first Kinship method",K.shape,"\n",K
-- 
cgit 1.4.1


From 58f43e9e767890031c77e44d10939c48bc1c81fe Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 16:45:37 +0300
Subject: Kinship multi-core: don't overload the queue with jobs

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index a5497a77..967eca81 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -22,7 +22,8 @@ import sys
 import os
 import numpy as np
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
-import Queue 
+import Queue
+import time
 
 from optmatrix import matrix_initialize, matrixMultT
 
@@ -90,6 +91,8 @@ def kinship(G,options):
 
    q = mp.Queue()
    p = mp.Pool(numThreads, f_init, [q])
+   cpu_num = mp.cpu_count()
+   print "CPU cores:",cpu_num
    iterations = snps/options.computeSize+1
    if options.testing:
       iterations = 8
@@ -105,6 +108,7 @@ def kinship(G,options):
          sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
       W = compute_W(job,G,n,snps,options.computeSize)
       if numThreads == 1:
+         # Single-core
          compute_matrixMult(job,W,q)
          j,x = q.get()
          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
@@ -112,25 +116,29 @@ def kinship(G,options):
          # print j,K_j[:,0]
          K = K + K_j
       else:
+         # Multi-core
          results.append(p.apply_async(compute_matrixMult, (job,W)))
          # Do we have a result?
-         try: 
-            j,x = q.get_nowait()
-            if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
-            K_j = x
-            # print j,K_j[:,0]
-            K = K + K_j
-            completed += 1
-         except Queue.Empty:
-            pass
-
+         while (len(results)-completed>cpu_num):
+            try: 
+               j,x = q.get_nowait()
+               if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+               K_j = x
+               # print j,K_j[:,0]
+               K = K + K_j
+               completed += 1
+            except Queue.Empty:
+               pass
+         
    if numThreads == None or numThreads > 1:
+      # results contains the growing result set
       for job in range(len(results)-completed):
          j,x = q.get(True,15)
          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
          K_j = x
          # print j,K_j[:,0]
          K = K + K_j
+         completed += 1
 
    K = K / float(snps)
    outFile = 'runtest.kin'
-- 
cgit 1.4.1


From c784f2fb2e3e4ad6cb6d6425aa7ec1596c2bd132 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 16:52:47 +0300
Subject: Allow queue to be 2xCPU. Disable output files

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 967eca81..1383c782 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -71,7 +71,6 @@ def compute_matrixMult(job,W,q = None):
 def f_init(q):
    compute_matrixMult.q = q
 
-
 # Calculate the kinship matrix from G (SNPs as rows!), returns K
 #
 def kinship(G,options):
@@ -119,7 +118,7 @@ def kinship(G,options):
          # Multi-core
          results.append(p.apply_async(compute_matrixMult, (job,W)))
          # Do we have a result?
-         while (len(results)-completed>cpu_num):
+         while (len(results)-completed>cpu_num*2):
             try: 
                j,x = q.get_nowait()
                if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
@@ -141,16 +140,17 @@ def kinship(G,options):
          completed += 1
 
    K = K / float(snps)
-   outFile = 'runtest.kin'
-   if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
-   np.savetxt(outFile,K)
-
-   if options.saveKvaKve:
-      if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
-      Kva,Kve = linalg.eigh(K)
-      if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
-      np.savetxt(outFile+".kva",Kva)
-      np.savetxt(outFile+".kve",Kve)
+   
+   # outFile = 'runtest.kin'
+   # if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+   # np.savetxt(outFile,K)
+
+   # if options.saveKvaKve:
+   #    if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+   #    Kva,Kve = linalg.eigh(K)
+   #    if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+   #    np.savetxt(outFile+".kva",Kva)
+   #    np.savetxt(outFile+".kve",Kve)
    return K      
 
 
-- 
cgit 1.4.1


From 086ce413e270d2ed07a1a385f84e569e3a478a39 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 17:08:10 +0300
Subject: Allow the GC to do its job

---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index a322f988..469ba6c9 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -113,6 +113,8 @@ if cmd == 'redis':
         print "MAF replacements: \n",G
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+    g = None
+    gnt = None
 
     ps, ts = gn2_load_redis('testrun','other',k,Y,G.T)
     print np.array(ps)
@@ -142,6 +144,8 @@ elif cmd == 'kinship':
         print "MAF replacements: \n",G
     if not options.skip_genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
+    g = None
+    gnt = None
 
     if options.test_kinship:
         K = kinship_full(G)
-- 
cgit 1.4.1


From 38cb5d5c0606481cbc34bd1a1187264d1fc1b085 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 14 Mar 2015 18:28:02 +0300
Subject: Threading: Sleep while waiting

---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'wqflask')

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index 1383c782..9ab48510 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -119,6 +119,7 @@ def kinship(G,options):
          results.append(p.apply_async(compute_matrixMult, (job,W)))
          # Do we have a result?
          while (len(results)-completed>cpu_num*2):
+            time.sleep(0.1)
             try: 
                j,x = q.get_nowait()
                if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
-- 
cgit 1.4.1