8 files changed, 281 insertions, 177 deletions
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
index abeffee7..9565fb96 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/chunks.py
@@ -93,4 +93,4 @@ if __name__ == '__main__':
     _main()
     print("\nConfirming doctests...")
     import doctest
-    doctest.testmod()
\ No newline at end of file
+    doctest.testmod()
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/genotype.py b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
new file mode 100644
index 00000000..19b0957b
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/genotype.py
@@ -0,0 +1,36 @@
+# Genotype routines
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+
+def normalizeGenotype(g):
+    """
+    Run for every SNP list (for one individual) and return
+    normalized SNP genotype values with missing data filled in
+    """
+    g1 = np.copy(g)          # avoid side effects
+    x = True - np.isnan(g)   # Matrix of True/False
+    m = g[x].mean()          # Global mean value
+    s = np.sqrt(g[x].var())  # Global stddev
+    g1[np.isnan(g)] = m      # Plug-in mean values for missing data
+    if s == 0:
+        g1 = g1 - m          # Subtract the mean
+    else:
+        g1 = (g1 - m) / s    # Normalize the deviation
+    return g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
new file mode 100644
index 00000000..353784aa
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -0,0 +1,142 @@
+# pylmm kinship calculation
+#
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# env PYTHONPATH=$pylmm_lib_path:./lib python $pylmm_lib_path/runlmm.py --pheno test.pheno --geno test9000.geno kinship --test
+
+import sys
+import os
+import numpy as np
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+from optmatrix import matrix_initialize, matrixMultT
+
+
+def compute_W(job,G,n,compute_size):
+   """
+   Read 1000 SNPs at a time into matrix and return the result
+   """
+   W = np.ones((n,compute_size)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+   for j in range(0,compute_size):
+      row = job*compute_size + j
+      if row >= compute_size:
+         W = W[:,range(0,j)]
+         break
+      snp = G[job*compute_size+j]
+      # print snp.shape,snp
+      if snp.var() == 0:
+         continue
+      W[:,j] = snp  # set row to list of SNPs
+   return W
+
+def compute_matrixMult(job,W,q = None):
+   """
+   Compute Kinship(W)*j
+
+   For every set of SNPs matrixMult is used to multiply matrices T(W)*W
+   """
+   res = matrixMultT(W)
+   if not q: q=compute_matrixMult.q
+   q.put([job,res])
+   return job
+
+def f_init(q):
+    compute_matrixMult.q = q
+
+def kinship_full(G,options):
+    print G.shape
+    m = np.dot(G.T,G)
+    m = m/G.shape[0]
+    return m
+
+# Calculate the kinship matrix from G (SNPs as rows!), returns K
+#
+def kinship(G,options):
+    numThreads = None
+    if options.numThreads:
+        numThreads = int(options.numThreads)
+    options.computeSize = 1000
+    matrix_initialize(options.useBLAS)
+    
+    sys.stderr.write(str(G.shape)+"\n")
+    n = G.shape[1] # inds
+    inds = n
+    m = G.shape[0] # snps
+    snps = m
+    sys.stderr.write(str(m)+" SNPs\n")
+
+    q = mp.Queue()
+    p = mp.Pool(numThreads, f_init, [q])
+    iterations = snps/options.computeSize+1
+    if options.testing:
+      iterations = 8
+    # jobs = range(0,8) # range(0,iterations)
+
+    results = []
+
+    K = np.zeros((n,n))  # The Kinship matrix has dimension individuals x individuals
+
+    completed = 0
+    for job in range(iterations):
+       if options.verbose:
+          sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+       W = compute_W(job,G,n,options.computeSize)
+       if numThreads == 1:
+          compute_matrixMult(job,W,q)
+          j,x = q.get()
+          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+          K_j = x
+          # print j,K_j[:,0]
+          K = K + K_j
+       else:
+          results.append(p.apply_async(compute_matrixMult, (job,W)))
+          # Do we have a result?
+          try: 
+             j,x = q.get_nowait()
+             if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+             K_j = x
+             # print j,K_j[:,0]
+             K = K + K_j
+             completed += 1
+          except Queue.Empty:
+             pass
+
+    if numThreads == None or numThreads > 1:
+       for job in range(len(results)-completed):
+          j,x = q.get(True,15)
+          if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+          K_j = x
+          # print j,K_j[:,0]
+          K = K + K_j
+
+    print "kiship.kinship: ",K.shape,K
+    K = K / float(snps)
+    outFile = 'runtest.kin'
+    if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
+    np.savetxt(outFile,K)
+
+    if options.saveKvaKve:
+       if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+       Kva,Kve = linalg.eigh(K)
+       if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+       np.savetxt(outFile+".kva",Kva)
+       np.savetxt(outFile+".kve",Kve)
+    return K      
+
+
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 58d7593d..36c3602f 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -143,6 +143,7 @@ def run_human(pheno_vector,
         
         timestamp = datetime.datetime.utcnow().isoformat()
 
+        # Pickle chunks of input SNPs (from Plink interator) and compress them
         #print("Starting adding loop")
         for part, result in enumerate(results):
             #data = pickle.dumps(result, pickle.HIGHEST_PROTOCOL)
@@ -325,7 +326,7 @@ def matrixMult(A,B):
     return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
 
 
-def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
+def calculate_kinship(genotype_matrix, temp_data=None, is_testing=False):
     """
     genotype_matrix is an n x m matrix encoding SNP minor alleles.
     
@@ -360,7 +361,8 @@ def calculate_kinship(genotype_matrix, temp_data, is_testing=False):
         genotype_matrix[:,counter] = (genotype_matrix[:,counter] - values_mean) / np.sqrt(vr)
         
         percent_complete = int(round((counter/m)*45))
-        temp_data.store("percent_complete", percent_complete)
+        if temp_data != None:
+            temp_data.store("percent_complete", percent_complete)
         
     genotype_matrix = genotype_matrix[:,keep]
     print("genotype_matrix: ", pf(genotype_matrix))
@@ -406,6 +408,8 @@ def GWAS(pheno_vector,
     v = np.isnan(pheno_vector)
     if v.sum():
         keep = True - v
+        print(pheno_vector.shape,pheno_vector)
+        print(keep.shape,keep)
         pheno_vector = pheno_vector[keep]
         #genotype_matrix = genotype_matrix[keep,:]
         #covariate_matrix = covariate_matrix[keep,:]
@@ -437,6 +441,8 @@ def GWAS(pheno_vector,
                 p_values.append(0)
                 t_statistics.append(np.nan)
                 continue
+            
+            print(genotype_matrix.shape,pheno_vector.shape,keep.shape)
 
             pheno_vector = pheno_vector[keep]
             covariate_matrix = covariate_matrix[keep,:]
@@ -513,7 +519,7 @@ class LMM:
           Kve = []
        self.nonmissing = x
  
-       print("this K is:", pf(K))
+       print("this K is:", K.shape, pf(K))
        
        if len(Kva) == 0 or len(Kve) == 0:
           if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
@@ -525,8 +531,8 @@ class LMM:
        self.K = K
        self.Kva = Kva
        self.Kve = Kve
-       print("self.Kva is: ", pf(self.Kva))
-       print("self.Kve is: ", pf(self.Kve))
+       print("self.Kva is: ", self.Kva.shape, pf(self.Kva))
+       print("self.Kve is: ", self.Kve.shape, pf(self.Kve))
        self.Y = Y
        self.X0 = X0
        self.N = self.K.shape[0]
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
index abb72081..5c71db6a 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/optmatrix.py
@@ -9,7 +9,7 @@ from scipy import stats
 useNumpy = None
 hasBLAS = None
 
-def matrix_initialize(useBLAS):
+def matrix_initialize(useBLAS=True): 
     global useNumpy  # module based variable
     if useBLAS and useNumpy == None:
         print get_info('blas_opt')
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
new file mode 100644
index 00000000..c22da761
--- /dev/null
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -0,0 +1,40 @@
+# Phenotype routines
+
+# Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
+# Copyright (C) 2015  Pjotr Prins (pjotr.prins@thebird.nl)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import numpy as np
+
+def removeMissingPhenotypes(y,g,verbose=False):
+    """
+    Remove missing data from matrices, make sure the genotype data has
+    individuals as rows
+    """
+    assert(y!=None)
+    assert(y.shape[0] == g.shape[0])
+
+    y1 = y
+    g1 = g
+    v = np.isnan(y)
+    keep = True - v
+    if v.sum():
+        if verbose: 
+            sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+        y1 = y[keep]
+        g1 = g[keep,:]
+    return y1,g1
+
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py b/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
deleted file mode 100644
index e47c18e1..00000000
--- a/wqflask/wqflask/my_pylmm/pyLMM/process_plink.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import absolute_import, print_function, division
-
-import sys
-sys.path.append("../../..")
-
-print("sys.path: ", sys.path)
-
-import numpy as np
-
-import zlib
-import cPickle as pickle
-import redis
-Redis = redis.Redis()
-
-import lmm
-
-class ProcessLmmChunk(object):
-    
-    def __init__(self):
-        self.get_snp_data()
-        self.get_lmm_vars()
-        
-        keep = self.trim_matrices()
-        
-        self.do_association(keep)
-        
-        print("p_values is: ", self.p_values)
-        
-    def get_snp_data(self):
-        plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
-        plink_data = pickle.loads(plink_pickled)
-        
-        self.snps = np.array(plink_data['result'])
-        self.identifier = plink_data['identifier']
-        
-    def get_lmm_vars(self):
-        lmm_vars_pickled = Redis.hget(self.identifier, "lmm_vars")
-        lmm_vars = pickle.loads(lmm_vars_pickled)
-        
-        self.pheno_vector = np.array(lmm_vars['pheno_vector'])
-        self.covariate_matrix = np.array(lmm_vars['covariate_matrix'])
-        self.kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-        
-    def trim_matrices(self):
-        v = np.isnan(self.pheno_vector)
-        keep = True - v
-        keep = keep.reshape((len(keep),))
-        
-        if v.sum():
-            self.pheno_vector = self.pheno_vector[keep]
-            self.covariate_matrix = self.covariate_matrix[keep,:]
-            self.kinship_matrix = self.kinship_matrix[keep,:][:,keep]
-
-        return keep
-    
-    def do_association(self, keep):
-        n = self.kinship_matrix.shape[0]
-        lmm_ob = lmm.LMM(self.pheno_vector,
-                    self.kinship_matrix,
-                    self.covariate_matrix)
-        lmm_ob.fit()
-    
-        self.p_values = []
-        
-        for snp in self.snps:
-            snp = snp[0]
-            p_value, t_stat = lmm.human_association(snp,
-                                        n,
-                                        keep,
-                                        lmm_ob,
-                                        self.pheno_vector,
-                                        self.covariate_matrix,
-                                        self.kinship_matrix,
-                                        False)
-        
-            self.p_values.append(p_value)
-            
-
-#plink_pickled = zlib.decompress(Redis.lpop("plink_inputs"))
-#
-#plink_data = pickle.loads(plink_pickled)
-#result = np.array(plink_data['result'])
-#print("snp size is: ", result.shape)
-#identifier = plink_data['identifier']
-#
-#lmm_vars_pickled = Redis.hget(identifier, "lmm_vars")
-#lmm_vars = pickle.loads(lmm_vars_pickled)
-#
-#pheno_vector = np.array(lmm_vars['pheno_vector'])
-#covariate_matrix = np.array(lmm_vars['covariate_matrix'])
-#kinship_matrix = np.array(lmm_vars['kinship_matrix'])
-#
-#v = np.isnan(pheno_vector)
-#keep = True - v
-#keep = keep.reshape((len(keep),))
-#print("keep is: ", keep)
-#
-#if v.sum():
-#    pheno_vector = pheno_vector[keep]
-#    covariate_matrix = covariate_matrix[keep,:]
-#    kinship_matrix = kinship_matrix[keep,:][:,keep]
-#
-#n = kinship_matrix.shape[0]
-#print("n is: ", n)
-#lmm_ob = lmm.LMM(pheno_vector,
-#            kinship_matrix,
-#            covariate_matrix)
-#lmm_ob.fit()
-#
-#p_values = []
-#
-#for snp in result:
-#    snp = snp[0]
-#    p_value, t_stat = lmm.human_association(snp,
-#                                n,
-#                                keep,
-#                                lmm_ob,
-#                                pheno_vector,
-#                                covariate_matrix,
-#                                kinship_matrix,
-#                                False)
-#
-#    p_values.append(p_value)
-    
-
-
-
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 738268be..627cc7a4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -21,18 +21,22 @@ from optparse import OptionParser
 import sys
 import tsvreader
 import numpy as np
-from lmm import gn2_load_redis
+from lmm import gn2_load_redis, calculate_kinship
+from kinship import kinship, kinship_full
+from genotype import normalizeGenotype
+from phenotype import removeMissingPhenotypes
 
 usage = """
 python runlmm.py [options] command
 
   runlmm.py processing multiplexer reads standardised input formats
-  and calls the different routines
+  and calls the different routines (writes to stdout)
 
   Current commands are:
 
     parse        : only parse input files
     redis        : use Redis to call into GN2
+    kinship      : calculate (new) kinship matrix
 
   try --help for more information
 """
@@ -50,6 +54,13 @@ parser.add_option("--geno",dest="geno",
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
+parser.add_option("--blas", action="store_true", default=False, dest="useBLAS", help="Use BLAS instead of numpy matrix multiplication")
+parser.add_option("-t", "--threads",
+                  type="int", dest="numThreads", 
+                  help="Threads to use")
+parser.add_option("--saveKvaKve",
+                  action="store_true", dest="saveKvaKve", default=False,
+                  help="Testing mode")
 parser.add_option("--test",
                   action="store_true", dest="testing", default=False,
                   help="Testing mode")
@@ -63,6 +74,10 @@ if len(args) != 1:
 cmd = args[0]
 print "Command: ",cmd
 
+k = None
+y = None
+g = None
+
 if options.kinship:
     k = tsvreader.kinship(options.kinship)
     print k.shape
@@ -75,46 +90,38 @@ if options.geno:
     g = tsvreader.geno(options.geno)
     print g.shape
 
-def normalizeGenotype(G):
-    # Run for every SNP list (num individuals)
-    x = True - np.isnan(G)  # Matrix of True/False
-    m = G[x].mean()         # Global mean value
-    # print m
-    s = np.sqrt(G[x].var()) # Global stddev
-    # print s
-    G[np.isnan(G)] = m      # Plug-in mean values for missing
-    if s == 0:
-        G = G - m           # Subtract the mean
-    else:
-        G = (G - m) / s     # Normalize the deviation
-    return G
-
-# g = g.reshape((1, -1))[0]
-print("Before",g)
-gn = []
-for snp in g:
-    gn.append( normalizeGenotype(snp) )
-
-gn = g
-gn = np.array(gn)
-print("After1",gn)
-gnT = gn.T
-print("After",gnT)
-# G = gnT
-G = gnT
-print "G shape",G.shape
-# sys.exit(1)
-# assert(G[0,0]==-2.25726341)
-
-# Remove individuals with missing phenotypes
-v = np.isnan(y)
-keep = True - v
-if v.sum():
-   if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
-   y  = y[keep]
-   G = G[keep,:]
-   k = k[keep,:][:,keep]
-
 if cmd == 'redis':
-    ps, ts = gn2_load_redis('testrun','other',np.array(k),y,G,options.testing)
+    # Emulating the redis setup of GN2
+    gn = []
+    for ind_g in g:
+        gn.append( normalizeGenotype(ind_g) )
+    gnt = np.array(gn).T
+    Y,G = removeMissingPhenotypes(y,gnt,options.verbose)
+    print "G",G.shape,G
+    ps, ts = gn2_load_redis('testrun','other',k,Y,G,options.testing)
     print np.array(ps)
+    print round(ps[0],4)
+    assert(options.testing and round(ps[0],4)==0.7262)
+    print round(ps[-1],4)
+    assert(options.testing and round(ps[-1],4)==0.3461)
+elif cmd == 'kinship':
+    gn = []
+    for ind_g in g:
+        if len(gn)>=8000: break
+        gn.append( normalizeGenotype(ind_g) )
+    K = kinship_full(np.array(gn),options)
+    print "first Kinship method",K.shape,K
+    K = kinship(np.array(gn),options)
+    print "second Kinship method",K.shape,K
+    sys.exit(1)
+    gnt = np.array(gn).T
+    Y,g = removeMissingPhenotypes(y,gnt,options.verbose)
+    G = g
+    print G.shape,G
+    K = calculate_kinship(np.copy(G),None,options)
+    print G.shape,G
+    print "first Kinship method",K.shape,K
+    K = kinship(G.T,options)
+    assert(K[0][0]==1.28)
+else:
+    print "Doing nothing"