From 73ed2f67bd0478473b887902ae96caaa0fca58d4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 17 Mar 2015 13:10:49 +0300
Subject: GWAS: one result is missing

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py   | 33 +++++++++++++-----------------
 wqflask/wqflask/my_pylmm/pyLMM/input.py  |  2 ++
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py    | 10 ++++++---
 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py   | 17 ++++++++++------
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py | 35 +++++++++++++++++---------------
 5 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 2a472717..20083bde 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -66,7 +66,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    """
    matrix_initialize()
    cpu_num = mp.cpu_count()
-   numThreads = None
+   numThreads = None # for now use all available threads
    kfile2 = False
    reml = restricted_max_likelihood
 
@@ -110,10 +110,7 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    # PS = []
    # TS = []
    count = 0
-
-   completed = 0
-   last_j = 0
-   # for snp_id in G:
+   jobs_running = 0
    for snp in G:
       snp_id = (snp,'SNPID')
       count += 1
@@ -129,13 +126,12 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
             j,lst = q.get()
             if verbose:
                sys.stderr.write("Job "+str(j)+" finished\n")
-            # for line in lines:
-            #    out.write(line)
             res.append(lst)
          else:
             p.apply_async(compute_snp,(job,n,collect,lmm2,reml))
+            jobs_running += 1
             collect = []
-            while job > completed:
+            while jobs_running:
                try:
                   j,lst = q.get_nowait()
                   if verbose:
@@ -143,34 +139,33 @@ def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
                   # for line in lines:
                   #    out.write(line)
                   res.append(lst)
-                  completed += 1
+                  jobs_running -= 1
                except Queue.Empty:
                   pass
-               if job > completed + cpu_num*2:
-                  time.sleep(0.1)
+               if jobs_running + cpu_num*2 > 0:
+                  time.sleep(1.0)
                else:
-                  if job >= completed:
+                  if jobs_running > 0:
                     break
 
       collect.append(snp_id)
 
-   if numThreads==1:
+   if numThreads==1 or count<1000:
       print "Running on 1 THREAD"
       compute_snp(count/1000,n,collect,lmm2,reml,q)
       j,lst = q.get()
-      # for line in lines:
-      #    out.write(line)
       res.append(lst)
    else:
-      for job in range(int(count/1000)-completed):
+      print "count=",count," running=",jobs_running," collect=",len(collect)
+      for job in range(jobs_running):
          j,lst = q.get(True,15) # time out
          if verbose:
             sys.stderr.write("Job "+str(j)+" finished\n")
          res.append(lst)
 
-   # print res
+   if verbose:
+      print "res=",res[0][0:10]
+   print [len(res1) for res1 in res]
    ts = [item[0] for res1 in res for item in res1]
    ps = [item[1] for res1 in res for item in res1]
-   # ts = [item[1] for item in res]
-   # print ps
    return ts,ps
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/input.py b/wqflask/wqflask/my_pylmm/pyLMM/input.py
index f7b556a5..7063fedf 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/input.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/input.py
@@ -135,6 +135,8 @@ class plink:
     def normalizeGenotype(self,G):
         # print "Before",G
         # print G.shape
+        print "call input.normalizeGenotype"
+        raise "This should not be used"
         x = True - np.isnan(G)
         m = G[x].mean()
         s = np.sqrt(G[x].var())
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 2014ffb8..8a24d98b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -285,7 +285,7 @@ def run_other_old(pheno_vector,
     # with Bench("LMM_ob fitting"):
     #     lmm_ob.fit()
 
-    print("genotype_matrix: ", genotype_matrix.shape)
+    print("run_other_old genotype_matrix: ", genotype_matrix.shape)
     print(genotype_matrix)
 
     with Bench("Doing GWAS"):
@@ -320,6 +320,7 @@ def run_other_new(pheno_vector,
     # Adjust phenotypes
     Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
     print("Removed missing phenotypes",Y.shape)
+
     # if options.maf_normalization:
     #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
     #     print "MAF replacements: \n",G
@@ -337,7 +338,7 @@ def run_other_new(pheno_vector,
     # with Bench("LMM_ob fitting"):
     #     lmm_ob.fit()
 
-    print("genotype_matrix: ", G.shape)
+    print("run_other_new genotype_matrix: ", G.shape)
     print(G)
 
     with Bench("Doing GWAS"):
@@ -388,7 +389,9 @@ def calculate_kinship_new(genotype_matrix, temp_data=None):
     Call the new kinship calculation where genotype_matrix contains
     inds (columns) by snps (rows).
     """
+    print("call genotype.normalize")
     G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
+    print("call calculate_kinship_new")
     return kinship(G.T),G # G gets transposed, we'll turn this into an iterator (FIXME)
 
 def calculate_kinship_old(genotype_matrix, temp_data=None):
@@ -399,6 +402,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
     normalizes the resulting vectors and returns the RRM matrix.
     
     """
+    print("call calculate_kinship_old")
     n = genotype_matrix.shape[0]
     m = genotype_matrix.shape[1]
     print("genotype 2D matrix n (inds) is:", n)
@@ -429,7 +433,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
             temp_data.store("percent_complete", percent_complete)
         
     genotype_matrix = genotype_matrix[:,keep]
-    print("genotype_matrix: ", pf(genotype_matrix))
+    print("After kinship (old) genotype_matrix: ", pf(genotype_matrix))
     kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
     return kinship_matrix,genotype_matrix
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index d50b0111..d4b3ac82 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -22,6 +22,7 @@ from scipy.linalg import eigh, inv, det
 import scipy.stats as stats # t-tests
 from scipy import optimize
 from optmatrix import matrixMult
+import kinship
 
 def calculateKinship(W,center=False):
       """
@@ -177,13 +178,17 @@ class LMM2:
 	 Kve = []
       self.nonmissing = x
 
-      if len(Kva) == 0 or len(Kve) == 0:
-	 if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
-	 begin = time.time()
-	 Kva,Kve = eigh(K)
-	 end = time.time()
-	 if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+      print("this K is:", K.shape, K)
       
+      if len(Kva) == 0 or len(Kve) == 0:
+          # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
+          begin = time.time()
+          # Kva,Kve = linalg.eigh(K)
+          Kva,Kve = kinship.kvakve(K)
+          end = time.time()
+          if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
+          print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
+
       self.K = K
       self.Kva = Kva
       self.Kve = Kve
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 1af5a443..708c9185 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -54,9 +54,9 @@ parser.add_option("--geno",dest="geno",
 parser.add_option("--maf-normalization",
                   action="store_true", dest="maf_normalization", default=False,
                   help="Apply MAF genotype normalization")
-parser.add_option("--skip-genotype-normalization",
-                  action="store_true", dest="skip_genotype_normalization", default=False,
-                  help="Skip genotype normalization")
+parser.add_option("--genotype-normalization",
+                  action="store_true", dest="genotype_normalization", default=False,
+                  help="Force genotype normalization")
 parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
                   help="don't print status messages to stdout")
@@ -100,7 +100,8 @@ if options.geno:
     print g.shape
 
 if cmd == 'redis_new':
-    # Emulating the redis setup of GN2
+    # The main difference between redis_new and redis is that missing
+    # phenotypes are handled by the first
     Y = y
     G = g
     print "Original G",G.shape, "\n", G
@@ -109,7 +110,7 @@ if cmd == 'redis_new':
     G = None
     ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
     print np.array(ps)
-    print sum(ps)
+    print len(ps),sum(ps)
     # Test results
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
@@ -118,12 +119,14 @@ if cmd == 'redis_new':
         assert p1==0.0708, "p1=%f" % p1
         assert p2==0.1417, "p2=%f" % p2
     if options.geno == 'data/small_na.geno':
-        assert p1==0.0958, "p1=%f" % p1
-        assert p2==0.0435, "p2=%f" % p2
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
         assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9623, "p2=%f" % p2
-if cmd == 'redis':
+        assert p2==0.9620, "p2=%f" % p2
+        assert sum(ps) == 4070.02346579
+        assert len(ps) == 8000
+elif cmd == 'redis':
     # Emulating the redis setup of GN2
     G = g
     print "Original G",G.shape, "\n", G
@@ -135,7 +138,7 @@ if cmd == 'redis':
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
@@ -144,8 +147,8 @@ if cmd == 'redis':
     G = None
     ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
     print np.array(ps)
-    print sum(ps)
-    # Test results
+    print len(ps),sum(ps)
+    # Test results 4070.02346579
     p1 = round(ps[0],4)
     p2 = round(ps[-1],4)
     sys.stderr.write(options.geno+"\n")
@@ -153,11 +156,11 @@ if cmd == 'redis':
         assert p1==0.0708, "p1=%f" % p1
         assert p2==0.1417, "p2=%f" % p2
     if options.geno == 'data/small_na.geno':
-        assert p1==0.0958, "p1=%f" % p1
-        assert p2==0.0435, "p2=%f" % p2
+        assert p1==0.0897, "p1=%f" % p1
+        assert p2==0.0405, "p2=%f" % p2
     if options.geno == 'data/test8000.geno':
         assert p1==0.8984, "p1=%f" % p1
-        assert p2==0.9623, "p2=%f" % p2
+        assert p2==0.8827, "p2=%f" % p2
 elif cmd == 'kinship':
     G = g
     print "Original G",G.shape, "\n", G
@@ -169,7 +172,7 @@ elif cmd == 'kinship':
     if options.maf_normalization:
         G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
         print "MAF replacements: \n",G
-    if not options.skip_genotype_normalization:
+    if options.genotype_normalization:
         G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
     g = None
     gnt = None
-- 
cgit v1.2.3