From 99fef2888f02551191cf6031c2c7222fce27e360 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sat, 4 Apr 2015 12:33:07 +0200
Subject: Run works without transposes

---
 wqflask/wqflask/my_pylmm/pyLMM/gwas.py      | 21 +++++++---
 wqflask/wqflask/my_pylmm/pyLMM/kinship.py   | 24 +++++++----
 wqflask/wqflask/my_pylmm/pyLMM/lmm.py       | 65 +++++++++++++++++++----------
 wqflask/wqflask/my_pylmm/pyLMM/lmm2.py      | 43 ++++++++++---------
 wqflask/wqflask/my_pylmm/pyLMM/phenotype.py | 35 +++++++++++++---
 wqflask/wqflask/my_pylmm/pyLMM/runlmm.py    | 21 +++++-----
 6 files changed, 136 insertions(+), 73 deletions(-)

diff --git a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
index 8b344a90..ae3769d4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/gwas.py
@@ -19,7 +19,7 @@
 
 import pdb
 import time
-# from utility import temp_data
+import sys
 import lmm2
 
 import os
@@ -31,6 +31,18 @@ from lmm2 import LMM2
 import multiprocessing as mp # Multiprocessing is part of the Python stdlib
 import Queue 
 
+# ---- A trick to decide on the environment:
+try:
+    from wqflask.my_pylmm.pyLMM import chunks
+    from gn2 import uses
+except ImportError:
+    sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n")
+    has_gn2=False
+    from standalone import uses
+
+progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
+
+
 def formatResult(id,beta,betaSD,ts,ps):
    return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
 
@@ -52,12 +64,11 @@ def compute_snp(j,n,snp_ids,lmm2,REML,q = None):
 def f_init(q):
    compute_snp.q = q
 
-def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True):
+def gwas(Y,G,K,restricted_max_likelihood=True,refit=False,verbose=True):
    """
    GWAS. The G matrix should be n inds (cols) x m snps (rows)
    """
-   progress,debug,info,mprint = uses('progress','debug','info','mprint')
-
+   info("In gwas.gwas")
    matrix_initialize()
    cpu_num = mp.cpu_count()
    numThreads = None # for now use all available threads
@@ -70,7 +81,7 @@ def gwas(Y,G,K,uses,restricted_max_likelihood=True,refit=False,verbose=True):
    m = G.shape[0] # snps
    snps = m
    info("%s SNPs",snps)
-   assert snps>inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
+   assert snps>=inds, "snps should be larger than inds (snps=%d,inds=%d)" % (snps,inds)
 
    # CREATE LMM object for association
    # if not kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=verbose)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
index be12417e..1c157fd8 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/kinship.py
@@ -28,12 +28,21 @@ import time
 
 from optmatrix import matrix_initialize, matrixMultT
 
-def kinship_full(G,uses):
+# ---- A trick to decide on the environment:
+try:
+    from wqflask.my_pylmm.pyLMM import chunks
+    from gn2 import uses, progress_set_func
+except ImportError:
+    has_gn2=False
+    import standalone as handlers
+    from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
+def kinship_full(G):
    """
    Calculate the Kinship matrix using a full dot multiplication
    """
-   info,mprint = uses('info','mprint')
-   
    # mprint("kinship_full G",G)
    m = G.shape[0] # snps
    n = G.shape[1] # inds
@@ -78,8 +87,7 @@ def f_init(q):
 
 # Calculate the kinship matrix from G (SNPs as rows!), returns K
 #
-def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
-   progress,debug,info,mprint = uses('progress','debug','info','mprint')
+def kinship(G,computeSize=1000,numThreads=None,useBLAS=False):
 
    matrix_initialize(useBLAS)
 
@@ -89,7 +97,7 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
    m = G.shape[0] # snps
    snps = m
    info("%i SNPs" % (m))
-   assert snps>inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
+   assert snps>=inds, "snps should be larger than inds (%i snps, %i inds)" % (snps,inds)
 
    q = mp.Queue()
    p = mp.Pool(numThreads, f_init, [q])
@@ -140,13 +148,11 @@ def kinship(G,uses,computeSize=1000,numThreads=None,useBLAS=False):
    K = K / float(snps)
    return K      
 
-def kvakve(K,uses):
+def kvakve(K):
    """
    Obtain eigendecomposition for K and return Kva,Kve where Kva is cleaned
    of small values < 1e-6 (notably smaller than zero)
    """
-   info,mprint = uses('info','mprint')
-
    info("Obtaining eigendecomposition for %dx%d matrix" % (K.shape[0],K.shape[1]) )
    Kva,Kve = linalg.eigh(K)
    mprint("Kva",Kva)
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index e51742c4..82bd7f0b 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -64,7 +64,6 @@ except ImportError:
     import standalone as handlers
     from standalone import uses, progress_set_func
     sys.stderr.write("WARNING: LMM standalone version missing the Genenetwork2 environment\n")
-    pass
 
 progress,mprint,debug,info,fatal = uses('progress','mprint','debug','info','fatal')
 
@@ -296,8 +295,8 @@ def run_other_old(pheno_vector,
     Bench().report()
     return p_values, t_stats
 
-def run_other_new(pheno_vector,
-        genotype_matrix,
+def run_other_new(n,m,pheno_vector,
+        geno,
         restricted_max_likelihood=True,
         refit=False):
     
@@ -312,8 +311,7 @@ def run_other_new(pheno_vector,
     print("REML=",restricted_max_likelihood," REFIT=",refit)
 
     # Adjust phenotypes
-    Y,G,keep = phenotype.remove_missing(pheno_vector,genotype_matrix,verbose=True)
-    print("Removed missing phenotypes",Y.shape)
+    n,Y,keep = phenotype.remove_missing_new(n,pheno_vector)
 
     # if options.maf_normalization:
     #     G = np.apply_along_axis( genotype.replace_missing_with_MAF, axis=0, arr=g )
@@ -321,8 +319,9 @@ def run_other_new(pheno_vector,
     # if not options.skip_genotype_normalization:
     # G = np.apply_along_axis( genotype.normalize, axis=1, arr=G)
 
+    geno = geno[:,keep]
     with Bench("Calculate Kinship"):
-        K,G = calculate_kinship_new(G)
+        K,G = calculate_kinship_new(geno)
     
     print("kinship_matrix: ", pf(K))
     print("kinship_matrix.shape: ", pf(K.shape))
@@ -337,9 +336,8 @@ def run_other_new(pheno_vector,
 
     with Bench("Doing GWAS"):
         t_stats, p_values = gwas.gwas(Y,
-                                      G.T,
+                                      G,
                                       K,
-                                      uses,
                                       restricted_max_likelihood=True,
                                       refit=False,verbose=True)
     Bench().report()
@@ -378,18 +376,30 @@ def matrixMult(A,B):
 
     return linalg.fblas.dgemm(alpha=1.,a=AA,b=BB,trans_a=transA,trans_b=transB)
 
+def calculate_kinship_new(genotype_matrix):
+    """ 
+    Call the new kinship calculation where genotype_matrix contains
+    inds (columns) by snps (rows).
+    """
+    assert type(genotype_matrix) is np.ndarray
+    info("call genotype.normalize")
+    G = np.apply_along_axis( genotype.normalize, axis=1, arr=genotype_matrix)
+    mprint("G",genotype_matrix)
+    info("call calculate_kinship_new")
+    return kinship(G),G # G gets transposed, we'll turn this into an iterator (FIXME)
 
-def calculate_kinship_new(genotype_matrix, temp_data=None):
+def calculate_kinship_iter(geno):
     """ 
     Call the new kinship calculation where genotype_matrix contains
     inds (columns) by snps (rows).
     """
+    assert type(genotype_matrix) is iter
     info("call genotype.normalize")
     G = np.apply_along_axis( genotype.normalize, axis=0, arr=genotype_matrix)
     info("call calculate_kinship_new")
-    return kinship(G.T,uses),G # G gets transposed, we'll turn this into an iterator (FIXME)
+    return kinship(G)
 
-def calculate_kinship_old(genotype_matrix, temp_data=None):
+def calculate_kinship_old(genotype_matrix):
     """
     genotype_matrix is an n x m matrix encoding SNP minor alleles.
     
@@ -430,7 +440,7 @@ def calculate_kinship_old(genotype_matrix, temp_data=None):
     mprint("G (after old normalize)",genotype_matrix.T)
     kinship_matrix = np.dot(genotype_matrix, genotype_matrix.T) * 1.0/float(m)
     return kinship_matrix,genotype_matrix
-    # return kinship_full(genotype_matrix.T,uses),genotype_matrix
+    # return kinship_full(genotype_matrix.T),genotype_matrix
 
 def GWAS(pheno_vector,
          genotype_matrix,
@@ -586,7 +596,7 @@ class LMM:
           # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
           begin = time.time()
           # Kva,Kve = linalg.eigh(K)
-          Kva,Kve = kvakve(K,uses)
+          Kva,Kve = kvakve(K)
           end = time.time()
           if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
           print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
@@ -794,12 +804,11 @@ class LMM:
        pl.ylabel("Probability of data")
        pl.title(title)
 
-
 def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,new_code=True):
     """
     Invoke pylmm using genotype as a matrix or as a (SNP) iterator.
     """
-    info("gwas_without_redis")
+    info("run_gwas")
     print('pheno', y)
     
     if species == "human" :
@@ -813,8 +822,8 @@ def run_gwas(species,n,m,k,y,geno,cov=None,reml=True,refit=False,inputfn=None,ne
         print('geno', geno.shape, geno)
 
         if new_code:
-            ps, ts = run_other_new(pheno_vector = y,
-                                   genotype_matrix = geno,
+            ps, ts = run_other_new(n,m,pheno_vector = y,
+                                   geno = geno,
                                    restricted_max_likelihood = reml,
                                    refit = refit)
         else:
@@ -849,10 +858,20 @@ def gwas_with_redis(key,species,new_code=True):
             v = np.array(v)
         return v
 
+    def narrayT(key):
+        m = narray(key)
+        if m is not None:
+            return m.T
+        return m
+    
+    # We are transposing before we enter run_gwas - this should happen on the webserver
+    # side (or when reading data from file)
+    k = narray('kinship_matrix')
+    g = narrayT('genotype_matrix')
     y = narray('pheno_vector')
     n = len(y)
     m = params['num_genotypes']
-    ps,ts = run_gwas(species,n,m,narray('kinship_matrix'),y,narray('genotype_matrix'),narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code)
+    ps,ts = run_gwas(species,n,m,k,y,g,narray('covariate_matrix'),params['restricted_max_likelihood'],params['refit'],params['input_file_name'],new_code)
         
     results_key = "pylmm:results:" + params['temp_uuid']
 
@@ -864,19 +883,19 @@ def gwas_with_redis(key,species,new_code=True):
     Redis.expire(results_key, 60*60)
     return ps, ts
 
-
 def gn2_load_redis(key,species,kinship,pheno,geno,new_code=True):
     """
-    This function emulates current GN2 behaviour by pre-loading Redis
+    This function emulates current GN2 behaviour by pre-loading Redis (note the input
+    genotype is transposed to emulate GN2 (FIXME!)
     """
-    print("Loading Redis from parsed data")
+    info("Loading Redis from parsed data")
     if kinship == None:
         k = None
     else:
         k = kinship.tolist()
     params = dict(pheno_vector = pheno.tolist(),
-                  genotype_matrix = geno.tolist(),
-                  num_genotypes = geno.shape[1],
+                  genotype_matrix = geno.T.tolist(),
+                  num_genotypes = geno.shape[0],
                   kinship_matrix = k,
                   covariate_matrix = None,
                   input_file_name = None,
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
index d67e1205..358bf27e 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm2.py
@@ -32,7 +32,6 @@ except ImportError:
     sys.stderr.write("WARNING: LMM2 standalone version missing the Genenetwork2 environment\n")
     has_gn2=False
     from standalone import uses
-    pass
 
 def calculateKinship(W,center=False):
       """
@@ -149,28 +148,32 @@ def GWAS(Y, X, K, Kva=[], Kve=[], X0=None, REML=True, refit=False):
 
 class LMM2:
 
-   """
-	 This is a simple version of EMMA/fastLMM.  
-	 The main purpose of this module is to take a phenotype vector (Y), a set of covariates (X) and a kinship matrix (K)
-	 and to optimize this model by finding the maximum-likelihood estimates for the model parameters.
-	 There are three model parameters: heritability (h), covariate coefficients (beta) and the total
-	 phenotypic variance (sigma).
-	 Heritability as defined here is the proportion of the total variance (sigma) that is attributed to 
-	 the kinship matrix.
-
-	 For simplicity, we assume that everything being input is a numpy array.
-	 If this is not the case, the module may throw an error as conversion from list to numpy array
-	 is not done consistently.
+   """This is a simple version of EMMA/fastLMM.
+
+   The main purpose of this module is to take a phenotype vector (Y),
+   a set of covariates (X) and a kinship matrix (K) and to optimize
+   this model by finding the maximum-likelihood estimates for the
+   model parameters.  There are three model parameters: heritability
+   (h), covariate coefficients (beta) and the total phenotypic
+   variance (sigma).  Heritability as defined here is the proportion
+   of the total variance (sigma) that is attributed to the kinship
+   matrix.
+
+   For simplicity, we assume that everything being input is a numpy
+   array.  If this is not the case, the module may throw an error as
+   conversion from list to numpy array is not done consistently.
 
    """
    def __init__(self,Y,K,Kva=[],Kve=[],X0=None,verbose=False):
 
-      """
-      The constructor takes a phenotype vector or array Y of size n.
-      It takes a kinship matrix K of size n x n.  Kva and Kve can be computed as Kva,Kve = linalg.eigh(K) and cached.
-      If they are not provided, the constructor will calculate them.
-      X0 is an optional covariate matrix of size n x q, where there are q covariates.
-      When this parameter is not provided, the constructor will set X0 to an n x 1 matrix of all ones to represent a mean effect.
+      """The constructor takes a phenotype vector or array Y of size n. It
+      takes a kinship matrix K of size n x n.  Kva and Kve can be
+      computed as Kva,Kve = linalg.eigh(K) and cached.  If they are
+      not provided, the constructor will calculate them.  X0 is an
+      optional covariate matrix of size n x q, where there are q
+      covariates.  When this parameter is not provided, the
+      constructor will set X0 to an n x 1 matrix of all ones to
+      represent a mean effect.
       """
 
       if X0 is None: 
@@ -194,7 +197,7 @@ class LMM2:
           # if self.verbose: sys.stderr.write("Obtaining eigendecomposition for %dx%d matrix\n" % (K.shape[0],K.shape[1]) )
           begin = time.time()
           # Kva,Kve = linalg.eigh(K)
-          Kva,Kve = kinship.kvakve(K,uses)
+          Kva,Kve = kinship.kvakve(K)
           end = time.time()
           if self.verbose: sys.stderr.write("Total time: %0.3f\n" % (end - begin))
           print("sum(Kva),sum(Kve)=",sum(Kva),sum(Kve))
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
index 4c8175f7..7b652515 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/phenotype.py
@@ -19,22 +19,47 @@
 import sys
 import numpy as np
 
-def remove_missing(y,g,verbose=False):
+# ---- A trick to decide on the environment:
+try:
+    from wqflask.my_pylmm.pyLMM import chunks
+    from gn2 import uses, progress_set_func
+except ImportError:
+    has_gn2=False
+    import standalone as handlers
+    from standalone import uses, progress_set_func
+
+progress,debug,info,mprint = uses('progress','debug','info','mprint')
+
+def remove_missing(n,y,g):
     """
     Remove missing data from matrices, make sure the genotype data has
     individuals as rows
     """
     assert(y is not None)
-    assert(y.shape[0] == g.shape[0])
+    assert y.shape[0] == g.shape[0],"y (n) %d, g (n,m) %s" % (y.shape[0],g.shape)
 
     y1 = y
     g1 = g
     v = np.isnan(y)
     keep = True - v
     if v.sum():
-        if verbose: 
-            sys.stderr.write("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
+        info("runlmm.py: Cleaning the phenotype vector and genotype matrix by removing %d individuals...\n" % (v.sum()))
         y1 = y[keep]
         g1 = g[keep,:]
-    return y1,g1,keep
+        n = y1.shape[0]
+    return n,y1,g1,keep
+
+def remove_missing_new(n,y):
+    """
+    Remove missing data. Returns new n,y,keep
+    """
+    assert(y is not None)
+    y1 = y
+    v = np.isnan(y)
+    keep = True - v
+    if v.sum():
+        info("runlmm.py: Cleaning the phenotype vector by removing %d individuals" % (v.sum()))
+        y1 = y[keep]
+        n = y1.shape[0]
+    return n,y1,keep
 
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
index 2d02e195..d248dee2 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/runlmm.py
@@ -131,16 +131,15 @@ def check_results(ps,ts):
 if cmd == 'run':
     if options.remove_missing_phenotypes:
         raise Exception('Can not use --remove-missing-phenotypes with LMM2')
-    snp_iterator =  tsvreader.geno_iter(options.geno)
     n = len(y)
     m = g.shape[1]
-    ps, ts = run_gwas('other',n,m,k,y,g.T)
+    ps, ts = run_gwas('other',n,m,k,y,g)  # <--- pass in geno by SNP
     check_results(ps,ts)
 elif cmd == 'iterator':
     if options.remove_missing_phenotypes:
         raise Exception('Can not use --remove-missing-phenotypes with LMM2')
-    snp_iterator =  tsvreader.geno_iter(options.geno)
-    ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,snp_iterator)
+    geno_iterator =  tsvreader.geno_iter(options.geno)
+    ps, ts = gn2_load_redis_iter('testrun_iter','other',k,y,geno_iterator)
     check_results(ps,ts)
 elif cmd == 'redis_new':
     # The main difference between redis_new and redis is that missing
@@ -150,10 +149,9 @@ elif cmd == 'redis_new':
     Y = y
     G = g
     print "Original G",G.shape, "\n", G
-
-    gt = G.T
-    G = None
-    ps, ts = gn2_load_redis('testrun','other',k,Y,gt,new_code=True)
+    # gt = G.T
+    # G = None
+    ps, ts = gn2_load_redis('testrun','other',k,Y,G,new_code=True)
     check_results(ps,ts)
 elif cmd == 'redis':
     # Emulating the redis setup of GN2
@@ -174,9 +172,10 @@ elif cmd == 'redis':
     g = None
     gnt = None
 
-    gt = G.T
-    G = None
-    ps, ts = gn2_load_redis('testrun','other',k,Y,gt, new_code=False)
+    # gt = G.T
+    # G = None
+    mprint("G",G)
+    ps, ts = gn2_load_redis('testrun','other',k,Y,G, new_code=False)
     check_results(ps,ts)
 elif cmd == 'kinship':
     G = g
-- 
cgit v1.2.3