Added pop-up for empty searches

Replaced pyLMM code with one version of Pyotr's updated faster code Began fixing search to work for combined searches Made login screen look better and fixed for empty fields
author: Zachary Sloan 2015-02-11 16:42:33 +0000
committer: Zachary Sloan 2015-02-11 16:42:33 +0000
commit: a4721c98188cff260633bc54251e3273a7b5f3de (patch)
tree: 7a09c613281c461eaca5e190c500db8ca0c5988b /wqflask
parent: e48ec8826ec48bd4ba53f681796a235737ab0d29 (diff)
download: genenetwork2-a4721c98188cff260633bc54251e3273a7b5f3de.tar.gz
9 files changed, 432 insertions, 215 deletions
diff --git a/wqflask/wqflask/do_search.py b/wqflask/wqflask/do_search.py
index fd7bd916..29f3de98 100755
--- a/wqflask/wqflask/do_search.py
+++ b/wqflask/wqflask/do_search.py
@@ -370,13 +370,24 @@ class WikiSearch(MrnaAssaySearch):
 
     DoSearch.search_types['WIKI'] =  "WikiSearch"
 
-    def run(self):
+    def get_where_clause(self):
         where_clause = """%s.symbol = GeneRIF.symbol
             and GeneRIF.versionId=0 and GeneRIF.display>0
             and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s')
                 """ % (self.dataset.type,
                        "[[:<:]]"+str(self.search_term[0])+"[[:>:]]",
-                       str(self.search_term[0]))
+                       str(self.search_term[0])) 
+        return where_clause
+
+    def run(self):
+        #where_clause = """%s.symbol = GeneRIF.symbol
+        #    and GeneRIF.versionId=0 and GeneRIF.display>0
+        #    and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s')
+        #        """ % (self.dataset.type,
+        #               "[[:<:]]"+str(self.search_term[0])+"[[:>:]]",
+        #               str(self.search_term[0]))
+
+        where_clause = self.get_where_clause()
 
         from_clause = ", GeneRIF "
         query = self.compile_final_query(from_clause, where_clause)
diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
index 54a230de..74b58dd6 100755
--- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py
+++ b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
@@ -3,60 +3,80 @@
 # pylmm is a python-based linear mixed-model solver with applications to GWAS
 
 # Copyright (C) 2013  Nicholas A. Furlotte (nick.furlotte@gmail.com)
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#The program is free for academic use. Please contact Nick Furlotte
+#<nick.furlotte@gmail.com> if you are interested in using the software for
+#commercial purposes.
+
+#The software must not be modified and distributed without prior
+#permission of the author.
+
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import pdb
 import time
+import sys
+
+def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
 
-def printOutHead():
-    out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
+def formatResult(id,beta,betaSD,ts,ps):
+   return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+   
 def outputResult(id,beta,betaSD,ts,ps):
-    out.write("\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n")
+   out.write(formatResult(id,beta,betaSD,ts,ps))
 
 from optparse import OptionParser,OptionGroup
-usage = """usage: %prog [options] --kfile kinshipFile --[t | b | p]file plinkFileBase outfileBase
+usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase
 
-This program provides basic genome-wide association (GWAS) functionality.  You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value.
-The input file are all standard plink formatted with the first two columns specifiying the individual and family ID.  For the phenotype file, we accept either NA or -9 to denote missing values.
+This program provides basic genome-wide association (GWAS) functionality.  You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value.  
+The input file are all standard plink formatted with the first two columns specifiying the individual and family ID.  For the phenotype file, we accept either NA or -9 to denote missing values.  
 
 Basic usage:
 
       python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile
 
-NOTE: The current running version only supports binary PED files (PLINK).  It is simple to convert between ped or tped and bed using PLINK.  Sorry for the inconvinience.
-
-            """
+	    """
 parser = OptionParser(usage=usage)
 
 basicGroup = OptionGroup(parser, "Basic Options")
 advancedGroup = OptionGroup(parser, "Advanced Options")
+experimentalGroup = OptionGroup(parser, "Experimental Options")
 
-basicGroup.add_option("--pfile", dest="pfile",
-                  help="The base for a PLINK ped file")
+#basicGroup.add_option("--pfile", dest="pfile",
+#                  help="The base for a PLINK ped file")
 basicGroup.add_option("--tfile", dest="tfile",
                   help="The base for a PLINK tped file")
 basicGroup.add_option("--bfile", dest="bfile",
                   help="The base for a PLINK binary bed file")
 basicGroup.add_option("--phenofile", dest="phenoFile", default=None,
-                  help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root.  If you want to specify an alternative phenotype file, then use this argument.  The order does not matter for this file. ")
+                  help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root.  If you want to specify an alternative phenotype file, then use this argument.  This file should be in plink format. ")
+
+# EMMA Options
+basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None,
+                  help="For backwards compatibility with emma, we allow for \"EMMA\" file formats.  This is just a text file with individuals on the columns and snps on the rows.")
+basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None,
+                  help="For backwards compatibility with emma, we allow for \"EMMA\" file formats.  This is just a text file with each phenotype as one row.")
+basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None,
+                  help="For backwards compatibility with emma, we allow for \"EMMA\" file formats.  This is just a text file with each covariate as one row.")
+
 basicGroup.add_option("--kfile", dest="kfile",
                   help="The location of a kinship file.  This is an nxn plain text file and can be computed with the pylmmKinship program.")
 basicGroup.add_option("--covfile", dest="covfile",
                   help="The location of a covariate file file.  This is a plink formatted covariate file.")
 basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0)
 
+
+
 advancedGroup.add_option("--removeMissingGenotypes",
                   action="store_false", dest="normalizeGenotype", default=True,
                   help="By default the program replaces missing genotypes with the minor allele frequency.  This option overrides that behavior making the program remove missing individuals.  NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.")
@@ -69,16 +89,23 @@ advancedGroup.add_option("--REML",
                   help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).")
 #advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.")
 advancedGroup.add_option("--eigen", dest="eigenfile",
-                  help="The location of the precomputed eigendecomposition for the kinship file")
+                  help="The location of the precomputed eigendecomposition for the kinship file.  These can be computed with pylmmKinship.py.")
 advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true",
                   help="This option only applies when --cofile is used.  When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.")
 
+basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use")
+
 advancedGroup.add_option("-v", "--verbose",
                   action="store_true", dest="verbose", default=False,
                   help="Print extra info")
 
+# Experimental Group Options
+experimentalGroup.add_option("--kfile2", dest="kfile2", 
+                  help="The location of a second kinship file.  This file has the same format as the first kinship.  This might be used if you want to correct for another form of confounding.")
+
 parser.add_option_group(basicGroup)
 parser.add_option_group(advancedGroup)
+parser.add_option_group(experimentalGroup)
 
 (options, args) = parser.parse_args()
 
@@ -89,68 +116,108 @@ from scipy import linalg
 from pylmm.lmm import LMM
 from pylmm import input
 
-if len(args) != 1:
-    parser.error("Incorrect number of arguments")
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+if len(args) != 1:  
+   parser.print_help()
+   sys.exit()
+
 outFile = args[0]
 
-if not options.pfile and not options.tfile and not options.bfile:
-    parser.error("You must provide at least one PLINK input file base")
+if not options.tfile and not options.bfile and not options.emmaFile: 
+#if not options.pfile and not options.tfile and not options.bfile: 
+   parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).")
 if not options.kfile:
-    parser.error("Please provide a pre-computed kinship file")
+   parser.error("Please provide a pre-computed kinship file")
+
+numThreads = None
+if options.numThreads:
+   numThreads = int(options.numThreads)
 
 # READING PLINK input
-if options.verbose:
-    sys.stderr.write("Reading PLINK input...\n")
-if options.bfile:
-    IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-elif options.tfile:
-    IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-elif options.pfile:
-    IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-else:
-    parser.error("You must provide at least one PLINK input file base")
-
-if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos'):
-    parser.error("No .pheno file exist for %s" % (options.phenoFile or IN.fbase + '.phenos'))
+if options.verbose: sys.stderr.write("Reading SNP input...\n")
+if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+else: parser.error("You must provide at least one PLINK input file base")
+
+if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno):
+   parser.error("No .pheno file exist for %s.  Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos'))
+
+# Read the emma phenotype file if provided.
+# Format should be rows are phenotypes and columns are individuals.
+if options.emmaPheno:
+   f = open(options.emmaPheno,'r')
+   P = []
+   for line in f:
+      v = line.strip().split()
+      p = []
+      for x in v:
+	 try:
+	    p.append(float(x))
+	 except: p.append(np.nan)
+      P.append(p)
+   f.close()
+   IN.phenos = np.array(P).T
 
 # READING Covariate File
-if options.covfile:
-    if options.verbose:
-        sys.stderr.write("Reading covariate file...\n")
-    # Read the covariate file -- write this into input.plink
-    P = IN.getCovariates(options.covfile)
-
-    if options.noMean:
-        X0 = P
-    else:
-        X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
-
-    if np.isnan(X0).sum():
-        parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case.  Either remove those individuals with missing values or replace them in some way.")
-else:
-    X0 = np.ones((IN.phenos.shape[0],1))
+if options.covfile: 
+   if options.verbose: 
+      sys.stderr.write("Reading covariate file...\n")
+   P = IN.getCovariates(options.covfile) 
+   if options.noMean: 
+      X0 = P
+   else: 
+      X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
+elif options.emmaCov:
+   if options.verbose: 
+      sys.stderr.write("Reading covariate file...\n")
+   P = IN.getCovariatesEMMA(options.emmaCov) 
+   if options.noMean: 
+      X0 = P
+   else: 
+      X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
+else: 
+   X0 = np.ones((IN.phenos.shape[0],1))
+
+if np.isnan(X0).sum(): 
+      parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case.  Either remove those individuals with missing values or replace them in some way.")
 
 # READING Kinship - major bottleneck for large datasets
-if options.verbose:
-    sys.stderr.write("Reading kinship...\n")
+if options.verbose: sys.stderr.write("Reading kinship...\n")
 begin = time.time()
 # This method seems to be the fastest and works if you already know the size of the matrix
 if options.kfile[-3:] == '.gz':
-    import gzip
-    f = gzip.open(options.kfile,'r')
-    F = f.read() # might exhaust mem if the file is huge
-    K = np.fromstring(F,sep=' ') # Assume that space separated
-    f.close()
-else:
-    K = np.fromfile(open(options.kfile,'r'),sep=" ")
+   import gzip
+   f = gzip.open(options.kfile,'r')
+   F = f.read() # might exhaust mem if the file is huge
+   K = np.fromstring(F,sep=' ') # Assume that space separated
+   f.close()
+else: 
+   K = np.fromfile(open(options.kfile,'r'),sep=" ")
 K.resize((len(IN.indivs),len(IN.indivs)))
 end = time.time()
 # Other slower ways
 #K = np.loadtxt(options.kfile)
 #K = np.genfromtxt(options.kfile)
-if options.verbose:
-    sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin))
-
+if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin))
+
+if options.kfile2:
+   if options.verbose: sys.stderr.write("Reading second kinship...\n")
+   begin = time.time()
+   # This method seems to be the fastest and works if you already know the size of the matrix
+   if options.kfile2[-3:] == '.gz':
+      import gzip
+      f = gzip.open(options.kfile2,'r')
+      F = f.read() # might exhaust mem if the file is huge
+      K2 = np.fromstring(F,sep=' ') # Assume that space separated
+      f.close()
+   else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ")
+   K2.resize((len(IN.indivs),len(IN.indivs)))
+   end = time.time()
+   if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin))
 
 # PROCESS the phenotype data -- Remove missing phenotype values
 # Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data
@@ -158,81 +225,122 @@ Y = IN.phenos[:,options.pheno]
 v = np.isnan(Y)
 keep = True - v
 if v.sum():
-    if options.verbose:
-        sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
-    Y = Y[keep]
-    X0 = X0[keep,:]
-    K = K[keep,:][:,keep]
-    Kva = []
-    Kve = []
+   if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+   Y = Y[keep]
+   X0 = X0[keep,:]
+   K = K[keep,:][:,keep]
+   if options.kfile2: K2 = K2[keep,:][:,keep]
+   Kva = []
+   Kve = []
 
 # Only load the decomposition if we did not remove individuals.
 # Otherwise it would not be correct and we would have to compute it again.
 if not v.sum() and options.eigenfile:
-    if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n")
-    Kva = np.load(options.eigenfile + ".Kva")
-    Kve = np.load(options.eigenfile + ".Kve")
-else:
-    Kva = []
-    Kve = []
+   if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n")
+   Kva = np.load(options.eigenfile + ".Kva")
+   Kve = np.load(options.eigenfile + ".Kve")
+else: 
+   Kva = []
+   Kve = []
 
 # CREATE LMM object for association
 n = K.shape[0]
-L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose)
-# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here
-if not options.refit:
-    if options.verbose: sys.stderr.write("Computing fit for null model\n")
-    L.fit()
-    if options.verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma))
+if not options.kfile2:  L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose)
+else:  L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2)
 
+# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here
+if not options.refit: 
+   if options.verbose: sys.stderr.write("Computing fit for null model\n")
+   L.fit()
+   if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma))
+   if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW))
+
+def compute_snp(collect):
+   snp = collect[0]
+   id = collect[1]
+   # result = []
+   # Check SNPs for missing values
+   x = snp[keep].reshape((n,1))  # all the SNPs
+   v = np.isnan(x).reshape((-1,))
+   if v.sum():
+      keeps = True - v
+      xs = x[keeps,:]
+      if keeps.sum() <= 1 or xs.var() <= 1e-6: 
+         # PS.append(np.nan)
+         # TS.append(np.nan)
+         # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
+         # continue
+         return formatResult(id,np.nan,np.nan,np.nan,np.nan)
+
+      # Its ok to center the genotype -  I used options.normalizeGenotype to 
+      # force the removal of missing genotypes as opposed to replacing them with MAF.
+      if not options.normalizeGenotype:
+         xs = (xs - xs.mean()) / np.sqrt(xs.var())
+      Ys = Y[keeps]
+      X0s = X0[keeps,:]
+      Ks = K[keeps,:][:,keeps]
+      if options.kfile2:
+         K2s = K2[keeps,:][:,keeps]
+         Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s)
+      else:
+         Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose)
+      if options.refit:
+        Ls.fit(X=xs,REML=options.REML)
+      else:
+         #try:
+         Ls.fit(REML=options.REML)
+         #except: pdb.set_trace()
+      ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True)
+   else:
+      if x.var() == 0: 
+         # PS.append(np.nan)
+         # TS.append(np.nan)
+         # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values
+         return formatResult(id,np.nan,np.nan,np.nan,np.nan)
+         # continue
+
+      if options.refit:
+         L.fit(X=x,REML=options.REML)
+      # This is where it happens
+      ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True)
+   
+   return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)
+   # PS.append(ps)
+   # TS.append(ts)
+   # return len(result)
+   # compute.q.put(result)
+   # return None
+
+def f_init(q):
+   compute_snp.q = q
+
+# Set up the pool
+# mp.set_start_method('spawn')
+q = mp.Queue()
+p = mp.Pool(numThreads, f_init, [q])
+collect = []
+   
 # Buffers for pvalues and t-stats
-PS = []
-TS = []
+# PS = []
+# TS = []
 count = 0
 out = open(outFile,'w')
 printOutHead()
 
-for snp,id in IN:
-    count += 1
-    if options.verbose and count % 1000 == 0:
-        sys.stderr.write("At SNP %d\n" % count)
-
-    x = snp[keep].reshape((n,1))
-    #x[[1,50,100,200,3000],:] = np.nan
-    v = np.isnan(x).reshape((-1,))
-    # Check SNPs for missing values
-    if v.sum():
-        keeps = True - v
-        xs = x[keeps,:]
-        if keeps.sum() <= 1 or xs.var() <= 1e-6:
-            PS.append(np.nan)
-            TS.append(np.nan)
-            outputResult(id,np.nan,np.nan,np.nan,np.nan)
-            continue
-
-        # Its ok to center the genotype -  I used options.normalizeGenotype to
-        # force the removal of missing genotypes as opposed to replacing them with MAF.
-        if not options.normalizeGenotype: xs = (xs - xs.mean()) / np.sqrt(xs.var())
-        Ys = Y[keeps]
-        X0s = X0[keeps,:]
-        Ks = K[keeps,:][:,keeps]
-        Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose)
-        if options.refit: Ls.fit(X=xs,REML=options.REML)
-        else:
-            #try:
-            Ls.fit(REML=options.REML)
-            #except: pdb.set_trace()
-        ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True)
-    else:
-        if x.var() == 0:
-            PS.append(np.nan)
-            TS.append(np.nan)
-            outputResult(id,np.nan,np.nan,np.nan,np.nan)
-            continue
-
-        if options.refit: L.fit(X=x,REML=options.REML)
-        ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True)
-
-    outputResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)
-    PS.append(ps)
-    TS.append(ts)
+for snp_id in IN:
+   count += 1
+   if count % 1000 == 0:
+      if options.verbose:
+         sys.stderr.write("At SNP %d\n" % count)
+      # if count>8000 :
+      #    break         # for testing only
+   if count % 100 == 0:
+      for line in p.imap(compute_snp,collect):
+         out.write(line)
+      collect = []
+      
+   collect.append(snp_id)
+for line in p.imap(compute_snp,collect):
+   out.write(line)
+
+
diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py
index cfba2936..c9a73ece 100755
--- a/wqflask/wqflask/my_pylmm/pylmmKinship.py
+++ b/wqflask/wqflask/my_pylmm/pylmmKinship.py
@@ -23,10 +23,12 @@
 #NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import sys
+import pdb
+
 from optparse import OptionParser,OptionGroup
-usage = """usage: %prog [options] --[t | b | p]file plinkFileBase outfile
+usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile
 
-NOTE: The current running version only supports binary PED files (PLINK).  It is simple to convert between ped or tped and bed using PLINK.  Sorry for the inconvinience.
 """
 
 parser = OptionParser(usage=usage)
@@ -34,15 +36,20 @@ parser = OptionParser(usage=usage)
 basicGroup = OptionGroup(parser, "Basic Options")
 #advancedGroup = OptionGroup(parser, "Advanced Options")
 
-basicGroup.add_option("--pfile", dest="pfile",
-                  help="The base for a PLINK ped file")
+#basicGroup.add_option("--pfile", dest="pfile",
+#                  help="The base for a PLINK ped file")
 basicGroup.add_option("--tfile", dest="tfile",
                   help="The base for a PLINK tped file")
 basicGroup.add_option("--bfile", dest="bfile",
                   help="The base for a PLINK binary ped file")
+basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None,
+                  help="For backwards compatibility with emma, we allow for \"EMMA\" file formats.  This is just a text file with individuals on the rows and snps on the columns.")
+basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0,
+		     help="When providing the emmaSNP file you need to specify how many snps are in the file")
 
 basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.")
 basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000).  This is important when there is a large number of SNPs, because memory could be an issue.")
+basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use")
 
 basicGroup.add_option("-v", "--verbose",
                   action="store_true", dest="verbose", default=False,
@@ -52,68 +59,128 @@ parser.add_option_group(basicGroup)
 #parser.add_option_group(advancedGroup)
 
 (options, args) = parser.parse_args()
-if len(args) != 1: parser.error("Incorrect number of arguments")
+if len(args) != 1: 
+   parser.print_help()
+   sys.exit()
+
 outFile = args[0]
 
 import sys
 import os
 import numpy as np
 from scipy import linalg
-from pyLMM.lmm import calculate_kinship
-from pyLMM import input
-
-if not options.pfile and not options.tfile and not options.bfile: 
-   parser.error("You must provide at least one PLINK input file base")
-
+from pylmm.lmm import calculateKinship
+from pylmm import input
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue 
+
+if not options.tfile and not options.bfile and not options.emmaFile: 
+   parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).")
+
+numThreads = None
+if options.numThreads:
+   numThreads = int(options.numThreads)
+   
 if options.verbose: sys.stderr.write("Reading PLINK input...\n")
 if options.bfile: IN = input.plink(options.bfile,type='b')
 elif options.tfile: IN = input.plink(options.tfile,type='t')
-elif options.pfile: IN = input.plink(options.pfile,type='p')
-else: parser.error("You must provide at least one PLINK input file base")
+#elif options.pfile: IN = input.plink(options.pfile,type='p')
+elif options.emmaFile: 
+   if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.")
+   IN = input.plink(options.emmaFile,type='emma')
+else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).")
+
+def compute_W(job):
+   """
+   Read 1000 SNPs at a time into matrix and return the result
+   """
+   n = len(IN.indivs)
+   m = options.computeSize
+   W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+   for j in range(0,options.computeSize):
+      row = job*m + j
+      if row >= IN.numSNPs:
+         W = W[:,range(0,j)]
+         break
+      snp,id = IN.next()
+      if snp.var() == 0:
+         continue
+      W[:,j] = snp  # set row to list of SNPs
+   return W
+
+def compute_dgemm(job,W):
+   """
+   Compute Kinship(W)*j
+
+   For every set of SNPs dgemm is used to multiply matrices T(W)*W
+   """
+   res = None
+   try:
+      res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False)
+   except AttributeError:
+      res = np.dot(W,W.T) 
+   compute_dgemm.q.put([job,res])
+   return job
+
+def f_init(q):
+    compute_dgemm.q = q
 
 n = len(IN.indivs)
-m = options.computeSize
-W = np.ones((n,m)) * np.nan
+# m = options.computeSize
+# jobsize=m
 
 IN.getSNPIterator()
-i = 0
-K = None
-while i < IN.numSNPs:
-    j = 0
-    while j < options.computeSize and i < IN.numSNPs:
-        snp,id = IN.next()
-        if snp.var() == 0:
-            i += 1
-            continue
-        W[:,j] = snp
-
-        i += 1
-        j += 1
-    if j < options.computeSize: W = W[:,range(0,j)] 
-
-    if options.verbose:
-        sys.stderr.write("Processing first %d SNPs\n" % i)
-    if K == None: 
-        try: 
-            K = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j
-        except AttributeError:
-            K = np.dot(W,W.T) 
-    else:
-        try: 
-            K_j = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j
-        except AttributeError:
-            K_j = np.dot(W,W.T)
-        K = K + K_j
-
+# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file
+if options.emmaFile: IN.numSNPs = options.numSNPs
+# i = 0
+
+# mp.set_start_method('spawn')
+q = mp.Queue()
+p = mp.Pool(numThreads, f_init, [q])
+iterations = IN.numSNPs/options.computeSize+1
+# jobs = range(0,8) # range(0,iterations)
+
+results = []
+
+K = np.zeros((n,n))  # The Kinship matrix has dimension individuals x individuals
+
+completed = 0
+
+# for job in range(8):
+for job in range(iterations):
+   if options.verbose:
+      sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+   W = compute_W(job)
+   results.append(p.apply_async(compute_dgemm, (job,W)))   
+   # Do we have a result?
+   try: 
+     j,x = q.get_nowait()
+     if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+     K_j = x
+     # print j,K_j[:,0]
+     K = K + K_j
+     completed += 1
+   except Queue.Empty:
+     pass
+
+for job in range(len(results)-completed):
+   j,x = q.get()
+   if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+   K_j = x
+   # print j,K_j[:,0]
+   K = K + K_j
+        
 K = K / float(IN.numSNPs)
 if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
 np.savetxt(outFile,K)
 
 if options.saveEig:
-    if options.verbose:
-        sys.stderr.write("Obtaining Eigendecomposition\n")
-    Kva,Kve = linalg.eigh(K)
-    if options.verbose:
-        sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
-    np.savetxt(outFile+".kva",Kva)
-    np.savetxt(outFile+".kve",Kve)
+   if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+   Kva,Kve = linalg.eigh(K)
+   if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+   np.savetxt(outFile+".kva",Kva)
+   np.savetxt(outFile+".kve",Kve)
+      
+
+
+
diff --git a/wqflask/wqflask/parser.py b/wqflask/wqflask/parser.py
index be92014d..5c89bc01 100755
--- a/wqflask/wqflask/parser.py
+++ b/wqflask/wqflask/parser.py
@@ -26,7 +26,7 @@ from pprint import pformat as pf
 def parse(pstring):
     """
     
-    returned item serach_term is always a list, even if only one element
+    returned item search_term is always a list, even if only one element
     """
     pstring = re.split(r"""(?:(\w+\s*=\s*[\('"\[][^)'"]*[\)\]'"])  |  # LRS=(1 2 3), cisLRS=[4 5 6], etc
                        (\w+\s*[=:\>\<][\w\*]+)  |  # wiki=bar, GO:foobar, etc
@@ -42,6 +42,8 @@ def parse(pstring):
     
     print("separators:", separators)
 
+    
+ 
     for item in pstring:
         splat = re.split(separators, item)
         print("splat is:", splat)
diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py
index 9d2d0b88..c6521106 100755
--- a/wqflask/wqflask/search_results.py
+++ b/wqflask/wqflask/search_results.py
@@ -212,9 +212,9 @@ class SearchResultPage(object):
     #
     #    #search_gene
     #    #search_geno
-    #    #searhch_pheno
+    #    #search_pheno
     #    #search_mrn
-    #    #searhc_publish
+    #    #search_publish
 
 
     def search(self):
diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
index af53bd09..83264da3 100755
--- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
+++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
@@ -118,5 +118,12 @@ $ ->
                 console.log("Calling:", populate_function)
                 window[populate_function]()
 
-
-    $("#make_default").click(make_default)
-\ No newline at end of file
+    check_search_term = ->
+        search_term = $('#tfor').val()
+        console.log("search_term:", search_term)
+        if (search_term == "")
+            alert("Please enter one or more search terms or search equations.")
+            return false
+
+    $("#make_default").click(make_default)
+    $("#btsearch").click(check_search_term)
+\ No newline at end of file
diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
index 6f63b715..5bd54359 100644
--- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
+++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
@@ -1,6 +1,6 @@
 // Generated by CoffeeScript 1.8.0
 $(function() {
-  var apply_default, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown;
+  var apply_default, check_search_term, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown;
   process_json = function(data) {
     window.jdata = data;
     populate_species();
@@ -133,5 +133,15 @@ $(function() {
     }
     return _results;
   };
-  return $("#make_default").click(make_default);
+  check_search_term = function() {
+    var search_term;
+    search_term = $('#tfor').val();
+    console.log("search_term:", search_term);
+    if (search_term === "") {
+      alert("Please enter one or more search terms or search equations.");
+      return false;
+    }
+  };
+  $("#make_default").click(make_default);
+  return $("#btsearch").click(check_search_term);
 });
diff --git a/wqflask/wqflask/templates/index_page.html b/wqflask/wqflask/templates/index_page.html
index eb4f883c..538a6799 100755
--- a/wqflask/wqflask/templates/index_page.html
+++ b/wqflask/wqflask/templates/index_page.html
@@ -18,7 +18,7 @@
         
         <div class="row" style="width: 1400px !important;">
 
-            <div class="col-xs-5 col-xs-5">
+            <div class="col-xs-5">
                 <section id="search">
                     <div class="page-header">
                         <h1>Select and search</h1>
@@ -29,31 +29,42 @@
 
                                 <div class="form-group">
                                     <label for="species" class="col-xs-1 control-label" style="width: 65px !important;">Species:</label>
-                                    <div class="col-xs-4 controls">
-                                        <select name="species" id="species" class="form-control selectpicker span3" style="width: 300px !important;"></select>
+                                    <div class="col-xs-10 controls">
+                                        <div class="col-xs-8">
+                                          <select name="species" id="species" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+                                        </div>
+                                        <div class="col-xs-4">
+                                          <input id="make_default" class="btn btn-default form-control" value="Make Default">
+                                        </div>
                                     </div>
                                 </div>
 
                                 <div class="form-group">
                                     <label for="group" class="col-xs-1 control-label" style="width: 65px !important;">Group:</label>
                                     <div class="col-xs-4 controls input-append">
-                                        <select name="group" id="group" class="form-control selectpicker span3" style="width: 300px !important;"></select>
-                                        <i class="icon-question-sign"></i>
+                                        <div class="col-xs-8">
+                                          <select name="group" id="group" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+                                          <i class="icon-question-sign"></i>
+                                        </div>
                                     </div>
                                 </div>
 
                                 <div class="form-group">
                                     <label for="tissue" class="col-xs-1 control-label" style="width: 65px !important;">Type:</label>
                                     <div class="col-xs-4 controls">
-                                        <select name="type" id="type" class="form-control selectpicker span3" style="width: 300px !important;"></select>
+                                        <div class="col-xs-8">
+                                          <select name="type" id="type" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+                                        </div>
                                     </div>
                                 </div>
 
                                 <div class="form-group">
                                     <label for="dataset" class="col-xs-1 control-label" style="width: 65px !important;">Dataset:</label>
                                     <div class="col-xs-4 controls input-append">
-                                        <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select>
-                                        <i class="icon-question-sign"></i>
+                                        <div class="col-xs-8">
+                                          <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select>
+                                          <i class="icon-question-sign"></i>
+                                        </div>
                                     </div>
                                 </div>
 
@@ -65,9 +76,11 @@
                                 <!--  GET ANY  SEARCH -->
 
                                 <div class="form-group">
-                                    <label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;">Search for:</label>
+                                    <label for="tfor" class="col-xs-1 control-label" style="padding-left: 0px; padding-right: 0px; width: 65px !important;">Search for:</label>
                                     <div class="col-xs-10 controls">
-                                        <textarea name="search_terms" rows="2" class="form-control search-query" style="width: 450px !important;" id="tfor"></textarea>
+                                        <div class="col-xs-8">
+                                          <textarea name="search_terms" rows="2" class="form-control search-query" style="max-width: 550px; width: 450px !important;" id="tfor"></textarea>
+                                        </div>
                                     </div>
                                 </div>
 
@@ -75,12 +88,14 @@
                                 <div class="form-group">
                                     <label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;"></label>
                                     <div class="col-xs-10 controls">
-                                        <p>Enter terms, genes, ID numbers in the
-                                           <b>Search</b> field<br>
-                                           Use <b>*</b> or <b>?</b> wildcards (Cyp*a?,
-                                           synap*)<br>
-                                           Use <b>quotes</b> for terms such as <i>"tyrosine
-                                           kinase"</i></p>
+                                        <div class="col-xs-10">
+                                          <p>Enter terms, genes, ID numbers in the
+                                             <b>Search</b> field<br>
+                                             Use <b>*</b> or <b>?</b> wildcards (Cyp*a?,
+                                             synap*)<br>
+                                             Use <b>quotes</b> for terms such as <i>"tyrosine
+                                             kinase"</i></p>
+                                        </div>
                                     </div>
                                 </div>
 
@@ -92,9 +107,6 @@
                                     <div class="col-xs-3 controls" style="width: 100px !important;">
                                         <input id="btsearch" type="submit" class="btn btn-primary form-control" value="Search">
                                     </div>
-                                    <div class="col-xs-4 controls" style="width: 150px !important;">
-                                        <input id="make_default" class="btn btn-default form-control" value="Make Default">
-                                    </div>
                                 </div>
 
                                 <input type="hidden" name="FormID" value="searchResult" class="form-control">
diff --git a/wqflask/wqflask/templates/new_security/login_user.html b/wqflask/wqflask/templates/new_security/login_user.html
index 8ad0c79e..81e8b53a 100755
--- a/wqflask/wqflask/templates/new_security/login_user.html
+++ b/wqflask/wqflask/templates/new_security/login_user.html
@@ -7,7 +7,7 @@
         {{ flash_me() }}
 
         <div class="page-header">
-            <h1>Login</h1>
+            <h1>&nbsp;&nbsp;&nbsp;&nbsp;Login</h1>
         </div>
author	Zachary Sloan	2015-02-11 16:42:33 +0000
committer	Zachary Sloan	2015-02-11 16:42:33 +0000
commit	a4721c98188cff260633bc54251e3273a7b5f3de (patch)
tree	7a09c613281c461eaca5e190c500db8ca0c5988b /wqflask
parent	e48ec8826ec48bd4ba53f681796a235737ab0d29 (diff)
download	genenetwork2-a4721c98188cff260633bc54251e3273a7b5f3de.tar.gz