aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xwqflask/wqflask/do_search.py15
-rwxr-xr-xwqflask/wqflask/my_pylmm/pylmmGWAS.py378
-rwxr-xr-xwqflask/wqflask/my_pylmm/pylmmKinship.py169
-rwxr-xr-xwqflask/wqflask/parser.py4
-rwxr-xr-xwqflask/wqflask/search_results.py4
-rwxr-xr-xwqflask/wqflask/static/new/javascript/dataset_select_menu.coffee11
-rw-r--r--wqflask/wqflask/static/new/javascript/dataset_select_menu.js14
-rwxr-xr-xwqflask/wqflask/templates/index_page.html50
-rwxr-xr-xwqflask/wqflask/templates/new_security/login_user.html2
9 files changed, 432 insertions, 215 deletions
diff --git a/wqflask/wqflask/do_search.py b/wqflask/wqflask/do_search.py
index fd7bd916..29f3de98 100755
--- a/wqflask/wqflask/do_search.py
+++ b/wqflask/wqflask/do_search.py
@@ -370,13 +370,24 @@ class WikiSearch(MrnaAssaySearch):
DoSearch.search_types['WIKI'] = "WikiSearch"
- def run(self):
+ def get_where_clause(self):
where_clause = """%s.symbol = GeneRIF.symbol
and GeneRIF.versionId=0 and GeneRIF.display>0
and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s')
""" % (self.dataset.type,
"[[:<:]]"+str(self.search_term[0])+"[[:>:]]",
- str(self.search_term[0]))
+ str(self.search_term[0]))
+ return where_clause
+
+ def run(self):
+ #where_clause = """%s.symbol = GeneRIF.symbol
+ # and GeneRIF.versionId=0 and GeneRIF.display>0
+ # and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s')
+ # """ % (self.dataset.type,
+ # "[[:<:]]"+str(self.search_term[0])+"[[:>:]]",
+ # str(self.search_term[0]))
+
+ where_clause = self.get_where_clause()
from_clause = ", GeneRIF "
query = self.compile_final_query(from_clause, where_clause)
diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
index 54a230de..74b58dd6 100755
--- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py
+++ b/wqflask/wqflask/my_pylmm/pylmmGWAS.py
@@ -3,60 +3,80 @@
# pylmm is a python-based linear mixed-model solver with applications to GWAS
# Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com)
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#The program is free for academic use. Please contact Nick Furlotte
+#<nick.furlotte@gmail.com> if you are interested in using the software for
+#commercial purposes.
+
+#The software must not be modified and distributed without prior
+#permission of the author.
+
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pdb
import time
+import sys
+
+def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
-def printOutHead():
- out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n")
+def formatResult(id,beta,betaSD,ts,ps):
+ return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n"
+
def outputResult(id,beta,betaSD,ts,ps):
- out.write("\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n")
+ out.write(formatResult(id,beta,betaSD,ts,ps))
from optparse import OptionParser,OptionGroup
-usage = """usage: %prog [options] --kfile kinshipFile --[t | b | p]file plinkFileBase outfileBase
+usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase
-This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value.
-The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values.
+This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value.
+The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values.
Basic usage:
python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile
-NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience.
-
- """
+ """
parser = OptionParser(usage=usage)
basicGroup = OptionGroup(parser, "Basic Options")
advancedGroup = OptionGroup(parser, "Advanced Options")
+experimentalGroup = OptionGroup(parser, "Experimental Options")
-basicGroup.add_option("--pfile", dest="pfile",
- help="The base for a PLINK ped file")
+#basicGroup.add_option("--pfile", dest="pfile",
+# help="The base for a PLINK ped file")
basicGroup.add_option("--tfile", dest="tfile",
help="The base for a PLINK tped file")
basicGroup.add_option("--bfile", dest="bfile",
help="The base for a PLINK binary bed file")
basicGroup.add_option("--phenofile", dest="phenoFile", default=None,
- help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. The order does not matter for this file. ")
+ help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ")
+
+# EMMA Options
+basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None,
+ help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.")
+basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None,
+ help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.")
+basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None,
+ help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.")
+
basicGroup.add_option("--kfile", dest="kfile",
help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.")
basicGroup.add_option("--covfile", dest="covfile",
help="The location of a covariate file file. This is a plink formatted covariate file.")
basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0)
+
+
advancedGroup.add_option("--removeMissingGenotypes",
action="store_false", dest="normalizeGenotype", default=True,
help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.")
@@ -69,16 +89,23 @@ advancedGroup.add_option("--REML",
help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).")
#advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.")
advancedGroup.add_option("--eigen", dest="eigenfile",
- help="The location of the precomputed eigendecomposition for the kinship file")
+ help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.")
advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true",
help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.")
+basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use")
+
advancedGroup.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=False,
help="Print extra info")
+# Experimental Group Options
+experimentalGroup.add_option("--kfile2", dest="kfile2",
+ help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.")
+
parser.add_option_group(basicGroup)
parser.add_option_group(advancedGroup)
+parser.add_option_group(experimentalGroup)
(options, args) = parser.parse_args()
@@ -89,68 +116,108 @@ from scipy import linalg
from pylmm.lmm import LMM
from pylmm import input
-if len(args) != 1:
- parser.error("Incorrect number of arguments")
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue
+
+if len(args) != 1:
+ parser.print_help()
+ sys.exit()
+
outFile = args[0]
-if not options.pfile and not options.tfile and not options.bfile:
- parser.error("You must provide at least one PLINK input file base")
+if not options.tfile and not options.bfile and not options.emmaFile:
+#if not options.pfile and not options.tfile and not options.bfile:
+ parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).")
if not options.kfile:
- parser.error("Please provide a pre-computed kinship file")
+ parser.error("Please provide a pre-computed kinship file")
+
+numThreads = None
+if options.numThreads:
+ numThreads = int(options.numThreads)
# READING PLINK input
-if options.verbose:
- sys.stderr.write("Reading PLINK input...\n")
-if options.bfile:
- IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-elif options.tfile:
- IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-elif options.pfile:
- IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
-else:
- parser.error("You must provide at least one PLINK input file base")
-
-if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos'):
- parser.error("No .pheno file exist for %s" % (options.phenoFile or IN.fbase + '.phenos'))
+if options.verbose: sys.stderr.write("Reading SNP input...\n")
+if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype)
+else: parser.error("You must provide at least one PLINK input file base")
+
+if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno):
+ parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos'))
+
+# Read the emma phenotype file if provided.
+# Format should be rows are phenotypes and columns are individuals.
+if options.emmaPheno:
+ f = open(options.emmaPheno,'r')
+ P = []
+ for line in f:
+ v = line.strip().split()
+ p = []
+ for x in v:
+ try:
+ p.append(float(x))
+ except: p.append(np.nan)
+ P.append(p)
+ f.close()
+ IN.phenos = np.array(P).T
# READING Covariate File
-if options.covfile:
- if options.verbose:
- sys.stderr.write("Reading covariate file...\n")
- # Read the covariate file -- write this into input.plink
- P = IN.getCovariates(options.covfile)
-
- if options.noMean:
- X0 = P
- else:
- X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
-
- if np.isnan(X0).sum():
- parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.")
-else:
- X0 = np.ones((IN.phenos.shape[0],1))
+if options.covfile:
+ if options.verbose:
+ sys.stderr.write("Reading covariate file...\n")
+ P = IN.getCovariates(options.covfile)
+ if options.noMean:
+ X0 = P
+ else:
+ X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
+elif options.emmaCov:
+ if options.verbose:
+ sys.stderr.write("Reading covariate file...\n")
+ P = IN.getCovariatesEMMA(options.emmaCov)
+ if options.noMean:
+ X0 = P
+ else:
+ X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P])
+else:
+ X0 = np.ones((IN.phenos.shape[0],1))
+
+if np.isnan(X0).sum():
+ parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.")
# READING Kinship - major bottleneck for large datasets
-if options.verbose:
- sys.stderr.write("Reading kinship...\n")
+if options.verbose: sys.stderr.write("Reading kinship...\n")
begin = time.time()
# This method seems to be the fastest and works if you already know the size of the matrix
if options.kfile[-3:] == '.gz':
- import gzip
- f = gzip.open(options.kfile,'r')
- F = f.read() # might exhaust mem if the file is huge
- K = np.fromstring(F,sep=' ') # Assume that space separated
- f.close()
-else:
- K = np.fromfile(open(options.kfile,'r'),sep=" ")
+ import gzip
+ f = gzip.open(options.kfile,'r')
+ F = f.read() # might exhaust mem if the file is huge
+ K = np.fromstring(F,sep=' ') # Assume that space separated
+ f.close()
+else:
+ K = np.fromfile(open(options.kfile,'r'),sep=" ")
K.resize((len(IN.indivs),len(IN.indivs)))
end = time.time()
# Other slower ways
#K = np.loadtxt(options.kfile)
#K = np.genfromtxt(options.kfile)
-if options.verbose:
- sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin))
-
+if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin))
+
+if options.kfile2:
+ if options.verbose: sys.stderr.write("Reading second kinship...\n")
+ begin = time.time()
+ # This method seems to be the fastest and works if you already know the size of the matrix
+ if options.kfile2[-3:] == '.gz':
+ import gzip
+ f = gzip.open(options.kfile2,'r')
+ F = f.read() # might exhaust mem if the file is huge
+ K2 = np.fromstring(F,sep=' ') # Assume that space separated
+ f.close()
+ else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ")
+ K2.resize((len(IN.indivs),len(IN.indivs)))
+ end = time.time()
+ if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin))
# PROCESS the phenotype data -- Remove missing phenotype values
# Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data
@@ -158,81 +225,122 @@ Y = IN.phenos[:,options.pheno]
v = np.isnan(Y)
keep = True - v
if v.sum():
- if options.verbose:
- sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
- Y = Y[keep]
- X0 = X0[keep,:]
- K = K[keep,:][:,keep]
- Kva = []
- Kve = []
+ if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum()))
+ Y = Y[keep]
+ X0 = X0[keep,:]
+ K = K[keep,:][:,keep]
+ if options.kfile2: K2 = K2[keep,:][:,keep]
+ Kva = []
+ Kve = []
# Only load the decomposition if we did not remove individuals.
# Otherwise it would not be correct and we would have to compute it again.
if not v.sum() and options.eigenfile:
- if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n")
- Kva = np.load(options.eigenfile + ".Kva")
- Kve = np.load(options.eigenfile + ".Kve")
-else:
- Kva = []
- Kve = []
+ if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n")
+ Kva = np.load(options.eigenfile + ".Kva")
+ Kve = np.load(options.eigenfile + ".Kve")
+else:
+ Kva = []
+ Kve = []
# CREATE LMM object for association
n = K.shape[0]
-L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose)
-# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here
-if not options.refit:
- if options.verbose: sys.stderr.write("Computing fit for null model\n")
- L.fit()
- if options.verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma))
+if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose)
+else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2)
+# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here
+if not options.refit:
+ if options.verbose: sys.stderr.write("Computing fit for null model\n")
+ L.fit()
+ if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma))
+ if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW))
+
+def compute_snp(collect):
+ snp = collect[0]
+ id = collect[1]
+ # result = []
+ # Check SNPs for missing values
+ x = snp[keep].reshape((n,1)) # all the SNPs
+ v = np.isnan(x).reshape((-1,))
+ if v.sum():
+ keeps = True - v
+ xs = x[keeps,:]
+ if keeps.sum() <= 1 or xs.var() <= 1e-6:
+ # PS.append(np.nan)
+ # TS.append(np.nan)
+ # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan))
+ # continue
+ return formatResult(id,np.nan,np.nan,np.nan,np.nan)
+
+ # Its ok to center the genotype - I used options.normalizeGenotype to
+ # force the removal of missing genotypes as opposed to replacing them with MAF.
+ if not options.normalizeGenotype:
+ xs = (xs - xs.mean()) / np.sqrt(xs.var())
+ Ys = Y[keeps]
+ X0s = X0[keeps,:]
+ Ks = K[keeps,:][:,keeps]
+ if options.kfile2:
+ K2s = K2[keeps,:][:,keeps]
+ Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s)
+ else:
+ Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose)
+ if options.refit:
+ Ls.fit(X=xs,REML=options.REML)
+ else:
+ #try:
+ Ls.fit(REML=options.REML)
+ #except: pdb.set_trace()
+ ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True)
+ else:
+ if x.var() == 0:
+ # PS.append(np.nan)
+ # TS.append(np.nan)
+ # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values
+ return formatResult(id,np.nan,np.nan,np.nan,np.nan)
+ # continue
+
+ if options.refit:
+ L.fit(X=x,REML=options.REML)
+ # This is where it happens
+ ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True)
+
+ return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)
+ # PS.append(ps)
+ # TS.append(ts)
+ # return len(result)
+ # compute.q.put(result)
+ # return None
+
+def f_init(q):
+ compute_snp.q = q
+
+# Set up the pool
+# mp.set_start_method('spawn')
+q = mp.Queue()
+p = mp.Pool(numThreads, f_init, [q])
+collect = []
+
# Buffers for pvalues and t-stats
-PS = []
-TS = []
+# PS = []
+# TS = []
count = 0
out = open(outFile,'w')
printOutHead()
-for snp,id in IN:
- count += 1
- if options.verbose and count % 1000 == 0:
- sys.stderr.write("At SNP %d\n" % count)
-
- x = snp[keep].reshape((n,1))
- #x[[1,50,100,200,3000],:] = np.nan
- v = np.isnan(x).reshape((-1,))
- # Check SNPs for missing values
- if v.sum():
- keeps = True - v
- xs = x[keeps,:]
- if keeps.sum() <= 1 or xs.var() <= 1e-6:
- PS.append(np.nan)
- TS.append(np.nan)
- outputResult(id,np.nan,np.nan,np.nan,np.nan)
- continue
-
- # Its ok to center the genotype - I used options.normalizeGenotype to
- # force the removal of missing genotypes as opposed to replacing them with MAF.
- if not options.normalizeGenotype: xs = (xs - xs.mean()) / np.sqrt(xs.var())
- Ys = Y[keeps]
- X0s = X0[keeps,:]
- Ks = K[keeps,:][:,keeps]
- Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose)
- if options.refit: Ls.fit(X=xs,REML=options.REML)
- else:
- #try:
- Ls.fit(REML=options.REML)
- #except: pdb.set_trace()
- ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True)
- else:
- if x.var() == 0:
- PS.append(np.nan)
- TS.append(np.nan)
- outputResult(id,np.nan,np.nan,np.nan,np.nan)
- continue
-
- if options.refit: L.fit(X=x,REML=options.REML)
- ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True)
-
- outputResult(id,beta,np.sqrt(betaVar).sum(),ts,ps)
- PS.append(ps)
- TS.append(ts)
+for snp_id in IN:
+ count += 1
+ if count % 1000 == 0:
+ if options.verbose:
+ sys.stderr.write("At SNP %d\n" % count)
+ # if count>8000 :
+ # break # for testing only
+ if count % 100 == 0:
+ for line in p.imap(compute_snp,collect):
+ out.write(line)
+ collect = []
+
+ collect.append(snp_id)
+for line in p.imap(compute_snp,collect):
+ out.write(line)
+
+
diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py
index cfba2936..c9a73ece 100755
--- a/wqflask/wqflask/my_pylmm/pylmmKinship.py
+++ b/wqflask/wqflask/my_pylmm/pylmmKinship.py
@@ -23,10 +23,12 @@
#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import sys
+import pdb
+
from optparse import OptionParser,OptionGroup
-usage = """usage: %prog [options] --[t | b | p]file plinkFileBase outfile
+usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile
-NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience.
"""
parser = OptionParser(usage=usage)
@@ -34,15 +36,20 @@ parser = OptionParser(usage=usage)
basicGroup = OptionGroup(parser, "Basic Options")
#advancedGroup = OptionGroup(parser, "Advanced Options")
-basicGroup.add_option("--pfile", dest="pfile",
- help="The base for a PLINK ped file")
+#basicGroup.add_option("--pfile", dest="pfile",
+# help="The base for a PLINK ped file")
basicGroup.add_option("--tfile", dest="tfile",
help="The base for a PLINK tped file")
basicGroup.add_option("--bfile", dest="bfile",
help="The base for a PLINK binary ped file")
+basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None,
+ help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.")
+basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0,
+ help="When providing the emmaSNP file you need to specify how many snps are in the file")
basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.")
basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.")
+basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use")
basicGroup.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=False,
@@ -52,68 +59,128 @@ parser.add_option_group(basicGroup)
#parser.add_option_group(advancedGroup)
(options, args) = parser.parse_args()
-if len(args) != 1: parser.error("Incorrect number of arguments")
+if len(args) != 1:
+ parser.print_help()
+ sys.exit()
+
outFile = args[0]
import sys
import os
import numpy as np
from scipy import linalg
-from pyLMM.lmm import calculate_kinship
-from pyLMM import input
-
-if not options.pfile and not options.tfile and not options.bfile:
- parser.error("You must provide at least one PLINK input file base")
-
+from pylmm.lmm import calculateKinship
+from pylmm import input
+import multiprocessing as mp # Multiprocessing is part of the Python stdlib
+import Queue
+
+if not options.tfile and not options.bfile and not options.emmaFile:
+ parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).")
+
+numThreads = None
+if options.numThreads:
+ numThreads = int(options.numThreads)
+
if options.verbose: sys.stderr.write("Reading PLINK input...\n")
if options.bfile: IN = input.plink(options.bfile,type='b')
elif options.tfile: IN = input.plink(options.tfile,type='t')
-elif options.pfile: IN = input.plink(options.pfile,type='p')
-else: parser.error("You must provide at least one PLINK input file base")
+#elif options.pfile: IN = input.plink(options.pfile,type='p')
+elif options.emmaFile:
+ if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.")
+ IN = input.plink(options.emmaFile,type='emma')
+else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).")
+
+def compute_W(job):
+ """
+ Read 1000 SNPs at a time into matrix and return the result
+ """
+ n = len(IN.indivs)
+ m = options.computeSize
+ W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs)
+ for j in range(0,options.computeSize):
+ row = job*m + j
+ if row >= IN.numSNPs:
+ W = W[:,range(0,j)]
+ break
+ snp,id = IN.next()
+ if snp.var() == 0:
+ continue
+ W[:,j] = snp # set row to list of SNPs
+ return W
+
+def compute_dgemm(job,W):
+ """
+ Compute Kinship(W)*j
+
+ For every set of SNPs dgemm is used to multiply matrices T(W)*W
+ """
+ res = None
+ try:
+ res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False)
+ except AttributeError:
+ res = np.dot(W,W.T)
+ compute_dgemm.q.put([job,res])
+ return job
+
+def f_init(q):
+ compute_dgemm.q = q
n = len(IN.indivs)
-m = options.computeSize
-W = np.ones((n,m)) * np.nan
+# m = options.computeSize
+# jobsize=m
IN.getSNPIterator()
-i = 0
-K = None
-while i < IN.numSNPs:
- j = 0
- while j < options.computeSize and i < IN.numSNPs:
- snp,id = IN.next()
- if snp.var() == 0:
- i += 1
- continue
- W[:,j] = snp
-
- i += 1
- j += 1
- if j < options.computeSize: W = W[:,range(0,j)]
-
- if options.verbose:
- sys.stderr.write("Processing first %d SNPs\n" % i)
- if K == None:
- try:
- K = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j
- except AttributeError:
- K = np.dot(W,W.T)
- else:
- try:
- K_j = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j
- except AttributeError:
- K_j = np.dot(W,W.T)
- K = K + K_j
-
+# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file
+if options.emmaFile: IN.numSNPs = options.numSNPs
+# i = 0
+
+# mp.set_start_method('spawn')
+q = mp.Queue()
+p = mp.Pool(numThreads, f_init, [q])
+iterations = IN.numSNPs/options.computeSize+1
+# jobs = range(0,8) # range(0,iterations)
+
+results = []
+
+K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals
+
+completed = 0
+
+# for job in range(8):
+for job in range(iterations):
+ if options.verbose:
+ sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize)))
+ W = compute_W(job)
+ results.append(p.apply_async(compute_dgemm, (job,W)))
+ # Do we have a result?
+ try:
+ j,x = q.get_nowait()
+ if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ K_j = x
+ # print j,K_j[:,0]
+ K = K + K_j
+ completed += 1
+ except Queue.Empty:
+ pass
+
+for job in range(len(results)-completed):
+ j,x = q.get()
+ if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n")
+ K_j = x
+ # print j,K_j[:,0]
+ K = K + K_j
+
K = K / float(IN.numSNPs)
if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile)
np.savetxt(outFile,K)
if options.saveEig:
- if options.verbose:
- sys.stderr.write("Obtaining Eigendecomposition\n")
- Kva,Kve = linalg.eigh(K)
- if options.verbose:
- sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
- np.savetxt(outFile+".kva",Kva)
- np.savetxt(outFile+".kve",Kve)
+ if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n")
+ Kva,Kve = linalg.eigh(K)
+ if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile)
+ np.savetxt(outFile+".kva",Kva)
+ np.savetxt(outFile+".kve",Kve)
+
+
+
+
diff --git a/wqflask/wqflask/parser.py b/wqflask/wqflask/parser.py
index be92014d..5c89bc01 100755
--- a/wqflask/wqflask/parser.py
+++ b/wqflask/wqflask/parser.py
@@ -26,7 +26,7 @@ from pprint import pformat as pf
def parse(pstring):
"""
- returned item serach_term is always a list, even if only one element
+ returned item search_term is always a list, even if only one element
"""
pstring = re.split(r"""(?:(\w+\s*=\s*[\('"\[][^)'"]*[\)\]'"]) | # LRS=(1 2 3), cisLRS=[4 5 6], etc
(\w+\s*[=:\>\<][\w\*]+) | # wiki=bar, GO:foobar, etc
@@ -42,6 +42,8 @@ def parse(pstring):
print("separators:", separators)
+
+
for item in pstring:
splat = re.split(separators, item)
print("splat is:", splat)
diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py
index 9d2d0b88..c6521106 100755
--- a/wqflask/wqflask/search_results.py
+++ b/wqflask/wqflask/search_results.py
@@ -212,9 +212,9 @@ class SearchResultPage(object):
#
# #search_gene
# #search_geno
- # #searhch_pheno
+ # #search_pheno
# #search_mrn
- # #searhc_publish
+ # #search_publish
def search(self):
diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
index af53bd09..83264da3 100755
--- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
+++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee
@@ -118,5 +118,12 @@ $ ->
console.log("Calling:", populate_function)
window[populate_function]()
-
- $("#make_default").click(make_default) \ No newline at end of file
+ check_search_term = ->
+ search_term = $('#tfor').val()
+ console.log("search_term:", search_term)
+ if (search_term == "")
+ alert("Please enter one or more search terms or search equations.")
+ return false
+
+ $("#make_default").click(make_default)
+ $("#btsearch").click(check_search_term) \ No newline at end of file
diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
index 6f63b715..5bd54359 100644
--- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
+++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js
@@ -1,6 +1,6 @@
// Generated by CoffeeScript 1.8.0
$(function() {
- var apply_default, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown;
+ var apply_default, check_search_term, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown;
process_json = function(data) {
window.jdata = data;
populate_species();
@@ -133,5 +133,15 @@ $(function() {
}
return _results;
};
- return $("#make_default").click(make_default);
+ check_search_term = function() {
+ var search_term;
+ search_term = $('#tfor').val();
+ console.log("search_term:", search_term);
+ if (search_term === "") {
+ alert("Please enter one or more search terms or search equations.");
+ return false;
+ }
+ };
+ $("#make_default").click(make_default);
+ return $("#btsearch").click(check_search_term);
});
diff --git a/wqflask/wqflask/templates/index_page.html b/wqflask/wqflask/templates/index_page.html
index eb4f883c..538a6799 100755
--- a/wqflask/wqflask/templates/index_page.html
+++ b/wqflask/wqflask/templates/index_page.html
@@ -18,7 +18,7 @@
<div class="row" style="width: 1400px !important;">
- <div class="col-xs-5 col-xs-5">
+ <div class="col-xs-5">
<section id="search">
<div class="page-header">
<h1>Select and search</h1>
@@ -29,31 +29,42 @@
<div class="form-group">
<label for="species" class="col-xs-1 control-label" style="width: 65px !important;">Species:</label>
- <div class="col-xs-4 controls">
- <select name="species" id="species" class="form-control selectpicker span3" style="width: 300px !important;"></select>
+ <div class="col-xs-10 controls">
+ <div class="col-xs-8">
+ <select name="species" id="species" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+ </div>
+ <div class="col-xs-4">
+ <input id="make_default" class="btn btn-default form-control" value="Make Default">
+ </div>
</div>
</div>
<div class="form-group">
<label for="group" class="col-xs-1 control-label" style="width: 65px !important;">Group:</label>
<div class="col-xs-4 controls input-append">
- <select name="group" id="group" class="form-control selectpicker span3" style="width: 300px !important;"></select>
- <i class="icon-question-sign"></i>
+ <div class="col-xs-8">
+ <select name="group" id="group" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+ <i class="icon-question-sign"></i>
+ </div>
</div>
</div>
<div class="form-group">
<label for="tissue" class="col-xs-1 control-label" style="width: 65px !important;">Type:</label>
<div class="col-xs-4 controls">
- <select name="type" id="type" class="form-control selectpicker span3" style="width: 300px !important;"></select>
+ <div class="col-xs-8">
+ <select name="type" id="type" class="form-control selectpicker span3" style="width: 280px !important;"></select>
+ </div>
</div>
</div>
<div class="form-group">
<label for="dataset" class="col-xs-1 control-label" style="width: 65px !important;">Dataset:</label>
<div class="col-xs-4 controls input-append">
- <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select>
- <i class="icon-question-sign"></i>
+ <div class="col-xs-8">
+ <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select>
+ <i class="icon-question-sign"></i>
+ </div>
</div>
</div>
@@ -65,9 +76,11 @@
<!-- GET ANY SEARCH -->
<div class="form-group">
- <label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;">Search for:</label>
+ <label for="tfor" class="col-xs-1 control-label" style="padding-left: 0px; padding-right: 0px; width: 65px !important;">Search for:</label>
<div class="col-xs-10 controls">
- <textarea name="search_terms" rows="2" class="form-control search-query" style="width: 450px !important;" id="tfor"></textarea>
+ <div class="col-xs-8">
+ <textarea name="search_terms" rows="2" class="form-control search-query" style="max-width: 550px; width: 450px !important;" id="tfor"></textarea>
+ </div>
</div>
</div>
@@ -75,12 +88,14 @@
<div class="form-group">
<label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;"></label>
<div class="col-xs-10 controls">
- <p>Enter terms, genes, ID numbers in the
- <b>Search</b> field<br>
- Use <b>*</b> or <b>?</b> wildcards (Cyp*a?,
- synap*)<br>
- Use <b>quotes</b> for terms such as <i>"tyrosine
- kinase"</i></p>
+ <div class="col-xs-10">
+ <p>Enter terms, genes, ID numbers in the
+ <b>Search</b> field<br>
+ Use <b>*</b> or <b>?</b> wildcards (Cyp*a?,
+ synap*)<br>
+ Use <b>quotes</b> for terms such as <i>"tyrosine
+ kinase"</i></p>
+ </div>
</div>
</div>
@@ -92,9 +107,6 @@
<div class="col-xs-3 controls" style="width: 100px !important;">
<input id="btsearch" type="submit" class="btn btn-primary form-control" value="Search">
</div>
- <div class="col-xs-4 controls" style="width: 150px !important;">
- <input id="make_default" class="btn btn-default form-control" value="Make Default">
- </div>
</div>
<input type="hidden" name="FormID" value="searchResult" class="form-control">
diff --git a/wqflask/wqflask/templates/new_security/login_user.html b/wqflask/wqflask/templates/new_security/login_user.html
index 8ad0c79e..81e8b53a 100755
--- a/wqflask/wqflask/templates/new_security/login_user.html
+++ b/wqflask/wqflask/templates/new_security/login_user.html
@@ -7,7 +7,7 @@
{{ flash_me() }}
<div class="page-header">
- <h1>Login</h1>
+ <h1>&nbsp;&nbsp;&nbsp;&nbsp;Login</h1>
</div>