diff options
-rwxr-xr-x | wqflask/wqflask/do_search.py | 15 | ||||
-rwxr-xr-x | wqflask/wqflask/my_pylmm/pylmmGWAS.py | 378 | ||||
-rwxr-xr-x | wqflask/wqflask/my_pylmm/pylmmKinship.py | 169 | ||||
-rwxr-xr-x | wqflask/wqflask/parser.py | 4 | ||||
-rwxr-xr-x | wqflask/wqflask/search_results.py | 4 | ||||
-rwxr-xr-x | wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee | 11 | ||||
-rw-r--r-- | wqflask/wqflask/static/new/javascript/dataset_select_menu.js | 14 | ||||
-rwxr-xr-x | wqflask/wqflask/templates/index_page.html | 50 | ||||
-rwxr-xr-x | wqflask/wqflask/templates/new_security/login_user.html | 2 |
9 files changed, 432 insertions, 215 deletions
diff --git a/wqflask/wqflask/do_search.py b/wqflask/wqflask/do_search.py index fd7bd916..29f3de98 100755 --- a/wqflask/wqflask/do_search.py +++ b/wqflask/wqflask/do_search.py @@ -370,13 +370,24 @@ class WikiSearch(MrnaAssaySearch): DoSearch.search_types['WIKI'] = "WikiSearch" - def run(self): + def get_where_clause(self): where_clause = """%s.symbol = GeneRIF.symbol and GeneRIF.versionId=0 and GeneRIF.display>0 and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s') """ % (self.dataset.type, "[[:<:]]"+str(self.search_term[0])+"[[:>:]]", - str(self.search_term[0])) + str(self.search_term[0])) + return where_clause + + def run(self): + #where_clause = """%s.symbol = GeneRIF.symbol + # and GeneRIF.versionId=0 and GeneRIF.display>0 + # and (GeneRIF.comment REGEXP '%s' or GeneRIF.initial = '%s') + # """ % (self.dataset.type, + # "[[:<:]]"+str(self.search_term[0])+"[[:>:]]", + # str(self.search_term[0])) + + where_clause = self.get_where_clause() from_clause = ", GeneRIF " query = self.compile_final_query(from_clause, where_clause) diff --git a/wqflask/wqflask/my_pylmm/pylmmGWAS.py b/wqflask/wqflask/my_pylmm/pylmmGWAS.py index 54a230de..74b58dd6 100755 --- a/wqflask/wqflask/my_pylmm/pylmmGWAS.py +++ b/wqflask/wqflask/my_pylmm/pylmmGWAS.py @@ -3,60 +3,80 @@ # pylmm is a python-based linear mixed-model solver with applications to GWAS # Copyright (C) 2013 Nicholas A. Furlotte (nick.furlotte@gmail.com) -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. - -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. +#The program is free for academic use. Please contact Nick Furlotte +#<nick.furlotte@gmail.com> if you are interested in using the software for +#commercial purposes. + +#The software must not be modified and distributed without prior +#permission of the author. + +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +#NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pdb import time +import sys + +def printOutHead(): out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") -def printOutHead(): - out.write("\t".join(["SNP_ID","BETA","BETA_SD","F_STAT","P_VALUE"]) + "\n") +def formatResult(id,beta,betaSD,ts,ps): + return "\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n" + def outputResult(id,beta,betaSD,ts,ps): - out.write("\t".join([str(x) for x in [id,beta,betaSD,ts,ps]]) + "\n") + out.write(formatResult(id,beta,betaSD,ts,ps)) from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --kfile kinshipFile --[t | b | p]file plinkFileBase outfileBase +usage = """usage: %prog [options] --kfile kinshipFile --[tfile | bfile] plinkFileBase outfileBase -This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. -The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. +This program provides basic genome-wide association (GWAS) functionality. You provide a phenotype and genotype file as well as a pre-computed (use pylmmKinship.py) kinship matrix and the program outputs a result file with information about each SNP, including the association p-value. +The input file are all standard plink formatted with the first two columns specifiying the individual and family ID. For the phenotype file, we accept either NA or -9 to denote missing values. Basic usage: python pylmmGWAS.py -v --bfile plinkFile --kfile preComputedKinship.kin --phenofile plinkFormattedPhenotypeFile resultFile -NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience. - - """ + """ parser = OptionParser(usage=usage) basicGroup = OptionGroup(parser, "Basic Options") advancedGroup = OptionGroup(parser, "Advanced Options") +experimentalGroup = OptionGroup(parser, "Experimental Options") -basicGroup.add_option("--pfile", dest="pfile", - help="The base for a PLINK ped file") +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") basicGroup.add_option("--tfile", dest="tfile", help="The base for a PLINK tped file") basicGroup.add_option("--bfile", dest="bfile", help="The base for a PLINK binary bed file") basicGroup.add_option("--phenofile", dest="phenoFile", default=None, - help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. The order does not matter for this file. ") + help="Without this argument the program will look for a file with .pheno that has the plinkFileBase root. If you want to specify an alternative phenotype file, then use this argument. This file should be in plink format. ") + +# EMMA Options +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the columns and snps on the rows.") +basicGroup.add_option("--emmaPHENO", dest="emmaPheno", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each phenotype as one row.") +basicGroup.add_option("--emmaCOV", dest="emmaCov", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with each covariate as one row.") + basicGroup.add_option("--kfile", dest="kfile", help="The location of a kinship file. This is an nxn plain text file and can be computed with the pylmmKinship program.") basicGroup.add_option("--covfile", dest="covfile", help="The location of a covariate file file. This is a plink formatted covariate file.") basicGroup.add_option("-p", type="int", dest="pheno", help="The phenotype index to be used in association.", default=0) + + advancedGroup.add_option("--removeMissingGenotypes", action="store_false", dest="normalizeGenotype", default=True, help="By default the program replaces missing genotypes with the minor allele frequency. This option overrides that behavior making the program remove missing individuals. NOTE: This can increase running time due to the need to recompute the eigendecomposition for each SNP with missing values.") @@ -69,16 +89,23 @@ advancedGroup.add_option("--REML", help="Use restricted maximum-likelihood (REML) (default is maximum-likelihood).") #advancedGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") advancedGroup.add_option("--eigen", dest="eigenfile", - help="The location of the precomputed eigendecomposition for the kinship file") + help="The location of the precomputed eigendecomposition for the kinship file. These can be computed with pylmmKinship.py.") advancedGroup.add_option("--noMean", dest="noMean", default=False,action="store_true", help="This option only applies when --cofile is used. When covfile is provided, the program will automatically add a global mean covariate to the model unless this option is specified.") +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") + advancedGroup.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="Print extra info") +# Experimental Group Options +experimentalGroup.add_option("--kfile2", dest="kfile2", + help="The location of a second kinship file. This file has the same format as the first kinship. This might be used if you want to correct for another form of confounding.") + parser.add_option_group(basicGroup) parser.add_option_group(advancedGroup) +parser.add_option_group(experimentalGroup) (options, args) = parser.parse_args() @@ -89,68 +116,108 @@ from scipy import linalg from pylmm.lmm import LMM from pylmm import input -if len(args) != 1: - parser.error("Incorrect number of arguments") +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if len(args) != 1: + parser.print_help() + sys.exit() + outFile = args[0] -if not options.pfile and not options.tfile and not options.bfile: - parser.error("You must provide at least one PLINK input file base") +if not options.tfile and not options.bfile and not options.emmaFile: +#if not options.pfile and not options.tfile and not options.bfile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an EMMA formatted file (--emmaSNP).") if not options.kfile: - parser.error("Please provide a pre-computed kinship file") + parser.error("Please provide a pre-computed kinship file") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) # READING PLINK input -if options.verbose: - sys.stderr.write("Reading PLINK input...\n") -if options.bfile: - IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.tfile: - IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -elif options.pfile: - IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) -else: - parser.error("You must provide at least one PLINK input file base") - -if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos'): - parser.error("No .pheno file exist for %s" % (options.phenoFile or IN.fbase + '.phenos')) +if options.verbose: sys.stderr.write("Reading SNP input...\n") +if options.bfile: IN = input.plink(options.bfile,type='b', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.tfile: IN = input.plink(options.tfile,type='t', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +#elif options.pfile: IN = input.plink(options.pfile,type='p', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +elif options.emmaFile: IN = input.plink(options.emmaFile,type='emma', phenoFile=options.phenoFile,normGenotype=options.normalizeGenotype) +else: parser.error("You must provide at least one PLINK input file base") + +if not os.path.isfile(options.phenoFile or IN.fbase + '.phenos') and not os.path.isfile(options.emmaPheno): + parser.error("No .pheno file exist for %s. Please provide a phenotype file using the --phenofile or --emmaPHENO argument." % (options.phenoFile or IN.fbase + '.phenos')) + +# Read the emma phenotype file if provided. +# Format should be rows are phenotypes and columns are individuals. +if options.emmaPheno: + f = open(options.emmaPheno,'r') + P = [] + for line in f: + v = line.strip().split() + p = [] + for x in v: + try: + p.append(float(x)) + except: p.append(np.nan) + P.append(p) + f.close() + IN.phenos = np.array(P).T # READING Covariate File -if options.covfile: - if options.verbose: - sys.stderr.write("Reading covariate file...\n") - # Read the covariate file -- write this into input.plink - P = IN.getCovariates(options.covfile) - - if options.noMean: - X0 = P - else: - X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) - - if np.isnan(X0).sum(): - parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") -else: - X0 = np.ones((IN.phenos.shape[0],1)) +if options.covfile: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariates(options.covfile) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +elif options.emmaCov: + if options.verbose: + sys.stderr.write("Reading covariate file...\n") + P = IN.getCovariatesEMMA(options.emmaCov) + if options.noMean: + X0 = P + else: + X0 = np.hstack([np.ones((IN.phenos.shape[0],1)),P]) +else: + X0 = np.ones((IN.phenos.shape[0],1)) + +if np.isnan(X0).sum(): + parser.error("The covariate file %s contains missing values. At this time we are not dealing with this case. Either remove those individuals with missing values or replace them in some way.") # READING Kinship - major bottleneck for large datasets -if options.verbose: - sys.stderr.write("Reading kinship...\n") +if options.verbose: sys.stderr.write("Reading kinship...\n") begin = time.time() # This method seems to be the fastest and works if you already know the size of the matrix if options.kfile[-3:] == '.gz': - import gzip - f = gzip.open(options.kfile,'r') - F = f.read() # might exhaust mem if the file is huge - K = np.fromstring(F,sep=' ') # Assume that space separated - f.close() -else: - K = np.fromfile(open(options.kfile,'r'),sep=" ") + import gzip + f = gzip.open(options.kfile,'r') + F = f.read() # might exhaust mem if the file is huge + K = np.fromstring(F,sep=' ') # Assume that space separated + f.close() +else: + K = np.fromfile(open(options.kfile,'r'),sep=" ") K.resize((len(IN.indivs),len(IN.indivs))) end = time.time() # Other slower ways #K = np.loadtxt(options.kfile) #K = np.genfromtxt(options.kfile) -if options.verbose: - sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) - +if options.verbose: sys.stderr.write("Read the %d x %d kinship matrix in %0.3fs \n" % (K.shape[0],K.shape[1],end-begin)) + +if options.kfile2: + if options.verbose: sys.stderr.write("Reading second kinship...\n") + begin = time.time() + # This method seems to be the fastest and works if you already know the size of the matrix + if options.kfile2[-3:] == '.gz': + import gzip + f = gzip.open(options.kfile2,'r') + F = f.read() # might exhaust mem if the file is huge + K2 = np.fromstring(F,sep=' ') # Assume that space separated + f.close() + else: K2 = np.fromfile(open(options.kfile2,'r'),sep=" ") + K2.resize((len(IN.indivs),len(IN.indivs))) + end = time.time() + if options.verbose: sys.stderr.write("Read the %d x %d second kinship matrix in %0.3fs \n" % (K2.shape[0],K2.shape[1],end-begin)) # PROCESS the phenotype data -- Remove missing phenotype values # Keep will now index into the "full" data to select what we keep (either everything or a subset of non missing data @@ -158,81 +225,122 @@ Y = IN.phenos[:,options.pheno] v = np.isnan(Y) keep = True - v if v.sum(): - if options.verbose: - sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) - Y = Y[keep] - X0 = X0[keep,:] - K = K[keep,:][:,keep] - Kva = [] - Kve = [] + if options.verbose: sys.stderr.write("Cleaning the phenotype vector by removing %d individuals...\n" % (v.sum())) + Y = Y[keep] + X0 = X0[keep,:] + K = K[keep,:][:,keep] + if options.kfile2: K2 = K2[keep,:][:,keep] + Kva = [] + Kve = [] # Only load the decomposition if we did not remove individuals. # Otherwise it would not be correct and we would have to compute it again. if not v.sum() and options.eigenfile: - if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") - Kva = np.load(options.eigenfile + ".Kva") - Kve = np.load(options.eigenfile + ".Kve") -else: - Kva = [] - Kve = [] + if options.verbose: sys.stderr.write("Loading pre-computed eigendecomposition...\n") + Kva = np.load(options.eigenfile + ".Kva") + Kve = np.load(options.eigenfile + ".Kve") +else: + Kva = [] + Kve = [] # CREATE LMM object for association n = K.shape[0] -L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) -# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here -if not options.refit: - if options.verbose: sys.stderr.write("Computing fit for null model\n") - L.fit() - if options.verbose: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) +if not options.kfile2: L = LMM(Y,K,Kva,Kve,X0,verbose=options.verbose) +else: L = LMM_withK2(Y,K,Kva,Kve,X0,verbose=options.verbose,K2=K2) +# Fit the null model -- if refit is true we will refit for each SNP, so no reason to run here +if not options.refit: + if options.verbose: sys.stderr.write("Computing fit for null model\n") + L.fit() + if options.verbose and not options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f\n" % (L.optH,L.optSigma)) + if options.verbose and options.kfile2: sys.stderr.write("\t heritability=%0.3f, sigma=%0.3f, w=%0.3f\n" % (L.optH,L.optSigma,L.optW)) + +def compute_snp(collect): + snp = collect[0] + id = collect[1] + # result = [] + # Check SNPs for missing values + x = snp[keep].reshape((n,1)) # all the SNPs + v = np.isnan(x).reshape((-1,)) + if v.sum(): + keeps = True - v + xs = x[keeps,:] + if keeps.sum() <= 1 or xs.var() <= 1e-6: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) + # continue + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + + # Its ok to center the genotype - I used options.normalizeGenotype to + # force the removal of missing genotypes as opposed to replacing them with MAF. + if not options.normalizeGenotype: + xs = (xs - xs.mean()) / np.sqrt(xs.var()) + Ys = Y[keeps] + X0s = X0[keeps,:] + Ks = K[keeps,:][:,keeps] + if options.kfile2: + K2s = K2[keeps,:][:,keeps] + Ls = LMM_withK2(Ys,Ks,X0=X0s,verbose=options.verbose,K2=K2s) + else: + Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) + if options.refit: + Ls.fit(X=xs,REML=options.REML) + else: + #try: + Ls.fit(REML=options.REML) + #except: pdb.set_trace() + ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) + else: + if x.var() == 0: + # PS.append(np.nan) + # TS.append(np.nan) + # result.append(formatResult(id,np.nan,np.nan,np.nan,np.nan)) # writes nan values + return formatResult(id,np.nan,np.nan,np.nan,np.nan) + # continue + + if options.refit: + L.fit(X=x,REML=options.REML) + # This is where it happens + ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) + + return formatResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) + # PS.append(ps) + # TS.append(ts) + # return len(result) + # compute.q.put(result) + # return None + +def f_init(q): + compute_snp.q = q + +# Set up the pool +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +collect = [] + # Buffers for pvalues and t-stats -PS = [] -TS = [] +# PS = [] +# TS = [] count = 0 out = open(outFile,'w') printOutHead() -for snp,id in IN: - count += 1 - if options.verbose and count % 1000 == 0: - sys.stderr.write("At SNP %d\n" % count) - - x = snp[keep].reshape((n,1)) - #x[[1,50,100,200,3000],:] = np.nan - v = np.isnan(x).reshape((-1,)) - # Check SNPs for missing values - if v.sum(): - keeps = True - v - xs = x[keeps,:] - if keeps.sum() <= 1 or xs.var() <= 1e-6: - PS.append(np.nan) - TS.append(np.nan) - outputResult(id,np.nan,np.nan,np.nan,np.nan) - continue - - # Its ok to center the genotype - I used options.normalizeGenotype to - # force the removal of missing genotypes as opposed to replacing them with MAF. - if not options.normalizeGenotype: xs = (xs - xs.mean()) / np.sqrt(xs.var()) - Ys = Y[keeps] - X0s = X0[keeps,:] - Ks = K[keeps,:][:,keeps] - Ls = LMM(Ys,Ks,X0=X0s,verbose=options.verbose) - if options.refit: Ls.fit(X=xs,REML=options.REML) - else: - #try: - Ls.fit(REML=options.REML) - #except: pdb.set_trace() - ts,ps,beta,betaVar = Ls.association(xs,REML=options.REML,returnBeta=True) - else: - if x.var() == 0: - PS.append(np.nan) - TS.append(np.nan) - outputResult(id,np.nan,np.nan,np.nan,np.nan) - continue - - if options.refit: L.fit(X=x,REML=options.REML) - ts,ps,beta,betaVar = L.association(x,REML=options.REML,returnBeta=True) - - outputResult(id,beta,np.sqrt(betaVar).sum(),ts,ps) - PS.append(ps) - TS.append(ts) +for snp_id in IN: + count += 1 + if count % 1000 == 0: + if options.verbose: + sys.stderr.write("At SNP %d\n" % count) + # if count>8000 : + # break # for testing only + if count % 100 == 0: + for line in p.imap(compute_snp,collect): + out.write(line) + collect = [] + + collect.append(snp_id) +for line in p.imap(compute_snp,collect): + out.write(line) + + diff --git a/wqflask/wqflask/my_pylmm/pylmmKinship.py b/wqflask/wqflask/my_pylmm/pylmmKinship.py index cfba2936..c9a73ece 100755 --- a/wqflask/wqflask/my_pylmm/pylmmKinship.py +++ b/wqflask/wqflask/my_pylmm/pylmmKinship.py @@ -23,10 +23,12 @@ #NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys +import pdb + from optparse import OptionParser,OptionGroup -usage = """usage: %prog [options] --[t | b | p]file plinkFileBase outfile +usage = """usage: %prog [options] --[tfile | bfile] plinkFileBase outfile -NOTE: The current running version only supports binary PED files (PLINK). It is simple to convert between ped or tped and bed using PLINK. Sorry for the inconvinience. """ parser = OptionParser(usage=usage) @@ -34,15 +36,20 @@ parser = OptionParser(usage=usage) basicGroup = OptionGroup(parser, "Basic Options") #advancedGroup = OptionGroup(parser, "Advanced Options") -basicGroup.add_option("--pfile", dest="pfile", - help="The base for a PLINK ped file") +#basicGroup.add_option("--pfile", dest="pfile", +# help="The base for a PLINK ped file") basicGroup.add_option("--tfile", dest="tfile", help="The base for a PLINK tped file") basicGroup.add_option("--bfile", dest="bfile", help="The base for a PLINK binary ped file") +basicGroup.add_option("--emmaSNP", dest="emmaFile", default=None, + help="For backwards compatibility with emma, we allow for \"EMMA\" file formats. This is just a text file with individuals on the rows and snps on the columns.") +basicGroup.add_option("--emmaNumSNPs", dest="numSNPs", type="int", default=0, + help="When providing the emmaSNP file you need to specify how many snps are in the file") basicGroup.add_option("-e", "--efile", dest="saveEig", help="Save eigendecomposition to this file.") basicGroup.add_option("-n", default=1000,dest="computeSize", type="int", help="The maximum number of SNPs to read into memory at once (default 1000). This is important when there is a large number of SNPs, because memory could be an issue.") +basicGroup.add_option("-t", "--nthreads", dest="numThreads", help="maximum number of threads to use") basicGroup.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, @@ -52,68 +59,128 @@ parser.add_option_group(basicGroup) #parser.add_option_group(advancedGroup) (options, args) = parser.parse_args() -if len(args) != 1: parser.error("Incorrect number of arguments") +if len(args) != 1: + parser.print_help() + sys.exit() + outFile = args[0] import sys import os import numpy as np from scipy import linalg -from pyLMM.lmm import calculate_kinship -from pyLMM import input - -if not options.pfile and not options.tfile and not options.bfile: - parser.error("You must provide at least one PLINK input file base") - +from pylmm.lmm import calculateKinship +from pylmm import input +import multiprocessing as mp # Multiprocessing is part of the Python stdlib +import Queue + +if not options.tfile and not options.bfile and not options.emmaFile: + parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +numThreads = None +if options.numThreads: + numThreads = int(options.numThreads) + if options.verbose: sys.stderr.write("Reading PLINK input...\n") if options.bfile: IN = input.plink(options.bfile,type='b') elif options.tfile: IN = input.plink(options.tfile,type='t') -elif options.pfile: IN = input.plink(options.pfile,type='p') -else: parser.error("You must provide at least one PLINK input file base") +#elif options.pfile: IN = input.plink(options.pfile,type='p') +elif options.emmaFile: + if not options.numSNPs: parser.error("You must provide the number of SNPs when specifying an emma formatted file.") + IN = input.plink(options.emmaFile,type='emma') +else: parser.error("You must provide at least one PLINK input file base (--tfile or --bfile) or an emma formatted file (--emmaSNP).") + +def compute_W(job): + """ + Read 1000 SNPs at a time into matrix and return the result + """ + n = len(IN.indivs) + m = options.computeSize + W = np.ones((n,m)) * np.nan # W matrix has dimensions individuals x SNPs (initially all NaNs) + for j in range(0,options.computeSize): + row = job*m + j + if row >= IN.numSNPs: + W = W[:,range(0,j)] + break + snp,id = IN.next() + if snp.var() == 0: + continue + W[:,j] = snp # set row to list of SNPs + return W + +def compute_dgemm(job,W): + """ + Compute Kinship(W)*j + + For every set of SNPs dgemm is used to multiply matrices T(W)*W + """ + res = None + try: + res = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) + except AttributeError: + res = np.dot(W,W.T) + compute_dgemm.q.put([job,res]) + return job + +def f_init(q): + compute_dgemm.q = q n = len(IN.indivs) -m = options.computeSize -W = np.ones((n,m)) * np.nan +# m = options.computeSize +# jobsize=m IN.getSNPIterator() -i = 0 -K = None -while i < IN.numSNPs: - j = 0 - while j < options.computeSize and i < IN.numSNPs: - snp,id = IN.next() - if snp.var() == 0: - i += 1 - continue - W[:,j] = snp - - i += 1 - j += 1 - if j < options.computeSize: W = W[:,range(0,j)] - - if options.verbose: - sys.stderr.write("Processing first %d SNPs\n" % i) - if K == None: - try: - K = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j - except AttributeError: - K = np.dot(W,W.T) - else: - try: - K_j = linalg.fblas.dgemm(alpha=1.,a=W.T,b=W.T,trans_a=True,trans_b=False) # calculateKinship(W) * j - except AttributeError: - K_j = np.dot(W,W.T) - K = K + K_j - +# Annoying hack to get around the fact that it is expensive to determine the number of SNPs in an emma file +if options.emmaFile: IN.numSNPs = options.numSNPs +# i = 0 + +# mp.set_start_method('spawn') +q = mp.Queue() +p = mp.Pool(numThreads, f_init, [q]) +iterations = IN.numSNPs/options.computeSize+1 +# jobs = range(0,8) # range(0,iterations) + +results = [] + +K = np.zeros((n,n)) # The Kinship matrix has dimension individuals x individuals + +completed = 0 + +# for job in range(8): +for job in range(iterations): + if options.verbose: + sys.stderr.write("Processing job %d first %d SNPs\n" % (job, ((job+1)*options.computeSize))) + W = compute_W(job) + results.append(p.apply_async(compute_dgemm, (job,W))) + # Do we have a result? + try: + j,x = q.get_nowait() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + completed += 1 + except Queue.Empty: + pass + +for job in range(len(results)-completed): + j,x = q.get() + if options.verbose: sys.stderr.write("Job "+str(j)+" finished\n") + K_j = x + # print j,K_j[:,0] + K = K + K_j + K = K / float(IN.numSNPs) if options.verbose: sys.stderr.write("Saving Kinship file to %s\n" % outFile) np.savetxt(outFile,K) if options.saveEig: - if options.verbose: - sys.stderr.write("Obtaining Eigendecomposition\n") - Kva,Kve = linalg.eigh(K) - if options.verbose: - sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) - np.savetxt(outFile+".kva",Kva) - np.savetxt(outFile+".kve",Kve) + if options.verbose: sys.stderr.write("Obtaining Eigendecomposition\n") + Kva,Kve = linalg.eigh(K) + if options.verbose: sys.stderr.write("Saving eigendecomposition to %s.[kva | kve]\n" % outFile) + np.savetxt(outFile+".kva",Kva) + np.savetxt(outFile+".kve",Kve) + + + + diff --git a/wqflask/wqflask/parser.py b/wqflask/wqflask/parser.py index be92014d..5c89bc01 100755 --- a/wqflask/wqflask/parser.py +++ b/wqflask/wqflask/parser.py @@ -26,7 +26,7 @@ from pprint import pformat as pf def parse(pstring): """ - returned item serach_term is always a list, even if only one element + returned item search_term is always a list, even if only one element """ pstring = re.split(r"""(?:(\w+\s*=\s*[\('"\[][^)'"]*[\)\]'"]) | # LRS=(1 2 3), cisLRS=[4 5 6], etc (\w+\s*[=:\>\<][\w\*]+) | # wiki=bar, GO:foobar, etc @@ -42,6 +42,8 @@ def parse(pstring): print("separators:", separators) + + for item in pstring: splat = re.split(separators, item) print("splat is:", splat) diff --git a/wqflask/wqflask/search_results.py b/wqflask/wqflask/search_results.py index 9d2d0b88..c6521106 100755 --- a/wqflask/wqflask/search_results.py +++ b/wqflask/wqflask/search_results.py @@ -212,9 +212,9 @@ class SearchResultPage(object): # # #search_gene # #search_geno - # #searhch_pheno + # #search_pheno # #search_mrn - # #searhc_publish + # #search_publish def search(self): diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee index af53bd09..83264da3 100755 --- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee +++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.coffee @@ -118,5 +118,12 @@ $ -> console.log("Calling:", populate_function) window[populate_function]() - - $("#make_default").click(make_default)
\ No newline at end of file + check_search_term = -> + search_term = $('#tfor').val() + console.log("search_term:", search_term) + if (search_term == "") + alert("Please enter one or more search terms or search equations.") + return false + + $("#make_default").click(make_default) + $("#btsearch").click(check_search_term)
\ No newline at end of file diff --git a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js index 6f63b715..5bd54359 100644 --- a/wqflask/wqflask/static/new/javascript/dataset_select_menu.js +++ b/wqflask/wqflask/static/new/javascript/dataset_select_menu.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 1.8.0 $(function() { - var apply_default, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown; + var apply_default, check_search_term, dataset_info, group_info, make_default, open_window, populate_dataset, populate_group, populate_species, populate_type, process_json, redo_dropdown; process_json = function(data) { window.jdata = data; populate_species(); @@ -133,5 +133,15 @@ $(function() { } return _results; }; - return $("#make_default").click(make_default); + check_search_term = function() { + var search_term; + search_term = $('#tfor').val(); + console.log("search_term:", search_term); + if (search_term === "") { + alert("Please enter one or more search terms or search equations."); + return false; + } + }; + $("#make_default").click(make_default); + return $("#btsearch").click(check_search_term); }); diff --git a/wqflask/wqflask/templates/index_page.html b/wqflask/wqflask/templates/index_page.html index eb4f883c..538a6799 100755 --- a/wqflask/wqflask/templates/index_page.html +++ b/wqflask/wqflask/templates/index_page.html @@ -18,7 +18,7 @@ <div class="row" style="width: 1400px !important;"> - <div class="col-xs-5 col-xs-5"> + <div class="col-xs-5"> <section id="search"> <div class="page-header"> <h1>Select and search</h1> @@ -29,31 +29,42 @@ <div class="form-group"> <label for="species" class="col-xs-1 control-label" style="width: 65px !important;">Species:</label> - <div class="col-xs-4 controls"> - <select name="species" id="species" class="form-control selectpicker span3" style="width: 300px !important;"></select> + <div class="col-xs-10 controls"> + <div class="col-xs-8"> + <select name="species" id="species" class="form-control selectpicker span3" style="width: 280px !important;"></select> + </div> + <div class="col-xs-4"> + <input id="make_default" class="btn btn-default form-control" value="Make Default"> + </div> </div> </div> <div class="form-group"> <label for="group" class="col-xs-1 control-label" style="width: 65px !important;">Group:</label> <div class="col-xs-4 controls input-append"> - <select name="group" id="group" class="form-control selectpicker span3" style="width: 300px !important;"></select> - <i class="icon-question-sign"></i> + <div class="col-xs-8"> + <select name="group" id="group" class="form-control selectpicker span3" style="width: 280px !important;"></select> + <i class="icon-question-sign"></i> + </div> </div> </div> <div class="form-group"> <label for="tissue" class="col-xs-1 control-label" style="width: 65px !important;">Type:</label> <div class="col-xs-4 controls"> - <select name="type" id="type" class="form-control selectpicker span3" style="width: 300px !important;"></select> + <div class="col-xs-8"> + <select name="type" id="type" class="form-control selectpicker span3" style="width: 280px !important;"></select> + </div> </div> </div> <div class="form-group"> <label for="dataset" class="col-xs-1 control-label" style="width: 65px !important;">Dataset:</label> <div class="col-xs-4 controls input-append"> - <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select> - <i class="icon-question-sign"></i> + <div class="col-xs-8"> + <select name="dataset" id="dataset" class="form-control selectpicker span5" style="width: 450px !important;"></select> + <i class="icon-question-sign"></i> + </div> </div> </div> @@ -65,9 +76,11 @@ <!-- GET ANY SEARCH --> <div class="form-group"> - <label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;">Search for:</label> + <label for="tfor" class="col-xs-1 control-label" style="padding-left: 0px; padding-right: 0px; width: 65px !important;">Search for:</label> <div class="col-xs-10 controls"> - <textarea name="search_terms" rows="2" class="form-control search-query" style="width: 450px !important;" id="tfor"></textarea> + <div class="col-xs-8"> + <textarea name="search_terms" rows="2" class="form-control search-query" style="max-width: 550px; width: 450px !important;" id="tfor"></textarea> + </div> </div> </div> @@ -75,12 +88,14 @@ <div class="form-group"> <label for="tfor" class="col-xs-1 control-label" style="width: 65px !important;"></label> <div class="col-xs-10 controls"> - <p>Enter terms, genes, ID numbers in the - <b>Search</b> field<br> - Use <b>*</b> or <b>?</b> wildcards (Cyp*a?, - synap*)<br> - Use <b>quotes</b> for terms such as <i>"tyrosine - kinase"</i></p> + <div class="col-xs-10"> + <p>Enter terms, genes, ID numbers in the + <b>Search</b> field<br> + Use <b>*</b> or <b>?</b> wildcards (Cyp*a?, + synap*)<br> + Use <b>quotes</b> for terms such as <i>"tyrosine + kinase"</i></p> + </div> </div> </div> @@ -92,9 +107,6 @@ <div class="col-xs-3 controls" style="width: 100px !important;"> <input id="btsearch" type="submit" class="btn btn-primary form-control" value="Search"> </div> - <div class="col-xs-4 controls" style="width: 150px !important;"> - <input id="make_default" class="btn btn-default form-control" value="Make Default"> - </div> </div> <input type="hidden" name="FormID" value="searchResult" class="form-control"> diff --git a/wqflask/wqflask/templates/new_security/login_user.html b/wqflask/wqflask/templates/new_security/login_user.html index 8ad0c79e..81e8b53a 100755 --- a/wqflask/wqflask/templates/new_security/login_user.html +++ b/wqflask/wqflask/templates/new_security/login_user.html @@ -7,7 +7,7 @@ {{ flash_me() }} <div class="page-header"> - <h1>Login</h1> + <h1> Login</h1> </div> |