diff options
Diffstat (limited to 'web/webqtl/compareCorrelates')
-rwxr-xr-x | web/webqtl/compareCorrelates/MultipleCorrelationPage.py | 108 | ||||
-rwxr-xr-x | web/webqtl/compareCorrelates/__init__.py | 0 | ||||
-rwxr-xr-x | web/webqtl/compareCorrelates/correlation.py | 359 | ||||
-rwxr-xr-x | web/webqtl/compareCorrelates/htmlModule.py | 279 | ||||
-rwxr-xr-x | web/webqtl/compareCorrelates/multitrait.py | 1121 | ||||
-rwxr-xr-x | web/webqtl/compareCorrelates/trait.py | 1074 |
6 files changed, 2941 insertions, 0 deletions
diff --git a/web/webqtl/compareCorrelates/MultipleCorrelationPage.py b/web/webqtl/compareCorrelates/MultipleCorrelationPage.py new file mode 100755 index 00000000..6a464ab6 --- /dev/null +++ b/web/webqtl/compareCorrelates/MultipleCorrelationPage.py @@ -0,0 +1,108 @@ +# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License +# as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero General Public License for more details. +# +# This program is available from Source Forge: at GeneNetwork Project +# (sourceforge.net/projects/genenetwork/). +# +# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) +# at rwilliams@uthsc.edu and xzhou15@uthsc.edu +# +# +# +# This module is used by GeneNetwork project (www.genenetwork.org) +# +# Created by GeneNetwork Core Team 2010/08/10 +# +# Last updated by GeneNetwork Core Team 2010/10/20 + +from base.templatePage import templatePage +from utility import webqtlUtil +from base.webqtlTrait import webqtlTrait +from base import webqtlConfig +import multitrait + +# XZ, 09/09/2008: After adding several traits to collection, click "Compare Correlates" button, +# XZ, 09/09/2008: This class will generate what you see. +# XZ, 09/09/2008: This class just collect the input, then pass them to multitrait.py +######################################### +# Multiple Correlation Page +######################################### +class MultipleCorrelationPage(templatePage): + + def __init__(self,fd): + + templatePage.__init__(self, fd) + + if not self.openMysql(): + return + if not fd.genotype: + fd.readData() + + self.searchResult = fd.formdata.getvalue('searchResult') + if not self.searchResult: + heading = 'Compare Correlates' + detail = ['You need to select at least two traits in order to generate correlation matrix.'] + self.error(heading=heading,detail=detail) + print 'Content-type: text/html\n' + self.write() + return + if type("1") == type(self.searchResult): + self.searchResult = [self.searchResult] + + if self.searchResult: + if len(self.searchResult) > 100: + heading = 'Compare Correlates' + detail = ['In order to display Compare Correlates properly, Do not select more than %d traits for Compare Correlates.' % 100] + self.error(heading=heading,detail=detail) + print 'Content-type: text/html\n' + self.write() + return + else: + pass + + traitList = [] + for item in self.searchResult: + thisTrait = webqtlTrait(fullname=item, cursor=self.cursor) + thisTrait.retrieveInfo() + traitList.append(thisTrait) + else: + heading = 'Compare Correlates' + detail = [HT.Font('Error : ',color='red'),HT.Font('Error occurs while retrieving data from database.',color='black')] + self.error(heading=heading,detail=detail) + print 'Content-type: text/html\n' + self.write() + return + + + ########## + filename= webqtlUtil.genRandStr("mult_") + fp = open(webqtlConfig.IMGDIR+filename, 'wb') + fp.write('%s\n' % fd.RISet) + for thisTrait in traitList: + fp.write("%s,%s,%s\n" % (thisTrait.db.type,thisTrait.db.id,thisTrait.name)) + fp.close() + fd.formdata["filename"] = filename + + params = {"filename":filename, "targetDatabase":"", + "threshold":0.5, "subsetSize":10, + "correlation":"pearson", "subsetCount":10, + "firstRun":"1"} + results = [] + txtOutputFileName = "" + + self.dict['body'] = multitrait.TraitCorrelationPage(fd, params, self.cursor, traitList, results, + fd.RISet,txtOutputFileName).dict['body'] + self.dict['title'] = 'Compare Correlates' + + + + diff --git a/web/webqtl/compareCorrelates/__init__.py b/web/webqtl/compareCorrelates/__init__.py new file mode 100755 index 00000000..e69de29b --- /dev/null +++ b/web/webqtl/compareCorrelates/__init__.py diff --git a/web/webqtl/compareCorrelates/correlation.py b/web/webqtl/compareCorrelates/correlation.py new file mode 100755 index 00000000..f2ea55b3 --- /dev/null +++ b/web/webqtl/compareCorrelates/correlation.py @@ -0,0 +1,359 @@ +# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License +# as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero General Public License for more details. +# +# This program is available from Source Forge: at GeneNetwork Project +# (sourceforge.net/projects/genenetwork/). +# +# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) +# at rwilliams@uthsc.edu and xzhou15@uthsc.edu +# +# +# +# This module is used by GeneNetwork project (www.genenetwork.org) +# +# Created by GeneNetwork Core Team 2010/08/10 +# +# Last updated by GeneNetwork Core Team 2010/10/20 + +# correlation.py +# functions for computing correlations for traits +# +# Originally, this code was designed to compute Pearson product-moment +# coefficents. The basic function calcPearson scans the strain data +# for the two traits and drops data for a strain unless both traits have it. +# If there are less than six strains left, we conclude that there's +# insufficent data and drop the correlation. +# +# In addition, this code can compute Spearman rank-order coefficents using +# the calcSpearman function. + +#Xiaodong changed the dependancy structure +import numarray +import numarray.ma as MA +import time + +import trait + +# strainDataUnion : StrainData -> StrainData -> array, array +def strainDataUnion(s1, s2): + # build lists of values that both have + # and make sure that both sets of values are in the same order + s1p = [] + s2p = [] + sortedKeys = s1.keys() + sortedKeys.sort() + for s in sortedKeys: + if s2.has_key(s): + s1p.append(s1[s]) + s2p.append(s2[s]) + + return (numarray.array(s1p, numarray.Float64), + numarray.array(s2p, numarray.Float64)) + +# calcCorrelationHelper : array -> array -> float +def calcCorrelationHelper(s1p, s2p): + # if the traits share less than six strains, then we don't + # bother with the correlations + if len(s1p) < 6: + return 0.0 + + # subtract by x-bar and y-bar elementwise + #oldS1P = s1p.copy() + #oldS2P = s2p.copy() + + s1p = (s1p - numarray.average(s1p)).astype(numarray.Float64) + s2p = (s2p - numarray.average(s2p)).astype(numarray.Float64) + + # square for the variances + s1p_2 = numarray.sum(s1p**2) + s2p_2 = numarray.sum(s2p**2) + + try: + corr = (numarray.sum(s1p*s2p)/ + numarray.sqrt(s1p_2 * s2p_2)) + except ZeroDivisionError: + corr = 0.0 + + return corr + +# calcSpearman : Trait -> Trait -> float +def calcSpearman(trait1, trait2): + s1p, s2p = strainDataUnion(trait1.strainData, + trait2.strainData) + s1p = rankArray(s1p) + s2p = rankArray(s2p) + return calcCorrelationHelper(s1p, s2p) + +# calcPearson : Trait -> Trait -> float +def calcPearson(trait1, trait2): + # build lists of values that both have + # and make sure that both sets of values are in the same order + s1p, s2p = strainDataUnion(trait1.strainData, + trait2.strainData) + + return calcCorrelationHelper(s1p, s2p) + +# buildPearsonCorrelationMatrix: (listof n traits) -> int s -> n x s matrix, n x s matrix +#def buildPearsonCorrelationMatrix(traits, sc): +# dim = (len(traits), sc) +# matrix = numarray.zeros(dim, MA.Float64) +# testMatrix = numarray.zeros(dim, MA.Float64) + +# for i in range(len(traits)): +# sd = traits[i].strainData +# for key in sd.keys(): +# matrix[i,int(key) - 1] = sd[key] +# testMatrix[i,int(key) - 1] = 1 + +def buildPearsonCorrelationMatrix(traits, commonStrains): + dim = (len(traits), len(commonStrains)) + matrix = numarray.zeros(dim, MA.Float64) + testMatrix = numarray.zeros(dim, MA.Float64) + + for i in range(len(traits)): + sd = traits[i].strainData + keys = sd.keys() + for j in range(0, len(commonStrains)): + if keys.__contains__(commonStrains[j]): + matrix[i,j] = sd[commonStrains[j]] + testMatrix[i,j] = 1 + + return matrix, testMatrix + +# buildSpearmanCorrelationMatrix: (listof n traits) -> int s -> n x s matrix, n x s matrix +def buildSpearmanCorrelationMatrix(traits, sc): + dim = (len(traits), sc) + matrix = numarray.zeros(dim, MA.Float64) + testMatrix = numarray.zeros(dim, MA.Float64) + + def customCmp(a, b): + return cmp(a[1], b[1]) + + for i in range(len(traits)): + # copy strain data to a temporary list and turn it into + # (strain, expression) pairs + sd = traits[i].strainData + tempList = [] + for key in sd.keys(): + tempList.append((key, sd[key])) + + # sort the temporary list by expression + tempList.sort(customCmp) + + for j in range(len(tempList)): + # k is the strain id minus 1 + # 1-based strain id -> 0-based column index + k = int(tempList[j][0]) - 1 + + # j is the rank of the particular strain + matrix[i,k] = j + + testMatrix[i,k] = 1 + + return matrix, testMatrix + +def findLargestStrain(traits, sc): + strainMaxes = [] + for i in range(len(traits)): + keys = traits[i].strainData.keys() + strainMaxes.append(max(keys)) + + return max(strainMaxes) + +def findCommonStrains(traits1, traits2): + commonStrains = [] + strains1 = [] + strains2 = [] + + for trait in traits1: + keys = trait.strainData.keys() + for key in keys: + if not strains1.__contains__(key): + strains1.append(key) + + for trait in traits2: + keys = trait.strainData.keys() + for key in keys: + if not strains2.__contains__(key): + strains2.append(key) + + for strain in strains1: + if strains2.__contains__(strain): + commonStrains.append(strain) + + return commonStrains + +def calcPearsonMatrix(traits1, traits2, sc, strainThreshold=6, + verbose = 0): + return calcMatrixHelper(buildPearsonCorrelationMatrix, + traits1, traits2, sc, strainThreshold, + verbose) + +def calcProbeSetPearsonMatrix(cursor, freezeId, traits2, strainThreshold=6, + verbose = 0): + + cursor.execute('select ProbeSetId from ProbeSetXRef where ProbeSetFreezeId = %s order by ProbeSetId' % freezeId) + ProbeSetIds = cursor.fetchall() + + results = [] + i=0 + while i<len(ProbeSetIds): + ProbeSetId1 = ProbeSetIds[i][0] + if (i+4999) < len(ProbeSetIds): + ProbeSetId2 = ProbeSetIds[i+4999][0] + else: + ProbeSetId2 = ProbeSetIds[len(ProbeSetIds)-1][0] + + traits1 = trait.queryPopulatedProbeSetTraits2(cursor, freezeId, ProbeSetId1, ProbeSetId2) # XZ,09/10/2008: add module name 'trait.' + SubMatrix = calcMatrixHelper(buildPearsonCorrelationMatrix, + traits1, traits2, 1000, strainThreshold, + verbose) + results.append(SubMatrix) + i += 5000 + + returnValue = numarray.zeros((len(ProbeSetIds), len(traits2)), MA.Float64) + row = 0 + col = 0 + for SubMatrix in results: + for i in range(0, len(SubMatrix)): + for j in range(0, len(traits2)): + returnValue[row,col] = SubMatrix[i,j] + col += 1 + col = 0 + row +=1 + + return returnValue + + + +# note: this code DOES NOT WORK, especially in cases where +# there are missing observations (e.g. when comparing traits +# from different probesetfreezes) +def calcSpearmanMatrix(traits1, traits2, sc, strainThreshold=6, + verbose=0): + return calcMatrixHelper(buildSpearmanCorrelationMatrix, + traits1, traits2, sc, strainThreshold, + verbose) + +def calcMatrixHelper(builder, traits1, traits2, sc, strainThreshold, + verbose): + + # intelligently figure out strain count + step0 = time.time() + #localSC = max(findLargestStrain(traits1, sc), + # findLargestStrain(traits2, sc)) + + commonStrains = findCommonStrains(traits1, traits2) + + buildStart = time.time() + matrix1, test1 = builder(traits1, commonStrains) + matrix2, test2 = builder(traits2, commonStrains) + buildTime = time.time() - buildStart + + step1 = time.time() + + ns = numarray.innerproduct(test1, test2) + + # mask all ns less than strainThreshold so the correlation values + # end up masked + # ns is now a MaskedArray and so all ops involving ns will be + # MaskedArrays + ns = MA.masked_less(ns, strainThreshold, copy=0) + + # divide-by-zero errors are automatically masked + #ns = -1.0/ns + + step2 = time.time() + + # see comment above to find out where this ridiculously cool + # matrix algebra comes from + xs = numarray.innerproduct(matrix1, test2) + ys = numarray.innerproduct(test1, matrix2) + xys = numarray.innerproduct(matrix1, matrix2) + + # use in-place operations to try to speed things up + numarray.power(matrix1, 2, matrix1) + numarray.power(matrix2, 2, matrix2) + + x2s = numarray.innerproduct(matrix1, test2) + y2s = numarray.innerproduct(test1, matrix2) + + step3 = time.time() + + # parens below are very important + # the instant we touch ns, arrays become masked and + # computation is much, much slower + top = ns*xys - (xs*ys) + bottom1 = ns*x2s - (xs*xs) + bottom2 = ns*y2s - (ys*ys) + bottom = MA.sqrt(bottom1*bottom2) + + # mask catches floating point divide-by-zero problems here + corrs = top / bottom + + step4 = time.time() + + # we define undefined correlations as zero even though there + # is a mathematical distinction + returnValue = MA.filled(corrs, 0.0) + + step5 = time.time() + + #print ("calcMatrixHelper: %.2f s, %.2f s, %.2f s, %.2f s, %.2f s, %.2f s, total: %.2f s" + # %(buildTime, + # buildStart - step0, + # step2 - step1, + # step3 - step2, + # step4 - step3, + # step5 - step4, + # step5 - step0)) + + if verbose: + print "Matrix 1:", matrix1 + print "Matrix 2:", matrix2 + print "Ns:", ns + print "Xs", xs + print "Ys", ys + print "XYs:", xys + print "Top:", top + print "Bottom 1:", bottom1 + print "Bottom 2:", bottom2 + print "Bottom:", bottom + print "Corrs:", corrsa + + + return returnValue + + + +# rankArray: listof float -> listof float +# to generate a companion list to alof with +# the actual value of each element replaced by the +# value's rank +def rankArray(floatArray): + # first we save the original index of each element + tmpAlof = [] + returnArray = numarray.zeros(len(floatArray), numarray.Float64) + i = 0 + for i in range(len(floatArray)): + tmpAlof.append((i,floatArray[i])) + + # now we sort by the data value + def customCmp(a,b): return cmp(a[1],b[1]) + tmpAlof.sort(customCmp) + + # finally we use the new rank data to populate the + # return array + for i in range(len(floatArray)): + returnArray[tmpAlof[i][0]] = i+1 + + return returnArray diff --git a/web/webqtl/compareCorrelates/htmlModule.py b/web/webqtl/compareCorrelates/htmlModule.py new file mode 100755 index 00000000..ebba3b86 --- /dev/null +++ b/web/webqtl/compareCorrelates/htmlModule.py @@ -0,0 +1,279 @@ +# Copyright (C) University of Tennessee Health Science Center, Memphis, TN. +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU Affero General Public License +# as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero General Public License for more details. +# +# This program is available from Source Forge: at GeneNetwork Project +# (sourceforge.net/projects/genenetwork/). +# +# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) +# at rwilliams@uthsc.edu and xzhou15@uthsc.edu +# +# +# +# This module is used by GeneNetwork project (www.genenetwork.org) +# +# Created by GeneNetwork Core Team 2010/08/10 +# +# Last updated by GeneNetwork Core Team 2010/10/20 + +import sys +import string +import os +import MySQLdb +import cgi + +from htmlgen import HTMLgen2 as HT + +from base import webqtlConfig + + +# XZ 08/14/2008: When I tried to replace 'from webqtlConfig import *' with 'import webqtlConfig' +# XZ 08/14/2008: I found some problems. I discussed with Hongqiang and the below is conclusion. +# XZ 08/14/2008: The program uses webqtlConfig.DB_NAME, webqtlConfig.MYSQL_SERVER and so on +# XZ 08/14/2008: without 'import webqtlConfig'. This program will not work. +# XZ 08/14/2008: CONFIG_htmlpath doesn't exist in webqtlConfig.py +# XZ 08/14/2008: Hongqian said this was done by Fan Zhang, and this program was not tested. +# XZ 08/14/2008: So nobody realize these bugs. + +# XZ, 09/09/2008: This function is not called any where. +# XZ, 09/09/2008: Actually, I don't think this function works. +def genHeaderFooter(i=1,title='',basehref='',js1='',js2='',layer='',body=''): + """ + generate footer and header HTML code + default is header + i = 0 is footer+header + i = 1 is header + i = 2 is footer + """ + try: + temp_file = CONFIG_htmlpath + 'beta-template.html' + fp = open(temp_file, 'rb') + template = fp.read() + fp.close() + template = template % (title,basehref,js1,js2,layer,body, "") + header,footer = string.split(template,'<!-- split from Here -->') + if i == 0: + return header + footer + elif i == 1: + return header + elif i == 2: + return footer + else: + return "" + except: + if i == 0: + return "header + footer" + elif i == 1: + return "header" + elif i == 2: + return "footer" + else: + return "" + +# XZ, 09/09/2008: This function is only used in multitrait.py where it is called with value assigned to db. +# XZ, 09/09/2008: So the try-except block is not executed. +# XZ, 09/09/2008: This explains why no error was generated even without 'import webqtlConfig' +def genDatabaseMenu(db = None, public =1, RISetgp = 'BXD', selectname = 'database', selected = ""): + """ + generate database Menu + public = 0 : search3.html databases Menu + public = 1 : search.html databases Menu + """ + if not db: + try: + # import MySQLdb + # con = MySQLdb.Connect(db='db_webqtl') + # Modified by Fan Zhang + con = MySQLdb.Connect(db=webqtlConfig.DB_NAME,host=webqtlConfig.MYSQL_SERVER, user=webqtlConfig.DB_USER,passwd=webqtlConfig.DB_PASSWD) + db = con.cursor() + except: + return "Connect MySQL Server Error" + else: + pass + + databaseMenu = HT.Select(name=selectname) + nmenu = 0 + + # here's a hack: bxd and bxd300 can be correlated against each other + # if either of those are the group, we put in special SQL that pulls both + if RISetgp in ("BXD", "BXD300"): + ibsNameQry = '(InbredSet.Name = "BXD" OR InbredSet.Name = "BXD300")' + else: + ibsNameQry = 'InbredSet.Name = "%s"' % RISetgp + + #Publish Database + db.execute(''' + SelecT + PublishFreeze.FullName, + PublishFreeze.Name + from + PublishFreeze, + InbredSet + where + PublishFreeze.InbredSetId = InbredSet.Id and + %s + ''' % ibsNameQry) + for item in db.fetchall(): + databaseMenu.append(item) + nmenu += 1 + + #Genome Database + db.execute(''' + SelecT + GenoFreeze.FullName, + GenoFreeze.Name + from + GenoFreeze,InbredSet + where + GenoFreeze.InbredSetId = InbredSet.Id and + %s + ''' % ibsNameQry) + for item in db.fetchall(): + databaseMenu.append(item) + nmenu += 1 + + #Microarray Database + db.execute('SelecT Id, Name from Tissue') + for item in db.fetchall(): + TId, TName = item + databaseMenuSub = HT.Optgroup(label = '%s ------' % TName) + db.execute(''' + SelecT + ProbeSetFreeze.FullName, + ProbeSetFreeze.Name + from + ProbeSetFreeze, + ProbeFreeze, + InbredSet + where + ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and + ProbeFreeze.TissueId = %d and + ProbeSetFreeze.public > %d and + ProbeFreeze.InbredSetId = InbredSet.Id and + %s + order by + ProbeSetFreeze.CreateTime desc, + ProbeSetFreeze.AvgId + ''' % (TId,public,ibsNameQry)) + for item2 in db.fetchall(): + databaseMenuSub.append(item2) + nmenu += 1 + databaseMenu.append(databaseMenuSub) + + if nmenu: + if selected: + databaseMenu.selected.append(selected) + return str(databaseMenu) + else: + return '' + + +# XZ, 09/09/2008: This function is not called any where. +# XZ, 09/09/2008: Actually, I don't think this function works. +# XZ, 09/09/2008: There is no 'DataForm' file now. It should be webqtlForm.py +def genRISample(): + import glob + import reaper + import random + import math + import webqtlUtil + risets = filter(lambda X:X.find('F2')<0, map(os.path.basename, glob.glob(os.path.join(CONFIG_genodir, "*.geno")))) + risets = map(lambda X:X.split('.')[0], risets) + risets.remove("BayXSha") + risets.sort() + body = HT.Blockquote() + NPerRow = 6 + for item in risets: + values = [] + if item == 'AXBXA': item2='AXB/BXA' + elif item == 'HXBBXH': item2='HXB/BXH' + else: item2=item + body.append(HT.Paragraph(item2, Class='subtitle')) + tbl = HT.TableLite(Class="collap") + dataset = reaper.Dataset() + dataset.read(os.path.join(CONFIG_genodir, "%s.geno"%item)) + prgy = webqtlUtil.ParInfo[item] + list(dataset.prgy) + + mean = random.random()*100 + variance = random.random()*500 + variables = [] + while len(variables) < len(prgy): + S = 2 + while (S>=1): + U1= random.random() + U2= random.random() + V1= 2*U1-1.0 + V2= 2*U2-1.0 + S=V1*V1+V2*V2 + X= math.sqrt(-2 * math.log(S) / S) * V1 + Y= math.sqrt(-2 * math.log(S) / S) * V2 + variables.append(mean + math.sqrt(variance) * X) + variables.append(mean + math.sqrt(variance) * Y) + + tempTR = HT.TR() + for i, strain in enumerate(prgy): + if i and i%NPerRow==0: + tbl.append(tempTR) + tempTR = HT.TR() + if random.random() < 0.2: + variable = 'X' + else: + variable = "%2.3f" % variables[i] + + tempTR.append(HT.TD(strain, Class="strains", width=80)) + tempTR.append(HT.TD(variable, Class="values", width=60)) + values.append(variable) + + for j in range(NPerRow-i%NPerRow-1): + tempTR.append(HT.TD()) + tbl.append(tempTR) + body.append(tbl) + body.append(HT.Paragraph("Copy the following line to paste into the GeneNetwork entry box:")) + body.append(HT.Code(string.join(values, " "))) + body.append(HT.HR(width="90%")) + return body + +if __name__ == "__main__": + if os.environ.has_key('SCRIPT_FILENAME'): + script_filename = os.environ['SCRIPT_FILENAME'] + else: + script_filename = '' + #Used as cgi script + if script_filename and script_filename[-2:] == 'py': + print 'Content-type: text/html\n' + formdata = cgi.FieldStorage() + sys.stderr = sys.stdout + try: + getID = string.lower(formdata.getvalue('get')) + except: + getID = '' + #Used as command + else: + if len(sys.argv) >= 2: + getID = string.lower(sys.argv[1]) + else: + getID = '' + + if getID == 'headerfooter': + print genHeaderFooter(0) + elif getID == 'header': + print genHeaderFooter(1) + elif getID == 'footer': + print genHeaderFooter(2) + elif getID == 'databasemenu': + print genDatabaseMenu(public=0) + elif getID == 'datasample': + print genRISample() + else: + print genHeaderFooter(0) +else: + pass + diff --git a/web/webqtl/compareCorrelates/multitrait.py b/web/webqtl/compareCorrelates/multitrait.py new file mode 100755 index 00000000..047620af --- /dev/null +++ b/web/webqtl/compareCorrelates/multitrait.py @@ -0,0 +1,1121 @@ +# multitrait.py +# a tool to analyze the correlations between several different traits and the traits +# in a given dataset +# +# Parameters: +# correlation -- either "pearson" or "spearman" depending on which ones we want to use +# +# filename -- an input file containing the traits to analyze +# +# progress -- if set, this parameter outputs a static progress page +# and uses a META redirect to trigger the real computation +# +# targetDatabaseType: +# one of "ProbeSet", "Publish", "Genotype" depending on the type of database +# we will use for the analysis +# +# targetDatabaseId: +# the id (*Freeze.Id in the database) of the particular database we will analyze +# +# threshold -- a float between 0 and 1 to determine which coefficents we wil l consider +# +# firstRun -- either 0 or 1 +# whether to automatically pick reasonable defaults for the other three parameters +# +# outputType -- either "html" or "text" +# +# Author: Stephen Pitts +# June 15, 2004 + +#Xiaodong changed the dependancy structure + +import copy +import sys +import cgi +import os +import os.path +import math +import time +import numarray +import tempfile +import string +import cgitb #all tracebacks come out as HTMLified CGI,useful when we have a random crash in the middle + +from base import templatePage +from base.webqtlTrait import webqtlTrait +from utility import webqtlUtil +from base import webqtlConfig +import trait +import correlation +import htmlModule + +cgitb.enable() + + +# where this program's data files are +RootDir = webqtlConfig.IMGDIR # XZ, 09/10/2008: add module name 'webqtlConfig.' +RootDirURL = "/image/" # XZ, 09/10/2008: This parameter is not used in this module + +tempfile.tempdir = RootDir +tempfile.template = "multitrait" + +# MultitraitException: used if something goes wrong +# maybe in the future we should make exceptions more granular +class MultitraitException(Exception): + def __init__(self, message): + self.message = message + + def __repr__(self): + return "MultitraitException: %s" % self.message + +# buildParamDict: Cursor -> ParamDict +# to process and validate CGI arguments +# see the comment at the top of this file for valid cgi +# parameters +def buildParamDict(cursor, fd): + params = {} + fs = fd.formdata #cgi.FieldStorage() + params["progress"] = fs.getfirst("progress", "0") + params["filename"] = fs.getfirst("filename", "") + if params["filename"] == "": + raise MultitraitException("Required parameter filename missing.") + + params["targetDatabase"] = fs.getfirst("targetDatabase", "U74Av2RMA_Raw_ProbeSet_March04") + params["firstRun"] = webqtlUtil.safeInt(fs.getfirst("firstRun", "0"),0) + params["threshold"] = webqtlUtil.safeFloat(fs.getfirst("threshold", "0.5"), 0.5) + params["subsetSize"] = webqtlUtil.safeInt(fs.getfirst("subsetSize", "10"), 10) + + if params["subsetSize"] < -1: + params["subsetSize"] = -1 + + params["correlation"] = fs.getfirst("correlation", "pearson") + params["subsetCount"] = webqtlUtil.safeInt(fs.getfirst("subsetCount", 10), 10) + + if params["subsetCount"] < -1: + params["subsetCount"] = -1 + + #params["outputType"] = fs.getfirst("outputType", "html") + + #if params["outputType"] not in ("html", "text"): + # params["outputType"] = "html" + + if params["correlation"] not in ("pearson", "spearman"): + params["correlation"] = "pearson" + + params["correlationName"] = params["correlation"].capitalize() + + # one of two cases: + # 1) We have just come from a submit, so there are a bunch of display* + # but no displaySets. Thus, the code down there converts the display* + # to displaySets so the GET request doesn't get too long + # 2) We have just been redirected from a progress page which already has + # a converted displaySets for us. + + displaySets = webqtlUtil.safeInt(fs.getfirst("displaySets","0"), 0) + + if displaySets == 0: + for key in fs.keys(): + if key[:7] == "display": + #print "Hit display key %s<br>" % key + try: + whichSet = int(key[7:]) + + # prevent malicious attacks + whichSet = min(whichSet, 512) + displaySets += pow(2, whichSet) + + except ValueError: pass + + params["displaySets"] = displaySets + #print "In the beginning, display sets was %s: %s<br>" % (displaySets, + # str(binaryDecompose(displaySets))) + + # if we are just gonna display a progress page, then there's no + # reason to look up detailed database information + #if params["progress"] == "1": + # return params + + a,b = trait.dbNameToTypeId(cursor, params["targetDatabase"]) # XZ, 09/10/2008: add module name + params["targetDatabaseType"] = a + params["targetDatabaseId"] = b + params["targetDatabaseName"] = params["targetDatabase"] + + return params + +# readInputFile: DB cursor -> string -> string, (arrayof Trait) +def readInputFile(cursor, filename): + """ + To read an input file with n lines in the following format + <databasetype>,<databaseid>,<traitname> + and retrieve and populate traits with appropriate data + from the database + + Also, for our purposes. we store the database type and + database id in fields attached to the trait instances. We use + this information to generate Javascript popups with trait + information. + + In addition, we read the strain of mice that the traits are + from so we can only show those databases to correlate against. + """ + handle = open(filename) + line = handle.readline() + inbredSetName = line.strip() + line = handle.readline() + traits = [] + +# XZ, 09/10/2008: In this while loop block, I changed the original variable name 'trait' to 'oneTrait' + while line != "": + line = line.strip() + dbType, dbId, tName = line.split(",") + + if dbType == "ProbeSet": + oneTrait = trait.queryProbeSetTraitByName(cursor, tName) # XZ, 09/10/2008: add module name + oneTrait.populateDataId(cursor, dbId) + oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name + elif dbType == "Geno": + speciesId = trait.getSpeciesIdByDbTypeId(cursor, dbType, dbId) + oneTrait = trait.queryGenotypeTraitByName(cursor, speciesId, tName) # XZ, 09/10/2008: add module name + oneTrait.populateDataId(cursor, dbId) + oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name + elif dbType == "Publish": + oneTrait = trait.queryPublishTraitByName(cursor, dbId, tName) # XZ, 09/10/2008: add module name + oneTrait.populateDataId(cursor, dbId) + oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name + elif dbType == "Temp": + oneTrait = trait.queryTempTraitByName(cursor, tName) # XZ, 09/10/2008: add module name + oneTrait.populateDataId(cursor, dbId) + oneTrait.dbName = "Temp" + + oneTrait.populateStrainData(cursor) + traits.append(oneTrait) + + line = handle.readline() + + return inbredSetName, traits + +# loadDatabase: Cursor -> ParamDict -> arrayof Trait +def loadDatabase(cursor, p): + """ + To load a set of traits as specified by the + targetDatabaseId + and targetDatabaseType parameters + + Cursor should be a fastCursor from the webqtl library (i.e. + a MySQLdb SSCursor). + + Calling populateStrainData 20,000 or so times on a ProbeSet + is really inefficent, so I wrote an optimized queryPopulatedProbeSetTraits + in the trait module that uses a join to get all of the rows in + bulk, store the resultset on the server, and do all sorts of nice buffering. + It's about two or three times faster. + """ + if p["targetDatabaseType"] == "ProbeSet": # XZ, 09/10/2008: add module name + dbTraits = trait.queryPopulatedProbeSetTraits(cursor, + p["targetDatabaseId"]) + elif p["targetDatabaseType"] == "Publish": # XZ, 09/10/2008: add module name + dbTraits = trait.queryPublishTraits(cursor, + p["targetDatabaseId"]) + psd = trait.PublishTrait.populateStrainData + elif p["targetDatabaseType"] == "Geno": # XZ, 09/10/2008: add module name + dbTraits = trait.queryGenotypeTraits(cursor, + p["targetDatabaseId"]) + psd = trait.GenotypeTrait.populateStrainData + else: + print "Unknown target database type %s" % p["targetDatabaseType"] + + if p["targetDatabaseType"] != "ProbeSet": + map(psd, dbTraits, [cursor]*len(dbTraits)) + + return dbTraits + +def runProbeSetCorrelations(cursor, p, traits): + """ + To run the correlations between the traits and the database. + This function computes a correlation coefficent between each + trait and every entry in the database, and partitions the database + into a disjoint array of arrays which it returns. + + The length of the return array is 2^n, where n is the length of + the trait array. Which constitutent element a of the return array + a given trait ends up in is determined by the following formula + i = i_02^0 + ... + i_(n-1)2^(n-1) + where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise + + Since most of the several thousand database traits will end up + with i=0, we don't return them, so the first element of the + return array will be empty. + + A particular element of subarray j of the return array contains + a 2-tuple (trait,kvalues). The variable trait is obviously the + particular database trait that matches the user traits l_1, ..., l_m + to which subarray j corresponds. kvalues is a list of the correlation + values linking trait to l_1, ..., l_m, so the length of kvalues is + the number of 1s in the binary representation of j (there must be + a better way to describe this length). + + The return array is an array of 2-tuples. The first element of + each tuple is the index of the particular subarray, and the second + element is the subarray itself. The array is sorted in descending + order by the number of 1's in the binary representation of the + index so the first few subarrays are the ones that correspond to + the largest sets. Each subarray is then sorted by the average of + the magnitude of the individual correlation values. + """ + + kMin = p["threshold"] + traitArrays = {} + + # TODO: Add Spearman support + freezeId = p["targetDatabaseId"] + if p["correlation"] == "pearson": + correlations = correlation.calcProbeSetPearsonMatrix(cursor, freezeId, traits) #XZ, 09/10/2008: add module name + else: + correlations = correlation.calcProbeSetSpearmanMatrix(freezeId, traits) #XZ, 09/10/2008: add module name + + # now we test all of the correlations in bulk + test = numarray.absolute(correlations) + test = numarray.greater_equal(test, kMin) + test = test.astype(numarray.Int8) + #print test + + db = trait.queryProbeSetTraits(cursor, freezeId) #XZ, 09/10/2008: add module name + for i in range(len(db)): + cIndex = 0 + prods = [] + for j in range(len(traits)): + if test[i,j] == 1: + cIndex += pow(2, j) + prods.append(correlations[i,j]) + if cIndex != 0: + if not traitArrays.has_key(cIndex): + traitArrays[cIndex] = [] + + traitArrays[cIndex].append((db[i], prods)) + + + # sort each inner list of traitArrays + # so the matched traits appear in descending order by the + # average magnitude of the correlation + def customCmp(traitPair, traitPair2): + magAvg1 = numarray.average(map(abs, traitPair[1])) + magAvg2 = numarray.average(map(abs, traitPair2[1])) + + # invert the sign to get descending order + return -cmp(magAvg1, magAvg2) + + for traitArray in traitArrays.values(): + traitArray.sort(customCmp) + + # sort the outer list of traitArrays + traitArrays2 = [] + i = 0 + for key in traitArrays.keys(): + a = traitArrays[key] + if len(a) > 0: + traitArrays2.append((key,a,len(binaryDecompose(key)), + len(a))) + + # we sort by the number of 1's in the binary output + # and then by the size of the list, both in descending order + def customCmp2(aL,bL): + a = -cmp(aL[2], bL[2]) + if a == 0: + return -cmp(aL[3], bL[3]) + else: + return a + + traitArrays2.sort(customCmp2) + + return traitArrays2 + +def runCorrelations(p, strainCount, traits, db): + """ + To run the correlations between the traits and the database. + This function computes a correlation coefficent between each + trait and every entry in the database, and partitions the database + into a disjoint array of arrays which it returns. + + The length of the return array is 2^n, where n is the length of + the trait array. Which constitutent element a of the return array + a given trait ends up in is determined by the following formula + i = i_02^0 + ... + i_(n-1)2^(n-1) + where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise + + Since most of the several thousand database traits will end up + with i=0, we don't return them, so the first element of the + return array will be empty. + + A particular element of subarray j of the return array contains + a 2-tuple (trait,kvalues). The variable trait is obviously the + particular database trait that matches the user traits l_1, ..., l_m + to which subarray j corresponds. kvalues is a list of the correlation + values linking trait to l_1, ..., l_m, so the length of kvalues is + the number of 1s in the binary representation of j (there must be + a better way to describe this length). + + The return array is an array of 2-tuples. The first element of + each tuple is the index of the particular subarray, and the second + element is the subarray itself. The array is sorted in descending + order by the number of 1's in the binary representation of the + index so the first few subarrays are the ones that correspond to + the largest sets. Each subarray is then sorted by the average of + the magnitude of the individual correlation values. + """ + kMin = p["threshold"] + traitArrays = {} + + # TODO: Add Spearman support + if p["correlation"] == "pearson": + correlations = correlation.calcPearsonMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name + else: + correlations = correlation.calcSpearmanMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name + + # now we test all of the correlations in bulk + test = numarray.absolute(correlations) + test = numarray.greater_equal(test, kMin) + test = test.astype(numarray.Int8) + #print test + + + for i in range(len(db)): + cIndex = 0 + prods = [] + for j in range(len(traits)): + if test[i,j] == 1: + cIndex += pow(2, j) + prods.append(correlations[i,j]) + if cIndex != 0: + if not traitArrays.has_key(cIndex): + traitArrays[cIndex] = [] + + traitArrays[cIndex].append((db[i], prods)) + + # sort each inner list of traitArrays + # so the matched traits appear in descending order by the + # average magnitude of the correlation + def customCmp(traitPair, traitPair2): + magAvg1 = numarray.average(map(abs, traitPair[1])) + magAvg2 = numarray.average(map(abs, traitPair2[1])) + + # invert the sign to get descending order + return -cmp(magAvg1, magAvg2) + + for traitArray in traitArrays.values(): + traitArray.sort(customCmp) + + # sort the outer list of traitArrays + traitArrays2 = [] + i = 0 + for key in traitArrays.keys(): + a = traitArrays[key] + if len(a) > 0: + traitArrays2.append((key,a,len(binaryDecompose(key)), + len(a))) + + # we sort by the number of 1's in the binary output + # and then by the size of the list, both in descending order + def customCmp2(aL,bL): + a = -cmp(aL[2], bL[2]) + if a == 0: + return -cmp(aL[3], bL[3]) + else: + return a + + traitArrays2.sort(customCmp2) + + return traitArrays2 + + +# XZ, 09/09/2008: In multiple trait correlation result page, +# XZ, 09/09/2008: click "Download a text version of the above results in CSV format" + +# TraitCorrelationText: a class to display trait correlations +# as textual output +class TraitCorrelationText: + # build a text shell to describe the given trait correlations + # this method sets self.output; use str(self) to actually + # get the text page + # + # traits is a list of traits and traitArray is a + # list of 3-tuples: index, traits', garbage + # where index is a binary-encoded description of which subset of + # traits the list traits' matches + # + # traits' is a list of 3-tuples as well: trait, correlations, garbage + # where trait is a particular trait and correlations is a list of float + # correlations (matching traits above) + def __init__(self, p, traits, traitArray): + output = "Correlation Comparison\n" + output += "from WebQTL and the University of Tennessee Health Science Center\n" + output += "initiated at " + time.asctime(time.gmtime()) + " UTC\n\n" + + output += self.showOptionPanel(p) + output += self.showSelectedTraits(traits) + output += self.showSummaryCorrelationResults(p, traits, traitArray) + output += self.showDetailedCorrelationResults(p, traits, traitArray) + + self.output = output + + # showOptionPanel: ParamDict -> string + # to display the options used to run this correlation + def showOptionPanel(self, params): + output = "Correlation Comparison Options:\n" + output += "Target database,%s\n" % params["targetDatabase"] + output += "Correlation type,%s\n" % params["correlationName"] + output += "Threshold,%f\n" % params["threshold"] + #output += "Subsets to Show,%d\n" % params["subsetCount"] + #output += "Traits to Show Per Subset,%d\n\n" % params["subsetSize"] + return output + + # showSelectedTraits: (listof Trait) -> string + # to display the traits compared with the database + # note: we can't use tabular output because the traits could be of + # different types and produce different fields + def showSelectedTraits(self, traits): + output = "Selected Traits:\n" + for trait in traits: + output += '"' + trait.longName() + '"' + "\n" + output += "\n" + return output + + # showSummaryCorrelationResults: ParamDict -> (listof Trait) -> + # TraitArray -> string + # see comment for __init__ for a description of TraitArray + # + # to show a summary (sets and sizes) of the correlation results + # as well as an X to indicate whether they will be included + # in the detailed output + def showSummaryCorrelationResults(self, p, traits, traitArray): + output = "Correlation Comparison Summary:\n" + + #if p["subsetCount"] != -1: + # ourSubsetCount = min(p["subsetCount"], len(traitArray)) + #else: + + ourSubsetCount = len(traitArray) + + displayDecomposition = binaryDecompose(p["displaySets"]) + for j in range(ourSubsetCount): + i = traitArray[j][0] + traitSubarray = traitArray[j][1] + if len(traitSubarray) == 0: + continue + + targetTraits = decomposeIndex(traits, i) + traitDesc = string.join(map(trait.Trait.shortName, targetTraits), # XZ, 09/10/2008: add module name + ", ") + if j in displayDecomposition: + checked = "X" + else: + checked = "" + + output += '"%s","%s","%d"\n' % (checked, traitDesc, len(traitSubarray)) + + output += "\n" + return output + + # showDetailedCorrelationResults: ParamDict -> (listof Trait) -> + # TraitArray -> string + # + # to show a detailed list of the correlation results; that is, + # to completely enumerate each subset of traitArray using the + # filtering parameters in p + def showDetailedCorrelationResults(self, p, traits, traitArray): + output = "Correlation Comparison Details:\n" + displayDecomposition = binaryDecompose(p["displaySets"]) + displayDecomposition.sort() + + def formatCorr(c): + return "%.4f" % c + + for j in displayDecomposition: + i = traitArray[j][0] + traitSubarray = traitArray[j][1] + + if len(traitSubarray) == 0: + continue + + targetTraits = decomposeIndex(traits, i) + extraColumnHeaders = map(trait.Trait.shortName, targetTraits) # XZ, 09/10/2008: add module name + traitDesc = string.join(extraColumnHeaders, ", ") + + #if(p["subsetSize"] != -1 and len(traitSubarray) > p["subsetSize"]): + # traitDesc += ",(showing top %s of %s)" % (p["subsetSize"], + # len(traitSubarray)) + # traitSubarray = traitSubarray[0:p["subsetSize"]] + + output += "%s\n" % traitDesc + output += traitSubarray[0][0].csvHeader([], extraColumnHeaders) + output += "\n" + for oneTrait, corr in traitSubarray:#XZ, 09/10/2008: change original variable name 'trait' to 'oneTrait' + corr = map(formatCorr, corr) + output += oneTrait.csvRow([], corr) + "\n" + + output += "\n" + + return output + + # __str__ : string + # to return self.output as the string representation of this page + # self.output is built in __init__ + def __str__(self): + return self.output + +# TraitCorrelationPage: a class to display trait correlations +# for now this is just one HTML file, so we don't even write it +# to a temporary file somewhere +class TraitCorrelationPage(templatePage.templatePage): + """ + Using the templatePage class, we build an HTML shell for + the core data here: the trait correlation lists. + + The way templatePage works, we build the page in pieces in + the __init__ method and later on use the inherited write + method to render the page. + """ + def __init__(self, fd, p, cursor, traits, traitArray, inbredSetName, txtFilename): + + templatePage.templatePage.__init__(self, fd) + + self.dict["title"] = "Correlation Comparison" + self.dict["basehref"] = "" + # NL: deleted js1 content part, since it has not been used in this project + self.dict["js1"] = "" + self.dict["js2"] = "" + + body = "<td><h1>Correlation Comparison</h1>" + body += "<p>Run at %s UTC</p>" % time.asctime(time.gmtime()) + body += """ + <p>The correlation comparison tool identifies intersecting sets of traits that are +correlated with your selections at a specified threshold. A correlation comparison +involves the following steps:</p> +<ol> +<li><p> +<b>Correlate:</b> +Choose a <i>Target Database</i>, a <i>Correlation Type</i>, and a <i>Correlation +Threshold</i>. For your initial correlation, leave <i>Number of Subsets to Show</i> and +<i>Traits to Show per Subset</i> at their default values of 10. Using the Correlation +Options panel, you can adjust the <i>Correlation Threshold</i>, <i>Number of Subsets to +Show</i>, and <i>Traits to Show per Subset</i>. +</p></li> + +<li><p> +<b>Add to Collection:</b> +You can use the check boxes in the <i>Correlation +Comparison Details</i> panel and the buttons at the bottom of the page to add these +results to your selections page for further analysis in WebQTL. +</p></li> + +<li><p> +<b>Filter:</b> +Using the <i>Correlation Comparison Summary</i> panel, choose which +subsets you would like to display for export. Note that if you change the +parameters in the <i>Correlation Options</i> panel, you will need to re-apply your filter. +</p></li> + +<li><p> +<b>Export:</b> +Once you are satisfied with your report, use the export link at +the bottom of the page to save the report as a comma-separated (CSV) text file +which you can then import into Excel or another tool. Note: the exported report +will list all subsets in the summary view and only those traits in the subsets +you have selected in the Filter step. +</p></li> +</ol> +""" + +# body += """ +# <p>The correlation +# comparison tool identifies the intersecting sets of traits that are +# correlated with your selections. A correlation comparison involves +# the following steps:</p> +# <ol> +# <li><p><b>Correlate:</b> Choose a <i>Target Database</i>, a <i>Correlation Type</i>, and a <i>Correlation Threshold</i>. +# For the initial correlation, leave <i>Subsets to Show</i> and <i>Traits to Show per Subset</i> +# at their default values of 10.</p></li> +# <li><p><b>Refine Correlation:</b> Using the <i>Correlation Options</i> panel, +# adjust the <i>Correlation Threshold</i>, <i>Subsets to Show</i>, and <i>Traits to +# Show per Subset</i> until you have a reasonable number of traits.</p></li> +# <li><p><b>Filter:</b> Using the <i>Correlation Comparison Summary</i> panel, choose which subsets you would +# like to see. Note that if you change the parameters in the <i>Correlation Options</i> panel, you will +# loose the filter you have selected.</p></li> +# <li><p><b>Export:</b> Once you are satisfied with your report, use the export +# link at the bottom of the page to save the report as a comma-separated (CSV) text file which +# you can then import into Excel or another tool. Note: the exported report +# will show all subsets in the summary view and all traits in each subset you have +# selected in the Filter step. +# <li><p><b>Shopping Cart:</b> In addition, you can use the +# check boxes in the <i>Correlation Comparison Details</i> panel and the +# buttons at the bottom of the page to add the traits you have found to the shopping cart.</p> +# </li> +# </ol> +# """ + + body += self.showOptionPanel(p, cursor, inbredSetName) + body += self.showSelectedTraits(traits, p, inbredSetName) + + if p["firstRun"] == 0: + body += self.showCorrelationResults(p, inbredSetName, traits, traitArray) + + exportParams = copy.copy(p) + exportParams["outputType"] = "text" + + body += (''' + <h2>Export these results</h2> + <p> + <a href="/image/%s">Download a text version of the above results in CSV format</a>. This text version differs from + the version you see on this page in two ways. First, the summary view shows all subsets. Second, the details + view shows all traits in the subsets that you have selected. + </p> + ''' + % txtFilename) + + + + body += "</td>" + self.dict["body"] = body + + + # showOptionPanel: ParamDict -> Cursor -> String -> String + # to build an option panel for the multitrait correlation + # we expect the database list to be a list of 2-tuples + # the first element of each tuple is the short name + # and the second element of the tuple is the long name + def showOptionPanel(self, params, cursor, inbredSetName): + output = ''' + <h2>Correlation Options</h2> + <FORM METHOD="POST" ACTION="%s%s" ENCTYPE="multipart/form-data"> + <INPUT TYPE="hidden" NAME="FormID" VALUE="compCorr2"> + <input type="hidden" name="filename" value="%s"> + <input type="hidden" name="firstRun" value="0"> + <input type="hidden" name="progress" value="1"> + <table> + <tr> + <td>Target Database:</td><td> + ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, params["filename"]) + + output += htmlModule.genDatabaseMenu(db = cursor, + public=0, + RISetgp = inbredSetName, + selectname="targetDatabase", + selected=params["targetDatabase"]) + output += "</td></tr>" + + corrSelected = ["",""] + + if params["correlation"] == "pearson": + corrSelected[0] = "SELECTED" + else: + corrSelected[1] = "SELECTED" + + output += (''' + <tr> + <td>Correlation Method:</td> + <td><select name="correlation"> + <option value="pearson" %s>Pearson</option> + <!--<option value="spearman" %s>Spearman</option>--> + </select></td></tr> + ''' % (corrSelected[0], corrSelected[1])) + output += ('<tr><td>Correlation Threshold:</td><td><input name="threshold" value="%s" /></td></tr>' + % params["threshold"]) + output += ('<tr><td>Subsets to Show (-1 to show all subsets):</td><td><input name="subsetCount" value="%s" /></td></tr>' + % params["subsetCount"]) + output += ('<tr><td>Traits to Show per Subset (-1 to show all traits):</td><td><input name="subsetSize" value="%s" /></td></tr>' + % params["subsetSize"]) + + # a cosmetic change to hopefully make this form a bit easier to use +# if params["firstRun"] == 1: +# applyName = "Correlate" +# else: +# applyName = "Refine Correlation" + + output += ''' + <tr> + <td colspan="2"><input class="button" type="submit" value="Correlate" /></td> + </tr> + </table> + </form> + ''' + + return output + + # showSelectedTraits: listof Trait -> string + # to show a list of the selected traits + def showSelectedTraits(self, traits, p, inbredSetName): + output = ''' + <form action="%s%s" method="post" name="showDatabase"> + <INPUT TYPE="hidden" NAME="FormID" VALUE="showDatabase"> + + <input type="hidden" name="incparentsf1" value="ON"> + <input type="hidden" name="ShowStrains" value="ON"> + <input type="hidden" name="ShowLine" value="ON"> + <input type="hidden" name="database" value=""> + <input type="hidden" name="ProbeSetID" value=""> + <input type="hidden" name="RISet" value="%s"> + <input type="hidden" name="CellID" value=""> + <input type="hidden" name="database2" value=""> + <input type="hidden" name="rankOrder" value=""> + <input type="hidden" name="ProbeSetID2" value=""> + <input type="hidden" name="CellID2" value=""> + ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, inbredSetName) + + output += "<h2>Selected Traits</h2>" + output += '<table cellpadding="2" cellspacing="0"><tr bgcolor="FFFFFF"><th>Database</th><th>Trait</th></tr>' + flip = 1 + colors = ["FFFFFF", "cccccc"] + + for trait in traits: + # we take advantage of the secret dbName attribute that + # loadDatabase fills in + descriptionString = trait.genHTML() + if trait.db.type == 'Publish' and trait.confidential: + descriptionString = trait.genHTML(privilege=self.privilege, userName=self.userName, authorized_users=trait.authorized_users) + output += ''' + <tr bgcolor="%s"><td><a href="/dbdoc/%s.html">%s</a></td> + <td><a href="javascript:showDatabase2('%s', '%s', '')">%s</a></td> + </tr> + ''' % (colors[flip], trait.db.name, trait.db.name, trait.db.name, trait.name, descriptionString) + flip = not flip + + output += "</table></form>" + return output + + + # showSummaryCorrelationResults + # show just the number of traits in each subarray + def showSummaryCorrelationResults(self, p, traits, traitArray): + output = ''' + <form action="%s%s" method="post"> + <INPUT TYPE="hidden" NAME="FormID" VALUE="compCorr2"> + <input type="hidden" name="filename" value="%s"> + <input type="hidden" name="firstRun" value="0"> + <input type="hidden" name="progress" value="1"> + <input type="hidden" name="correlation" value="%s"> + <input type="hidden" name="threshold" value="%s"> + <input type="hidden" name="rankOrder" value=""> + <input type="hidden" name="subsetCount" value="%s"> + <input type="hidden" name="subsetSize" value="%s"> + <input type="hidden" name="targetDatabase" value="%s"> + ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, p["filename"], p["correlation"], p["threshold"], + p["subsetCount"], p["subsetSize"], p["targetDatabase"]) + + output += ''' + <table cellpadding="2" cellspacing="0"> + <tr> + <th>Trait Subsets</th> + <th colspan="2">Intersecting Set Size</th> + </tr> + ''' + # figure out a scale for the summary graph + # for now we set max = 300 pixels wide + if p["subsetCount"] != -1: + ourSubsetCount = min(p["subsetCount"], len(traitArray)) + else: + ourSubsetCount = len(traitArray) + + screenWidth = 600 + lengths = [] + for j in range(ourSubsetCount): + lengths.append(len(traitArray[j][1])) + maxLength = max(lengths) + + displayDecomposition = binaryDecompose(p["displaySets"]) + flip = 0 + colors = ["FFFFFF", "cccccc"] + + for j in range(ourSubsetCount): + i = traitArray[j][0] + traitSubarray = traitArray[j][1] + + if len(traitSubarray) == 0: + continue + + targetTraits = decomposeIndex(traits, i) + traitDesc = string.join(map(webqtlTrait.displayName, targetTraits), + ", ") + + if j in displayDecomposition: + checked = "CHECKED" + else: + checked = "" + + barWidth = (len(traitSubarray) * screenWidth) / maxLength + output += ('''<tr bgcolor="%s"> + <td><input type="checkbox" name="display%d" value="1" %s>%s</input></td> + <td>%s</td> + <td><img src="/images/blue.png" width="%d" height="25"></td></tr>''' + % (colors[flip], j, checked, traitDesc, len(traitSubarray), barWidth)) + flip = not flip + + output += ''' + <tr> + <td colspan="3"> + <input class="button" type="submit" value="Filter" /></td> + </tr> + </table></form> + ''' + return output + + # showDetailedCorrelationResults + # actually show the traits in each subarray + def showDetailedCorrelationResults(self, p, inbredSetName, traits, + traitArray): + output = "<h2>Correlation Comparison Details</h2>" + + # the hidden form below powers all of the JavaScript links, + # the shopping cart links, and the correlation plot links + + output += ''' + <form action="%s%s" method="post"> + <input type="hidden" name="database" value="%s"> + <input type="hidden" name="FormID" value="showDatabase"> + <input type="hidden" name="traitfile" value=""> + <input type="hidden" name="incparentsf1" value="ON"> + <input type="hidden" name="ShowStrains" value="ON"> + <input type="hidden" name="ProbeSetID" value=""> + <input type="hidden" name="ShowLine" value="ON"> + <input type="hidden" name="RISet" value="%s"> + <input type="hidden" name="CellID" value=""> + <input type="hidden" name="database2" value=""> + <input type="hidden" name="rankOrder" value=""> + <input type="hidden" name="ProbeSetID2" value=""> + <input type="hidden" name="CellID2" value=""> + ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, p["targetDatabase"], inbredSetName) + + + displayDecomposition = binaryDecompose(p["displaySets"]) + + # necessary to ensure that subset order is the same in the + # summary and the detailed view + displayDecomposition.sort() + + # here's a trick: the first trait we show must have the widest row because it correlates + # with the largest set of input traits + firstSubset = traitArray[displayDecomposition[0]] + firstTrait = firstSubset[1][0][0] + extraColumnCount = firstSubset[2] + totalColumnCount = 1 + len(firstTrait.row()) + extraColumnCount + + output += "<table cellpadding=2 cellspacing=0>\n" + for j in displayDecomposition: + i = traitArray[j][0] + traitSubarray = traitArray[j][1] + + # we don't display trait combinations for which there are + # no correlations + if len(traitSubarray) == 0: + continue + + # generate a description of the traits that this particular array + # matches highly + targetTraits = decomposeIndex(traits, i) + extraColumnHeaders = map(webqtlTrait.displayName, targetTraits) + traitDesc = string.join(extraColumnHeaders, ", ") + + # massage extraColumnHeaders so that they can be wrapped + for i in range(len(extraColumnHeaders)): + ech = extraColumnHeaders[i] + ech = ech.replace("-", " ") + ech = ech.replace("_", " ") + extraColumnHeaders[i] = ech + + # pad extraColumnHeaders if we have less columns than the max + paddingNeeded = extraColumnCount - len(extraColumnHeaders) + if paddingNeeded > 0: + extraColumnHeaders.extend(paddingNeeded * [" "]) + + # we limit the output to the top ones + if(p["subsetSize"] != -1 and len(traitSubarray) > p["subsetSize"]): + traitDesc += " (showing top %s of %s)" % (p["subsetSize"], len(traitSubarray)) + traitSubarray = traitSubarray[0:p["subsetSize"]] + + # combine that description with actual database traits themselves + # and the correlation values + output += '<tr><td colspan="%d"><h3>%s</h3></td></tr>' % (totalColumnCount, traitDesc) + #output += '<h3>%s</h3>\n<table cellpadding=2 cellspacing=0>\n'% traitDesc + + # we assume that every trait in traitSubarray is the same type + # of trait + flip = 0 + colors = ["FFFFFF", "cccccc"] + + output += traitSubarray[0][0].tableRowHeader([" "], extraColumnHeaders, colors[0]) + + for traitPair in traitSubarray: + corr = [] + traitPair[0].dbName = p['targetDatabase'] + trait = traitPair[0] + + for i in range(len(traitPair[1])): + corrValue = traitPair[1][i] + corrPlotLink = (''' + <a href="javascript:showCorrelationPlot2(db='%s',ProbeSetID='%s',CellID='',db2='%s',ProbeSetID2='%s',CellID2='',rank='%s')">%.2f</a> + ''' % (p["targetDatabaseName"], trait.name, targetTraits[i].db.name, targetTraits[i].name, "0", corrValue)) + corr.append(corrPlotLink) + + corr.extend(paddingNeeded * [" "]) + + checkbox = ('<INPUT TYPE="checkbox" NAME="searchResult" VALUE="%s:%s" />' + % (p["targetDatabaseName"], trait.name)) + flip = not flip + output += traitPair[0].tableRow([checkbox], corr, colors[flip]) + + #output += "</table>" + i += 1 + output += '<tr><td colspan="%d"> </td></tr>' % totalColumnCount + + output += "</table>" + + # print form buttons if there were checkboxes above + output += ''' + <div align="left"> + <INPUT TYPE="button" NAME="addselect" CLASS="button" VALUE="Add to Collection" + onClick="addRmvSelection('%s',this.form, 'addToSelection');"> + <INPUT TYPE="button" NAME="selectall" CLASS="button" VALUE="Select All" onClick="checkAll(this.form);"> + <INPUT TYPE="reset" CLASS="button" VALUE="Select None"> + </div> + </form> + ''' % inbredSetName + + return output + + # showCorrelationResults: ParamDict -> listof Trait -> tupleof (int,arrayof trait) -> String + # to build an output display for the multitrait correlation results + def showCorrelationResults(self, p, inbredSetName, traits, traitArray): + output = ''' + <h2>Correlation Comparison Summary</h2> + <p> + %s correlations were computed for each of the selected traits with each trait in + the <a href="/dbdoc/%s.html">%s</a> database. + Subsets of database traits for which correlations were higher than %s + or lower than -%s are shown below based on which traits + they correlated highly with. The top %s subsets, ranked by the number of input traits that + they correspond with, are shown, and at most %s traits in each subset are shown. </p> + ''' % (p["correlationName"], + p["targetDatabase"], p["targetDatabaseName"], + p["threshold"], p["threshold"], p["subsetCount"], + p["subsetSize"]) + + + totalTraits = 0 + for j in range(len(traitArray)): + totalTraits += len(traitArray[j][1]) + + if totalTraits == 0: + output += """ + <p> + No shared corrrelates were found with your given traits at this + threshold. You may wish to lower the correlation threshold or choose different traits. + </p> + """ + else: + output += self.showSummaryCorrelationResults(p, traits, traitArray) + output += self.showDetailedCorrelationResults(p, inbredSetName, + traits, traitArray) + + return output + +# decomposeIndex: (listof Trait) -> Int -> +# (listof Trait) +# to use i to partition T into a sublist +# each bit in i controls the inclusion or exclusion of a trait +def decomposeIndex(traits, i): + targetTraits = [] + + for j in range(len(traits)): + # look, mom, a bitwise and! + # expression below tests whether the jth bit is + # set in i + # see runCorrelation for how we decompose the + # array index + if (i & pow(2,j)) == pow(2,j): + targetTraits.append(traits[j]) + + return targetTraits + +# binaryDecompose: int -> (listof int) +# to decompose a number into its constituent powers of 2 +# returns a list of the exponents a_1...a_n such that the input m +# is m = 2^a_1 + ... + 2^a_n +def binaryDecompose(n): + if n == 0: + return [] + + # we start with the highest power of 2 <= this number + # and work our way down, subtracting powers of 2 + start = long(math.floor(math.log(n)/math.log(2))) + + exponents = [] + while start >= 0: + if n >= long(math.pow(2, start)): + n -= math.pow(2,start) + exponents.append(start) + start -= 1 + return exponents + +# powerOf : int -> int -> boolean +# to determine whether m is a power of n; +# more precisely, whether there exists z in Z s.t. +# n^z = m +def powerOf(m, n): + trialZ = math.floor(math.log(m)/math.log(n)) + return pow(n,trialZ) == m + + +class compCorrPage(templatePage.templatePage): + def __init__(self,fd): + templatePage.templatePage.__init__(self, fd) + + if not self.openMysql(): + return + + cursor = self.cursor + params = buildParamDict(cursor, fd) + + # get the input data + inbredSetName, traits = readInputFile(cursor, RootDir + params["filename"]) + + # and what we are comparing the data to + dbTraits = [] + if params["targetDatabaseType"] != "ProbeSet": + dbTraits = loadDatabase(cursor, params) + + + # run the comparison itself + strainCount = trait.queryStrainCount(cursor) # XZ, 09/10/2008: add module name + if params["targetDatabaseType"] == "ProbeSet": + results = runProbeSetCorrelations(cursor, params, traits) + else: + results = runCorrelations(params, strainCount, traits, dbTraits) + + # try to be smart about what to output: + # we want to limit the number of traits shown, at least initially + # and since traitArray is already sorted with most interesting + # subsets first, we simply pick up the first 500 or so traits + # that we find + if params["displaySets"] == 0: + selectedTraits = 0 + for j in range(len(results)): + #print "Scanning subarray %d" % j + if selectedTraits <= 200: + params["displaySets"] += pow(2, j) + selectedTraits += len(results[j][1]) + + traitList = [] + for oneTrait in traits: # XZ, 09/10/2008: change the original variable name 'trait' to 'oneTrait' + traitName = oneTrait.dbName+'::'+oneTrait.name # XZ, 09/10/2008: change the original variable name 'trait' to 'oneTrait' + aTrait = webqtlTrait(cursor=self.cursor, fullname=traitName) + traitList.append(aTrait) + + # and generate some output + txtOutputFilename = tempfile.mktemp() + txtOutputHandle = open(txtOutputFilename, "w") + txtOutput = TraitCorrelationText(params, traits, results) + txtOutputHandle.write(str(txtOutput)) + txtOutputHandle.close() + txtOutputFilename = os.path.split(txtOutputFilename)[1] + + self.dict['body'] = TraitCorrelationPage(fd, params, cursor, traitList, + results, inbredSetName, + txtOutputFilename).dict['body'] diff --git a/web/webqtl/compareCorrelates/trait.py b/web/webqtl/compareCorrelates/trait.py new file mode 100755 index 00000000..ff1f8119 --- /dev/null +++ b/web/webqtl/compareCorrelates/trait.py @@ -0,0 +1,1074 @@ +#Trait.py +# +#--Individual functions are already annotated, more or less. +# +#Classes: +#RawPoint +#Trait +#ProbeSetTrait +#GenotypeTrait +#PublishTrait +#TempTrait +#-KA + +# trait.py: a data structure to represent a trait +import time +import string + +CONFIG_pubMedLinkURL = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=%s&dopt=Abstract" + +# RawPoint: to store information about the relationship between two particular +# traits +# RawPoint represents directly the input file +class RawPoint: + def __init__(self, i, j): + self.i = i + self.j = j + + def __eq__(self, other): + return (self.i == other.i and + self.j == other.j and + self.spearman == other.spearman and + self.pearson == other.pearson) + + def __str__(self): + return "(%s,%s,%s,%s)" % (self.i, self.j, self.spearman, self.pearson) + +def tdEscapeList(cols, align="left"): + """ + A helper function used by tableRow + in Trait that will convert a list of strings into a set of + table cells enclosed by <td>%s</td> tags + """ + html = "" + for col in cols: + html += '<td style="text-align: %s">%s</td>' % (align, col) + return html + +def thEscapeList(cols): + """ + A helper function used by tableRowHeader + in Trait that will convert a list of strings into a set of + table cells enclosed by <td>%s</td> tags + """ + html = "" + for col in cols: + html += "<th>%s</th>" % col + return html + +def commaEscapeList(cols): + """ + A helper function used by csvHeader and csvRow. + Really it's just a wrapper for string.join + """ + return '"' + string.join(cols, '","') + '"' + + +class Trait: + """ + A trait represents an attribute of an object. In the WebQTL database, traits are stored + as ProbeSets; that is, the average values of a set of probes are stored. + """ + def __init__(self, id="", name="", description="", symbol="", href=""): + self.id = id + self.name = name + self.dbName = "" + self.symbol = symbol + self.href = href + self.strainData = {} + + def populateDataId(self, cursor, freezeId): + """ + Retrieve the dataId for trait data corresponding to the given database + The way to do this depends on the particular type of trait, so we leave implementation + to subclasses. + """ + raise NotImplementedError + + def populateStrainData(self, cursor): + """ + Load this trait full of train data corresponding to the data id + The data id can either come from populateDataId + or can be set manually by the user of this class. + Xiaodong added: The way to do this depends on the particular type of trait, + so we leave implementation to subclasses. + + """ + raise NotImplementedError + + def shortName(self): + """ + To return a short name for this trait; this name should be + appropriate for a row or column title + """ + return self.name + + def nameNoDB(self): + """ + To return only the short name without the database attached + """ + strArray = self.shortName().split('::') + + return strArray[1] + + def datasetName(self): + """ + To return only the name of the dataset + """ + strArray = self.shortName().split('::') + + return strArray[0].strip() + + def longName(self): + """ + To return a long name for this trait; this name should be + appropriate for a key to a table + """ + return self.shortName() + + def __str__(self): + return self.shortName() + + def tableRowHelper(self, beforeCols, afterCols, color, thisRow): + """ + tableRowHelper: (arrayof String) -. String + To generate a table row to represent this object, appending + the additional information in beforeCols and afterCols + to the beginning and the end + """ + thisRow[0] = '<a href="%s">%s</a>' % (self.traitInfoLink(), + self.name) + html = '<tr bgcolor="%s">' % color + html += tdEscapeList(beforeCols + thisRow) + html += tdEscapeList(afterCols, align="right") + html += "</tr>" + + return html + + + def header(self): + """ + header: (listof String) + To generate a list of strings describing each piece of data + returned by row + """ + raise NotImplementedError + + def row(self): + """ + row: (listof String) + To generate a list of strings describing this object. The + elements of this list should be described by header() + """ + raise NotImplementedError + + def tableRowHeader(self, beforeCols, afterCols, color): + """ + tableRowHeader: (arrayof String) -> (arrayof String) -> String + To generate a table row header to represent this object, + appending the additional information in beforeCols and + afterCols to the beginning and end + """ + html = '<tr bgcolor="%s">' % color + html += thEscapeList(beforeCols + self.header() + + afterCols) + html += "</tr>" + return html + + def csvHeader(self, beforeCols, afterCols): + return commaEscapeList(beforeCols + self.header() + afterCols) + + def csvRow(self, beforeCols, afterCols): + return commaEscapeList(beforeCols + self.row() + afterCols) + + + def traitInfoLink(self): + """ + To build a trait info link to show information about this + trait. We assume that the database attribute is properly set + on the hidden form on the page where this link will go. + """ + return "javascript:showDatabase2('%s','%s','')" % (self.dbName, self.name) + +# ProbeSetTrait: a trait with data from a probeset +class ProbeSetTrait(Trait): + def __init__(self, id="", name="", description="", symbol="", href="", + chromosome="", MB="", GeneId=""): + Trait.__init__(self, id=id, name=name, href=href) + self.description = description + self.symbol = symbol + self.chromosome = chromosome + self.MB = MB + self.GeneId = GeneId + + def populateDataId(self, cursor, freezeId): + """ + Look up the data id for this trait given which + freeze it came from. + """ + cursor.execute(''' + SELECT + ProbeSetXRef.DataId + FROM + ProbeSetXRef + WHERE + ProbeSetId = %s AND + ProbeSetFreezeId = %s + ''' % (self.id, freezeId)) + + # we hope that there's only one record here + row = cursor.fetchone() + self.dataId = row[0] + + #XZ, 03/03/2009: Xiaodong implemented this fuction + def populateStrainData(self, cursor): + cursor.execute(''' + SELECT + ProbeSetData.StrainId, + ProbeSetData.value + FROM + ProbeSetData + WHERE + ProbeSetData.Id = %s''' % self.dataId) + for row in cursor.fetchall(): + self.strainData[int(row[0])] = float(row[1]) + + + def shortName(self): + """ + An improved string method that uses the gene symbol where + we have it + """ + if self.symbol != "": + return self.symbol + else: + return Trait.shortName(self) + + def longName(self): + """ + We use several bits of genetic information to give + useful information about this trait and where it is + """ + if self.chromosome != "": + chrPart = " (%s on Chr %s @ %s Mb)" % (self.symbol, + self.chromosome, + self.MB) + else: + chrPart = "" + + return "%s%s: %s" % (self.name, chrPart, self.description) + + def header(self): + return ["Name", "Symbol", "Description", + "Chr", "Position (Mb)"] + + def row(self): + if type(self.MB) is float: + MB = "%.2f" % self.MB + else: + MB = "" + + return [self.name, self.symbol, self.description, + self.chromosome, MB] + + def tableRow(self, beforeCols, afterCols, color): + """ + tableRow: (arrayof String) -> (arrayof String) -> String + To generate a table row to represent this object, appending + the additional information in beforeCols and afterCols to the + beginning and end + """ + thisRow = self.row() + + # trim description + if len(thisRow[2]) > 20: + thisRow[2] = thisRow[2][:20] + "..." + + # add NCBI info link + thisRow[1] = self.ncbiInfoLink() + + return self.tableRowHelper(beforeCols, afterCols, color, + thisRow) + + + def ncbiInfoLink(self): + """ + ncbiInfoLink :: String + To generate an NCBI info link for this trait. If we have a GeneId, + then we can go straight to the gene. If not, then we generate a search + link based on the gene symbol. If we have none of them, then we don't + generate a link at all. + """ + if self.GeneId != "": + cmd = "cmd=Retrieve&dopt=Graphics&list_uids=%s" % self.GeneId + elif self.symbol != "": + cmd = "cmd=Search&term=%s" % self.symbol + else: + return "" + + return ''' + <a target="_new" + href="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&%s"> + %s</a> ''' % (cmd, self.symbol) + + +# GenotypeTrait: a trait with data from the genotype +class GenotypeTrait(Trait): + def __init__(self, id="", name="", href="", chromosome="", MB=""): + Trait.__init__(self, id=id, name=name, href=href) + self.chromosome = chromosome + self.MB = MB + + def populateDataId(self, cursor, freezeId): + """ + Look up the data id for this trait from the + genotype. + """ + cursor.execute(''' + SELECT + GenoXRef.DataId + FROM + GenoXRef + WHERE + GenoId = %s AND + GenoFreezeId = %s + ''' % (self.id, freezeId)) + + # we hope that there's only one record here + row = cursor.fetchone() + self.dataId = row[0] + + #XZ, 03/03/2009: Xiaodong implemented this fuction + def populateStrainData(self, cursor): + cursor.execute(''' + SELECT + GenoData.StrainId, + GenoData.value + FROM + GenoData + WHERE + GenoData.Id = %s''' % self.dataId) + for row in cursor.fetchall(): + self.strainData[int(row[0])] = float(row[1]) + + def header(self): + return ["Locus", "Chr", "Position (Mb)"] + + def row(self): + return [self.name, self.chromosome, "%.3f" % self.MB] + + def tableRow(self, beforeCols, afterCols, color): + return self.tableRowHelper(beforeCols, afterCols, color, self.row()) + +# PublishTrait: a trait with data from publications +class PublishTrait(Trait): + def __init__(self, id="", name="", href="", authors="", title="", + phenotype="", year=""): + Trait.__init__(self, id=id, name=name, href=href) + self.authors = authors + self.title = title + self.phenotype = phenotype + self.year = year + + def populateDataId(self, cursor, freezeId): + """ + Look up the data id for this trait from the + published set. For the moment, we assume that there's + only one publish freeze. + """ + cursor.execute(''' + SELECT + PublishXRef.DataId + FROM + PublishXRef, PublishFreeze + WHERE + PublishFreeze.Id = %s AND + PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND + PublishXRef.Id = %s + ''' % (freezeId, self.id)) + + # we hope that there's only one record here + row = cursor.fetchone() + self.dataId = row[0] + + #XZ, 03/03/2009: Xiaodong implemented this fuction + def populateStrainData(self, cursor): + cursor.execute(''' + SELECT + PublishData.StrainId, + PublishData.value + FROM + PublishData + WHERE + PublishData.Id = %s''' % self.dataId) + for row in cursor.fetchall(): + self.strainData[int(row[0])] = float(row[1]) + + + def longName(self): + """ + A more intelligent string function that uses + information about the publication from which this trait came + """ + return "%s: %s by %s" % (self.name, self.title, self.authors) + + def header(self): + return ["Record", "Phenotype", "Authors", "Year", "URL"] + + def row(self): + return [self.name, + self.phenotype, + self.authors, + str(self.year), + ""] + + def tableRow(self, beforeCols, afterCols, color): + """ + tableRow: (arrayof String) -> (arrayof String) -> String + To generate a table row to represent this object, appending + the additional information in beforeCols and afterCols to the + beginning and end + """ + thisRow = self.row() + + # for multiple authors, use "et. al" after first two + authors = thisRow[2].split(",") + if len(authors) > 2: + thisRow[2] = string.join(authors[:2], ",") + ", et al" + + # clip phenotype to 20 chars + if len(thisRow[1]) > 20: + thisRow[1] = thisRow[1][:20] + "..." + + # add Pub Med URL + thisRow[4] = '<a href="%s" target="_new">Pub Med</a>' % (CONFIG_pubMedLinkURL % self.href) + + return self.tableRowHelper(beforeCols, afterCols, color, + thisRow) + + +# TempTrait: a trait with data generate by user and stored in temp table +class TempTrait(Trait): + def __init__(self, id="", name="", href="", description=""): + Trait.__init__(self, id=id, name=name, href=href) + self.description = description + + def populateDataId(self, cursor, freeezeId): + """ + Look up the data id for this trait from the Temp table, freezeId isn't used, + it just for fixing the inherit + """ + cursor.execute(''' + SELECT + DataId + FROM + Temp + WHERE + Id=%s + ''' % (self.id)) + + # we hope that there's only one record here + row = cursor.fetchone() + self.dataId = row[0] + + #XZ, 03/03/2009: Xiaodong implemented this fuction + def populateStrainData(self, cursor): + cursor.execute(''' + SELECT + TempData.StrainId, + TempData.value + FROM + TempData + WHERE + TempData.Id = %s''' % self.dataId) + for row in cursor.fetchall(): + self.strainData[int(row[0])] = float(row[1]) + + + def row(self): + return [self.id, + self.name, + self.description, + ""] + + + def longName(self): + """ + For temp trait, the description always contents whole useful information + """ + return self.description + + +# queryGenotypeTraitByName : Cursor -> string -> GenotypeTrait +def queryGenotypeTraitByName(cursor, speciesId, name): + qry = ''' + SELECT + Geno.Id, + Geno.Name, + Geno.Chr, + Geno.Mb + FROM + Geno + WHERE + Geno.SpeciesId = %s and Geno.Name = "%s" ''' % (speciesId, name) + + cursor.execute(qry) + row = cursor.fetchone() + return GenotypeTrait(id=row[0], name=row[1], + chromosome=row[2], MB=row[3]) + +# queryPublishTraitByName : Cursor -> string -> PublishTrait +def queryPublishTraitByName(cursor, freezeId, name): + qry = ''' + SELECT + PublishXRef.Id, + Phenotype.Id, + Publication.Authors, + Publication.Title, + Publication.Year, + Publication.PubMed_ID + FROM + Publication, PublishXRef, Phenotype, PublishFreeze + WHERE + PublishFreeze.Id = %s AND + PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND + PublishXRef.Id = %s AND + PublishXRef.PublicationId = Publication.Id AND + PublishXRef.PhenotypeId = Phenotype.Id + ''' % (freezeId, name) + + cursor.execute(qry) + if cursor.rowcount == 0: + return None + else: + row = cursor.fetchone() + + return PublishTrait(id=row[0], name='%s'%row[0], + authors=row[2], title=row[3], + year=row[4], href=row[5]) + + +def queryTempTraitByName(cursor, name): + name=name.strip() + qry = ''' + SELECT + Temp.Id, + Temp.Name, + Temp.description + FROM + Temp + WHERE + Temp.Name= "%s" + ''' % (name) + + cursor.execute(qry) + if cursor.rowcount == 0: + return None + else: + row = cursor.fetchone() + return TempTrait(id=row[0], name=row[1], description=row[2], href='') + +# queryPopulatedProbeSetTraits: Cursor -> Integer -> dictof Trait +# to retrieve an entire probeset fully populated with data +# this query can take 15+ sec the old way (22,000 traits * 35 strains = half +# a million records) +# so we ask for the data in bulk +# +# cursor should be SSCursor for MySQL so rows are stored on the server side +# and tuples are used +# we explicitly close the cursor here as well +#XZ, 03/04/2009: It seems to me that this function is never be executed. +#XZ: Although it can be called from multitrait.loadDatabase, +#XZ: but the loadDatabase function will not be called +#XZ: if the targetDatabaseType is probeset. +#XZ: The probeset traits of target database are retrieved by execute +#XZ: queryPopulatedProbeSetTraits2 from correlation.calcProbeSetPearsonMatrix +def queryPopulatedProbeSetTraits(cursor, freezeId): + step1 = time.time() + traits = queryProbeSetTraits(cursor, freezeId) + traitDict = {} + for trait in traits: + traitDict[trait.id] = trait + + step2 = time.time() + print + #XZ, 03/04/2009: Xiaodong changed Data to ProbeSetData + cursor.execute(''' + SELECT + ProbeSetXRef.ProbeSetId, + ProbeSetData.StrainId, + ProbeSetData.value + FROM + ProbeSetXRef + Left Join ProbeSetData ON + ProbeSetXRef.DataId = ProbeSetData.Id + WHERE + ProbeSetXRef.ProbeSetFreezeId = %s + ''' % freezeId) + + step3 = time.time() + totalrows = 0 + somerows = cursor.fetchmany(1000) + while len(somerows) > 0: + totalrows += len(somerows) + for row in somerows: + # this line of code can execute more than one million times + traitDict[row[0]].strainData[int(row[1])] = row[2] + somerows = cursor.fetchmany(1000) + + #cursor.close() + step4 = time.time() + + time1 = step2 - step1 + time2 = step3 - step2 + time3 = step4 - step3 + time4 = step4 - step1 + #print "%f %f %f %f %d rows" % (round(time1, 2), + # round(time2, 2), + # round(time3, 2), + # round(time4, 2), + # totalrows) + #print "Fetched %d traits" % len(traits) + return traits + + +# queryPopulatedProbeSetTraits2: Cursor -> Integer -> dictof Trait +# to retrieve probeset fully populated whose ProbeSetId in a range +# a special ProbeSetId with data +# this query can take 15+ sec the old way (22,000 traits * 35 strains = half +# a million records) +# so we ask for the data in bulk +# +# cursor should be SSCursor for MySQL so rows are stored on the server side +# and tuples are used +# we explicitly close the cursor here as well +def queryPopulatedProbeSetTraits2(cursor, freezeId, ProbeSetId1, ProbeSetId2): + step1 = time.time() + traits = queryProbeSetTraits2(cursor, freezeId, ProbeSetId1, ProbeSetId2) + traitDict = {} + for trait in traits: + traitDict[trait.id] = trait + + step2 = time.time() + print + #XZ, 03/04/2009: Xiaodong changed Data to ProbeSetData + cursor.execute(''' + SELECT + ProbeSetXRef.ProbeSetId, + ProbeSetData.StrainId, + ProbeSetData.value + FROM + ProbeSetXRef + Left Join ProbeSetData ON + ProbeSetXRef.DataId = ProbeSetData.Id + WHERE + ProbeSetXRef.ProbeSetFreezeId = %s AND + ProbeSetXRef.ProbeSetId >= %s AND + ProbeSetXRef.ProbeSetId <= %s + ''' % (freezeId, ProbeSetId1, ProbeSetId2)) + + step3 = time.time() + totalrows = 0 + somerows = cursor.fetchmany(1000) + while len(somerows) > 0: + totalrows += len(somerows) + for row in somerows: + # this line of code can execute more than one million times + traitDict[row[0]].strainData[int(row[1])] = row[2] + somerows = cursor.fetchmany(1000) + + #cursor.close() + step4 = time.time() + + time1 = step2 - step1 + time2 = step3 - step2 + time3 = step4 - step3 + time4 = step4 - step1 + #print "%f %f %f %f %d rows" % (round(time1, 2), + # round(time2, 2), + # round(time3, 2), + # round(time4, 2), + # totalrows) + #print "Fetched %d traits" % len(traits) + return traits + + +# def noneFilter : string or none -> string +# to replace a possible None by an empty string +def noneFilter(x): + if x is None: + return "" + else: + return x + +# queryProbeSetTraits: Cursor -> Integer -> dictof Trait +def queryProbeSetTraits(cursor, freezeId): + """ + To locate all of the traits in a particular probeset + """ + qry = ''' + SELECT + ProbeSet.Id, + ProbeSet.Name, + ProbeSet.description, + ProbeSet.symbol, + ProbeSet.Chr, + ProbeSet.Mb, + ProbeSet.GeneId, + ProbeSetXRef.DataId + FROM + ProbeSet, + ProbeSetXRef + WHERE + ProbeSetXRef.ProbeSetId = ProbeSet.Id AND + ProbeSetXRef.ProbeSetFreezeId = %s + ORDER BY ProbeSet.Id + ''' % freezeId + + cursor.execute(qry) + rows = cursor.fetchall() + traits = [] + + for row in rows: + row = map(noneFilter, row) + trait = ProbeSetTrait(id=row[0], name=row[1], + description=row[2], + chromosome=row[4], + MB=row[5], + symbol=row[3], + GeneId=row[6]) + trait.dataId = row[7] + traits.append(trait) + + return traits + + +# queryProbeSetTraits2: Cursor -> Integer -> dictof Trait +def queryProbeSetTraits2(cursor, freezeId, ProbeSetId1, ProbeSetId2): + """ + To locate all of the traits in a particular probeset + """ + qry = ''' + SELECT + ProbeSet.Id, + ProbeSet.Name, + ProbeSet.description, + ProbeSet.symbol, + ProbeSet.Chr, + ProbeSet.Mb, + ProbeSet.GeneId, + ProbeSetXRef.DataId + FROM + ProbeSet, + ProbeSetXRef + WHERE + ProbeSetXRef.ProbeSetId = ProbeSet.Id AND + ProbeSetXRef.ProbeSetFreezeId = %s AND + ProbeSet.Id >= %s AND + ProbeSet.Id <= %s + ORDER BY ProbeSet.Id + ''' % (freezeId, ProbeSetId1, ProbeSetId2) + + cursor.execute(qry) + rows = cursor.fetchall() + traits = [] + + for row in rows: + row = map(noneFilter, row) + trait = ProbeSetTrait(id=row[0], name=row[1], + description=row[2], + chromosome=row[4], + MB=row[5], + symbol=row[3], + GeneId=row[6]) + trait.dataId = row[7] + traits.append(trait) + + return traits + + +# queryPublishTraits : Cursor -> arrayof Trait +def queryPublishTraits(cursor, freezeId): + """ + To locate all published traits + """ + qry = ''' + SELECT + Publication.Id, + Publication.Name, + Publication.PhenoType, + Publication.Authors, + Publication.Title, + Publication.Year, + Publication.PubMed_ID, + PublishXRef.DataId + FROM + Publication, + PublishXRef + WHERE + PublishXRef.PublishFreezeId = %s AND + PublishXRef.PublishId = Publication.Id + ''' % freezeId + + qry = ''' + SELECT + Publication.Id, + PublishXRef.Id, + Phenotype.Pre_publication_description, + Phenotype.Post_publication_description, + Publication.Authors, + Publication.Title, + Publication.Year, + Publication.PubMed_ID, + PublishXRef.DataId + FROM + Publication, PublishXRef, Phenotype, PublishFreeze + WHERE + PublishFreeze.Id = %s AND + PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND + PublishXRef.PublicationId = Publication.Id AND + PublishXRef.PhenotypeId = Phenotype.Id + ''' % freezeId + cursor.execute(qry) + rows = cursor.fetchall() + traits = [] + for row in rows: + PhenotypeString = row[3] + if not row[7] and row[2]: + PhenotypeString = row[2] + trait = PublishTrait(id=row[0], name= '%s' %row[1], + phenotype=PhenotypeString, + authors=row[4], + title=row[5], + year=row[6], + href=row[7]) + trait.dataId = row[8] + traits.append(trait) + + return traits + +# queryGenotypeTraits : Cursor -> arrayof Trait +def queryGenotypeTraits(cursor, freezeId): + """ + To locate all traits in the genotype + """ + qry = ''' + SELECT + Geno.Id, + Geno.Name, + Geno.Chr, + GenoXRef.DataId, + Geno.Mb + FROM + Geno, + GenoXRef + WHERE + GenoXRef.GenoId = Geno.Id + AND GenoXRef.GenoFreezeId = %s + ''' % freezeId + cursor.execute(qry) + rows = cursor.fetchall() + traits = [] + + for row in rows: + trait = GenotypeTrait(id=row[0], name=row[1], + chromosome=row[2], MB=row[4]) + trait.dataId = row[3] + traits.append(trait) + + return traits + +# queryProbeSetTraitByName : Cursor -> string -> Trait +# to find a particular trait given its name +def queryProbeSetTraitByName(cursor, name): + qry = ''' + SELECT + ProbeSet.Id, + ProbeSet.Name, + ProbeSet.description, + ProbeSet.symbol, + ProbeSet.Chr, + ProbeSet.Mb, + ProbeSet.GeneId + FROM + ProbeSet + WHERE + ProbeSet.Name = "%s" + ''' % name + #print qry + cursor.execute(qry) + row = cursor.fetchone() + + # convert a MySQL NULL value to an empty string + # for gene symbol + if row[3] is None: + sym = "" + else: + sym = row[3] + + return ProbeSetTrait(id=row[0], name=row[1], description=row[2], + symbol=sym, chromosome=row[4], MB=row[5], + GeneId=row[6]) + + +# queryTraits : Cursor -> string -> string -> arrayof Traits +# to find all of the traits whose descriptions match a certain string in a +# particular database +def queryTraits(cursor, dbId, queryString): + # we do this in two steps: + # first we get the data id for the matching traits + qry = ''' + SELECT + ProbeSet.Id, + ProbeSet.Name, + ProbeSet.description, + ProbeSetXRef.DataId + FROM + ProbeSet, + ProbeSetXRef + WHERE + ProbeSetXRef.ProbeSetFreezeId = %s AND + ProbeSet.Id = ProbeSetXRef.ProbeSetId AND + ProbeSet.description LIKE "%%%s%%" + ''' % (dbId, queryString) + # print qry + cursor.execute(qry) + + if cursor.rowcount == 0: + print "No traits found" + return [] + else: + print "%s traits found" % (cursor.rowcount) + + # maybe fetchall is bad; we will see + traits = [] + for row in cursor.fetchall(): + myTrait = Trait(row[0], row[1], row[2]) + myTrait.dataId = row[3] + traits.append(myTrait) + + # second we pull all of the strain data for each trait + print "Retrieving individual trait data..." + for trait in traits: + trait.populateStrainData(cursor, trait.dataId) + print "%s (%s) -- %s" % (trait.name, trait.id, trait.description) + + print "done" + return traits + +# queryProbeSetFreezes : Cursor -> arrayof String,String tuples +# to return the short and long name for each ProbeSetFreeze +# this function is designed specifically for building +# a database selector +def queryProbeSetFreezes(cursor): + cursor.execute(""" + SELECT + ProbeSetFreeze.Name, + ProbeSetFreeze.FullName + FROM + ProbeSetFreeze + ORDER BY + ProbeSetFreeze.Name + """) + + # for now, fetchall returns the data as a list of tuples + # which is what we want + return list(cursor.fetchall()) + +# queryProbeSetFreezeIdName: Cursor -> String -> String, String +# this function returns the +# id and the long name of a probesetfreeze given its name +# once again, it's designed specifically for building +# the database selector +def queryProbeSetFreezeIdName(cursor, name): + qry = (''' + SELECT + ProbeSetFreeze.Id, + ProbeSetFreeze.FullName + FROM + ProbeSetFreeze + WHERE + ProbeSetFreeze.Name = "%s" + ''' % name) + #print qry + cursor.execute(qry) + + row = cursor.fetchone() + return row + +# queryProbeSetFreezeName: Cursor -> String -> String +# to return the name of a probe set freeze given its id +def queryProbeSetFreezeName(cursor, id): + cursor.execute(''' + SELECT + ProbeSetFreeze.FullName + FROM + ProbeSetFreeze + WHERE + ProbeSetFreeze.Id = %s + ''' % id) + + row = cursor.fetchone() + return row[0] + +# dbNameToTypeId : Cursor -> String -> (String, String) +# to convert a database name to a (type, id) pair +def dbNameToTypeId(cursor, name): + types = ["ProbeSet", "Geno", "Publish"] + for type_ in types: + count = cursor.execute(''' + SELECT + %sFreeze.Id + FROM + %sFreeze + WHERE + Name = "%s" + ''' % (type_, type_, name)) + + if count != 0: + id = cursor.fetchone()[0] + return type_, id + + return None, None + +# dbTypeIdToName : Cursor -> String -> String -> String +# to convert a database (type,id) pair into a name +def dbTypeIdToName(cursor, dbType, dbId): + cursor.execute(''' + SELECT + %sFreeze.Name + FROM + %sFreeze + WHERE + Id = %s + ''' % (dbType, dbType, dbId)) + + row = cursor.fetchone() + return row[0] + +#XZ, July 21, 2010: I add this function +def getSpeciesIdByDbTypeId (cursor, dbType, dbId): + cursor.execute(''' + SELECT + SpeciesId + FROM + InbredSet, %sFreeze + WHERE + %sFreeze.Id = %s + and InbredSetId = InbredSet.Id + ''' % (dbType, dbType, dbId)) + + row = cursor.fetchone() + return row[0] + + +# queryStrainCount : Cursor -> int +# return the number of strains in the database +def queryStrainCount(cursor): + cursor.execute(''' + SELECT + Max(Strain.Id) + FROM + Strain + ''') + return (cursor.fetchone())[0] |