aboutsummaryrefslogtreecommitdiff
path: root/web/webqtl/compareCorrelates/multitrait.py
diff options
context:
space:
mode:
Diffstat (limited to 'web/webqtl/compareCorrelates/multitrait.py')
-rwxr-xr-xweb/webqtl/compareCorrelates/multitrait.py1121
1 files changed, 1121 insertions, 0 deletions
diff --git a/web/webqtl/compareCorrelates/multitrait.py b/web/webqtl/compareCorrelates/multitrait.py
new file mode 100755
index 00000000..047620af
--- /dev/null
+++ b/web/webqtl/compareCorrelates/multitrait.py
@@ -0,0 +1,1121 @@
+# multitrait.py
+# a tool to analyze the correlations between several different traits and the traits
+# in a given dataset
+#
+# Parameters:
+# correlation -- either "pearson" or "spearman" depending on which ones we want to use
+#
+# filename -- an input file containing the traits to analyze
+#
+# progress -- if set, this parameter outputs a static progress page
+# and uses a META redirect to trigger the real computation
+#
+# targetDatabaseType:
+# one of "ProbeSet", "Publish", "Genotype" depending on the type of database
+# we will use for the analysis
+#
+# targetDatabaseId:
+# the id (*Freeze.Id in the database) of the particular database we will analyze
+#
+# threshold -- a float between 0 and 1 to determine which coefficents we wil l consider
+#
+# firstRun -- either 0 or 1
+# whether to automatically pick reasonable defaults for the other three parameters
+#
+# outputType -- either "html" or "text"
+#
+# Author: Stephen Pitts
+# June 15, 2004
+
+#Xiaodong changed the dependancy structure
+
+import copy
+import sys
+import cgi
+import os
+import os.path
+import math
+import time
+import numarray
+import tempfile
+import string
+import cgitb #all tracebacks come out as HTMLified CGI,useful when we have a random crash in the middle
+
+from base import templatePage
+from base.webqtlTrait import webqtlTrait
+from utility import webqtlUtil
+from base import webqtlConfig
+import trait
+import correlation
+import htmlModule
+
+cgitb.enable()
+
+
+# where this program's data files are
+RootDir = webqtlConfig.IMGDIR # XZ, 09/10/2008: add module name 'webqtlConfig.'
+RootDirURL = "/image/" # XZ, 09/10/2008: This parameter is not used in this module
+
+tempfile.tempdir = RootDir
+tempfile.template = "multitrait"
+
+# MultitraitException: used if something goes wrong
+# maybe in the future we should make exceptions more granular
+class MultitraitException(Exception):
+ def __init__(self, message):
+ self.message = message
+
+ def __repr__(self):
+ return "MultitraitException: %s" % self.message
+
+# buildParamDict: Cursor -> ParamDict
+# to process and validate CGI arguments
+# see the comment at the top of this file for valid cgi
+# parameters
+def buildParamDict(cursor, fd):
+ params = {}
+ fs = fd.formdata #cgi.FieldStorage()
+ params["progress"] = fs.getfirst("progress", "0")
+ params["filename"] = fs.getfirst("filename", "")
+ if params["filename"] == "":
+ raise MultitraitException("Required parameter filename missing.")
+
+ params["targetDatabase"] = fs.getfirst("targetDatabase", "U74Av2RMA_Raw_ProbeSet_March04")
+ params["firstRun"] = webqtlUtil.safeInt(fs.getfirst("firstRun", "0"),0)
+ params["threshold"] = webqtlUtil.safeFloat(fs.getfirst("threshold", "0.5"), 0.5)
+ params["subsetSize"] = webqtlUtil.safeInt(fs.getfirst("subsetSize", "10"), 10)
+
+ if params["subsetSize"] < -1:
+ params["subsetSize"] = -1
+
+ params["correlation"] = fs.getfirst("correlation", "pearson")
+ params["subsetCount"] = webqtlUtil.safeInt(fs.getfirst("subsetCount", 10), 10)
+
+ if params["subsetCount"] < -1:
+ params["subsetCount"] = -1
+
+ #params["outputType"] = fs.getfirst("outputType", "html")
+
+ #if params["outputType"] not in ("html", "text"):
+ # params["outputType"] = "html"
+
+ if params["correlation"] not in ("pearson", "spearman"):
+ params["correlation"] = "pearson"
+
+ params["correlationName"] = params["correlation"].capitalize()
+
+ # one of two cases:
+ # 1) We have just come from a submit, so there are a bunch of display*
+ # but no displaySets. Thus, the code down there converts the display*
+ # to displaySets so the GET request doesn't get too long
+ # 2) We have just been redirected from a progress page which already has
+ # a converted displaySets for us.
+
+ displaySets = webqtlUtil.safeInt(fs.getfirst("displaySets","0"), 0)
+
+ if displaySets == 0:
+ for key in fs.keys():
+ if key[:7] == "display":
+ #print "Hit display key %s<br>" % key
+ try:
+ whichSet = int(key[7:])
+
+ # prevent malicious attacks
+ whichSet = min(whichSet, 512)
+ displaySets += pow(2, whichSet)
+
+ except ValueError: pass
+
+ params["displaySets"] = displaySets
+ #print "In the beginning, display sets was %s: %s<br>" % (displaySets,
+ # str(binaryDecompose(displaySets)))
+
+ # if we are just gonna display a progress page, then there's no
+ # reason to look up detailed database information
+ #if params["progress"] == "1":
+ # return params
+
+ a,b = trait.dbNameToTypeId(cursor, params["targetDatabase"]) # XZ, 09/10/2008: add module name
+ params["targetDatabaseType"] = a
+ params["targetDatabaseId"] = b
+ params["targetDatabaseName"] = params["targetDatabase"]
+
+ return params
+
+# readInputFile: DB cursor -> string -> string, (arrayof Trait)
+def readInputFile(cursor, filename):
+ """
+ To read an input file with n lines in the following format
+ <databasetype>,<databaseid>,<traitname>
+ and retrieve and populate traits with appropriate data
+ from the database
+
+ Also, for our purposes. we store the database type and
+ database id in fields attached to the trait instances. We use
+ this information to generate Javascript popups with trait
+ information.
+
+ In addition, we read the strain of mice that the traits are
+ from so we can only show those databases to correlate against.
+ """
+ handle = open(filename)
+ line = handle.readline()
+ inbredSetName = line.strip()
+ line = handle.readline()
+ traits = []
+
+# XZ, 09/10/2008: In this while loop block, I changed the original variable name 'trait' to 'oneTrait'
+ while line != "":
+ line = line.strip()
+ dbType, dbId, tName = line.split(",")
+
+ if dbType == "ProbeSet":
+ oneTrait = trait.queryProbeSetTraitByName(cursor, tName) # XZ, 09/10/2008: add module name
+ oneTrait.populateDataId(cursor, dbId)
+ oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name
+ elif dbType == "Geno":
+ speciesId = trait.getSpeciesIdByDbTypeId(cursor, dbType, dbId)
+ oneTrait = trait.queryGenotypeTraitByName(cursor, speciesId, tName) # XZ, 09/10/2008: add module name
+ oneTrait.populateDataId(cursor, dbId)
+ oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name
+ elif dbType == "Publish":
+ oneTrait = trait.queryPublishTraitByName(cursor, dbId, tName) # XZ, 09/10/2008: add module name
+ oneTrait.populateDataId(cursor, dbId)
+ oneTrait.dbName = trait.dbTypeIdToName(cursor, dbType, dbId) # XZ, 09/10/2008: add module name
+ elif dbType == "Temp":
+ oneTrait = trait.queryTempTraitByName(cursor, tName) # XZ, 09/10/2008: add module name
+ oneTrait.populateDataId(cursor, dbId)
+ oneTrait.dbName = "Temp"
+
+ oneTrait.populateStrainData(cursor)
+ traits.append(oneTrait)
+
+ line = handle.readline()
+
+ return inbredSetName, traits
+
+# loadDatabase: Cursor -> ParamDict -> arrayof Trait
+def loadDatabase(cursor, p):
+ """
+ To load a set of traits as specified by the
+ targetDatabaseId
+ and targetDatabaseType parameters
+
+ Cursor should be a fastCursor from the webqtl library (i.e.
+ a MySQLdb SSCursor).
+
+ Calling populateStrainData 20,000 or so times on a ProbeSet
+ is really inefficent, so I wrote an optimized queryPopulatedProbeSetTraits
+ in the trait module that uses a join to get all of the rows in
+ bulk, store the resultset on the server, and do all sorts of nice buffering.
+ It's about two or three times faster.
+ """
+ if p["targetDatabaseType"] == "ProbeSet": # XZ, 09/10/2008: add module name
+ dbTraits = trait.queryPopulatedProbeSetTraits(cursor,
+ p["targetDatabaseId"])
+ elif p["targetDatabaseType"] == "Publish": # XZ, 09/10/2008: add module name
+ dbTraits = trait.queryPublishTraits(cursor,
+ p["targetDatabaseId"])
+ psd = trait.PublishTrait.populateStrainData
+ elif p["targetDatabaseType"] == "Geno": # XZ, 09/10/2008: add module name
+ dbTraits = trait.queryGenotypeTraits(cursor,
+ p["targetDatabaseId"])
+ psd = trait.GenotypeTrait.populateStrainData
+ else:
+ print "Unknown target database type %s" % p["targetDatabaseType"]
+
+ if p["targetDatabaseType"] != "ProbeSet":
+ map(psd, dbTraits, [cursor]*len(dbTraits))
+
+ return dbTraits
+
+def runProbeSetCorrelations(cursor, p, traits):
+ """
+ To run the correlations between the traits and the database.
+ This function computes a correlation coefficent between each
+ trait and every entry in the database, and partitions the database
+ into a disjoint array of arrays which it returns.
+
+ The length of the return array is 2^n, where n is the length of
+ the trait array. Which constitutent element a of the return array
+ a given trait ends up in is determined by the following formula
+ i = i_02^0 + ... + i_(n-1)2^(n-1)
+ where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise
+
+ Since most of the several thousand database traits will end up
+ with i=0, we don't return them, so the first element of the
+ return array will be empty.
+
+ A particular element of subarray j of the return array contains
+ a 2-tuple (trait,kvalues). The variable trait is obviously the
+ particular database trait that matches the user traits l_1, ..., l_m
+ to which subarray j corresponds. kvalues is a list of the correlation
+ values linking trait to l_1, ..., l_m, so the length of kvalues is
+ the number of 1s in the binary representation of j (there must be
+ a better way to describe this length).
+
+ The return array is an array of 2-tuples. The first element of
+ each tuple is the index of the particular subarray, and the second
+ element is the subarray itself. The array is sorted in descending
+ order by the number of 1's in the binary representation of the
+ index so the first few subarrays are the ones that correspond to
+ the largest sets. Each subarray is then sorted by the average of
+ the magnitude of the individual correlation values.
+ """
+
+ kMin = p["threshold"]
+ traitArrays = {}
+
+ # TODO: Add Spearman support
+ freezeId = p["targetDatabaseId"]
+ if p["correlation"] == "pearson":
+ correlations = correlation.calcProbeSetPearsonMatrix(cursor, freezeId, traits) #XZ, 09/10/2008: add module name
+ else:
+ correlations = correlation.calcProbeSetSpearmanMatrix(freezeId, traits) #XZ, 09/10/2008: add module name
+
+ # now we test all of the correlations in bulk
+ test = numarray.absolute(correlations)
+ test = numarray.greater_equal(test, kMin)
+ test = test.astype(numarray.Int8)
+ #print test
+
+ db = trait.queryProbeSetTraits(cursor, freezeId) #XZ, 09/10/2008: add module name
+ for i in range(len(db)):
+ cIndex = 0
+ prods = []
+ for j in range(len(traits)):
+ if test[i,j] == 1:
+ cIndex += pow(2, j)
+ prods.append(correlations[i,j])
+ if cIndex != 0:
+ if not traitArrays.has_key(cIndex):
+ traitArrays[cIndex] = []
+
+ traitArrays[cIndex].append((db[i], prods))
+
+
+ # sort each inner list of traitArrays
+ # so the matched traits appear in descending order by the
+ # average magnitude of the correlation
+ def customCmp(traitPair, traitPair2):
+ magAvg1 = numarray.average(map(abs, traitPair[1]))
+ magAvg2 = numarray.average(map(abs, traitPair2[1]))
+
+ # invert the sign to get descending order
+ return -cmp(magAvg1, magAvg2)
+
+ for traitArray in traitArrays.values():
+ traitArray.sort(customCmp)
+
+ # sort the outer list of traitArrays
+ traitArrays2 = []
+ i = 0
+ for key in traitArrays.keys():
+ a = traitArrays[key]
+ if len(a) > 0:
+ traitArrays2.append((key,a,len(binaryDecompose(key)),
+ len(a)))
+
+ # we sort by the number of 1's in the binary output
+ # and then by the size of the list, both in descending order
+ def customCmp2(aL,bL):
+ a = -cmp(aL[2], bL[2])
+ if a == 0:
+ return -cmp(aL[3], bL[3])
+ else:
+ return a
+
+ traitArrays2.sort(customCmp2)
+
+ return traitArrays2
+
+def runCorrelations(p, strainCount, traits, db):
+ """
+ To run the correlations between the traits and the database.
+ This function computes a correlation coefficent between each
+ trait and every entry in the database, and partitions the database
+ into a disjoint array of arrays which it returns.
+
+ The length of the return array is 2^n, where n is the length of
+ the trait array. Which constitutent element a of the return array
+ a given trait ends up in is determined by the following formula
+ i = i_02^0 + ... + i_(n-1)2^(n-1)
+ where i_0 is 1 if corr(a,trait 0) >= threshold and 0 otherwise
+
+ Since most of the several thousand database traits will end up
+ with i=0, we don't return them, so the first element of the
+ return array will be empty.
+
+ A particular element of subarray j of the return array contains
+ a 2-tuple (trait,kvalues). The variable trait is obviously the
+ particular database trait that matches the user traits l_1, ..., l_m
+ to which subarray j corresponds. kvalues is a list of the correlation
+ values linking trait to l_1, ..., l_m, so the length of kvalues is
+ the number of 1s in the binary representation of j (there must be
+ a better way to describe this length).
+
+ The return array is an array of 2-tuples. The first element of
+ each tuple is the index of the particular subarray, and the second
+ element is the subarray itself. The array is sorted in descending
+ order by the number of 1's in the binary representation of the
+ index so the first few subarrays are the ones that correspond to
+ the largest sets. Each subarray is then sorted by the average of
+ the magnitude of the individual correlation values.
+ """
+ kMin = p["threshold"]
+ traitArrays = {}
+
+ # TODO: Add Spearman support
+ if p["correlation"] == "pearson":
+ correlations = correlation.calcPearsonMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name
+ else:
+ correlations = correlation.calcSpearmanMatrix(db, traits, strainCount) #XZ, 09/10/2008: add module name
+
+ # now we test all of the correlations in bulk
+ test = numarray.absolute(correlations)
+ test = numarray.greater_equal(test, kMin)
+ test = test.astype(numarray.Int8)
+ #print test
+
+
+ for i in range(len(db)):
+ cIndex = 0
+ prods = []
+ for j in range(len(traits)):
+ if test[i,j] == 1:
+ cIndex += pow(2, j)
+ prods.append(correlations[i,j])
+ if cIndex != 0:
+ if not traitArrays.has_key(cIndex):
+ traitArrays[cIndex] = []
+
+ traitArrays[cIndex].append((db[i], prods))
+
+ # sort each inner list of traitArrays
+ # so the matched traits appear in descending order by the
+ # average magnitude of the correlation
+ def customCmp(traitPair, traitPair2):
+ magAvg1 = numarray.average(map(abs, traitPair[1]))
+ magAvg2 = numarray.average(map(abs, traitPair2[1]))
+
+ # invert the sign to get descending order
+ return -cmp(magAvg1, magAvg2)
+
+ for traitArray in traitArrays.values():
+ traitArray.sort(customCmp)
+
+ # sort the outer list of traitArrays
+ traitArrays2 = []
+ i = 0
+ for key in traitArrays.keys():
+ a = traitArrays[key]
+ if len(a) > 0:
+ traitArrays2.append((key,a,len(binaryDecompose(key)),
+ len(a)))
+
+ # we sort by the number of 1's in the binary output
+ # and then by the size of the list, both in descending order
+ def customCmp2(aL,bL):
+ a = -cmp(aL[2], bL[2])
+ if a == 0:
+ return -cmp(aL[3], bL[3])
+ else:
+ return a
+
+ traitArrays2.sort(customCmp2)
+
+ return traitArrays2
+
+
+# XZ, 09/09/2008: In multiple trait correlation result page,
+# XZ, 09/09/2008: click "Download a text version of the above results in CSV format"
+
+# TraitCorrelationText: a class to display trait correlations
+# as textual output
+class TraitCorrelationText:
+ # build a text shell to describe the given trait correlations
+ # this method sets self.output; use str(self) to actually
+ # get the text page
+ #
+ # traits is a list of traits and traitArray is a
+ # list of 3-tuples: index, traits', garbage
+ # where index is a binary-encoded description of which subset of
+ # traits the list traits' matches
+ #
+ # traits' is a list of 3-tuples as well: trait, correlations, garbage
+ # where trait is a particular trait and correlations is a list of float
+ # correlations (matching traits above)
+ def __init__(self, p, traits, traitArray):
+ output = "Correlation Comparison\n"
+ output += "from WebQTL and the University of Tennessee Health Science Center\n"
+ output += "initiated at " + time.asctime(time.gmtime()) + " UTC\n\n"
+
+ output += self.showOptionPanel(p)
+ output += self.showSelectedTraits(traits)
+ output += self.showSummaryCorrelationResults(p, traits, traitArray)
+ output += self.showDetailedCorrelationResults(p, traits, traitArray)
+
+ self.output = output
+
+ # showOptionPanel: ParamDict -> string
+ # to display the options used to run this correlation
+ def showOptionPanel(self, params):
+ output = "Correlation Comparison Options:\n"
+ output += "Target database,%s\n" % params["targetDatabase"]
+ output += "Correlation type,%s\n" % params["correlationName"]
+ output += "Threshold,%f\n" % params["threshold"]
+ #output += "Subsets to Show,%d\n" % params["subsetCount"]
+ #output += "Traits to Show Per Subset,%d\n\n" % params["subsetSize"]
+ return output
+
+ # showSelectedTraits: (listof Trait) -> string
+ # to display the traits compared with the database
+ # note: we can't use tabular output because the traits could be of
+ # different types and produce different fields
+ def showSelectedTraits(self, traits):
+ output = "Selected Traits:\n"
+ for trait in traits:
+ output += '"' + trait.longName() + '"' + "\n"
+ output += "\n"
+ return output
+
+ # showSummaryCorrelationResults: ParamDict -> (listof Trait) ->
+ # TraitArray -> string
+ # see comment for __init__ for a description of TraitArray
+ #
+ # to show a summary (sets and sizes) of the correlation results
+ # as well as an X to indicate whether they will be included
+ # in the detailed output
+ def showSummaryCorrelationResults(self, p, traits, traitArray):
+ output = "Correlation Comparison Summary:\n"
+
+ #if p["subsetCount"] != -1:
+ # ourSubsetCount = min(p["subsetCount"], len(traitArray))
+ #else:
+
+ ourSubsetCount = len(traitArray)
+
+ displayDecomposition = binaryDecompose(p["displaySets"])
+ for j in range(ourSubsetCount):
+ i = traitArray[j][0]
+ traitSubarray = traitArray[j][1]
+ if len(traitSubarray) == 0:
+ continue
+
+ targetTraits = decomposeIndex(traits, i)
+ traitDesc = string.join(map(trait.Trait.shortName, targetTraits), # XZ, 09/10/2008: add module name
+ ", ")
+ if j in displayDecomposition:
+ checked = "X"
+ else:
+ checked = ""
+
+ output += '"%s","%s","%d"\n' % (checked, traitDesc, len(traitSubarray))
+
+ output += "\n"
+ return output
+
+ # showDetailedCorrelationResults: ParamDict -> (listof Trait) ->
+ # TraitArray -> string
+ #
+ # to show a detailed list of the correlation results; that is,
+ # to completely enumerate each subset of traitArray using the
+ # filtering parameters in p
+ def showDetailedCorrelationResults(self, p, traits, traitArray):
+ output = "Correlation Comparison Details:\n"
+ displayDecomposition = binaryDecompose(p["displaySets"])
+ displayDecomposition.sort()
+
+ def formatCorr(c):
+ return "%.4f" % c
+
+ for j in displayDecomposition:
+ i = traitArray[j][0]
+ traitSubarray = traitArray[j][1]
+
+ if len(traitSubarray) == 0:
+ continue
+
+ targetTraits = decomposeIndex(traits, i)
+ extraColumnHeaders = map(trait.Trait.shortName, targetTraits) # XZ, 09/10/2008: add module name
+ traitDesc = string.join(extraColumnHeaders, ", ")
+
+ #if(p["subsetSize"] != -1 and len(traitSubarray) > p["subsetSize"]):
+ # traitDesc += ",(showing top %s of %s)" % (p["subsetSize"],
+ # len(traitSubarray))
+ # traitSubarray = traitSubarray[0:p["subsetSize"]]
+
+ output += "%s\n" % traitDesc
+ output += traitSubarray[0][0].csvHeader([], extraColumnHeaders)
+ output += "\n"
+ for oneTrait, corr in traitSubarray:#XZ, 09/10/2008: change original variable name 'trait' to 'oneTrait'
+ corr = map(formatCorr, corr)
+ output += oneTrait.csvRow([], corr) + "\n"
+
+ output += "\n"
+
+ return output
+
+ # __str__ : string
+ # to return self.output as the string representation of this page
+ # self.output is built in __init__
+ def __str__(self):
+ return self.output
+
+# TraitCorrelationPage: a class to display trait correlations
+# for now this is just one HTML file, so we don't even write it
+# to a temporary file somewhere
+class TraitCorrelationPage(templatePage.templatePage):
+ """
+ Using the templatePage class, we build an HTML shell for
+ the core data here: the trait correlation lists.
+
+ The way templatePage works, we build the page in pieces in
+ the __init__ method and later on use the inherited write
+ method to render the page.
+ """
+ def __init__(self, fd, p, cursor, traits, traitArray, inbredSetName, txtFilename):
+
+ templatePage.templatePage.__init__(self, fd)
+
+ self.dict["title"] = "Correlation Comparison"
+ self.dict["basehref"] = ""
+ # NL: deleted js1 content part, since it has not been used in this project
+ self.dict["js1"] = ""
+ self.dict["js2"] = ""
+
+ body = "<td><h1>Correlation Comparison</h1>"
+ body += "<p>Run at %s UTC</p>" % time.asctime(time.gmtime())
+ body += """
+ <p>The correlation comparison tool identifies intersecting sets of traits that are
+correlated with your selections at a specified threshold. A correlation comparison
+involves the following steps:</p>
+<ol>
+<li><p>
+<b>Correlate:</b>
+Choose a <i>Target Database</i>, a <i>Correlation Type</i>, and a <i>Correlation
+Threshold</i>. For your initial correlation, leave <i>Number of Subsets to Show</i> and
+<i>Traits to Show per Subset</i> at their default values of 10. Using the Correlation
+Options panel, you can adjust the <i>Correlation Threshold</i>, <i>Number of Subsets to
+Show</i>, and <i>Traits to Show per Subset</i>.
+</p></li>
+
+<li><p>
+<b>Add to Collection:</b>
+You can use the check boxes in the <i>Correlation
+Comparison Details</i> panel and the buttons at the bottom of the page to add these
+results to your selections page for further analysis in WebQTL.
+</p></li>
+
+<li><p>
+<b>Filter:</b>
+Using the <i>Correlation Comparison Summary</i> panel, choose which
+subsets you would like to display for export. Note that if you change the
+parameters in the <i>Correlation Options</i> panel, you will need to re-apply your filter.
+</p></li>
+
+<li><p>
+<b>Export:</b>
+Once you are satisfied with your report, use the export link at
+the bottom of the page to save the report as a comma-separated (CSV) text file
+which you can then import into Excel or another tool. Note: the exported report
+will list all subsets in the summary view and only those traits in the subsets
+you have selected in the Filter step.
+</p></li>
+</ol>
+"""
+
+# body += """
+# <p>The correlation
+# comparison tool identifies the intersecting sets of traits that are
+# correlated with your selections. A correlation comparison involves
+# the following steps:</p>
+# <ol>
+# <li><p><b>Correlate:</b> Choose a <i>Target Database</i>, a <i>Correlation Type</i>, and a <i>Correlation Threshold</i>.
+# For the initial correlation, leave <i>Subsets to Show</i> and <i>Traits to Show per Subset</i>
+# at their default values of 10.</p></li>
+# <li><p><b>Refine Correlation:</b> Using the <i>Correlation Options</i> panel,
+# adjust the <i>Correlation Threshold</i>, <i>Subsets to Show</i>, and <i>Traits to
+# Show per Subset</i> until you have a reasonable number of traits.</p></li>
+# <li><p><b>Filter:</b> Using the <i>Correlation Comparison Summary</i> panel, choose which subsets you would
+# like to see. Note that if you change the parameters in the <i>Correlation Options</i> panel, you will
+# loose the filter you have selected.</p></li>
+# <li><p><b>Export:</b> Once you are satisfied with your report, use the export
+# link at the bottom of the page to save the report as a comma-separated (CSV) text file which
+# you can then import into Excel or another tool. Note: the exported report
+# will show all subsets in the summary view and all traits in each subset you have
+# selected in the Filter step.
+# <li><p><b>Shopping Cart:</b> In addition, you can use the
+# check boxes in the <i>Correlation Comparison Details</i> panel and the
+# buttons at the bottom of the page to add the traits you have found to the shopping cart.</p>
+# </li>
+# </ol>
+# """
+
+ body += self.showOptionPanel(p, cursor, inbredSetName)
+ body += self.showSelectedTraits(traits, p, inbredSetName)
+
+ if p["firstRun"] == 0:
+ body += self.showCorrelationResults(p, inbredSetName, traits, traitArray)
+
+ exportParams = copy.copy(p)
+ exportParams["outputType"] = "text"
+
+ body += ('''
+ <h2>Export these results</h2>
+ <p>
+ <a href="/image/%s">Download a text version of the above results in CSV format</a>. This text version differs from
+ the version you see on this page in two ways. First, the summary view shows all subsets. Second, the details
+ view shows all traits in the subsets that you have selected.
+ </p>
+ '''
+ % txtFilename)
+
+
+
+ body += "</td>"
+ self.dict["body"] = body
+
+
+ # showOptionPanel: ParamDict -> Cursor -> String -> String
+ # to build an option panel for the multitrait correlation
+ # we expect the database list to be a list of 2-tuples
+ # the first element of each tuple is the short name
+ # and the second element of the tuple is the long name
+ def showOptionPanel(self, params, cursor, inbredSetName):
+ output = '''
+ <h2>Correlation Options</h2>
+ <FORM METHOD="POST" ACTION="%s%s" ENCTYPE="multipart/form-data">
+ <INPUT TYPE="hidden" NAME="FormID" VALUE="compCorr2">
+ <input type="hidden" name="filename" value="%s">
+ <input type="hidden" name="firstRun" value="0">
+ <input type="hidden" name="progress" value="1">
+ <table>
+ <tr>
+ <td>Target Database:</td><td>
+ ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, params["filename"])
+
+ output += htmlModule.genDatabaseMenu(db = cursor,
+ public=0,
+ RISetgp = inbredSetName,
+ selectname="targetDatabase",
+ selected=params["targetDatabase"])
+ output += "</td></tr>"
+
+ corrSelected = ["",""]
+
+ if params["correlation"] == "pearson":
+ corrSelected[0] = "SELECTED"
+ else:
+ corrSelected[1] = "SELECTED"
+
+ output += ('''
+ <tr>
+ <td>Correlation Method:</td>
+ <td><select name="correlation">
+ <option value="pearson" %s>Pearson</option>
+ <!--<option value="spearman" %s>Spearman</option>-->
+ </select></td></tr>
+ ''' % (corrSelected[0], corrSelected[1]))
+ output += ('<tr><td>Correlation Threshold:</td><td><input name="threshold" value="%s" /></td></tr>'
+ % params["threshold"])
+ output += ('<tr><td>Subsets to Show (-1 to show all subsets):</td><td><input name="subsetCount" value="%s" /></td></tr>'
+ % params["subsetCount"])
+ output += ('<tr><td>Traits to Show per Subset (-1 to show all traits):</td><td><input name="subsetSize" value="%s" /></td></tr>'
+ % params["subsetSize"])
+
+ # a cosmetic change to hopefully make this form a bit easier to use
+# if params["firstRun"] == 1:
+# applyName = "Correlate"
+# else:
+# applyName = "Refine Correlation"
+
+ output += '''
+ <tr>
+ <td colspan="2"><input class="button" type="submit" value="Correlate" /></td>
+ </tr>
+ </table>
+ </form>
+ '''
+
+ return output
+
+ # showSelectedTraits: listof Trait -> string
+ # to show a list of the selected traits
+ def showSelectedTraits(self, traits, p, inbredSetName):
+ output = '''
+ <form action="%s%s" method="post" name="showDatabase">
+ <INPUT TYPE="hidden" NAME="FormID" VALUE="showDatabase">
+
+ <input type="hidden" name="incparentsf1" value="ON">
+ <input type="hidden" name="ShowStrains" value="ON">
+ <input type="hidden" name="ShowLine" value="ON">
+ <input type="hidden" name="database" value="">
+ <input type="hidden" name="ProbeSetID" value="">
+ <input type="hidden" name="RISet" value="%s">
+ <input type="hidden" name="CellID" value="">
+ <input type="hidden" name="database2" value="">
+ <input type="hidden" name="rankOrder" value="">
+ <input type="hidden" name="ProbeSetID2" value="">
+ <input type="hidden" name="CellID2" value="">
+ ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, inbredSetName)
+
+ output += "<h2>Selected Traits</h2>"
+ output += '<table cellpadding="2" cellspacing="0"><tr bgcolor="FFFFFF"><th>Database</th><th>Trait</th></tr>'
+ flip = 1
+ colors = ["FFFFFF", "cccccc"]
+
+ for trait in traits:
+ # we take advantage of the secret dbName attribute that
+ # loadDatabase fills in
+ descriptionString = trait.genHTML()
+ if trait.db.type == 'Publish' and trait.confidential:
+ descriptionString = trait.genHTML(privilege=self.privilege, userName=self.userName, authorized_users=trait.authorized_users)
+ output += '''
+ <tr bgcolor="%s"><td><a href="/dbdoc/%s.html">%s</a></td>
+ <td><a href="javascript:showDatabase2('%s', '%s', '')">%s</a></td>
+ </tr>
+ ''' % (colors[flip], trait.db.name, trait.db.name, trait.db.name, trait.name, descriptionString)
+ flip = not flip
+
+ output += "</table></form>"
+ return output
+
+
+ # showSummaryCorrelationResults
+ # show just the number of traits in each subarray
+ def showSummaryCorrelationResults(self, p, traits, traitArray):
+ output = '''
+ <form action="%s%s" method="post">
+ <INPUT TYPE="hidden" NAME="FormID" VALUE="compCorr2">
+ <input type="hidden" name="filename" value="%s">
+ <input type="hidden" name="firstRun" value="0">
+ <input type="hidden" name="progress" value="1">
+ <input type="hidden" name="correlation" value="%s">
+ <input type="hidden" name="threshold" value="%s">
+ <input type="hidden" name="rankOrder" value="">
+ <input type="hidden" name="subsetCount" value="%s">
+ <input type="hidden" name="subsetSize" value="%s">
+ <input type="hidden" name="targetDatabase" value="%s">
+ ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, p["filename"], p["correlation"], p["threshold"],
+ p["subsetCount"], p["subsetSize"], p["targetDatabase"])
+
+ output += '''
+ <table cellpadding="2" cellspacing="0">
+ <tr>
+ <th>Trait Subsets</th>
+ <th colspan="2">Intersecting Set Size</th>
+ </tr>
+ '''
+ # figure out a scale for the summary graph
+ # for now we set max = 300 pixels wide
+ if p["subsetCount"] != -1:
+ ourSubsetCount = min(p["subsetCount"], len(traitArray))
+ else:
+ ourSubsetCount = len(traitArray)
+
+ screenWidth = 600
+ lengths = []
+ for j in range(ourSubsetCount):
+ lengths.append(len(traitArray[j][1]))
+ maxLength = max(lengths)
+
+ displayDecomposition = binaryDecompose(p["displaySets"])
+ flip = 0
+ colors = ["FFFFFF", "cccccc"]
+
+ for j in range(ourSubsetCount):
+ i = traitArray[j][0]
+ traitSubarray = traitArray[j][1]
+
+ if len(traitSubarray) == 0:
+ continue
+
+ targetTraits = decomposeIndex(traits, i)
+ traitDesc = string.join(map(webqtlTrait.displayName, targetTraits),
+ ", ")
+
+ if j in displayDecomposition:
+ checked = "CHECKED"
+ else:
+ checked = ""
+
+ barWidth = (len(traitSubarray) * screenWidth) / maxLength
+ output += ('''<tr bgcolor="%s">
+ <td><input type="checkbox" name="display%d" value="1" %s>%s</input></td>
+ <td>%s</td>
+ <td><img src="/images/blue.png" width="%d" height="25"></td></tr>'''
+ % (colors[flip], j, checked, traitDesc, len(traitSubarray), barWidth))
+ flip = not flip
+
+ output += '''
+ <tr>
+ <td colspan="3">
+ <input class="button" type="submit" value="Filter" /></td>
+ </tr>
+ </table></form>
+ '''
+ return output
+
+ # showDetailedCorrelationResults
+ # actually show the traits in each subarray
+ def showDetailedCorrelationResults(self, p, inbredSetName, traits,
+ traitArray):
+ output = "<h2>Correlation Comparison Details</h2>"
+
+ # the hidden form below powers all of the JavaScript links,
+ # the shopping cart links, and the correlation plot links
+
+ output += '''
+ <form action="%s%s" method="post">
+ <input type="hidden" name="database" value="%s">
+ <input type="hidden" name="FormID" value="showDatabase">
+ <input type="hidden" name="traitfile" value="">
+ <input type="hidden" name="incparentsf1" value="ON">
+ <input type="hidden" name="ShowStrains" value="ON">
+ <input type="hidden" name="ProbeSetID" value="">
+ <input type="hidden" name="ShowLine" value="ON">
+ <input type="hidden" name="RISet" value="%s">
+ <input type="hidden" name="CellID" value="">
+ <input type="hidden" name="database2" value="">
+ <input type="hidden" name="rankOrder" value="">
+ <input type="hidden" name="ProbeSetID2" value="">
+ <input type="hidden" name="CellID2" value="">
+ ''' % (webqtlConfig.CGIDIR, webqtlConfig.SCRIPTFILE, p["targetDatabase"], inbredSetName)
+
+
+ displayDecomposition = binaryDecompose(p["displaySets"])
+
+ # necessary to ensure that subset order is the same in the
+ # summary and the detailed view
+ displayDecomposition.sort()
+
+ # here's a trick: the first trait we show must have the widest row because it correlates
+ # with the largest set of input traits
+ firstSubset = traitArray[displayDecomposition[0]]
+ firstTrait = firstSubset[1][0][0]
+ extraColumnCount = firstSubset[2]
+ totalColumnCount = 1 + len(firstTrait.row()) + extraColumnCount
+
+ output += "<table cellpadding=2 cellspacing=0>\n"
+ for j in displayDecomposition:
+ i = traitArray[j][0]
+ traitSubarray = traitArray[j][1]
+
+ # we don't display trait combinations for which there are
+ # no correlations
+ if len(traitSubarray) == 0:
+ continue
+
+ # generate a description of the traits that this particular array
+ # matches highly
+ targetTraits = decomposeIndex(traits, i)
+ extraColumnHeaders = map(webqtlTrait.displayName, targetTraits)
+ traitDesc = string.join(extraColumnHeaders, ", ")
+
+ # massage extraColumnHeaders so that they can be wrapped
+ for i in range(len(extraColumnHeaders)):
+ ech = extraColumnHeaders[i]
+ ech = ech.replace("-", " ")
+ ech = ech.replace("_", " ")
+ extraColumnHeaders[i] = ech
+
+ # pad extraColumnHeaders if we have less columns than the max
+ paddingNeeded = extraColumnCount - len(extraColumnHeaders)
+ if paddingNeeded > 0:
+ extraColumnHeaders.extend(paddingNeeded * ["&nbsp;"])
+
+ # we limit the output to the top ones
+ if(p["subsetSize"] != -1 and len(traitSubarray) > p["subsetSize"]):
+ traitDesc += " (showing top %s of %s)" % (p["subsetSize"], len(traitSubarray))
+ traitSubarray = traitSubarray[0:p["subsetSize"]]
+
+ # combine that description with actual database traits themselves
+ # and the correlation values
+ output += '<tr><td colspan="%d"><h3>%s</h3></td></tr>' % (totalColumnCount, traitDesc)
+ #output += '<h3>%s</h3>\n<table cellpadding=2 cellspacing=0>\n'% traitDesc
+
+ # we assume that every trait in traitSubarray is the same type
+ # of trait
+ flip = 0
+ colors = ["FFFFFF", "cccccc"]
+
+ output += traitSubarray[0][0].tableRowHeader(["&nbsp;"], extraColumnHeaders, colors[0])
+
+ for traitPair in traitSubarray:
+ corr = []
+ traitPair[0].dbName = p['targetDatabase']
+ trait = traitPair[0]
+
+ for i in range(len(traitPair[1])):
+ corrValue = traitPair[1][i]
+ corrPlotLink = ('''
+ <a href="javascript:showCorrelationPlot2(db='%s',ProbeSetID='%s',CellID='',db2='%s',ProbeSetID2='%s',CellID2='',rank='%s')">%.2f</a>
+ ''' % (p["targetDatabaseName"], trait.name, targetTraits[i].db.name, targetTraits[i].name, "0", corrValue))
+ corr.append(corrPlotLink)
+
+ corr.extend(paddingNeeded * ["&nbsp;"])
+
+ checkbox = ('<INPUT TYPE="checkbox" NAME="searchResult" VALUE="%s:%s" />'
+ % (p["targetDatabaseName"], trait.name))
+ flip = not flip
+ output += traitPair[0].tableRow([checkbox], corr, colors[flip])
+
+ #output += "</table>"
+ i += 1
+ output += '<tr><td colspan="%d">&nbsp;</td></tr>' % totalColumnCount
+
+ output += "</table>"
+
+ # print form buttons if there were checkboxes above
+ output += '''
+ <div align="left">
+ <INPUT TYPE="button" NAME="addselect" CLASS="button" VALUE="Add to Collection"
+ onClick="addRmvSelection('%s',this.form, 'addToSelection');">
+ <INPUT TYPE="button" NAME="selectall" CLASS="button" VALUE="Select All" onClick="checkAll(this.form);">
+ <INPUT TYPE="reset" CLASS="button" VALUE="Select None">
+ </div>
+ </form>
+ ''' % inbredSetName
+
+ return output
+
+ # showCorrelationResults: ParamDict -> listof Trait -> tupleof (int,arrayof trait) -> String
+ # to build an output display for the multitrait correlation results
+ def showCorrelationResults(self, p, inbredSetName, traits, traitArray):
+ output = '''
+ <h2>Correlation Comparison Summary</h2>
+ <p>
+ %s correlations were computed for each of the selected traits with each trait in
+ the <a href="/dbdoc/%s.html">%s</a> database.
+ Subsets of database traits for which correlations were higher than %s
+ or lower than -%s are shown below based on which traits
+ they correlated highly with. The top %s subsets, ranked by the number of input traits that
+ they correspond with, are shown, and at most %s traits in each subset are shown. </p>
+ ''' % (p["correlationName"],
+ p["targetDatabase"], p["targetDatabaseName"],
+ p["threshold"], p["threshold"], p["subsetCount"],
+ p["subsetSize"])
+
+
+ totalTraits = 0
+ for j in range(len(traitArray)):
+ totalTraits += len(traitArray[j][1])
+
+ if totalTraits == 0:
+ output += """
+ <p>
+ No shared corrrelates were found with your given traits at this
+ threshold. You may wish to lower the correlation threshold or choose different traits.
+ </p>
+ """
+ else:
+ output += self.showSummaryCorrelationResults(p, traits, traitArray)
+ output += self.showDetailedCorrelationResults(p, inbredSetName,
+ traits, traitArray)
+
+ return output
+
+# decomposeIndex: (listof Trait) -> Int ->
+# (listof Trait)
+# to use i to partition T into a sublist
+# each bit in i controls the inclusion or exclusion of a trait
+def decomposeIndex(traits, i):
+ targetTraits = []
+
+ for j in range(len(traits)):
+ # look, mom, a bitwise and!
+ # expression below tests whether the jth bit is
+ # set in i
+ # see runCorrelation for how we decompose the
+ # array index
+ if (i & pow(2,j)) == pow(2,j):
+ targetTraits.append(traits[j])
+
+ return targetTraits
+
+# binaryDecompose: int -> (listof int)
+# to decompose a number into its constituent powers of 2
+# returns a list of the exponents a_1...a_n such that the input m
+# is m = 2^a_1 + ... + 2^a_n
+def binaryDecompose(n):
+ if n == 0:
+ return []
+
+ # we start with the highest power of 2 <= this number
+ # and work our way down, subtracting powers of 2
+ start = long(math.floor(math.log(n)/math.log(2)))
+
+ exponents = []
+ while start >= 0:
+ if n >= long(math.pow(2, start)):
+ n -= math.pow(2,start)
+ exponents.append(start)
+ start -= 1
+ return exponents
+
+# powerOf : int -> int -> boolean
+# to determine whether m is a power of n;
+# more precisely, whether there exists z in Z s.t.
+# n^z = m
+def powerOf(m, n):
+ trialZ = math.floor(math.log(m)/math.log(n))
+ return pow(n,trialZ) == m
+
+
+class compCorrPage(templatePage.templatePage):
+ def __init__(self,fd):
+ templatePage.templatePage.__init__(self, fd)
+
+ if not self.openMysql():
+ return
+
+ cursor = self.cursor
+ params = buildParamDict(cursor, fd)
+
+ # get the input data
+ inbredSetName, traits = readInputFile(cursor, RootDir + params["filename"])
+
+ # and what we are comparing the data to
+ dbTraits = []
+ if params["targetDatabaseType"] != "ProbeSet":
+ dbTraits = loadDatabase(cursor, params)
+
+
+ # run the comparison itself
+ strainCount = trait.queryStrainCount(cursor) # XZ, 09/10/2008: add module name
+ if params["targetDatabaseType"] == "ProbeSet":
+ results = runProbeSetCorrelations(cursor, params, traits)
+ else:
+ results = runCorrelations(params, strainCount, traits, dbTraits)
+
+ # try to be smart about what to output:
+ # we want to limit the number of traits shown, at least initially
+ # and since traitArray is already sorted with most interesting
+ # subsets first, we simply pick up the first 500 or so traits
+ # that we find
+ if params["displaySets"] == 0:
+ selectedTraits = 0
+ for j in range(len(results)):
+ #print "Scanning subarray %d" % j
+ if selectedTraits <= 200:
+ params["displaySets"] += pow(2, j)
+ selectedTraits += len(results[j][1])
+
+ traitList = []
+ for oneTrait in traits: # XZ, 09/10/2008: change the original variable name 'trait' to 'oneTrait'
+ traitName = oneTrait.dbName+'::'+oneTrait.name # XZ, 09/10/2008: change the original variable name 'trait' to 'oneTrait'
+ aTrait = webqtlTrait(cursor=self.cursor, fullname=traitName)
+ traitList.append(aTrait)
+
+ # and generate some output
+ txtOutputFilename = tempfile.mktemp()
+ txtOutputHandle = open(txtOutputFilename, "w")
+ txtOutput = TraitCorrelationText(params, traits, results)
+ txtOutputHandle.write(str(txtOutput))
+ txtOutputHandle.close()
+ txtOutputFilename = os.path.split(txtOutputFilename)[1]
+
+ self.dict['body'] = TraitCorrelationPage(fd, params, cursor, traitList,
+ results, inbredSetName,
+ txtOutputFilename).dict['body']