aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLei Yan2013-06-19 23:04:21 +0000
committerLei Yan2013-06-19 23:04:21 +0000
commit615b861dfd05c04df2e1a753dd135b07c1d88a94 (patch)
tree318586811f2781069e6df0f5910e3b12975efee5
parent82bd8b61b4aed0b0ae07477afae37a846fab35c2 (diff)
downloadgenenetwork2-615b861dfd05c04df2e1a753dd135b07c1d88a94.tar.gz
Moved the normalize_values function to separate file corr_result_helpers.py
Added docstring test to normalize_values Number of overlapping samples column now displays correctly in the correlation results page
-rwxr-xr-xwqflask/base/trait.py7
-rw-r--r--wqflask/utility/corr_result_helpers.py30
-rw-r--r--wqflask/wqflask/correlation/show_corr_results.py183
3 files changed, 44 insertions, 176 deletions
diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py
index 801d32c2..3429d9c1 100755
--- a/wqflask/base/trait.py
+++ b/wqflask/base/trait.py
@@ -286,7 +286,6 @@ class GeneralTrait(object):
escape(self.dataset.name),
escape(self.name))
trait_info = g.db.execute(query).fetchone()
- #print("trait_info is: ", pf(trait_info))
#XZ, 05/08/2009: We also should use Geno.Id to find marker instead of just using Geno.Name
# to avoid the problem of same marker name from different species.
elif self.dataset.type == 'Geno':
@@ -319,7 +318,6 @@ class GeneralTrait(object):
#XZ: assign SQL query result to trait attributes.
for i, field in enumerate(self.dataset.display_fields):
- print(" mike: {} -> {} - {}".format(field, type(trait_info[i]), trait_info[i]))
setattr(self, field, trait_info[i])
if self.dataset.type == 'Publish':
@@ -329,9 +327,6 @@ class GeneralTrait(object):
self.homologeneid = None
- print("self.geneid is:", self.geneid)
- print(" type:", type(self.geneid))
- print("self.dataset.group.name is:", self.dataset.group.name)
if self.dataset.type == 'ProbeSet' and self.dataset.group and self.geneid:
#XZ, 05/26/2010: From time to time, this query get error message because some geneid values in database are not number.
#XZ: So I have to test if geneid is number before execute the query.
@@ -356,7 +351,6 @@ class GeneralTrait(object):
InbredSet.SpeciesId = Species.Id AND
Species.TaxonomyId = Homologene.TaxonomyId
""" % (escape(str(self.geneid)), escape(self.dataset.group.name))
- print("-> query is:", query)
result = g.db.execute(query).fetchone()
#else:
# result = None
@@ -388,7 +382,6 @@ class GeneralTrait(object):
Geno.Name = '{}' and
Geno.SpeciesId = Species.Id
""".format(self.dataset.group.species, self.locus)
- print("query is:", query)
result = g.db.execute(query).fetchone()
self.locus_chr = result[0]
self.locus_mb = result[1]
diff --git a/wqflask/utility/corr_result_helpers.py b/wqflask/utility/corr_result_helpers.py
new file mode 100644
index 00000000..edf32449
--- /dev/null
+++ b/wqflask/utility/corr_result_helpers.py
@@ -0,0 +1,30 @@
+def normalize_values(a_values, b_values):
+ """
+ Trim two lists of values to contain only the values they both share
+
+ Given two lists of sample values, trim each list so that it contains
+ only the samples that contain a value in both lists. Also returns
+ the number of such samples.
+
+ >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], [3.4, 7.2, 1.3, None, 6.2, 4.1])
+ ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3)
+
+ """
+
+ min_length = min(len(a_values), len(b_values))
+ a_new = []
+ b_new = []
+ for counter in range(min_length):
+ if a_values[counter] and b_values[counter]:
+ a_new.append(a_values[counter])
+ b_new.append(b_values[counter])
+
+ num_overlap = len(a_new)
+ assert num_overlap == len(b_new), "Lengths should be the same"
+
+ return a_new, b_new, num_overlap
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod() \ No newline at end of file
diff --git a/wqflask/wqflask/correlation/show_corr_results.py b/wqflask/wqflask/correlation/show_corr_results.py
index 3b1ac87d..1410ae0c 100644
--- a/wqflask/wqflask/correlation/show_corr_results.py
+++ b/wqflask/wqflask/correlation/show_corr_results.py
@@ -20,6 +20,9 @@
from __future__ import absolute_import, print_function, division
+import sys
+sys.path.append(".")
+
import gc
import string
import cPickle
@@ -43,7 +46,7 @@ from utility.TDCell import TDCell
from base.trait import GeneralTrait
from base import data_set
from base.templatePage import templatePage
-from utility import webqtlUtil, helper_functions
+from utility import webqtlUtil, helper_functions, corr_result_helpers
from dbFunction import webqtlDatabaseFunction
import utility.webqtlUtil #this is for parallel computing only.
from wqflask.correlation import correlationFunction
@@ -122,22 +125,24 @@ class CorrelationResults(object):
self.correlation_data = {}
for trait, values in self.target_dataset.trait_data.iteritems():
- this_trait_values = []
- target_values = []
+ this_trait_vals = []
+ target_vals = []
for index, sample in enumerate(self.target_dataset.samplelist):
if sample in self.sample_data:
sample_value = self.sample_data[sample]
target_sample_value = values[index]
- this_trait_values.append(sample_value)
- target_values.append(target_sample_value)
+ this_trait_vals.append(sample_value)
+ target_vals.append(target_sample_value)
+
+ this_trait_vals, target_vals, num_overlap = corr_result_helpers.normalize_values(
+ this_trait_vals, target_vals)
- this_trait_values, target_values, num_overlap = normalize_values(this_trait_values,
- target_values)
+ print("num_overlap:", num_overlap)
if self.corr_method == 'pearson':
- sample_r, sample_p = scipy.stats.pearsonr(this_trait_values, target_values)
+ sample_r, sample_p = scipy.stats.pearsonr(this_trait_vals, target_vals)
else:
- sample_r, sample_p = scipy.stats.spearmanr(this_trait_values, target_values)
+ sample_r, sample_p = scipy.stats.spearmanr(this_trait_vals, target_vals)
self.correlation_data[trait] = [sample_r, sample_p, num_overlap]
@@ -940,163 +945,3 @@ class CorrelationResults(object):
return traitList
-
- def createExcelFileWithTitleAndFooter(self, workbook=None, identification=None, db=None, returnNumber=None):
-
- worksheet = workbook.add_worksheet()
-
- titleStyle = workbook.add_format(align = 'left', bold = 0, size=14, border = 1, border_color="gray")
-
- ##Write title Info
- # Modified by Hongqiang Li
- worksheet.write([1, 0], "Citations: Please see %s/reference.html" % webqtlConfig.PORTADDR, titleStyle)
- worksheet.write([1, 0], "Citations: Please see %s/reference.html" % webqtlConfig.PORTADDR, titleStyle)
- worksheet.write([2, 0], "Trait : %s" % identification, titleStyle)
- worksheet.write([3, 0], "Database : %s" % db.fullname, titleStyle)
- worksheet.write([4, 0], "Date : %s" % time.strftime("%B %d, %Y", time.gmtime()), titleStyle)
- worksheet.write([5, 0], "Time : %s GMT" % time.strftime("%H:%M ", time.gmtime()), titleStyle)
- worksheet.write([6, 0], "Status of data ownership: Possibly unpublished data; please see %s/statusandContact.html for details on sources, ownership, and usage of these data." % webqtlConfig.PORTADDR, titleStyle)
- #Write footer info
- worksheet.write([9 + returnNumber, 0], "Funding for The GeneNetwork: NIAAA (U01AA13499, U24AA13513), NIDA, NIMH, and NIAAA (P20-DA21131), NCI MMHCC (U01CA105417), and NCRR (U01NR 105417)", titleStyle)
- worksheet.write([10 + returnNumber, 0], "PLEASE RETAIN DATA SOURCE INFORMATION WHENEVER POSSIBLE", titleStyle)
-
- return worksheet
-
-
- def getTableHeaderForGeno(self, method=None, worksheet=None, newrow=None, headingStyle=None):
-
- tblobj_header = []
-
- if method in ["1","3","4"]:
- tblobj_header = [[THCell(HT.TD(' ', Class="fs13 fwb ffl b1 cw cbrb"), sort=0),
- THCell(HT.TD('Record', HT.BR(), 'ID', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Record ID', idx=1),
- THCell(HT.TD('Location', HT.BR(), 'Chr and Mb', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Location (Chr and Mb)', idx=2),
- THCell(HT.TD(HT.Href(
- text = HT.Span('Sample',HT.BR(), 'r', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"),
- target = '_blank',
- url = "/correlationAnnotation.html#genetic_r"),
- Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample r", idx=3),
- THCell(HT.TD('N',HT.BR(),'Cases',HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text="N Cases", idx=4),
- THCell(HT.TD(HT.Href(
- text = HT.Span('Sample',HT.BR(), 'p(r)', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"),
- target = '_blank',
- url = "/correlationAnnotation.html#genetic_p_r"),
- Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample p(r)", idx=5)]]
-
- for ncol, item in enumerate(['Record ID', 'Location (Chr, Mb)', 'Sample r', 'N Cases', 'Sample p(r)']):
- worksheet.write([newrow, ncol], item, headingStyle)
- worksheet.set_column([ncol, ncol], 2*len(item))
- else:
- tblobj_header = [[THCell(HT.TD(' ', Class="fs13 fwb ffl b1 cw cbrb"), sort=0),
- THCell(HT.TD('Record', HT.BR(), 'ID', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Record ID', idx=1),
- THCell(HT.TD('Location', HT.BR(), 'Chr and Mb', HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text='Location (Chr and Mb)', idx=2),
- THCell(HT.TD(HT.Href(
- text = HT.Span('Sample',HT.BR(), 'rho', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"),
- target = '_blank',
- url = "/correlationAnnotation.html#genetic_rho"),
- Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample rho", idx=3),
- THCell(HT.TD('N',HT.BR(),'Cases',HT.BR(), Class="fs13 fwb ffl b1 cw cbrb"), text="N Cases", idx=4),
- THCell(HT.TD(HT.Href(
- text = HT.Span('Sample',HT.BR(), 'p(rho)', HT.Sup(' ?', style="color:#f00"),HT.BR(), Class="fs13 fwb ffl cw"),
- target = '_blank',
- url = "/correlationAnnotation.html#genetic_p_rho"),
- Class="fs13 fwb ffl b1 cw cbrb", nowrap='ON'), text="Sample p(rho)", idx=5)]]
-
- for ncol, item in enumerate(['Record ID', 'Location (Chr, Mb)', 'Sample rho', 'N Cases', 'Sample p(rho)']):
- worksheet.write([newrow, ncol], item, headingStyle)
- worksheet.set_column([ncol, ncol], 2*len(item))
-
-
- return tblobj_header, worksheet
-
-
- def getTableBodyForGeno(self, traitList, formName=None, worksheet=None, newrow=None, corrScript=None):
-
- tblobj_body = []
-
- for thisTrait in traitList:
- tr = []
-
- trId = str(thisTrait)
-
- corrScript.append('corrArray["%s"] = {corr:%1.4f};' % (trId, thisTrait.corr))
-
- tr.append(TDCell(HT.TD(HT.Input(type="checkbox", Class="checkbox", name="searchResult",value=trId, onClick="highlight(this)"), nowrap="on", Class="fs12 fwn ffl b1 c222"), text=trId))
-
- tr.append(TDCell(HT.TD(HT.Href(text=thisTrait.name,url="javascript:showTrait('%s', '%s')" % (formName, thisTrait.name), Class="fs12 fwn ffl"),align="left", Class="fs12 fwn ffl b1 c222"), text=thisTrait.name, val=thisTrait.name.upper()))
-
- #XZ: trait_location_value is used for sorting
- trait_location_repr = '--'
- trait_location_value = 1000000
-
- if thisTrait.chr and thisTrait.mb:
- try:
- trait_location_value = int(thisTrait.chr)*1000 + thisTrait.mb
- except:
- if thisTrait.chr.upper() == 'X':
- trait_location_value = 20*1000 + thisTrait.mb
- else:
- trait_location_value = ord(str(thisTrait.chr).upper()[0])*1000 + thisTrait.mb
-
- trait_location_repr = 'Chr%s: %.6f' % (thisTrait.chr, float(thisTrait.mb) )
-
- tr.append(TDCell(HT.TD(trait_location_repr, Class="fs12 fwn b1 c222", nowrap="on"), trait_location_repr, trait_location_value))
-
-
- repr='%3.3f' % thisTrait.corr
- tr.append(TDCell(HT.TD(HT.Href(text=repr, url="javascript:showCorrPlot('%s', '%s')" % (formName, thisTrait.name), Class="fs12 fwn ffl"), Class="fs12 fwn ffl b1 c222", nowrap='ON', align='right'),repr,abs(thisTrait.corr)))
-
- repr = '%d' % thisTrait.nOverlap
- tr.append(TDCell(HT.TD(repr, Class="fs12 fwn ffl b1 c222",align='right'),repr,thisTrait.nOverlap))
-
- repr = webqtlUtil.SciFloat(thisTrait.corrPValue)
- tr.append(TDCell(HT.TD(repr,nowrap='ON', Class="fs12 fwn ffl b1 c222", align='right'),repr,thisTrait.corrPValue))
-
- tblobj_body.append(tr)
-
- for ncol, item in enumerate([thisTrait.name, trait_location_repr, thisTrait.corr, thisTrait.nOverlap, thisTrait.corrPValue]):
- worksheet.write([newrow, ncol], item)
- newrow += 1
-
- return tblobj_body, worksheet, corrScript
-
-def normalize_values(values_1, values_2):
- N = min(len(values_1), len(values_2))
- X = []
- Y = []
- for i in range(N):
- if values_1[i]!= None and values_2[i]!= None:
- X.append(values_1[i])
- Y.append(values_2[i])
- num_overlap = len(X)
-
- return (X, Y, num_overlap)
-
-
-def cal_correlation(values_1, values_2):
- N = min(len(values_1), len(values_2))
- X = []
- Y = []
- for i in range(N):
- if values_1[i]!= None and values_2[i]!= None:
- X.append(values_1[i])
- Y.append(values_2[i])
- NN = len(X)
- if NN <6:
- return (0.0,NN)
- sx = reduce(lambda x,y:x+y,X,0.0)
- sy = reduce(lambda x,y:x+y,Y,0.0)
- x_mean = sx/NN
- y_mean = sy/NN
- xyd = 0.0
- sxd = 0.0
- syd = 0.0
- for i in range(NN):
- xyd += (X[i] - x_mean)*(Y[i] - y_mean)
- sxd += (X[i] - x_mean)*(X[i] - x_mean)
- syd += (Y[i] - y_mean)*(Y[i] - y_mean)
- try:
- corr = xyd/(sqrt(sxd)*sqrt(syd))
- except:
- corr = 0
- return (corr, NN)