aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/mrna_assay_tissue_data.py
blob: e99ef7b539f2ef2c20075530c65e66ec7a9baea3 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from __future__ import absolute_import, print_function, division

import collections

from flask import g

from utility import db_tools
from utility import Bunch

from MySQLdb import escape_string as escape

from pprint import pformat as pf

from utility.logger import getLogger
logger = getLogger(__name__ )

class MrnaAssayTissueData(object):

    def __init__(self, gene_symbols=None):
        self.gene_symbols = gene_symbols
        self.have_data = False
        if self.gene_symbols == None:
            self.gene_symbols = []

        #print("self.gene_symbols:", self.gene_symbols)

        self.data = collections.defaultdict(Bunch)

        #self.gene_id_dict ={}
        #self.data_id_dict = {}
        #self.chr_dict = {}
        #self.mb_dict = {}
        #self.desc_dict = {}
        #self.probe_target_desc_dict = {}

        query =  '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description
                        from (
                        select Symbol, max(Mean) as maxmean
                        from TissueProbeSetXRef
                        where TissueProbeSetFreezeId=1 and '''

        # Note that inner join is necessary in this query to get distinct record in one symbol group
        # with highest mean value
        # Due to the limit size of TissueProbeSetFreezeId table in DB,
        # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list)
        if len(gene_symbols) == 0:
            query +=  '''Symbol!='' and Symbol Is Not Null group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''
        else:
            in_clause = db_tools.create_in_clause(gene_symbols)

            #ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower

            query += ''' Symbol in {} group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''.format(in_clause)
        logger.sql(query)
        results = g.db.execute(query).fetchall()

        lower_symbols = []
        for gene_symbol in gene_symbols:
            if gene_symbol != None:
                lower_symbols.append(gene_symbol.lower())

        for result in results:
            symbol = result[0]
            #if symbol.lower() in [gene_symbol.lower() for gene_symbol in gene_symbols]:
            if symbol.lower() in lower_symbols:
                #gene_symbols.append(symbol)
                symbol = symbol.lower()

                self.data[symbol].gene_id = result.GeneId
                self.data[symbol].data_id = result.DataId
                self.data[symbol].chr = result.Chr
                self.data[symbol].mb = result.Mb
                self.data[symbol].description = result.description
                self.data[symbol].probe_target_description = result.Probe_Target_Description

        print("self.data: ", pf(self.data))

    ###########################################################################
    #Input: cursor, symbolList (list), dataIdDict(Dict)
    #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair,
    #        key is symbol, value is one list of expression values of one probeSet;
    #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type).
    #Attention! All keys are lower case!
    ###########################################################################

    def get_symbol_values_pairs(self):
        id_list = [self.data[symbol].data_id for symbol in self.data]

        symbol_values_dict = {}

        if len(id_list) > 0:
            query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value
                       FROM TissueProbeSetXRef, TissueProbeSetData
                       WHERE TissueProbeSetData.Id IN {} and
                             TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list))

            logger.sql('tissue query',query)

            results = g.db.execute(query).fetchall()
            for result in results:
                if result.Symbol.lower() not in symbol_values_dict:
                    symbol_values_dict[result.Symbol.lower()] = [result.value]
                else:
                    symbol_values_dict[result.Symbol.lower()].append(result.value)

        #for symbol in self.data:
        #    data_id = self.data[symbol].data_id
        #    symbol_values_dict[symbol] = self.get_tissue_values(data_id)


        return symbol_values_dict


    #def get_tissue_values(self, data_id):
    #    """Gets the tissue values for a particular gene"""
    #
    #    tissue_values=[]
    #
    #    query = """SELECT value, id
    #               FROM TissueProbeSetData
    #               WHERE Id IN {}""".format(db_tools.create_in_clause(data_id))
    #
    #    #try :
    #    results = g.db.execute(query).fetchall()
    #    for result in results:
    #        tissue_values.append(result.value)
    #    #symbol_values_dict[symbol] = value_list
    #    #except:
    #    #    symbol_values_pairs[symbol] = None
    #
    #    return tissue_values

########################################################################################################
#input: cursor, symbolList (list), dataIdDict(Dict): key is symbol
#output: SymbolValuePairDict(dictionary):one dictionary of Symbol and Value Pair.
#        key is symbol, value is one list of expression values of one probeSet.
#function: wrapper function for getSymbolValuePairDict function
#          build gene symbol list if necessary, cut it into small lists if necessary,
#          then call getSymbolValuePairDict function and merge the results.
########################################################################################################

#def get_trait_symbol_and_tissue_values(symbol_list=None):
#    tissue_data = MrnaAssayTissueData(gene_symbols=symbol_list)
#
#    #symbolList,
#    #geneIdDict,
#    #dataIdDict,
#    #ChrDict,
#    #MbDict,
#    #descDict,
#    #pTargetDescDict = getTissueProbeSetXRefInfo(
#    #                    GeneNameLst=GeneNameLst,TissueProbeSetFreezeId=TissueProbeSetFreezeId)
#
#    if len(tissue_data.gene_symbols):
#        return get_symbol_values_pairs(tissue_data)