aboutsummaryrefslogtreecommitdiff
path: root/wqflask/base/mrna_assay_tissue_data.py
blob: 0220d73b08f18fcb0a48abb3e22e474a2e87e72f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import collections

from flask import g

from utility import db_tools
from utility import Bunch

from utility.db_tools import escape
from gn3.db_utils import database_connector


from utility.logger import getLogger
logger = getLogger(__name__ )

class MrnaAssayTissueData(object):

    def __init__(self, gene_symbols=None):
        self.gene_symbols = gene_symbols
        if self.gene_symbols == None:
            self.gene_symbols = []

        self.data = collections.defaultdict(Bunch)

        query =  '''select t.Symbol, t.GeneId, t.DataId, t.Chr, t.Mb, t.description, t.Probe_Target_Description
                        from (
                        select Symbol, max(Mean) as maxmean
                        from TissueProbeSetXRef
                        where TissueProbeSetFreezeId=1 and '''

        # Note that inner join is necessary in this query to get distinct record in one symbol group
        # with highest mean value
        # Due to the limit size of TissueProbeSetFreezeId table in DB,
        # performance of inner join is acceptable.MrnaAssayTissueData(gene_symbols=symbol_list)
        if len(gene_symbols) == 0:
            query +=  '''Symbol!='' and Symbol Is Not Null group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''
        else:
            in_clause = db_tools.create_in_clause(gene_symbols)

            #ZS: This was in the query, not sure why: http://docs.python.org/2/library/string.html?highlight=lower#string.lower
            query += ''' Symbol in {} group by Symbol)
                as x inner join TissueProbeSetXRef as t on t.Symbol = x.Symbol
                and t.Mean = x.maxmean;
                    '''.format(in_clause)


        # lower_symbols = []
        lower_symbols = {}
        for gene_symbol in gene_symbols:
            # lower_symbols[gene_symbol.lower()] = True
            if gene_symbol != None:
                lower_symbols[gene_symbol.lower()] = True

        import time
        # initial_time = time.time()
        # conn,cursor = database_connector()
        # cursor.execute(query)
        # for result in cursor.fetchall():
        #     symbol = result[0]
        #     self.data[symbol].gene_id = result[1]
        #     self.data[symbol].data_id = result[2]
        #     self.data[symbol].chr = result[3]
        #     self.data[symbol].mb = result[4]
        #     self.data[symbol].description = result[5]
        #     self.data[symbol].probe_target_description = result[6]


        # print("my loop takes>>>>",time.time()-initial_time)
        # conn.close()
        # r

        # takes 5 seconds
        initial_time = time.time()
        results = list(g.db.execute(query).fetchall())
        for result in results:
            symbol = result[0]
            # if  symbol  is not None
            # exists = lower_symbols.get(symbol.lower())
            # if symbol.lower() in lower_symbols:
            if symbol  is not None and lower_symbols.get(symbol.lower()):

                symbol = symbol.lower()

                self.data[symbol].gene_id = result.GeneId
                self.data[symbol].data_id = result.DataId
                self.data[symbol].chr = result.Chr
                self.data[symbol].mb = result.Mb
                self.data[symbol].description = result.description
                self.data[symbol].probe_target_description = result.Probe_Target_Description
        print("time taken in the loop is",time.time()-initial_time)

    ###########################################################################
    #Input: cursor, symbolList (list), dataIdDict(Dict)
    #output: symbolValuepairDict (dictionary):one dictionary of Symbol and Value Pair,
    #        key is symbol, value is one list of expression values of one probeSet;
    #function: get one dictionary whose key is gene symbol and value is tissue expression data (list type).
    #Attention! All keys are lower case!
    ###########################################################################

    def get_symbol_values_pairs(self):
        id_list = [self.data[symbol].data_id for symbol in self.data]
        
        symbol_values_dict = {}

        if len(id_list) > 0:
            query = """SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value
                       FROM TissueProbeSetXRef, TissueProbeSetData
                       WHERE TissueProbeSetData.Id IN {} and
                             TissueProbeSetXRef.DataId = TissueProbeSetData.Id""".format(db_tools.create_in_clause(id_list))


            results = g.db.execute(query).fetchall()
            for result in results:
                if result.Symbol.lower() not in symbol_values_dict:
                    symbol_values_dict[result.Symbol.lower()] = [result.value]
                else:
                    symbol_values_dict[result.Symbol.lower()].append(result.value)

        return symbol_values_dict