aboutsummaryrefslogtreecommitdiff
path: root/gn2/base/mrna_assay_tissue_data.py
blob: 7b7914aa6a59637d59fa791474440b151de1371f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import collections

from gn2.utility import Bunch


class MrnaAssayTissueData:

    def __init__(self, conn, gene_symbols=None):
        self.gene_symbols = gene_symbols
        self.conn = conn
        if self.gene_symbols is None:
            self.gene_symbols = []

        self.data = collections.defaultdict(Bunch)
        results = ()
        # Note that inner join is necessary in this query to get
        # distinct record in one symbol group with highest mean value
        # Due to the limit size of TissueProbeSetFreezeId table in DB,
        # performance of inner join is
        # acceptable.MrnaAssayTissueData(gene_symbols=symbol_list)
        with conn.cursor() as cursor:
            if len(self.gene_symbols) == 0:
                cursor.execute(
                    "SELECT t.Symbol, t.GeneId, t.DataId, "
                    "t.Chr, t.Mb, t.description, "
                    "t.Probe_Target_Description FROM (SELECT Symbol, "
                    "max(Mean) AS maxmean "
                    "FROM TissueProbeSetXRef WHERE "
                    "TissueProbeSetFreezeId=1 AND "
                    "Symbol != '' AND Symbol IS NOT "
                    "Null GROUP BY Symbol) "
                    "AS x INNER JOIN "
                    "TissueProbeSetXRef AS t ON "
                    "t.Symbol = x.Symbol "
                    "AND t.Mean = x.maxmean")
            else:
                cursor.execute(
                    "SELECT t.Symbol, t.GeneId, t.DataId, "
                    "t.Chr, t.Mb, t.description, "
                    "t.Probe_Target_Description FROM (SELECT Symbol, "
                    "max(Mean) AS maxmean "
                    "FROM TissueProbeSetXRef WHERE "
                    "TissueProbeSetFreezeId=1 AND "
                    "Symbol IN "
                    f"({', '.join(['%s'] * len(self.gene_symbols))}) "
                    "GROUP BY Symbol) AS x INNER JOIN "
                    "TissueProbeSetXRef AS t ON t.Symbol = x.Symbol "
                    "AND t.Mean = x.maxmean",
                    tuple(self.gene_symbols))
            results = list(cursor.fetchall())
        lower_symbols = {}
        for gene_symbol in self.gene_symbols:
            if gene_symbol is not None:
                lower_symbols[gene_symbol.lower()] = True

        for result in results:
            (symbol, gene_id, data_id, _chr, _mb,
             descr, probeset_target_descr) = result
            if symbol is not None and lower_symbols.get(symbol.lower()):
                symbol = symbol.lower()
                self.data[symbol].gene_id = gene_id
                self.data[symbol].data_id = data_id
                self.data[symbol].chr = _chr
                self.data[symbol].mb = _mb
                self.data[symbol].description = descr
                (self.data[symbol]
                 .probe_target_description) = probeset_target_descr


    def get_symbol_values_pairs(self):
        """Get one dictionary whose key is gene symbol and value is
        tissue expression data (list type).  All keys are lower case.

        The output is a symbolValuepairDict (dictionary): one
        dictionary of Symbol and Value Pair; key is symbol, value is
        one list of expression values of one probeSet;

        """
        id_list = [self.data[symbol].data_id for symbol in self.data]

        symbol_values_dict = {}

        if len(id_list) > 0:
            results = []
            with self.conn.cursor() as cursor:

                cursor.execute(
                    "SELECT TissueProbeSetXRef.Symbol, TissueProbeSetData.value "
                    "FROM TissueProbeSetXRef, TissueProbeSetData"
                    f" WHERE TissueProbeSetData.Id IN ({', '.join(['%s'] * len(id_list))})"
                    " AND TissueProbeSetXRef.DataId = TissueProbeSetData.Id"
                    ,tuple(id_list))

                results = cursor.fetchall()
                for result in results:
                    (symbol, value) = result
                    if symbol.lower() not in symbol_values_dict:
                        symbol_values_dict[symbol.lower()] = [value]
                    else:
                        symbol_values_dict[symbol.lower()].append(
                            value)
        return symbol_values_dict