gn2/wqflask/interval_analyst/GeneUtil.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

import string

from gn2.wqflask.database import database_connection

from gn2.utility.tools import flat_files, get_setting

def load_homology(chr_name, start_mb, end_mb, source_file):
    homology_list = []
    with open(flat_files("homology/") + source_file) as h_file:
        current_chr = 0
        for line in h_file:
            line_items = line.split()
            this_dict = {
                "ref_chr": line_items[2][3:],
                "ref_strand": line_items[4],
                "ref_start": float(line_items[5])/1000000,
                "ref_end": float(line_items[6])/1000000,
                "query_chr": line_items[7][3:],
                "query_strand": line_items[9],
                "query_start": float(line_items[10])/1000000,
                "query_end": float(line_items[11])/1000000
            }

            if str(this_dict["ref_chr"]) == str(chr_name) and \
                    this_dict["ref_start"] < end_mb and \
                    this_dict["ref_end"] > start_mb:
                homology_list.append(this_dict)

    return homology_list

def loadGenes(chrName, diffCol, startMb, endMb, species='mouse'):
    assembly_map = {
        "mouse": "mm10",
        "rat": "rn7"
    }

    def append_assembly(fetch_fields, species):
        query_fields = []
        for field in fetch_fields:
            if field in ['Chr', 'TxStart', 'TxEnd', 'Strand']:
                query_fields.append(field + "_" + assembly_map[species])
            else:
                query_fields.append(field)

        return query_fields


    fetchFields = ['SpeciesId', 'Id', 'GeneSymbol', 'GeneDescription', 'Chr', 'TxStart', 'TxEnd',
                   'Strand', 'GeneID', 'NM_ID', 'kgID', 'GenBankID', 'UnigenID', 'ProteinID', 'AlignID',
                   'exonCount', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd']

    # List All Species in the Gene Table
    speciesDict = {}
    results = []
    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
        cursor.execute("SELECT Species.Name, GeneList081722.SpeciesId "
                       "FROM Species, GeneList081722 WHERE "
                       "GeneList081722.SpeciesId = Species.Id "
                       "GROUP BY GeneList081722.SpeciesId")
        results = cursor.fetchall()
        for item in results:
            if item[0] == "rat":
                speciesDict[item[0]] = (item[1], "rn7")
            else:
                speciesDict[item[0]] = (item[1], "mm10")

        # List current Species and other Species
        speciesId, assembly = speciesDict[species]
        otherSpecies = [[X, speciesDict[X][0], speciesDict[X][1]] for X in list(speciesDict.keys())]
        otherSpecies.remove([species, speciesId, assembly])
        query_fields = append_assembly(fetchFields, species)

        cursor.execute(f"SELECT {', '.join(query_fields)} FROM GeneList081722 "
                       "WHERE SpeciesId = %s AND "
                       f"Chr_{assembly}" + " = %s AND "
                       f"((TxStart_{assembly}" + " > %s and " + f"TxStart_{assembly}" + " <= %s) "
                       f"OR (TxEnd_{assembly}" + " > %s and " + f"TxEnd_{assembly}" + " <= %s)) "
                       f"ORDER BY TxStart_{assembly}",
                       (speciesId, chrName,
                        startMb, endMb,
                        startMb, endMb))
        results = cursor.fetchall()

        GeneList = []
        if results:
            for result in results:
                newdict = {}
                for j, item in enumerate(fetchFields):
                    newdict[item] = result[j]
                # count SNPs if possible
                if diffCol and species == 'mouse':
                    cursor.execute(
                        "SELECT count(*) FROM BXDSnpPosition "
                        "WHERE Chr = %s AND "
                        "Mb >= %s AND Mb < %s "
                        "AND StrainId1 = %s AND StrainId2 = %s",
                    (chrName, f"{newdict['TxStart']:2.6f}",
                     f"{newdict['TxEnd']:2.6f}",
                     diffCol[0], diffCol[1],))
                    newdict["snpCount"] = cursor.fetchone()[0]
                    newdict["snpDensity"] = (
                        newdict["snpCount"] /
                        (newdict["TxEnd"] - newdict["TxStart"]) / 1000.0)
                else:
                    newdict["snpDensity"] = newdict["snpCount"] = 0
                try:
                    newdict['GeneLength'] = 1000.0 * \
                        (newdict['TxEnd'] - newdict['TxStart'])
                except:
                    pass
                # load gene from other Species by the same name
                for item in otherSpecies:
                    othSpec, othSpecId, othSpecAssembly = item
                    newdict2 = {}
                    query_fields = append_assembly(fetchFields, othSpec)
                    cursor.execute(
                        f"SELECT {', '.join(query_fields)} FROM GeneList081722 WHERE "
                        "SpeciesId = %s AND "
                        "geneSymbol= %s LIMIT 1",
                        (othSpecId,
                         newdict["GeneSymbol"]))
                    resultsOther = cursor.fetchone()
                    if resultsOther:
                        for j, item in enumerate(fetchFields):
                            newdict2[item] = resultsOther[j]

                        # count SNPs if possible, could be a separate function
                        if diffCol and othSpec == 'mouse':
                            cursor.execute(
                                "SELECT count(*) FROM BXDSnpPosition "
                                "WHERE Chr = %s AND Mb >= %s AND "
                                "Mb < %s AND StrainId1 = %s "
                                "AND StrainId2 = %s",
                                (chrName, f"{newdict['TxStart']:2.6f}",
                                 f"{newdict['TxEnd']:2.6f}",
                                 diffCol[0], diffCol[1]))
                            if snp_count := cursor.fetchone():
                                newdict2["snpCount"] = snp_count[0]

                            newdict2["snpDensity"] = (
                                newdict2["snpCount"]
                                / (newdict2["TxEnd"] - newdict2["TxStart"])
                                / 1000.0)
                        else:
                            newdict2["snpDensity"] = newdict2["snpCount"] = 0
                        try:
                            newdict2['GeneLength'] = (
                                1000.0 * (newdict2['TxEnd'] - newdict2['TxStart']))
                        except:
                            pass

                    newdict['%sGene' % othSpec] = newdict2

                GeneList.append(newdict)
        return GeneList