genenetwork2 - GeneNetwork (2nd generation)

import string

from gn2.wqflask.database import database_connection

from gn2.utility.tools import flat_files, get_setting

def load_homology(chr_name, start_mb, end_mb, source_file):
    homology_list = []
    with open(flat_files("homology/") + source_file) as h_file:
        current_chr = 0
        for line in h_file:
            line_items = line.split()
            this_dict = {
                "ref_chr": line_items[2][3:],
                "ref_strand": line_items[4],
                "ref_start": float(line_items[5])/1000000,
                "ref_end": float(line_items[6])/1000000,
                "query_chr": line_items[7][3:],
                "query_strand": line_items[9],
                "query_start": float(line_items[10])/1000000,
                "query_end": float(line_items[11])/1000000
            }

            if str(this_dict["ref_chr"]) == str(chr_name) and \
                    this_dict["ref_start"] < end_mb and \
                    this_dict["ref_end"] > start_mb:
                homology_list.append(this_dict)

    return homology_list

def loadGenes(chrName, diffCol, startMb, endMb, species='mouse'):
    assembly_map = {
        "mouse": "mm10",
        "rat": "rn7"
    }

    def append_assembly(fetch_fields, species):
        query_fields = []
        for field in fetch_fields:
            if field in ['Chr', 'TxStart', 'TxEnd', 'Strand']:
                query_fields.append(field + "_" + assembly_map[species])
            else:
                query_fields.append(field)

        return query_fields


    fetchFields = ['SpeciesId', 'Id', 'GeneSymbol', 'GeneDescription', 'Chr', 'TxStart', 'TxEnd',
                   'Strand', 'GeneID', 'NM_ID', 'kgID', 'GenBankID', 'UnigenID', 'ProteinID', 'AlignID',
                   'exonCount', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd']

    # List All Species in the Gene Table
    speciesDict = {}
    results = []
    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
        cursor.execute("SELECT Species.Name, GeneList081722.SpeciesId "
                       "FROM Species, GeneList081722 WHERE "
                       "GeneList081722.SpeciesId = Species.Id "
                       "GROUP BY GeneList081722.SpeciesId")
        results = cursor.fetchall()
        for item in results:
            if item[0] == "rat":
                speciesDict[item[0]] = (item[1], "rn7")
            else:
                speciesDict[item[0]] = (item[1], "mm10")

        # List current Species and other Species
        speciesId, assembly = speciesDict[species]
        otherSpecies = [[X, speciesDict[X][0], speciesDict[X][1]] for X in list(speciesDict.keys())]
        otherSpecies.remove([species, speciesId, assembly])
        query_fields = append_assembly(fetchFields, species)

        cursor.execute(f"SELECT {', '.join(query_fields)} FROM GeneList081722 "
                       "WHERE SpeciesId = %s AND "
                       f"Chr_{assembly}" + " = %s AND "
                       f"((TxStart_{assembly}" + " > %s and " + f"TxStart_{assembly}" + " <= %s) "
                       f"OR (TxEnd_{assembly}" + " > %s and " + f"TxEnd_{assembly}" + " <= %s)) "
                       f"ORDER BY TxStart_{assembly}",
                       (speciesId, chrName,
                        startMb, endMb,
                        startMb, endMb))
        results = cursor.fetchall()

        GeneList = []
        if results:
            for result in results:
                newdict = {}
                for j, item in enumerate(fetchFields):
                    newdict[item] = result[j]
                # count SNPs if possible
                if diffCol and species == 'mouse':
                    cursor.execute(
                        "SELECT count(*) FROM BXDSnpPosition "
                        "WHERE Chr = %s AND "
                        "Mb >= %s AND Mb < %s "
                        "AND StrainId1 = %s AND StrainId2 = %s",
                    (chrName, f"{newdict['TxStart']:2.6f}",
                     f"{newdict['TxEnd']:2.6f}",
                     diffCol[0], diffCol[1],))
                    newdict["snpCount"] = cursor.fetchone()[0]
                    newdict["snpDensity"] = (
                        newdict["snpCount"] /
                        (newdict["TxEnd"] - newdict["TxStart"]) / 1000.0)
                else:
                    newdict["snpDensity"] = newdict["snpCount"] = 0
                try:
                    newdict['GeneLength'] = 1000.0 * \
                        (newdict['TxEnd'] - newdict['TxStart'])
                except:
                    pass
                # load gene from other Species by the same name
                for item in otherSpecies:
                    othSpec, othSpecId, othSpecAssembly = item
                    newdict2 = {}
                    query_fields = append_assembly(fetchFields, othSpec)
                    cursor.execute(
                        f"SELECT {', '.join(query_fields)} FROM GeneList081722 WHERE "
                        "SpeciesId = %s AND "
                        "geneSymbol= %s LIMIT 1",
                        (othSpecId,
                         newdict["GeneSymbol"]))
                    resultsOther = cursor.fetchone()
                    if resultsOther:
                        for j, item in enumerate(fetchFields):
                            newdict2[item] = resultsOther[j]

                        # count SNPs if possible, could be a separate function
                        if diffCol and othSpec == 'mouse':
                            cursor.execute(
                                "SELECT count(*) FROM BXDSnpPosition "
                                "WHERE Chr = %s AND Mb >= %s AND "
                                "Mb < %s AND StrainId1 = %s "
                                "AND StrainId2 = %s",
                                (chrName, f"{newdict['TxStart']:2.6f}",
                                 f"{newdict['TxEnd']:2.6f}",
                                 diffCol[0], diffCol[1]))
                            if snp_count := cursor.fetchone():
                                newdict2["snpCount"] = snp_count[0]

                            newdict2["snpDensity"] = (
                                newdict2["snpCount"]
                                / (newdict2["TxEnd"] - newdict2["TxStart"])
                                / 1000.0)
                        else:
                            newdict2["snpDensity"] = newdict2["snpCount"] = 0
                        try:
                            newdict2['GeneLength'] = (
                                1000.0 * (newdict2['TxEnd'] - newdict2['TxStart']))
                        except:
                            pass

                    newdict['%sGene' % othSpec] = newdict2

                GeneList.append(newdict)
        return GeneList