about summary refs log tree commit diff
path: root/gn2/wqflask/interval_analyst/GeneUtil.py
diff options
context:
space:
mode:
authorArun Isaac2023-12-29 18:55:37 +0000
committerArun Isaac2023-12-29 19:01:46 +0000
commit204a308be0f741726b9a620d88fbc22b22124c81 (patch)
treeb3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/wqflask/interval_analyst/GeneUtil.py
parent83062c75442160427b50420161bfcae2c5c34c84 (diff)
downloadgenenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz
Namespace all modules under gn2.
We move all modules under a gn2 directory. This is important for
"correct" packaging and deployment as a Guix service.
Diffstat (limited to 'gn2/wqflask/interval_analyst/GeneUtil.py')
-rw-r--r--gn2/wqflask/interval_analyst/GeneUtil.py155
1 files changed, 155 insertions, 0 deletions
diff --git a/gn2/wqflask/interval_analyst/GeneUtil.py b/gn2/wqflask/interval_analyst/GeneUtil.py
new file mode 100644
index 00000000..d3b00e31
--- /dev/null
+++ b/gn2/wqflask/interval_analyst/GeneUtil.py
@@ -0,0 +1,155 @@
+import string
+
+from gn2.wqflask.database import database_connection
+
+from gn2.utility.tools import flat_files, get_setting
+
+def load_homology(chr_name, start_mb, end_mb, source_file):
+    homology_list = []
+    with open(flat_files("homology/") + source_file) as h_file:
+        current_chr = 0
+        for line in h_file:
+            line_items = line.split()
+            this_dict = {
+                "ref_chr": line_items[2][3:],
+                "ref_strand": line_items[4],
+                "ref_start": float(line_items[5])/1000000,
+                "ref_end": float(line_items[6])/1000000,
+                "query_chr": line_items[7][3:],
+                "query_strand": line_items[9],
+                "query_start": float(line_items[10])/1000000,
+                "query_end": float(line_items[11])/1000000
+            }
+
+            if str(this_dict["ref_chr"]) == str(chr_name) and \
+                    this_dict["ref_start"] < end_mb and \
+                    this_dict["ref_end"] > start_mb:
+                homology_list.append(this_dict)
+
+    return homology_list
+
+def loadGenes(chrName, diffCol, startMb, endMb, species='mouse'):
+    assembly_map = {
+        "mouse": "mm10",
+        "rat": "rn7"
+    }
+
+    def append_assembly(fetch_fields, species):
+        query_fields = []
+        for field in fetch_fields:
+            if field in ['Chr', 'TxStart', 'TxEnd', 'Strand']:
+                query_fields.append(field + "_" + assembly_map[species])
+            else:
+                query_fields.append(field)
+
+        return query_fields
+
+
+    fetchFields = ['SpeciesId', 'Id', 'GeneSymbol', 'GeneDescription', 'Chr', 'TxStart', 'TxEnd',
+                   'Strand', 'GeneID', 'NM_ID', 'kgID', 'GenBankID', 'UnigenID', 'ProteinID', 'AlignID',
+                   'exonCount', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd']
+
+    # List All Species in the Gene Table
+    speciesDict = {}
+    results = []
+    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+        cursor.execute("SELECT Species.Name, GeneList081722.SpeciesId "
+                       "FROM Species, GeneList081722 WHERE "
+                       "GeneList081722.SpeciesId = Species.Id "
+                       "GROUP BY GeneList081722.SpeciesId")
+        results = cursor.fetchall()
+        for item in results:
+            if item[0] == "rat":
+                speciesDict[item[0]] = (item[1], "rn7")
+            else:
+                speciesDict[item[0]] = (item[1], "mm10")
+
+        # List current Species and other Species
+        speciesId, assembly = speciesDict[species]
+        otherSpecies = [[X, speciesDict[X][0], speciesDict[X][1]] for X in list(speciesDict.keys())]
+        otherSpecies.remove([species, speciesId, assembly])
+        query_fields = append_assembly(fetchFields, species)
+
+        cursor.execute(f"SELECT {', '.join(query_fields)} FROM GeneList081722 "
+                       "WHERE SpeciesId = %s AND "
+                       f"Chr_{assembly}" + " = %s AND "
+                       f"((TxStart_{assembly}" + " > %s and " + f"TxStart_{assembly}" + " <= %s) "
+                       f"OR (TxEnd_{assembly}" + " > %s and " + f"TxEnd_{assembly}" + " <= %s)) "
+                       f"ORDER BY TxStart_{assembly}",
+                       (speciesId, chrName,
+                        startMb, endMb,
+                        startMb, endMb))
+        results = cursor.fetchall()
+
+        GeneList = []
+        if results:
+            for result in results:
+                newdict = {}
+                for j, item in enumerate(fetchFields):
+                    newdict[item] = result[j]
+                # count SNPs if possible
+                if diffCol and species == 'mouse':
+                    cursor.execute(
+                        "SELECT count(*) FROM BXDSnpPosition "
+                        "WHERE Chr = %s AND "
+                        "Mb >= %s AND Mb < %s "
+                        "AND StrainId1 = %s AND StrainId2 = %s",
+                    (chrName, f"{newdict['TxStart']:2.6f}",
+                     f"{newdict['TxEnd']:2.6f}",
+                     diffCol[0], diffCol[1],))
+                    newdict["snpCount"] = cursor.fetchone()[0]
+                    newdict["snpDensity"] = (
+                        newdict["snpCount"] /
+                        (newdict["TxEnd"] - newdict["TxStart"]) / 1000.0)
+                else:
+                    newdict["snpDensity"] = newdict["snpCount"] = 0
+                try:
+                    newdict['GeneLength'] = 1000.0 * \
+                        (newdict['TxEnd'] - newdict['TxStart'])
+                except:
+                    pass
+                # load gene from other Species by the same name
+                for item in otherSpecies:
+                    othSpec, othSpecId, othSpecAssembly = item
+                    newdict2 = {}
+                    query_fields = append_assembly(fetchFields, othSpec)
+                    cursor.execute(
+                        f"SELECT {', '.join(query_fields)} FROM GeneList081722 WHERE "
+                        "SpeciesId = %s AND "
+                        "geneSymbol= %s LIMIT 1",
+                        (othSpecId,
+                         newdict["GeneSymbol"]))
+                    resultsOther = cursor.fetchone()
+                    if resultsOther:
+                        for j, item in enumerate(fetchFields):
+                            newdict2[item] = resultsOther[j]
+
+                        # count SNPs if possible, could be a separate function
+                        if diffCol and othSpec == 'mouse':
+                            cursor.execute(
+                                "SELECT count(*) FROM BXDSnpPosition "
+                                "WHERE Chr = %s AND Mb >= %s AND "
+                                "Mb < %s AND StrainId1 = %s "
+                                "AND StrainId2 = %s",
+                                (chrName, f"{newdict['TxStart']:2.6f}",
+                                 f"{newdict['TxEnd']:2.6f}",
+                                 diffCol[0], diffCol[1]))
+                            if snp_count := cursor.fetchone():
+                                newdict2["snpCount"] = snp_count[0]
+
+                            newdict2["snpDensity"] = (
+                                newdict2["snpCount"]
+                                / (newdict2["TxEnd"] - newdict2["TxStart"])
+                                / 1000.0)
+                        else:
+                            newdict2["snpDensity"] = newdict2["snpCount"] = 0
+                        try:
+                            newdict2['GeneLength'] = (
+                                1000.0 * (newdict2['TxEnd'] - newdict2['TxStart']))
+                        except:
+                            pass
+
+                    newdict['%sGene' % othSpec] = newdict2
+
+                GeneList.append(newdict)
+        return GeneList