aboutsummaryrefslogtreecommitdiff
path: root/gn2/wqflask/interval_analyst/GeneUtil.py
diff options
context:
space:
mode:
authorArun Isaac2023-12-29 18:55:37 +0000
committerArun Isaac2023-12-29 19:01:46 +0000
commit204a308be0f741726b9a620d88fbc22b22124c81 (patch)
treeb3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/wqflask/interval_analyst/GeneUtil.py
parent83062c75442160427b50420161bfcae2c5c34c84 (diff)
downloadgenenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz
Namespace all modules under gn2.
We move all modules under a gn2 directory. This is important for "correct" packaging and deployment as a Guix service.
Diffstat (limited to 'gn2/wqflask/interval_analyst/GeneUtil.py')
-rw-r--r--gn2/wqflask/interval_analyst/GeneUtil.py155
1 files changed, 155 insertions, 0 deletions
diff --git a/gn2/wqflask/interval_analyst/GeneUtil.py b/gn2/wqflask/interval_analyst/GeneUtil.py
new file mode 100644
index 00000000..d3b00e31
--- /dev/null
+++ b/gn2/wqflask/interval_analyst/GeneUtil.py
@@ -0,0 +1,155 @@
+import string
+
+from gn2.wqflask.database import database_connection
+
+from gn2.utility.tools import flat_files, get_setting
+
+def load_homology(chr_name, start_mb, end_mb, source_file):
+ homology_list = []
+ with open(flat_files("homology/") + source_file) as h_file:
+ current_chr = 0
+ for line in h_file:
+ line_items = line.split()
+ this_dict = {
+ "ref_chr": line_items[2][3:],
+ "ref_strand": line_items[4],
+ "ref_start": float(line_items[5])/1000000,
+ "ref_end": float(line_items[6])/1000000,
+ "query_chr": line_items[7][3:],
+ "query_strand": line_items[9],
+ "query_start": float(line_items[10])/1000000,
+ "query_end": float(line_items[11])/1000000
+ }
+
+ if str(this_dict["ref_chr"]) == str(chr_name) and \
+ this_dict["ref_start"] < end_mb and \
+ this_dict["ref_end"] > start_mb:
+ homology_list.append(this_dict)
+
+ return homology_list
+
+def loadGenes(chrName, diffCol, startMb, endMb, species='mouse'):
+ assembly_map = {
+ "mouse": "mm10",
+ "rat": "rn7"
+ }
+
+ def append_assembly(fetch_fields, species):
+ query_fields = []
+ for field in fetch_fields:
+ if field in ['Chr', 'TxStart', 'TxEnd', 'Strand']:
+ query_fields.append(field + "_" + assembly_map[species])
+ else:
+ query_fields.append(field)
+
+ return query_fields
+
+
+ fetchFields = ['SpeciesId', 'Id', 'GeneSymbol', 'GeneDescription', 'Chr', 'TxStart', 'TxEnd',
+ 'Strand', 'GeneID', 'NM_ID', 'kgID', 'GenBankID', 'UnigenID', 'ProteinID', 'AlignID',
+ 'exonCount', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd']
+
+ # List All Species in the Gene Table
+ speciesDict = {}
+ results = []
+ with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
+ cursor.execute("SELECT Species.Name, GeneList081722.SpeciesId "
+ "FROM Species, GeneList081722 WHERE "
+ "GeneList081722.SpeciesId = Species.Id "
+ "GROUP BY GeneList081722.SpeciesId")
+ results = cursor.fetchall()
+ for item in results:
+ if item[0] == "rat":
+ speciesDict[item[0]] = (item[1], "rn7")
+ else:
+ speciesDict[item[0]] = (item[1], "mm10")
+
+ # List current Species and other Species
+ speciesId, assembly = speciesDict[species]
+ otherSpecies = [[X, speciesDict[X][0], speciesDict[X][1]] for X in list(speciesDict.keys())]
+ otherSpecies.remove([species, speciesId, assembly])
+ query_fields = append_assembly(fetchFields, species)
+
+ cursor.execute(f"SELECT {', '.join(query_fields)} FROM GeneList081722 "
+ "WHERE SpeciesId = %s AND "
+ f"Chr_{assembly}" + " = %s AND "
+ f"((TxStart_{assembly}" + " > %s and " + f"TxStart_{assembly}" + " <= %s) "
+ f"OR (TxEnd_{assembly}" + " > %s and " + f"TxEnd_{assembly}" + " <= %s)) "
+ f"ORDER BY TxStart_{assembly}",
+ (speciesId, chrName,
+ startMb, endMb,
+ startMb, endMb))
+ results = cursor.fetchall()
+
+ GeneList = []
+ if results:
+ for result in results:
+ newdict = {}
+ for j, item in enumerate(fetchFields):
+ newdict[item] = result[j]
+ # count SNPs if possible
+ if diffCol and species == 'mouse':
+ cursor.execute(
+ "SELECT count(*) FROM BXDSnpPosition "
+ "WHERE Chr = %s AND "
+ "Mb >= %s AND Mb < %s "
+ "AND StrainId1 = %s AND StrainId2 = %s",
+ (chrName, f"{newdict['TxStart']:2.6f}",
+ f"{newdict['TxEnd']:2.6f}",
+ diffCol[0], diffCol[1],))
+ newdict["snpCount"] = cursor.fetchone()[0]
+ newdict["snpDensity"] = (
+ newdict["snpCount"] /
+ (newdict["TxEnd"] - newdict["TxStart"]) / 1000.0)
+ else:
+ newdict["snpDensity"] = newdict["snpCount"] = 0
+ try:
+ newdict['GeneLength'] = 1000.0 * \
+ (newdict['TxEnd'] - newdict['TxStart'])
+ except:
+ pass
+ # load gene from other Species by the same name
+ for item in otherSpecies:
+ othSpec, othSpecId, othSpecAssembly = item
+ newdict2 = {}
+ query_fields = append_assembly(fetchFields, othSpec)
+ cursor.execute(
+ f"SELECT {', '.join(query_fields)} FROM GeneList081722 WHERE "
+ "SpeciesId = %s AND "
+ "geneSymbol= %s LIMIT 1",
+ (othSpecId,
+ newdict["GeneSymbol"]))
+ resultsOther = cursor.fetchone()
+ if resultsOther:
+ for j, item in enumerate(fetchFields):
+ newdict2[item] = resultsOther[j]
+
+ # count SNPs if possible, could be a separate function
+ if diffCol and othSpec == 'mouse':
+ cursor.execute(
+ "SELECT count(*) FROM BXDSnpPosition "
+ "WHERE Chr = %s AND Mb >= %s AND "
+ "Mb < %s AND StrainId1 = %s "
+ "AND StrainId2 = %s",
+ (chrName, f"{newdict['TxStart']:2.6f}",
+ f"{newdict['TxEnd']:2.6f}",
+ diffCol[0], diffCol[1]))
+ if snp_count := cursor.fetchone():
+ newdict2["snpCount"] = snp_count[0]
+
+ newdict2["snpDensity"] = (
+ newdict2["snpCount"]
+ / (newdict2["TxEnd"] - newdict2["TxStart"])
+ / 1000.0)
+ else:
+ newdict2["snpDensity"] = newdict2["snpCount"] = 0
+ try:
+ newdict2['GeneLength'] = (
+ 1000.0 * (newdict2['TxEnd'] - newdict2['TxStart']))
+ except:
+ pass
+
+ newdict['%sGene' % othSpec] = newdict2
+
+ GeneList.append(newdict)
+ return GeneList