From 204a308be0f741726b9a620d88fbc22b22124c81 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 29 Dec 2023 18:55:37 +0000 Subject: Namespace all modules under gn2. We move all modules under a gn2 directory. This is important for "correct" packaging and deployment as a Guix service. --- gn2/wqflask/interval_analyst/GeneUtil.py | 155 +++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 gn2/wqflask/interval_analyst/GeneUtil.py (limited to 'gn2/wqflask/interval_analyst/GeneUtil.py') diff --git a/gn2/wqflask/interval_analyst/GeneUtil.py b/gn2/wqflask/interval_analyst/GeneUtil.py new file mode 100644 index 00000000..d3b00e31 --- /dev/null +++ b/gn2/wqflask/interval_analyst/GeneUtil.py @@ -0,0 +1,155 @@ +import string + +from gn2.wqflask.database import database_connection + +from gn2.utility.tools import flat_files, get_setting + +def load_homology(chr_name, start_mb, end_mb, source_file): + homology_list = [] + with open(flat_files("homology/") + source_file) as h_file: + current_chr = 0 + for line in h_file: + line_items = line.split() + this_dict = { + "ref_chr": line_items[2][3:], + "ref_strand": line_items[4], + "ref_start": float(line_items[5])/1000000, + "ref_end": float(line_items[6])/1000000, + "query_chr": line_items[7][3:], + "query_strand": line_items[9], + "query_start": float(line_items[10])/1000000, + "query_end": float(line_items[11])/1000000 + } + + if str(this_dict["ref_chr"]) == str(chr_name) and \ + this_dict["ref_start"] < end_mb and \ + this_dict["ref_end"] > start_mb: + homology_list.append(this_dict) + + return homology_list + +def loadGenes(chrName, diffCol, startMb, endMb, species='mouse'): + assembly_map = { + "mouse": "mm10", + "rat": "rn7" + } + + def append_assembly(fetch_fields, species): + query_fields = [] + for field in fetch_fields: + if field in ['Chr', 'TxStart', 'TxEnd', 'Strand']: + query_fields.append(field + "_" + assembly_map[species]) + else: + query_fields.append(field) + + return query_fields + + + fetchFields = ['SpeciesId', 'Id', 'GeneSymbol', 'GeneDescription', 'Chr', 'TxStart', 'TxEnd', + 'Strand', 'GeneID', 'NM_ID', 'kgID', 'GenBankID', 'UnigenID', 'ProteinID', 'AlignID', + 'exonCount', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd'] + + # List All Species in the Gene Table + speciesDict = {} + results = [] + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute("SELECT Species.Name, GeneList081722.SpeciesId " + "FROM Species, GeneList081722 WHERE " + "GeneList081722.SpeciesId = Species.Id " + "GROUP BY GeneList081722.SpeciesId") + results = cursor.fetchall() + for item in results: + if item[0] == "rat": + speciesDict[item[0]] = (item[1], "rn7") + else: + speciesDict[item[0]] = (item[1], "mm10") + + # List current Species and other Species + speciesId, assembly = speciesDict[species] + otherSpecies = [[X, speciesDict[X][0], speciesDict[X][1]] for X in list(speciesDict.keys())] + otherSpecies.remove([species, speciesId, assembly]) + query_fields = append_assembly(fetchFields, species) + + cursor.execute(f"SELECT {', '.join(query_fields)} FROM GeneList081722 " + "WHERE SpeciesId = %s AND " + f"Chr_{assembly}" + " = %s AND " + f"((TxStart_{assembly}" + " > %s and " + f"TxStart_{assembly}" + " <= %s) " + f"OR (TxEnd_{assembly}" + " > %s and " + f"TxEnd_{assembly}" + " <= %s)) " + f"ORDER BY TxStart_{assembly}", + (speciesId, chrName, + startMb, endMb, + startMb, endMb)) + results = cursor.fetchall() + + GeneList = [] + if results: + for result in results: + newdict = {} + for j, item in enumerate(fetchFields): + newdict[item] = result[j] + # count SNPs if possible + if diffCol and species == 'mouse': + cursor.execute( + "SELECT count(*) FROM BXDSnpPosition " + "WHERE Chr = %s AND " + "Mb >= %s AND Mb < %s " + "AND StrainId1 = %s AND StrainId2 = %s", + (chrName, f"{newdict['TxStart']:2.6f}", + f"{newdict['TxEnd']:2.6f}", + diffCol[0], diffCol[1],)) + newdict["snpCount"] = cursor.fetchone()[0] + newdict["snpDensity"] = ( + newdict["snpCount"] / + (newdict["TxEnd"] - newdict["TxStart"]) / 1000.0) + else: + newdict["snpDensity"] = newdict["snpCount"] = 0 + try: + newdict['GeneLength'] = 1000.0 * \ + (newdict['TxEnd'] - newdict['TxStart']) + except: + pass + # load gene from other Species by the same name + for item in otherSpecies: + othSpec, othSpecId, othSpecAssembly = item + newdict2 = {} + query_fields = append_assembly(fetchFields, othSpec) + cursor.execute( + f"SELECT {', '.join(query_fields)} FROM GeneList081722 WHERE " + "SpeciesId = %s AND " + "geneSymbol= %s LIMIT 1", + (othSpecId, + newdict["GeneSymbol"])) + resultsOther = cursor.fetchone() + if resultsOther: + for j, item in enumerate(fetchFields): + newdict2[item] = resultsOther[j] + + # count SNPs if possible, could be a separate function + if diffCol and othSpec == 'mouse': + cursor.execute( + "SELECT count(*) FROM BXDSnpPosition " + "WHERE Chr = %s AND Mb >= %s AND " + "Mb < %s AND StrainId1 = %s " + "AND StrainId2 = %s", + (chrName, f"{newdict['TxStart']:2.6f}", + f"{newdict['TxEnd']:2.6f}", + diffCol[0], diffCol[1])) + if snp_count := cursor.fetchone(): + newdict2["snpCount"] = snp_count[0] + + newdict2["snpDensity"] = ( + newdict2["snpCount"] + / (newdict2["TxEnd"] - newdict2["TxStart"]) + / 1000.0) + else: + newdict2["snpDensity"] = newdict2["snpCount"] = 0 + try: + newdict2['GeneLength'] = ( + 1000.0 * (newdict2['TxEnd'] - newdict2['TxStart'])) + except: + pass + + newdict['%sGene' % othSpec] = newdict2 + + GeneList.append(newdict) + return GeneList -- cgit v1.2.3