From a51f95bea5fa9a3b767aaebf75adfa706cf7940f Mon Sep 17 00:00:00 2001 From: zsloan Date: Thu, 10 Mar 2022 00:45:11 +0000 Subject: Add code generating the new genotype files Also made a large number of other fixes that proved necessary during testing --- wqflask/maintenance/gen_ind_genofiles.py | 114 +++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 29 deletions(-) diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py index 9a97626d..e705119f 100644 --- a/wqflask/maintenance/gen_ind_genofiles.py +++ b/wqflask/maintenance/gen_ind_genofiles.py @@ -2,6 +2,7 @@ # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno +import json import os import sys from typing import List @@ -28,23 +29,37 @@ def main(args): # The source group(s) we're generating the .geno files from # This can be passed as either a specific .geno file (or set of files as multiple arguments), # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists) + geno_json = {} + source_files = [] if ".json" in args[4]: - source_files = [geno_dir + genofile['location'] for genofile in json.load(args[4])['genofile']] + geno_json = json.load(open(geno_dir + args[4], "r")) + par_f1s = { + "mat": geno_json['mat'], + "pat": geno_json['pat'], + "f1s": geno_json['f1s'] + } + + # List of file titles and locations from JSON + source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']] else: - source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]] + par_f1s = {} + # List of files directly taken from command line arguments, with titles just set to the filename + for group in args[4:]: + file_name = geno_dir + group + ".geno" if ".geno" not in group else group + source_files.append({'title': file_name[:-5], 'location': file_name}) if len(source_files) > 1: # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files - target_json_loc = out_dir + args[3].split(".")[:-1] + ".json" + target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json" target_json = {'genofile': []} # Generate the output .geno files for source_file in source_files: - filename, samples = generate_new_genofile(source_file, target_file) + filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir) target_json['genofile'].append({ 'location': filename.split("/")[-1], - 'title': filename.split("/")[-1], + 'title': source_file['title'], 'sample_list': samples }) @@ -55,20 +70,59 @@ def get_strain_for_sample(sample): "SELECT CaseAttributeXRefNew.Value " "FROM CaseAttributeXRefNew, Strain " "WHERE CaseAttributeXRefNew.CaseAttributeId=11 " - "AND CaseAttributeXRef.New.StrainId = Strain.Id " + "AND CaseAttributeXRefNew.StrainId = Strain.Id " "AND Strain.Name = %(name)s" ) - with conn.cursor() as cursor: - return cursor.execute(query, {"name": name}).fetchone()[0] + with conn().cursor() as cursor: + cursor.execute(query, {"name": sample.strip()}) + return cursor.fetchone()[0] -def generate_new_genofiles(source_genofile, target_genofile): - base_samples = group_samples(source_genofile) - base_genotypes = strain_genotypes(source_genofile) +def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir): + source_samples = group_samples(source_genofile) + source_genotypes = strain_genotypes(source_genofile) target_samples = group_samples(target_genofile) - strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples) + strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s) + if len(source_genofile.split("/")[-1].split(".")) > 2: + # The number in the source genofile; for example 4 in BXD.4.geno + source_num = source_genofile.split("/")[-1].split(".")[-2] + target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno" + else: + target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno" + + file_location = out_dir + target_filename + + with open(file_location, "w") as fh: + for metadata in ["name", "type", "mat", "pat", "het", "unk"]: + fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n") + + header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples + fh.write("\t".join(header_line)) + + for marker in source_genotypes['markers']: + line_items = [ + marker['Chr'], + marker['Locus'], + marker['cM'], + marker['Mb'] + ] + + for pos in strain_pos_map: + if isinstance(pos, int): + line_items.append(marker['genotypes'][pos]) + else: + if pos in ["mat", "pat"]: + line_items.append(source_genotypes[pos]) + elif pos == "f1s": + line_items.append("H") + else: + line_items.append("U") -def map_strain_pos_to_target_group(base_samples, target_samples): + fh.write("\t".join(line_items) + "\n") + + return file_location, target_samples + +def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s): """ Retrieve corresponding strain position for each sample in the target group @@ -82,7 +136,14 @@ def map_strain_pos_to_target_group(base_samples, target_samples): pos_map = [] for sample in target_samples: sample_strain = get_strain_for_sample(sample) - pos_map.append(base_samples.index(sample_strain)) + if sample_strain in source_samples: + pos_map.append(source_samples.index(sample_strain)) + else: + val = "U" + for key in par_f1s.keys(): + if sample_strain in par_f1s[key]: + val = key + pos_map.append(val) return pos_map @@ -128,27 +189,21 @@ def strain_genotypes(strain_genofile: str) -> List: geno_start_col = None header_columns = [] sample_list = [] - marker_genotypes = [] - with open(file_location, "r") as source_geno: + markers = [] + with open(strain_genofile, "r") as source_geno: for i, line in enumerate(source_geno): if line[0] == "@": - if "@type" in line: - geno_dict['type'] = line.split(":")[1] - if "@mat" in line: - geno_dict['mat'] = line.split(":")[1] - elif "@pat" in line: - geno_dict['pat'] = line.split(":")[1] - elif "@het" in line: - geno_dict['het'] = line.split(":")[1] - elif "@unk" in line: - geno_dict['unk'] = line.split(":")[1] + metadata_type = line[1:].split(":")[0] + if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']: + geno_dict[metadata_type] = line.split(":")[1].strip() + + continue # Skip other header lines if line[0] == "#" or not len(line): continue line_items = line.split("\t") - if "Chr" in line_items: # Header row # Get the first column index containing genotypes header_columns = line_items @@ -169,9 +224,10 @@ def strain_genotypes(strain_genofile: str) -> List: 'cM': line_items[header_columns.index("cM")], 'genotypes': [item.strip() for item in line_items][geno_start_col:] } - marker_genotypes.append(this_marker) - geno_dict['genotypes'] = marker_genotypes + markers.append(this_marker) + + geno_dict['markers'] = markers return geno_dict -- cgit v1.2.3