diff options
author | zsloan | 2023-10-24 17:41:51 +0000 |
---|---|---|
committer | zsloan | 2023-10-24 17:41:51 +0000 |
commit | 554da75ce6617ee5d91bc82085cfa2857fbc7637 (patch) | |
tree | 48f1ffbc518bcd0e5899826279034806f0232787 | |
parent | 79ce392c5a482a0460e8a91344a0e63c9e9a8085 (diff) | |
download | genenetwork2-554da75ce6617ee5d91bc82085cfa2857fbc7637.tar.gz |
Add script for converting the HXBBXH genotype file provided by Hao.
Paths and sample/chromosome lists are hard-coded just because this was probably a one-time thing, but still commiting it since it's good to have it around in case we need to use it again
-rw-r--r-- | scripts/convert_hxbbxh_to_geno.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/scripts/convert_hxbbxh_to_geno.py b/scripts/convert_hxbbxh_to_geno.py new file mode 100644 index 00000000..a907838f --- /dev/null +++ b/scripts/convert_hxbbxh_to_geno.py @@ -0,0 +1,60 @@ +import csv +import os + +input_dir = "/export2/local/home/zas1024/gn2-zach/hxbbxh-genotypes/hao/error0.001" +output_dir = "/export2/local/home/zas1024/gn2-zach/hxbbxh-genotypes/hao/output" + +base_dict = { + '0': 'B', + '2': 'D', + '1': 'H' +} + +chromosomes = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","X","Y"] +sample_list = ["BXH2","BXH3","BXH5","BXH6","BXH8","BXH9","BXH10","BXH11","BXH12","BXH12a","BXH13","HXB1","HXB2","HXB3","HXB4","HXB5","HXB7","HXB9","HXB10","HXB13","HXB14","HXB15","HXB16","HXB17","HXB18","HXB19","HXB20","HXB21","HXB22","HXB23","HXB24","HXB25","HXB26","HXB27","HXB29","HXB30","HXB31"] + +row_list = [ + ["#type riset or intercross"], + ["@type:riset"], + ["@name:HXB/BXH"], + ["#abbreviation of maternal or paternal parents"], + ["@mat:B"], + ["@pat:D"], + ["#heterozygous , optional, default is \"H\""], + ["@het:H"], + ["#Unknown , optional, default is \"U\""], + ["@unk:U"] +] + +file_sample_list = [] +trimmed_samples = [] + +# This is a convoluted way to fix the order of samples to be the same as in GN +sample_mapping = [] + +for chromosome in chromosomes: + f = os.path.join(input_dir, f"HXB_genotype_chr{chromosome}_dup_removed_smoothed_by_rqtl_error0.001_dup_removed_again_012.csv") + if os.path.isfile(f): + with open(f, "r") as the_file: + all_rows = [row.split() for row in the_file] + all_rows = [[item.replace('"', '') for item in col] for col in zip(*all_rows)] + + if not len(file_sample_list): + file_sample_list = [sample.replace("_mRatNor1", "").split("_")[0] for sample in all_rows[0][4:-1]] + for sample in sample_list: + if sample in file_sample_list: + trimmed_samples.append(sample) + sample_mapping.append(file_sample_list.index(sample)) + + row_list.append(["Chr", "Locus", "cM", "Mb"] + trimmed_samples) + for row in all_rows[1:]: + this_mb = str(float(row[0].split(":")[1])/1000000) + this_row = [row[1], row[0], row[2], this_mb] + genotypes = row[4:-1] + for i in range(len(trimmed_samples)): + this_row.append(base_dict[genotypes[sample_mapping[i]]]) + row_list.append(this_row) + +with open(os.path.join(output_dir, "HXBBXH_new.geno"), "w") as out_file: + for line in row_list: + out_file.write("\t".join(line) + "\n") |