Add script for converting the HXBBXH genotype file provided by Hao.

Paths and sample/chromosome lists are hard-coded just because this was probably a one-time thing, but still commiting it since it's good to have it around in case we need to use it again
author: zsloan 2023-10-24 17:41:51 +0000
committer: zsloan 2023-10-24 17:41:51 +0000
commit: 554da75ce6617ee5d91bc82085cfa2857fbc7637 (patch)
tree: 48f1ffbc518bcd0e5899826279034806f0232787
parent: 79ce392c5a482a0460e8a91344a0e63c9e9a8085 (diff)
download: genenetwork2-554da75ce6617ee5d91bc82085cfa2857fbc7637.tar.gz
1 files changed, 60 insertions, 0 deletions
diff --git a/scripts/convert_hxbbxh_to_geno.py b/scripts/convert_hxbbxh_to_geno.py
new file mode 100644
index 00000000..a907838f
--- /dev/null
+++ b/scripts/convert_hxbbxh_to_geno.py
@@ -0,0 +1,60 @@
+import csv
+import os
+
+input_dir = "/export2/local/home/zas1024/gn2-zach/hxbbxh-genotypes/hao/error0.001"
+output_dir = "/export2/local/home/zas1024/gn2-zach/hxbbxh-genotypes/hao/output"
+
+base_dict = {
+    '0': 'B',
+    '2': 'D',
+    '1': 'H'
+}
+
+chromosomes = ["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","X","Y"]
+sample_list = ["BXH2","BXH3","BXH5","BXH6","BXH8","BXH9","BXH10","BXH11","BXH12","BXH12a","BXH13","HXB1","HXB2","HXB3","HXB4","HXB5","HXB7","HXB9","HXB10","HXB13","HXB14","HXB15","HXB16","HXB17","HXB18","HXB19","HXB20","HXB21","HXB22","HXB23","HXB24","HXB25","HXB26","HXB27","HXB29","HXB30","HXB31"]
+
+row_list = [
+    ["#type riset or intercross"],
+    ["@type:riset"],
+    ["@name:HXB/BXH"],
+    ["#abbreviation of maternal or paternal parents"],
+    ["@mat:B"],
+    ["@pat:D"],
+    ["#heterozygous , optional, default is \"H\""],
+    ["@het:H"],
+    ["#Unknown , optional, default is \"U\""],
+    ["@unk:U"]
+]
+
+file_sample_list = []
+trimmed_samples = []
+
+# This is a convoluted way to fix the order of samples to be the same as in GN
+sample_mapping = []
+
+for chromosome in chromosomes:
+    f = os.path.join(input_dir, f"HXB_genotype_chr{chromosome}_dup_removed_smoothed_by_rqtl_error0.001_dup_removed_again_012.csv")
+    if os.path.isfile(f):
+        with open(f, "r") as the_file:
+            all_rows = [row.split() for row in the_file]
+            all_rows = [[item.replace('"', '') for item in col] for col in zip(*all_rows)]
+
+            if not len(file_sample_list):
+                file_sample_list = [sample.replace("_mRatNor1", "").split("_")[0] for sample in all_rows[0][4:-1]]
+                for sample in sample_list:
+                    if sample in file_sample_list:
+                        trimmed_samples.append(sample)
+                        sample_mapping.append(file_sample_list.index(sample))
+
+            row_list.append(["Chr", "Locus", "cM", "Mb"] + trimmed_samples)
+            for row in all_rows[1:]:
+                this_mb = str(float(row[0].split(":")[1])/1000000)
+                this_row = [row[1], row[0], row[2], this_mb]
+                genotypes = row[4:-1]
+                for i in range(len(trimmed_samples)):
+                    this_row.append(base_dict[genotypes[sample_mapping[i]]])
+                row_list.append(this_row)
+
+with open(os.path.join(output_dir, "HXBBXH_new.geno"), "w") as out_file:
+    for line in row_list:
+        out_file.write("\t".join(line) + "\n")
author	zsloan	2023-10-24 17:41:51 +0000
committer	zsloan	2023-10-24 17:41:51 +0000
commit	554da75ce6617ee5d91bc82085cfa2857fbc7637 (patch)
tree	48f1ffbc518bcd0e5899826279034806f0232787
parent	79ce392c5a482a0460e8a91344a0e63c9e9a8085 (diff)
download	genenetwork2-554da75ce6617ee5d91bc82085cfa2857fbc7637.tar.gz