scripts/convert_dol_genotypes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

# This is just to convert the Rqtl2 format genotype files for DOL into a .geno file
# Everything is hard-coded since I doubt this will be re-used and I just wanted to generate the file quickly

# This is to be used on the files generated as described by Karl Broman here - https://kbroman.org/qtl2/pages/prep_do_data.html

import os

geno_dir = "/home/zas1024/gn2-zach/DO_genotypes/"
markers_file = "/home/zas1024/gn2-zach/DO_genotypes/SNP_Map.txt"
gn_geno_path = "/home/zas1024/gn2-zach/DO_genotypes/DOL.geno"

# Iterate through the SNP_Map.txt file to get marker positions
marker_data = {}
with open(markers_file, "r") as markers_fh:
    for i, line in enumerate(markers_fh):
        if i == 0:
            continue
        else:
            line_items = line.split("\t")
            this_marker = {}
            this_marker['chr'] = line_items[2] if line_items[2] != "0" else "M"
            this_marker['pos'] = f'{float(line_items[3])/1000000:.6f}'
            marker_data[line_items[1]] = this_marker

# Iterate through R/qtl2 format genotype files and pull out the samplelist and genotypes for each marker
sample_names = []
for filename in os.listdir(geno_dir):
    if "gm4qtl2_geno" in filename:
        with open(geno_dir + "/" + filename, "r") as rqtl_geno_fh:
            for i, line in enumerate(rqtl_geno_fh):
                line_items = line.split(",")
                if i < 3:
                    continue
                elif not len(sample_names) and i == 3:
                    sample_names_positions = [[item.replace("TLB", "TB").strip(), i] for i, item in enumerate(line_items[1:])]
                    sample_names_positions.sort(key = lambda x: x[0][2:])
                    sample_names = [sample[0] for sample in sample_names_positions]
                elif i > 3:
                    genotypes = ["X" if item.strip() == "-" else item.strip() for item in line_items[1:]]
                    ordered_genotypes = [genotypes[i].strip() for i in [pos[1] for pos in sample_names_positions]]
                    marker_data[line_items[0]]['genotypes'] = ordered_genotypes

# Generate list of marker obs to iterate through when writing to .geno file
marker_list = []
for key, value in marker_data.items():
    if 'genotypes' in value:
        this_marker = {
            'chr': value['chr'],
            'locus': key,
            'pos': value['pos'],
            'genotypes': value['genotypes']
        }
        marker_list.append(this_marker)


def sort_func(e):
    """For ensuring that X/Y chromosomes/mitochondria are sorted to the end correctly"""
    try:
        return float((e['chr']))*1000 + float(e['pos'])
    except:
        if e['chr'] == "X":
            return 20000 + float(e['pos'])
        elif e['chr'] == "Y":
            return 21000 + float(e['pos'])
        elif e['chr'] == "M":
            return 22000 + float(e['pos'])

# Sort markers by chromosome
marker_list.sort(key=sort_func)

# Write lines to .geno file
with open(gn_geno_path, "w") as gn_geno_fh:
    gn_geno_fh.write("\t".join((["Chr", "Locus", "cM", "Mb"] + sample_names)) + "\n")
    for marker in marker_list:
        row_contents = [
            marker['chr'],
            marker['locus'],
            marker['pos'],
            marker['pos']
        ] + marker['genotypes']
        gn_geno_fh.write("\t".join(row_contents) + "\n")