aboutsummaryrefslogtreecommitdiff
path: root/wqflask/utility/gen_geno_ob.py
blob: 44e2722f9a059a2b93d92263d8647e1da487fe6a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from __future__ import absolute_import, division, print_function

class genotype(object):
    """
    Replacement for reaper.Dataset so we can remove qtlreaper use while still generating mapping output figure
    """

    def __init__(self, filename):
        self.group = None
        self.type = "riset"
        self.prgy = []
        self.nprgy = 0
        self.mat = -1
        self.pat = 1
        self.het = 0
        self.unk = "U"
        self.filler = False
        self.mb_exists = False

        #ZS: This is because I'm not sure if some files switch the column that contains Mb/cM positions; might be unnecessary
        self.cm_column = 2
        self.mb_column = 3

        self.chromosomes = []

        self.read_file(filename)

    def __iter__(self):
        return iter(self.chromosomes)

    def __getitem__(self, index):
        return self.chromosomes[index]

    def __len__(self):
        return len(self.chromosomes)

    def read_file(self, filename):

        with open(filename, 'r') as geno_file:
            lines = geno_file.readlines()

            this_chr = "" #ZS: This is so it can track when the chromosome changes as it iterates through markers
            chr_ob = None
            for line in lines:
                if line[0] == "#":
                    continue
                elif line[0] == "@":
                    label = line.split(":")[0][1:]
                    if label == "name":
                        self.group = line.split(":")[1].strip()
                    elif label == "filler":
                        if line.split(":")[1].strip() == "yes":
                            self.filler = True
                    elif label == "type":
                        self.type = line.split(":")[1].strip()
                    elif label == "mat":
                        self.mat = line.split(":")[1].strip()
                    elif label == "pat":
                        self.pat = line.split(":")[1].strip()
                    elif label == "het":
                        self.het = line.split(":")[1].strip()
                    elif label == "unk":
                        self.unk = line.split(":")[1].strip()
                    else:
                        continue
                elif line[:3] == "Chr":
                    header_row = line.split("\t")
                    if header_row[2] == "Mb":
                        self.mb_exists = True
                        self.mb_column = 2
                        self.cm_column = 3
                    elif header_row[3] == "Mb":
                        self.mb_exists = True
                        self.mb_column = 3
                    elif header_row[2] == "cM":
                        self.cm_column = 2

                    if self.mb_exists:
                        self.prgy = header_row[4:]
                    else:
                        self.prgy = header_row[3:]
                    self.nprgy = len(self.prgy)
                else:
                    if line.split("\t")[0] != this_chr:
                        if this_chr != "":
                            self.chromosomes.append(chr_ob)
                        this_chr = line.split("\t")[0]
                        chr_ob = Chr(line.split("\t")[0], self)
                    chr_ob.add_marker(line.split("\t"))

            self.chromosomes.append(chr_ob)

class Chr(object):
    def __init__(self, name, geno_ob):
        self.name = name
        self.loci = []
        self.mb_exists = geno_ob.mb_exists
        self.cm_column = geno_ob.cm_column
        self.mb_column = geno_ob.mb_column
        self.geno_ob = geno_ob

    def __iter__(self):
        return iter(self.loci)

    def __getitem__(self, index):
        return self.loci[index]

    def __len__(self):
        return len(self.loci)

    def add_marker(self, marker_row):
        self.loci.append(Locus(marker_row, self.geno_ob))

class Locus(object):
    def __init__(self, marker_row, geno_ob):
        self.chr = marker_row[0]
        self.name = marker_row[1]
        try:
            self.cM = float(marker_row[geno_ob.cm_column])
        except:
            self.cM = float(marker_row[geno_ob.mb_column]) if geno_ob.mb_exists else 0
        self.Mb = float(marker_row[geno_ob.mb_column]) if geno_ob.mb_exists else None

        geno_table = {
            geno_ob.mat: -1,
            geno_ob.pat: 1,
            geno_ob.het: 0,
            geno_ob.unk: "U"
        }

        self.genotype = []
        if geno_ob.mb_exists:
            start_pos = 4
        else:
            start_pos = 3

        for allele in marker_row[start_pos:]:
            if allele in geno_table.keys():
                self.genotype.append(geno_table[allele])
            else: #ZS: Some genotype appears that isn't specified in the metadata, make it unknown
                self.genotype.append("U")