aboutsummaryrefslogtreecommitdiff
path: root/gn2/utility/genofile_parser.py
blob: 86d9823e2033e03e0c2f18a5fad46ea9c6ea74bd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# CTL analysis for GN2
# Author / Maintainer: Danny Arends <Danny.Arends@gmail.com>

import sys
import os
import glob
import traceback
import gzip


import simplejson as json

from pprint import pformat as pf


class Marker:
    def __init__(self):
        self.name = None
        self.chr = None
        self.cM = None
        self.Mb = None
        self.genotypes = []


class ConvertGenoFile:

    def __init__(self, input_file):
        self.mb_exists = False
        self.cm_exists = False
        self.markers = []

        self.latest_row_pos = None
        self.latest_col_pos = None

        self.latest_row_value = None
        self.latest_col_value = None
        self.input_fh = open(input_file)
        print("!!!!!!!!!!!!!!!!PARSER!!!!!!!!!!!!!!!!!!")
        self.haplotype_notation = {
            '@mat': "1",
            '@pat': "2",
            '@het': "-999",
            '@unk': "-999"
        }
        self.configurations = {}

    def process_rows(self):
        for self.latest_row_pos, row in enumerate(self.input_fh):
            self.latest_row_value = row
            # Take care of headers
            if not row.strip():
                continue
            if row.startswith('#'):
                continue
            if row.startswith('Chr'):
                if 'Mb' in row.split():
                    self.mb_exists = True
                if 'cM' in row.split():
                    self.cm_exists = True
                skip = 2 + self.cm_exists + self.mb_exists
                self.individuals = row.split()[skip:]
                continue
            if row.startswith('@'):
                key, _separater, value = row.partition(':')
                key = key.strip()
                value = value.strip()
                if key in self.haplotype_notation:
                    self.configurations[value] = self.haplotype_notation[key]
                continue
            if not len(self.configurations):
                raise EmptyConfigurations
            yield row

    def process_csv(self):
        for row in self.process_rows():
            row_items = row.split("\t")

            this_marker = Marker()
            this_marker.name = row_items[1]
            this_marker.chr = row_items[0]
            if self.cm_exists and self.mb_exists:
                this_marker.cM = row_items[2]
                this_marker.Mb = row_items[3]
                genotypes = row_items[4:]
            elif self.cm_exists:
                this_marker.cM = row_items[2]
                genotypes = row_items[3:]
            elif self.mb_exists:
                this_marker.Mb = row_items[2]
                genotypes = row_items[3:]
            else:
                genotypes = row_items[2:]
            for item_count, genotype in enumerate(genotypes):
                if genotype.upper().strip() in self.configurations:
                    this_marker.genotypes.append(
                        self.configurations[genotype.upper().strip()])
                else:
                    print("WARNING:", genotype.upper())
                    this_marker.genotypes.append("NA")
            self.markers.append(this_marker.__dict__)