From a13baa5fb98d8167e70df7008d0d07b40a05a6b9 Mon Sep 17 00:00:00 2001 From: DannyArends Date: Tue, 22 Mar 2016 23:37:13 +0100 Subject: Adding the geno file parser from Zach --- wqflask/utility/genofile_parser.py | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 wqflask/utility/genofile_parser.py (limited to 'wqflask/utility/genofile_parser.py') diff --git a/wqflask/utility/genofile_parser.py b/wqflask/utility/genofile_parser.py new file mode 100644 index 00000000..9dd7b08b --- /dev/null +++ b/wqflask/utility/genofile_parser.py @@ -0,0 +1,96 @@ +# CTL analysis for GN2 +# Author / Maintainer: Danny Arends + +from __future__ import print_function, division, absolute_import +import sys +import os +import glob +import traceback +import gzip + +import simplejson as json + +from pprint import pformat as pf + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + + +class ConvertGenoFile(object): + + def __init__(self, input_file): + self.mb_exists = False + self.cm_exists = False + self.markers = [] + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + self.input_fh = open(input_file) + print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + self.haplotype_notation = { + '@mat': "3", + '@pat': "1", + '@het': "2", + '@unk': "NA" + } + self.configurations = {} + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + self.latest_row_value = row + # Take care of headers + if not row.strip(): + continue + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.mb_exists = True + if 'cM' in row.split(): + self.cm_exists = True + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + def process_csv(self): + for row_count, row in enumerate(self.process_rows()): + row_items = row.split("\t") + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + if self.cm_exists and self.mb_exists: + this_marker.cM = row_items[2] + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + elif self.cm_exists: + this_marker.cM = row_items[2] + genotypes = row_items[3:] + elif self.mb_exists: + this_marker.Mb = row_items[2] + genotypes = row_items[3:] + else: + genotypes = row_items[2:] + for item_count, genotype in enumerate(genotypes): + if genotype.upper() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper()]) + else: + this_marker.genotypes.append("NA") + self.markers.append(this_marker.__dict__) + -- cgit v1.2.3 From 3d505d997511cd8f7b9f14510059cb2983edc6d4 Mon Sep 17 00:00:00 2001 From: DannyArends Date: Wed, 23 Mar 2016 23:07:16 +0100 Subject: Parsing the names of the individuals, and coding H as -999 --- wqflask/utility/genofile_parser.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'wqflask/utility/genofile_parser.py') diff --git a/wqflask/utility/genofile_parser.py b/wqflask/utility/genofile_parser.py index 9dd7b08b..67b84dc9 100644 --- a/wqflask/utility/genofile_parser.py +++ b/wqflask/utility/genofile_parser.py @@ -8,6 +8,7 @@ import glob import traceback import gzip + import simplejson as json from pprint import pformat as pf @@ -34,12 +35,12 @@ class ConvertGenoFile(object): self.latest_row_value = None self.latest_col_value = None self.input_fh = open(input_file) - print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + print("!!!!!!!!!!!!!!!!PARSER!!!!!!!!!!!!!!!!!!") self.haplotype_notation = { - '@mat': "3", - '@pat': "1", - '@het': "2", - '@unk': "NA" + '@mat': "1", + '@pat': "2", + '@het': "-999", + '@unk': "-999" } self.configurations = {} @@ -56,6 +57,8 @@ class ConvertGenoFile(object): self.mb_exists = True if 'cM' in row.split(): self.cm_exists = True + skip = 2 + self.cm_exists + self.mb_exists + self.individuals = row.split()[skip:] continue if row.startswith('@'): key, _separater, value = row.partition(':') @@ -88,9 +91,10 @@ class ConvertGenoFile(object): else: genotypes = row_items[2:] for item_count, genotype in enumerate(genotypes): - if genotype.upper() in self.configurations: - this_marker.genotypes.append(self.configurations[genotype.upper()]) + if genotype.upper().strip() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) else: + print("WARNING:", genotype.upper()) this_marker.genotypes.append("NA") self.markers.append(this_marker.__dict__) -- cgit v1.2.3