From 1cc86cb4b71fe29b40115813836ca1277c1df859 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 13 Apr 2022 12:03:08 +0300 Subject: Implement test for parsing that fails * Improve tests that ensure parsing fails in case the file has errors * Add strains.csv file * Implement minimum viable functionality that passes the implemented tests --- quality_control/errors.py | 8 +++++-- quality_control/parsing.py | 60 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 5 deletions(-) (limited to 'quality_control') diff --git a/quality_control/errors.py b/quality_control/errors.py index 99f9c97..758a300 100644 --- a/quality_control/errors.py +++ b/quality_control/errors.py @@ -4,10 +4,14 @@ class InvalidCellValue(Exception): """Raised when a function encounters an invalid value""" def __init__(self, *args): - Exception.__init__(self, *args) + super().__init__(self, *args) class InvalidHeaderValue(Exception): """Raised when a header contains values not in the reference file.""" def __init__(self, *args): - Exception.__init__(self, *args) + super().__init__(self, *args) + +class ParseError(Exception): + def __init(self, *args): + super().__init__(*args) diff --git a/quality_control/parsing.py b/quality_control/parsing.py index 52124f9..eda9181 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -1,12 +1,66 @@ +import csv + from enum import Enum import quality_control.average as avg import quality_control.standard_error as se -from quality_control.errors import InvalidCellValue, InvalidHeaderValue +from quality_control.headers import valid_header +from quality_control.errors import ( + ParseError, InvalidCellValue, InvalidHeaderValue) class FileType(Enum): AVERAGE = 1 STANDARD_ERROR = 2 -def parse_file(filepath: str, filetype: FileType): - pass +def parse_strains(filepath): + with open(filepath) as strains_file: + reader = csv.DictReader( + strains_file, + fieldnames=[ + header.strip() for header + in strains_file.readline().split("\t")], + delimiter="\t") + for row in reader: + yield { + key: (value if value != "\\N" else None) + for key, value in row.items() + } + +def __parse_header(line, strains): + return valid_header( + strains, + tuple(header.strip() for header in line.split("\t"))) + +def __parse_average_line(line): + return (line[0],) + tuple(avg.valid_value(field) for field in line[1:]) + +def __parse_standard_error_line(line): + return (line[0],) + tuple(se.valid_value(field) for field in line[1:]) + +LINE_PARSERS = { + FileType.AVERAGE: __parse_average_line, + FileType.STANDARD_ERROR: __parse_standard_error_line +} + +def parse_file(filepath: str, filetype: FileType, strains_filepath: str): + seek_pos = 0 + try: + with open(filepath, encoding="utf-8") as input_file: + for line_number, line in enumerate(input_file): + if line_number == 0: + yield __parse_header( + line, + tuple(strain["Name"] for strain + in parse_strains(strains_filepath))) + seek_pos = seek_pos + len(line) + + yield LINE_PARSERS[filetype]( + tuple(field.strip() for field in line.split("\t"))) + seek_pos = seek_pos + len(line) + except (InvalidCellValue, InvalidHeaderValue) as err: + raise ParseError({ + "filepath": filepath, + "filetype": filetype, + "position": seek_pos, + "line_number": line_number + }) -- cgit v1.2.3