diff options
-rw-r--r-- | quality_control/parsing.py | 48 | ||||
-rw-r--r-- | tests/qc/test_error_collection.py | 30 |
2 files changed, 78 insertions, 0 deletions
diff --git a/quality_control/parsing.py b/quality_control/parsing.py index abed22e..ac53642 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -3,6 +3,7 @@ import csv from enum import Enum from functools import reduce +from typing import Iterator, Generator import quality_control.average as avg import quality_control.standard_error as se @@ -76,3 +77,50 @@ def parse_file(filepath: str, filetype: FileType, strains: list): "line_number": line_number, "error": err }) from err + +def parse_errors(filepath: str, filetype: FileType, strains: list, + seek_pos: int = 0) -> Generator: + """Retrieve ALL the parse errors""" + print(f"seek_pos: {seek_pos}, {type(seek_pos)}") + assert seek_pos >= 0, "The seek position must be at least zero (0)" + + def __error_type(error): + """Return a nicer string representatiton for the error type.""" + if isinstance(error, DuplicateHeader): + return "Duplicated Headers" + if isinstance(error, InvalidCellValue): + return "Invalid Value" + if isinstance(error, InvalidHeaderValue): + return "Invalid Strain" + + def __errors(filepath, filetype, strains, seek_pos): + """Return only the errors as values""" + with open(filepath, encoding="utf-8") as input_file: + ## TODO: Seek the file to the given seek position + for line_number, line in enumerate(input_file): + if seek_pos > 0: + input_file.seek(seek_pos, 0) + try: + if seek_pos == 0 and line_number == 0: + header = __parse_header(line, strains) + yield None + seek_pos = seek_pos + len(line) + continue + + parsed_line = LINE_PARSERS[filetype]( + tuple(field.strip() for field in line.split("\t"))) + yield None + seek_pos = seek_pos + len(line) + except (DuplicateHeader, InvalidCellValue, InvalidHeaderValue) as err: + yield { + "filepath": filepath, + "filetype": filetype, + "position": seek_pos, + "line_number": line_number, + "error": __error_type(err), + "message": err.args + } + + return ( + error for error in __errors(filepath, filetype, strains, seek_pos) + if error is not None) diff --git a/tests/qc/test_error_collection.py b/tests/qc/test_error_collection.py new file mode 100644 index 0000000..c45803a --- /dev/null +++ b/tests/qc/test_error_collection.py @@ -0,0 +1,30 @@ +import pytest + +from quality_control.parsing import FileType, parse_errors + +@pytest.mark.slow +@pytest.mark.parametrize( + "filepath,filetype,seek_pos", + (("tests/test_data/average_crlf.tsv", FileType.AVERAGE, 0), + ("tests/test_data/average_error_at_end_200MB.tsv", FileType.AVERAGE, + 205500004 # Skip first 500K lines + ), + ("tests/test_data/average.tsv", FileType.AVERAGE, 0), + ("tests/test_data/standarderror_1_error_at_end.tsv", + FileType.STANDARD_ERROR, 0), + ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR, 0), + ("tests/test_data/duplicated_headers_no_data_errors.tsv", + FileType.AVERAGE), + )) +def test_parse_errors(filepath, filetype, strains, seek_pos): + """ + Check that only errors are returned, and that certain properties hold for + said errors. + """ + for error in parse_errors(filepath, filetype, strains, seek_pos): + assert isinstance(error, dict) + assert "filepath" in error + assert "filetype" in error + assert "position" in error + assert "error" in error and isinstance(error["error"], str) + assert "message" in error |