diff options
-rw-r--r-- | quality_control/parsing.py | 18 | ||||
-rw-r--r-- | tests/conftest.py | 9 | ||||
-rw-r--r-- | tests/qc/test_parsing.py | 29 | ||||
-rw-r--r-- | tests/test_data/duplicated_headers_no_data_errors.tsv (renamed from tests/test_data/we_found_no_errors_in_your_file.tsv) | 0 |
4 files changed, 35 insertions, 21 deletions
diff --git a/quality_control/parsing.py b/quality_control/parsing.py index b7b0ff5..8b2715a 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -1,6 +1,6 @@ import csv - from enum import Enum +from functools import reduce import quality_control.average as avg import quality_control.standard_error as se @@ -28,7 +28,7 @@ def parse_strains(filepath): def __parse_header(line, strains): return valid_header( - strains, + set(strains), tuple(header.strip() for header in line.split("\t"))) def __parse_average_line(line): @@ -42,16 +42,20 @@ LINE_PARSERS = { FileType.STANDARD_ERROR: __parse_standard_error_line } -def parse_file(filepath: str, filetype: FileType, strains_filepath: str): +def strain_names(strains): + def __extract_strain_names(acc, strain): + return acc + tuple( + item for item in (strain["Name"], strain["Name2"]) + if (item is not None and item != "")) + return reduce(__extract_strain_names, strains, tuple()) + +def parse_file(filepath: str, filetype: FileType, strains: list): seek_pos = 0 try: with open(filepath, encoding="utf-8") as input_file: for line_number, line in enumerate(input_file): if line_number == 0: - yield __parse_header( - line, - tuple(strain["Name"] for strain - in parse_strains(strains_filepath))) + yield __parse_header(line, strains) seek_pos = seek_pos + len(line) yield LINE_PARSERS[filetype]( diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0cdba3e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from functools import reduce + +import pytest + +from quality_control.parsing import strain_names, parse_strains + +@pytest.fixture(scope="session") +def strains(): + return strain_names(parse_strains("strains.csv")) diff --git a/tests/qc/test_parsing.py b/tests/qc/test_parsing.py index 14cfbde..be13d9b 100644 --- a/tests/qc/test_parsing.py +++ b/tests/qc/test_parsing.py @@ -11,37 +11,38 @@ from quality_control.parsing import FileType, parse_file ("tests/test_data/average.tsv", FileType.STANDARD_ERROR), ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.AVERAGE), ("tests/test_data/standarderror.tsv", FileType.AVERAGE), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", + ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.STANDARD_ERROR),)) -def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype): +def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype, strains): with pytest.raises(ParseError): - for line in parse_file(filepath, filetype, "strains.csv"): + for line in parse_file(filepath, filetype, strains): pass @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_crlf.tsv", FileType.AVERAGE), ("tests/test_data/average.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE))) -def test_parse_file_passes_with_valid_files(filepath, filetype): - assert False, "Not Implemented" + ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR))) +def test_parse_file_passes_with_valid_files(filepath, filetype, strains): + for line in parse_file(filepath, filetype, strains): + assert bool(line) @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_large.tsv", FileType.AVERAGE), ("tests/test_data/average.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE))) -def test_parse_file_works_with_large_files(filepath, filetype): - assert False, "Not Implemented" + ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR))) +def test_parse_file_works_with_large_files(filepath, filetype, strains): + for line in parse_file(filepath, filetype, strains): + assert bool(line) @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_error_at_end_200MB.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR))) -def test_parse_file_raises_exception_on_error_in_file(filepath, filetype): + ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR), + ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.AVERAGE))) +def test_parse_file_raises_exception_on_error_in_file(filepath, filetype, strains): with pytest.raises(ParseError): - for line in parse_file(filepath, filetype, "strains.csv"): + for line in parse_file(filepath, filetype, strains): pass diff --git a/tests/test_data/we_found_no_errors_in_your_file.tsv b/tests/test_data/duplicated_headers_no_data_errors.tsv index a49ed1b..a49ed1b 100644 --- a/tests/test_data/we_found_no_errors_in_your_file.tsv +++ b/tests/test_data/duplicated_headers_no_data_errors.tsv |