From 8954890bd7410d79ce151196f406b8e1b6985238 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 13 Apr 2022 15:11:17 +0300 Subject: Implement remaining file parsing tests * Implement remaining file parsing tests and some helpers functions needed for ensuring the tests work. --- quality_control/parsing.py | 18 ++++++++------ tests/conftest.py | 9 +++++++ tests/qc/test_parsing.py | 29 +++++++++++----------- .../duplicated_headers_no_data_errors.tsv | 2 ++ .../test_data/we_found_no_errors_in_your_file.tsv | 2 -- 5 files changed, 37 insertions(+), 23 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_data/duplicated_headers_no_data_errors.tsv delete mode 100644 tests/test_data/we_found_no_errors_in_your_file.tsv diff --git a/quality_control/parsing.py b/quality_control/parsing.py index b7b0ff5..8b2715a 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -1,6 +1,6 @@ import csv - from enum import Enum +from functools import reduce import quality_control.average as avg import quality_control.standard_error as se @@ -28,7 +28,7 @@ def parse_strains(filepath): def __parse_header(line, strains): return valid_header( - strains, + set(strains), tuple(header.strip() for header in line.split("\t"))) def __parse_average_line(line): @@ -42,16 +42,20 @@ LINE_PARSERS = { FileType.STANDARD_ERROR: __parse_standard_error_line } -def parse_file(filepath: str, filetype: FileType, strains_filepath: str): +def strain_names(strains): + def __extract_strain_names(acc, strain): + return acc + tuple( + item for item in (strain["Name"], strain["Name2"]) + if (item is not None and item != "")) + return reduce(__extract_strain_names, strains, tuple()) + +def parse_file(filepath: str, filetype: FileType, strains: list): seek_pos = 0 try: with open(filepath, encoding="utf-8") as input_file: for line_number, line in enumerate(input_file): if line_number == 0: - yield __parse_header( - line, - tuple(strain["Name"] for strain - in parse_strains(strains_filepath))) + yield __parse_header(line, strains) seek_pos = seek_pos + len(line) yield LINE_PARSERS[filetype]( diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0cdba3e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +from functools import reduce + +import pytest + +from quality_control.parsing import strain_names, parse_strains + +@pytest.fixture(scope="session") +def strains(): + return strain_names(parse_strains("strains.csv")) diff --git a/tests/qc/test_parsing.py b/tests/qc/test_parsing.py index 14cfbde..be13d9b 100644 --- a/tests/qc/test_parsing.py +++ b/tests/qc/test_parsing.py @@ -11,37 +11,38 @@ from quality_control.parsing import FileType, parse_file ("tests/test_data/average.tsv", FileType.STANDARD_ERROR), ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.AVERAGE), ("tests/test_data/standarderror.tsv", FileType.AVERAGE), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", + ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.STANDARD_ERROR),)) -def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype): +def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype, strains): with pytest.raises(ParseError): - for line in parse_file(filepath, filetype, "strains.csv"): + for line in parse_file(filepath, filetype, strains): pass @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_crlf.tsv", FileType.AVERAGE), ("tests/test_data/average.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE))) -def test_parse_file_passes_with_valid_files(filepath, filetype): - assert False, "Not Implemented" + ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR))) +def test_parse_file_passes_with_valid_files(filepath, filetype, strains): + for line in parse_file(filepath, filetype, strains): + assert bool(line) @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_large.tsv", FileType.AVERAGE), ("tests/test_data/average.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR), - ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE))) -def test_parse_file_works_with_large_files(filepath, filetype): - assert False, "Not Implemented" + ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR))) +def test_parse_file_works_with_large_files(filepath, filetype, strains): + for line in parse_file(filepath, filetype, strains): + assert bool(line) @pytest.mark.parametrize( "filepath,filetype", (("tests/test_data/average_error_at_end_200MB.tsv", FileType.AVERAGE), - ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR))) -def test_parse_file_raises_exception_on_error_in_file(filepath, filetype): + ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR), + ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.AVERAGE))) +def test_parse_file_raises_exception_on_error_in_file(filepath, filetype, strains): with pytest.raises(ParseError): - for line in parse_file(filepath, filetype, "strains.csv"): + for line in parse_file(filepath, filetype, strains): pass diff --git a/tests/test_data/duplicated_headers_no_data_errors.tsv b/tests/test_data/duplicated_headers_no_data_errors.tsv new file mode 100644 index 0000000..a49ed1b --- /dev/null +++ b/tests/test_data/duplicated_headers_no_data_errors.tsv @@ -0,0 +1,2 @@ +ProbeSetID BXD95 BXD27 BXD98 BXD99 BXD39 BXD33 BXD45 BXD51 BXD56 BXD76 BXD56 BXD12 BXD83 DBA/2J BXD87 BXD81 BXD86 BXD84 BXD85 BXD40 BXD8 BXD79 BXD77 BXD70 BXD71 BXD73 BXD68 BXD67 BXD66 BXD63 BXD64 BXD65 BXD48 BXD50 BXD55 BXD6 BXD100 BXD11 BXD24 BXD18 BXD21 BXD20 BXD14 BXD16 BXD44 BXD38 BXD42 BXD43 BXD28 BXD25 BXD2 BXD23 BXD29 BXD22 C57BL/6J D2B6F1 BXD19 BXD90 BXD90 BXD1 BXD101 BXD90 BXD62 BXD9 BXD89 BXD96 BXD69 +10608724 6.356 6.532 6.515 6.563 6.471 6.472 6.632 6.564 6.601 6.584 6.342 6.542 6.639 6.343 6.468 6.367 6.555 6.635 6.443 6.531 6.514 6.582 6.544 6.504 6.387 6.489 6.726 6.556 6.462 6.594 6.604 6.629 6.547 6.648 6.493 6.451 6.672 6.601 6.689 6.731 6.769 6.633 6.425 6.612 6.486 6.452 6.557 6.343 6.345 6.345 6.346 6.339 6.539 6.554 6.481 6.524 6.382 6.491 6.352 6.556 6.636 6.574 6.334 6.744 6.427 6.405 6.493 \ No newline at end of file diff --git a/tests/test_data/we_found_no_errors_in_your_file.tsv b/tests/test_data/we_found_no_errors_in_your_file.tsv deleted file mode 100644 index a49ed1b..0000000 --- a/tests/test_data/we_found_no_errors_in_your_file.tsv +++ /dev/null @@ -1,2 +0,0 @@ -ProbeSetID BXD95 BXD27 BXD98 BXD99 BXD39 BXD33 BXD45 BXD51 BXD56 BXD76 BXD56 BXD12 BXD83 DBA/2J BXD87 BXD81 BXD86 BXD84 BXD85 BXD40 BXD8 BXD79 BXD77 BXD70 BXD71 BXD73 BXD68 BXD67 BXD66 BXD63 BXD64 BXD65 BXD48 BXD50 BXD55 BXD6 BXD100 BXD11 BXD24 BXD18 BXD21 BXD20 BXD14 BXD16 BXD44 BXD38 BXD42 BXD43 BXD28 BXD25 BXD2 BXD23 BXD29 BXD22 C57BL/6J D2B6F1 BXD19 BXD90 BXD90 BXD1 BXD101 BXD90 BXD62 BXD9 BXD89 BXD96 BXD69 -10608724 6.356 6.532 6.515 6.563 6.471 6.472 6.632 6.564 6.601 6.584 6.342 6.542 6.639 6.343 6.468 6.367 6.555 6.635 6.443 6.531 6.514 6.582 6.544 6.504 6.387 6.489 6.726 6.556 6.462 6.594 6.604 6.629 6.547 6.648 6.493 6.451 6.672 6.601 6.689 6.731 6.769 6.633 6.425 6.612 6.486 6.452 6.557 6.343 6.345 6.345 6.346 6.339 6.539 6.554 6.481 6.524 6.382 6.491 6.352 6.556 6.636 6.574 6.334 6.744 6.427 6.405 6.493 \ No newline at end of file -- cgit v1.2.3