diff options
-rw-r--r-- | quality_control/average.py | 7 | ||||
-rw-r--r-- | quality_control/checks.py | 5 | ||||
-rw-r--r-- | quality_control/errors.py | 6 | ||||
-rw-r--r-- | quality_control/headers.py | 24 | ||||
-rw-r--r-- | quality_control/parsing.py | 44 | ||||
-rw-r--r-- | quality_control/standard_error.py | 11 | ||||
-rw-r--r-- | r_qtl/r_qtl2_qc.py | 12 | ||||
-rw-r--r-- | scripts/qc_on_rqtl2_bundle.py | 2 | ||||
-rw-r--r-- | tests/qc/test_cells.py | 20 | ||||
-rw-r--r-- | tests/qc/test_cells_average.py | 2 | ||||
-rw-r--r-- | tests/qc/test_cells_standard_error.py | 2 | ||||
-rw-r--r-- | tests/qc/test_error_collection.py | 9 | ||||
-rw-r--r-- | tests/qc/test_header.py | 25 | ||||
-rw-r--r-- | tests/r_qtl/test_r_qtl2_qc.py | 19 |
14 files changed, 108 insertions, 80 deletions
diff --git a/quality_control/average.py b/quality_control/average.py index ad732d0..bf288de 100644 --- a/quality_control/average.py +++ b/quality_control/average.py @@ -4,12 +4,13 @@ from typing import Union from .utils import cell_error from .errors import InvalidValue -def invalid_value(line_number: int, column_number: int, val: str) -> Union[ - InvalidValue, None]: +def invalid_value( + filename: str, line_number: int, column_number: int, val: str) -> Union[ + InvalidValue, None]: """Return an `InvalidValue` object if `val` is not a valid "averages" value.""" return cell_error( - r"^([0-9]+\.[0-9]{3}|[0-9]+\.?0*)$", val, line=line_number, + r"^([0-9]+\.[0-9]{3}|[0-9]+\.?0*)$", val, filename=filename, line=line_number, column=column_number, value=val, message=( f"Invalid value '{val}'. " "Expected string representing a number with exactly three " diff --git a/quality_control/checks.py b/quality_control/checks.py index 475eb9e..bdfd12b 100644 --- a/quality_control/checks.py +++ b/quality_control/checks.py @@ -52,7 +52,8 @@ def decimal_places_pattern(mini: int, maxi: Optional[int] = None) -> re.Pattern: + r")$" ) -def decimal_points_error(lineno: int, +def decimal_points_error(filename: str,# pylint: disable=[too-many-arguments] + lineno: int, field: str, value: str, mini: int, @@ -61,7 +62,7 @@ def decimal_points_error(lineno: int, Check that 'value' in a decimal number with the appropriate decimal places. """ if not bool(decimal_places_pattern(mini, maxi).match(value)): - return InvalidValue(lineno, field, value, ( + return InvalidValue(filename, lineno, field, value, ( f"Invalid value '{value}'. Expected numerical value " + f"with at least {mini} decimal places" + (f" and at most {maxi} decimal places" if maxi is not None else "") diff --git a/quality_control/errors.py b/quality_control/errors.py index fff6c7c..01afa81 100644 --- a/quality_control/errors.py +++ b/quality_control/errors.py @@ -3,10 +3,10 @@ from collections import namedtuple InvalidValue = namedtuple( - "InvalidValue", ("line", "column", "value", "message")) + "InvalidValue", ("filename", "line", "column", "value", "message")) DuplicateHeading = namedtuple( - "DuplicateHeading", ("line", "columns", "heading", "message")) + "DuplicateHeading", ("filename", "line", "columns", "heading", "message")) InconsistentColumns = namedtuple( - "InconsistentColumns", ("line", "header_count", "contents_count", "message")) + "InconsistentColumns", ("filename", "line", "header_count", "contents_count", "message")) diff --git a/quality_control/headers.py b/quality_control/headers.py index f4f4dad..436ea5a 100644 --- a/quality_control/headers.py +++ b/quality_control/headers.py @@ -5,30 +5,34 @@ from typing import Union, Tuple, Sequence from quality_control.errors import InvalidValue, DuplicateHeading -def invalid_header( - line_number: int, headers: Sequence[str]) -> Union[InvalidValue, None]: +def invalid_header(filename: str, + line_number: int, + headers: Sequence[str]) -> Union[InvalidValue, None]: """Return an `InvalidValue` object if the header row has less than 2 items.""" if len(headers) < 2: return InvalidValue( - line_number, 0, "<TAB>".join(headers), + filename, line_number, 0, "<TAB>".join(headers), "The header MUST contain at least 2 columns") return None def invalid_headings( - line_number: int, strains: Sequence[str], + filename: str, line_number: int, strains: Sequence[str], headings: Sequence[str]) -> Union[Tuple[InvalidValue, ...], None]: """Return tuple of `InvalidValue` objects for each error found for every column heading.""" return tuple( - InvalidValue( - line_number, col, header, f"'{header}' not a valid strain.") + InvalidValue(filename, + line_number, + col, + header, + f"'{header}' not a valid strain.") for col, header in enumerate(headings, start=2) if header not in strains) -def duplicate_headings( - line_number: int, - headers: Sequence[str]) -> Tuple[DuplicateHeading, ...]: +def duplicate_headings(filename: str, + line_number: int, + headers: Sequence[str]) -> Tuple[DuplicateHeading, ...]: """Return a tuple of `DuplicateHeading` objects for each column heading that is a duplicate of another column heading.""" def __update_columns__(acc, item): @@ -42,7 +46,7 @@ def duplicate_headings( } return tuple( DuplicateHeading( - line_number, columns, heading, ( + filename, line_number, columns, heading, ( f"Heading '{heading}', is repeated in columns " f"{','.join(str(i) for i in columns)}")) for heading, columns in repeated.items()) diff --git a/quality_control/parsing.py b/quality_control/parsing.py index f7a664f..f1d21fc 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -1,6 +1,7 @@ """Module handling the high-level parsing of the files""" import collections from enum import Enum +from pathlib import Path from functools import partial from typing import Tuple, Union, Generator, Callable, Optional @@ -30,44 +31,44 @@ def strain_names(dbconn: mdb.Connection, speciesid: int) -> tuple[str, ...]: lambda item: bool(item.strip() if item is not None else item), (name for names in samplenames for name in names)))) -def header_errors(line_number, fields, strains): +def header_errors(filename, line_number, fields, strains): """Gather all header row errors.""" return ( - (invalid_header(line_number, fields),) + - invalid_headings(line_number, strains, fields[1:]) + - duplicate_headings(line_number, fields)) + (invalid_header(filename, line_number, fields),) + + invalid_headings(filename, line_number, strains, fields[1:]) + + duplicate_headings(filename, line_number, fields)) -def empty_value(line_number, column_number, value): +def empty_value(filename, line_number, column_number, value): """Check for empty field values.""" if value == "": - return InvalidValue( - line_number, column_number, value, "Empty value for column") + return InvalidValue(filename, line_number, column_number, value, + "Empty value for column") return None -def average_errors(line_number, fields): +def average_errors(filename, line_number, fields): """Gather all errors for a line in a averages file.""" return ( - (empty_value(line_number, 1, fields[0]),) + + (empty_value(filename, line_number, 1, fields[0]),) + tuple( - avg.invalid_value(line_number, *field) + avg.invalid_value(filename, line_number, *field) for field in enumerate(fields[1:], start=2))) -def se_errors(line_number, fields): +def se_errors(filename, line_number, fields): """Gather all errors for a line in a standard-errors file.""" return ( - (empty_value(line_number, 1, fields[0]),) + + (empty_value(filename, line_number, 1, fields[0]),) + tuple( - se.invalid_value(line_number, *field) + se.invalid_value(filename, line_number, *field) for field in enumerate(fields[1:], start=2))) -def make_column_consistency_checker(header_row): +def make_column_consistency_checker(filename, header_row): """Build function to check for column consistency""" headers = tuple(field.strip() for field in header_row.split("\t")) def __checker__(line_number, contents_row): contents = tuple(field.strip() for field in contents_row.split("\t")) if len(contents) != len(headers): return InconsistentColumns( - line_number, len(headers), len(contents), + filename, line_number, len(headers), len(contents), (f"Header row has {len(headers)} columns while row " f"{line_number} has {len(contents)} columns")) return None @@ -79,8 +80,10 @@ def collect_errors( user_aborted: Callable = lambda: False) -> Generator: """Run checks against file and collect all the errors""" errors:Tuple[Union[InvalidValue, DuplicateHeading], ...] = tuple() - def __process_errors__(line_number, line, error_checker_fn, errors = tuple()): + def __process_errors__( + filename, line_number, line, error_checker_fn, errors = tuple()): errs = error_checker_fn( + filename, line_number, tuple(field.strip() for field in line.split("\t"))) if errs is None: @@ -90,6 +93,7 @@ def collect_errors( return errors + (errs,) with open_file(filepath) as input_file: + filename = Path(filepath).name for line_number, line in enumerate(input_file, start=1): if user_aborted(): break @@ -98,9 +102,11 @@ def collect_errors( line = line.decode("utf-8") if line_number == 1: - consistent_columns_checker = make_column_consistency_checker(line) + consistent_columns_checker = make_column_consistency_checker( + filename, line) for error in __process_errors__( - line_number, line, partial(header_errors, strains=strains), + filename, line_number, line, + partial(header_errors, strains=strains), errors): yield error @@ -110,7 +116,7 @@ def collect_errors( yield col_consistency_error for error in __process_errors__( - line_number, line, ( + filename, line_number, line, ( average_errors if filetype == FileType.AVERAGE else se_errors), errors): diff --git a/quality_control/standard_error.py b/quality_control/standard_error.py index 90beb8a..00b1ac6 100644 --- a/quality_control/standard_error.py +++ b/quality_control/standard_error.py @@ -4,17 +4,18 @@ from typing import Union from .utils import cell_error from .errors import InvalidValue -def invalid_value( - line_number: int, column_number: int, val: str) -> Union[ - InvalidValue, None]: +def invalid_value(filename: str, + line_number: int, + column_number: int, + val: str) -> Union[InvalidValue, None]: """ Returns a `quality_control.errors.InvalidValue` object in the case where `val` is not a valid input for standard error files, otherwise, it returns `None`. """ return cell_error( - r"^([0-9]+\.[0-9]{6,}|[0-9]+\.?0*)$", val, line=line_number, - column=column_number, value=val, message=( + r"^([0-9]+\.[0-9]{6,}|[0-9]+\.?0*)$", val, filename=filename, + line=line_number, column=column_number, value=val, message=( f"Invalid value '{val}'. " "Expected string representing a number with at least six " "decimal places.")) diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 43f7d94..be1eac4 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -63,12 +63,13 @@ def validate_bundle(zfile: ZipFile): "The following files do not exist in the bundle: " + ", ".join(mfile[1] for mfile in missing)) -def make_genocode_checker(genocode: dict) -> Callable[[int, str, str], Optional[InvalidValue]]: +def make_genocode_checker(genocode: dict, filename: str) -> Callable[ + [int, str, str], Optional[InvalidValue]]: """Make a checker from the genotypes in the control data""" def __checker__(lineno: int, field: str, value: str) -> Optional[InvalidValue]: genotypes = tuple(genocode.keys()) if value not in genotypes: - return InvalidValue(lineno, field, value, ( + return InvalidValue(filename, lineno, field, value, ( f"Invalid value '{value}'. Expected one of {genotypes}.")) return None return __checker__ @@ -78,14 +79,15 @@ def geno_errors(zfile: ZipFile) -> Iterator[Union[InvalidValue, MissingFile]]: cdata = rqtl2.control_data(zfile) return ( error for error in retrieve_errors( - zfile, "geno", (make_genocode_checker(cdata.get("genotypes", {})),)) + zfile, "geno", (make_genocode_checker(cdata.get("genotypes", {}), "geno"),)) if error is not None) def pheno_errors(zfile: ZipFile) -> Iterator[Union[InvalidValue, MissingFile]]: """Check for and retrieve pheno errors.""" return ( error for error in retrieve_errors( - zfile, "pheno", (partial(decimal_points_error, mini=3),)) + zfile, "pheno", (partial( + decimal_points_error, mini=3, filename="pheno"),)) if error is not None) def phenose_errors(zfile: ZipFile) -> Iterator[Union[InvalidValue, MissingFile]]: @@ -108,7 +110,7 @@ def retrieve_errors(zfile: ZipFile, filetype: str, checkers: tuple[Callable]) -> continue if value is not None: for checker in checkers: - yield checker(lineno, field, value) + yield checker(lineno=lineno, field=field, value=value) except rqe.MissingFileError: fname = cdata.get(filetype) yield MissingFile(filetype, fname, f"Missing '{filetype}' file '{fname}'.") diff --git a/scripts/qc_on_rqtl2_bundle.py b/scripts/qc_on_rqtl2_bundle.py index d77b6c2..027f387 100644 --- a/scripts/qc_on_rqtl2_bundle.py +++ b/scripts/qc_on_rqtl2_bundle.py @@ -116,7 +116,7 @@ def qc_geno_errors(rconn, fqjobid, zfile, logger) -> bool: logger.info("Checking for errors in the 'geno' fileā¦") gerrs = tuple(retrieve_errors_with_progress( rconn, fqjobid, zfile, "geno", - (rqc.make_genocode_checker(cdata.get("genotypes", {})),))) + (rqc.make_genocode_checker(cdata.get("genotypes", {}), "geno"),))) add_to_errors(rconn, fqjobid, "errors-generic", tuple( err for err in gerrs if isinstance(err, rqfe.MissingFile))) add_to_errors(rconn, fqjobid, "errors-geno", tuple( diff --git a/tests/qc/test_cells.py b/tests/qc/test_cells.py index e4a0959..937579f 100644 --- a/tests/qc/test_cells.py +++ b/tests/qc/test_cells.py @@ -22,12 +22,12 @@ def test_cell_value_errors_with_invalid_inputs2(num_str): `quality_control.errors.InvalidValue` object which holds the error information. """ - assert avg_invalid_value(0, 0, num_str) == InvalidValue( - 0, 0, num_str, ( + assert avg_invalid_value("test.file", 0, 0, num_str) == InvalidValue( + "test.file", 0, 0, num_str, ( f"Invalid value '{num_str}'. Expected string representing a number " "with exactly three decimal places.")) - assert se_invalid_value(0, 0, num_str) == InvalidValue( - 0, 0, num_str, ( + assert se_invalid_value("test.file", 0, 0, num_str) == InvalidValue( + "test.file", 0, 0, num_str, ( f"Invalid value '{num_str}'. Expected string representing a number " "with at least six decimal places.")) @@ -43,8 +43,8 @@ def test_cell_average_value_errors_if_not_three_decimal_places2(num_str): object with the information about the placement of the invalid value. """ line, col = randint(0, 100), randint(0, 20) - assert avg_invalid_value(line, col, num_str) == InvalidValue( - line, col, num_str, ( + assert avg_invalid_value("test.file", line, col, num_str) == InvalidValue( + "test.file", line, col, num_str, ( f"Invalid value '{num_str}'. Expected string representing a number " "with exactly three decimal places.")) @@ -57,7 +57,7 @@ def test_cell_average_value_pass_if_three_decimal_places(num_str): THEN: `avg_invalid_value` returns `None` """ line, col = randint(0, 100), randint(0, 20) - assert avg_invalid_value(line, col, num_str) is None + assert avg_invalid_value("test.file", line, col, num_str) is None @given(num_str=st.from_regex(r"^[0-9]+\.([0-9]{0,5}$)", fullmatch=True).filter( lambda param: not re.match(r"^[0-9]+\.?0*$", param))) @@ -70,8 +70,8 @@ def test_cell_standard_error_value_errors_if_less_than_six_decimal_places2(num_s object with the information about the placement of the invalid value. """ line, col = randint(0, 100), randint(0, 20) - assert se_invalid_value(line, col, num_str) == InvalidValue( - line, col, num_str, ( + assert se_invalid_value("test.file", line, col, num_str) == InvalidValue( + "test.file", line, col, num_str, ( f"Invalid value '{num_str}'. Expected string representing a number " "with at least six decimal places.")) @@ -84,4 +84,4 @@ def test_cell_standard_error_value_pass_if_six_or_more_decimal_places(num_str): THEN: `se_invalid_value` returns `None` """ line, col = randint(0, 100), randint(0, 20) - assert se_invalid_value(line, col, num_str) is None + assert se_invalid_value("test.file", line, col, num_str) is None diff --git a/tests/qc/test_cells_average.py b/tests/qc/test_cells_average.py index 68fd4ec..b6ded31 100644 --- a/tests/qc/test_cells_average.py +++ b/tests/qc/test_cells_average.py @@ -14,4 +14,4 @@ def test_cell_average_value_pass_if_no_decimal_places(num_str): THEN: `avg_invalid_value` returns `None` """ line, col = randint(0, 100), randint(0, 20) - assert avg_invalid_value(line, col, num_str) is None + assert avg_invalid_value("test.file", line, col, num_str) is None diff --git a/tests/qc/test_cells_standard_error.py b/tests/qc/test_cells_standard_error.py index 90c13cf..fa9f1db 100644 --- a/tests/qc/test_cells_standard_error.py +++ b/tests/qc/test_cells_standard_error.py @@ -17,4 +17,4 @@ def test_cell_standard_error_value_errors_if_less_than_six_decimal_places2(num_s THEN: `se_invalid_value` returns a `None`. """ line, col = randint(0, 100), randint(0, 20) - assert invalid_value(line, col, num_str) is None + assert invalid_value("test.file", line, col, num_str) is None diff --git a/tests/qc/test_error_collection.py b/tests/qc/test_error_collection.py index 962d2c5..260fabf 100644 --- a/tests/qc/test_error_collection.py +++ b/tests/qc/test_error_collection.py @@ -44,11 +44,14 @@ def test_collect_errors(filepath, filetype, strains, count): "filepath,filetype,expected", (("tests/test_data/average_inconsistent_columns.tsv", FileType.AVERAGE, (InconsistentColumns( - 4, 4, 5, "Header row has 4 columns while row 4 has 5 columns"), + "average_inconsistent_columns.tsv", 4, 4, 5, + "Header row has 4 columns while row 4 has 5 columns"), InconsistentColumns( - 5, 4, 3, "Header row has 4 columns while row 5 has 3 columns"), + "average_inconsistent_columns.tsv", 5, 4, 3, + "Header row has 4 columns while row 5 has 3 columns"), InconsistentColumns( - 6, 4, 7, "Header row has 4 columns while row 6 has 7 columns"))),)) + "average_inconsistent_columns.tsv", 6, 4, 7, + "Header row has 4 columns while row 6 has 7 columns"))),)) def test_collect_inconsistent_column_errors(filepath, filetype, strains, expected): """ Given: A file with inconsistent columns in certain lines diff --git a/tests/qc/test_header.py b/tests/qc/test_header.py index 5e54122..06647a2 100644 --- a/tests/qc/test_header.py +++ b/tests/qc/test_header.py @@ -11,17 +11,22 @@ from quality_control.headers import ( @given(headers=st.lists(st.text(max_size=10), max_size=1)) def test_invalid_header_with_list_of_one_value(headers): """Test `invalid_header` with invalid header row""" - assert invalid_header(0, headers) == InvalidValue( - 0, 0, "<TAB>".join(headers), + assert invalid_header("test.file", 0, headers) == InvalidValue( + "test.file", 0, 0, "<TAB>".join(headers), "The header MUST contain at least 2 columns") @pytest.mark.unit_test @given(headings=st.lists(st.text(min_size=2, max_size=10), min_size=2)) def test_invalid_headings_with_invalid_inputs(headings): "Verify that the check for header validity works" - assert invalid_headings(0, ("BXD1", "BXD2", "BXD3"), headings) == tuple( - InvalidValue(0, col, heading, f"'{heading}' not a valid strain.") - for col, heading in enumerate(headings, start=2)) + assert invalid_headings( + "test.file", 0, ("BXD1", "BXD2", "BXD3"), headings) == tuple( + InvalidValue("test.file", + 0, + col, + heading, + f"'{heading}' not a valid strain.") + for col, heading in enumerate(headings, start=2)) @pytest.mark.unit_test @pytest.mark.parametrize( @@ -30,7 +35,7 @@ def test_invalid_headings_with_invalid_inputs(headings): (("Individual", "AStrain", "AnotherStrain", "YetAnotherStrain"))]) def test_invalid_header_with_valid_headers(headers): "Verify that the check for header validity works" - assert invalid_header(0, headers) is None + assert invalid_header("test.file", 0, headers) is None @pytest.mark.unit_test @pytest.mark.parametrize( @@ -40,7 +45,7 @@ def test_invalid_header_with_valid_headers(headers): ("AStrain", "AnotherStrain", "YetAnotherStrain"))]) def test_invalid_headings_with_valid_headings(strains, headings): "Verify that the check for header validity works" - assert invalid_headings(0, strains, headings) == tuple() + assert invalid_headings("test.file", 0, strains, headings) == tuple() @pytest.mark.unit_test @pytest.mark.parametrize( @@ -50,8 +55,8 @@ def test_invalid_headings_with_valid_headings(strains, headings): "AStrain"), {"AStrain": (2, 5)})]) def test_duplicate_headers_with_repeated_column_headings(headers, repeated): """Check that parsing fails if any header is duplicated""" - assert duplicate_headings(0, headers) == tuple( - DuplicateHeading(0, cols, head, ( + assert duplicate_headings("test.file", 0, headers) == tuple( + DuplicateHeading("test.file", 0, cols, head, ( f"Heading '{head}', is repeated in columns " f"{','.join(str(i) for i in cols)}")) for head, cols in repeated.items()) @@ -63,4 +68,4 @@ def test_duplicate_headers_with_repeated_column_headings(headers, repeated): (("Individual", "AStrain", "AnotherStrain", "YetAnotherStrain",))]) def test_duplicate_headers_with_unique_column_headings(headers): """Check that parsing fails if any header is duplicated""" - assert duplicate_headings(0, headers) == tuple() + assert duplicate_headings("test.file", 0, headers) == tuple() diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py index d12172e..05db30e 100644 --- a/tests/r_qtl/test_r_qtl2_qc.py +++ b/tests/r_qtl/test_r_qtl2_qc.py @@ -124,9 +124,12 @@ def test_missing_files(filepath, expected): ("tests/r_qtl/test_files/test_geno.zip", tuple()), ("tests/r_qtl/test_files/geno_with_missing_genotypes.zip", - (InvalidValue(1, "AXR-1", "X", "Invalid value 'X'. Expected one of ('L', 'C')."), - InvalidValue(2, "EC.480C", "Y", "Invalid value 'Y'. Expected one of ('L', 'C')."), - InvalidValue(6, "HH.335C-Col/PhyA", "H", f"Invalid value 'H'. Expected one of ('L', 'C')."))))) + (InvalidValue("geno", 1, "AXR-1", "X", + "Invalid value 'X'. Expected one of ('L', 'C')."), + InvalidValue("geno", 2, "EC.480C", + "Y", "Invalid value 'Y'. Expected one of ('L', 'C')."), + InvalidValue("geno", 6, "HH.335C-Col/PhyA", "H", + f"Invalid value 'H'. Expected one of ('L', 'C')."))))) def test_geno_errors(filepath, expected): """ GIVEN: A R/qtl2 bundle @@ -145,10 +148,12 @@ def test_geno_errors(filepath, expected): ("tests/r_qtl/test_files/pheno_without_errors.zip", tuple()), ("tests/r_qtl/test_files/pheno_with_errors.zip", - (InvalidValue(1, "liver", "61.92", ("Invalid value '61.92'. Expected numerical value " - "with at least 3 decimal places.")), - InvalidValue(2, "spleen", "brrr", ("Invalid value 'brrr'. Expected numerical value " - "with at least 3 decimal places.")))))) + (InvalidValue("pheno", 1, "liver", "61.92", ( + "Invalid value '61.92'. Expected numerical value " + "with at least 3 decimal places.")), + InvalidValue("pheno", 2, "spleen", "brrr", ( + "Invalid value 'brrr'. Expected numerical value " + "with at least 3 decimal places.")))))) def test_pheno_errors(filepath, expected): """ GIVEN: A R/qtl2 bundle |