diff options
Diffstat (limited to 'quality_control/parsing.py')
-rw-r--r-- | quality_control/parsing.py | 44 |
1 files changed, 25 insertions, 19 deletions
diff --git a/quality_control/parsing.py b/quality_control/parsing.py index f7a664f..f1d21fc 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -1,6 +1,7 @@ """Module handling the high-level parsing of the files""" import collections from enum import Enum +from pathlib import Path from functools import partial from typing import Tuple, Union, Generator, Callable, Optional @@ -30,44 +31,44 @@ def strain_names(dbconn: mdb.Connection, speciesid: int) -> tuple[str, ...]: lambda item: bool(item.strip() if item is not None else item), (name for names in samplenames for name in names)))) -def header_errors(line_number, fields, strains): +def header_errors(filename, line_number, fields, strains): """Gather all header row errors.""" return ( - (invalid_header(line_number, fields),) + - invalid_headings(line_number, strains, fields[1:]) + - duplicate_headings(line_number, fields)) + (invalid_header(filename, line_number, fields),) + + invalid_headings(filename, line_number, strains, fields[1:]) + + duplicate_headings(filename, line_number, fields)) -def empty_value(line_number, column_number, value): +def empty_value(filename, line_number, column_number, value): """Check for empty field values.""" if value == "": - return InvalidValue( - line_number, column_number, value, "Empty value for column") + return InvalidValue(filename, line_number, column_number, value, + "Empty value for column") return None -def average_errors(line_number, fields): +def average_errors(filename, line_number, fields): """Gather all errors for a line in a averages file.""" return ( - (empty_value(line_number, 1, fields[0]),) + + (empty_value(filename, line_number, 1, fields[0]),) + tuple( - avg.invalid_value(line_number, *field) + avg.invalid_value(filename, line_number, *field) for field in enumerate(fields[1:], start=2))) -def se_errors(line_number, fields): +def se_errors(filename, line_number, fields): """Gather all errors for a line in a standard-errors file.""" return ( - (empty_value(line_number, 1, fields[0]),) + + (empty_value(filename, line_number, 1, fields[0]),) + tuple( - se.invalid_value(line_number, *field) + se.invalid_value(filename, line_number, *field) for field in enumerate(fields[1:], start=2))) -def make_column_consistency_checker(header_row): +def make_column_consistency_checker(filename, header_row): """Build function to check for column consistency""" headers = tuple(field.strip() for field in header_row.split("\t")) def __checker__(line_number, contents_row): contents = tuple(field.strip() for field in contents_row.split("\t")) if len(contents) != len(headers): return InconsistentColumns( - line_number, len(headers), len(contents), + filename, line_number, len(headers), len(contents), (f"Header row has {len(headers)} columns while row " f"{line_number} has {len(contents)} columns")) return None @@ -79,8 +80,10 @@ def collect_errors( user_aborted: Callable = lambda: False) -> Generator: """Run checks against file and collect all the errors""" errors:Tuple[Union[InvalidValue, DuplicateHeading], ...] = tuple() - def __process_errors__(line_number, line, error_checker_fn, errors = tuple()): + def __process_errors__( + filename, line_number, line, error_checker_fn, errors = tuple()): errs = error_checker_fn( + filename, line_number, tuple(field.strip() for field in line.split("\t"))) if errs is None: @@ -90,6 +93,7 @@ def collect_errors( return errors + (errs,) with open_file(filepath) as input_file: + filename = Path(filepath).name for line_number, line in enumerate(input_file, start=1): if user_aborted(): break @@ -98,9 +102,11 @@ def collect_errors( line = line.decode("utf-8") if line_number == 1: - consistent_columns_checker = make_column_consistency_checker(line) + consistent_columns_checker = make_column_consistency_checker( + filename, line) for error in __process_errors__( - line_number, line, partial(header_errors, strains=strains), + filename, line_number, line, + partial(header_errors, strains=strains), errors): yield error @@ -110,7 +116,7 @@ def collect_errors( yield col_consistency_error for error in __process_errors__( - line_number, line, ( + filename, line_number, line, ( average_errors if filetype == FileType.AVERAGE else se_errors), errors): |