diff options
author | Frederick Muriuki Muriithi | 2022-06-28 15:20:54 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2022-06-28 15:20:54 +0300 |
commit | e68c807e6598a4087d7c83510ba33c81139f5544 (patch) | |
tree | 1c8d52c686e64cf8751f51d85bf8164ff8b9653c /quality_control | |
parent | 3eef6d582245c80e274c9e135028de90788a712b (diff) | |
download | gn-uploader-e68c807e6598a4087d7c83510ba33c81139f5544.tar.gz |
Check for inconsistent columns
The number columns in each contents line should be equal to the nember
of columns in the header line.
Diffstat (limited to 'quality_control')
-rw-r--r-- | quality_control/errors.py | 3 | ||||
-rw-r--r-- | quality_control/parsing.py | 21 |
2 files changed, 23 insertions, 1 deletions
diff --git a/quality_control/errors.py b/quality_control/errors.py index 678fe09..fff6c7c 100644 --- a/quality_control/errors.py +++ b/quality_control/errors.py @@ -7,3 +7,6 @@ InvalidValue = namedtuple( DuplicateHeading = namedtuple( "DuplicateHeading", ("line", "columns", "heading", "message")) + +InconsistentColumns = namedtuple( + "InconsistentColumns", ("line", "header_count", "contents_count", "message")) diff --git a/quality_control/parsing.py b/quality_control/parsing.py index ba22e0c..28a311e 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -7,7 +7,8 @@ from typing import Tuple, Union, Iterable, Generator, Callable, Optional import quality_control.average as avg import quality_control.standard_error as se -from quality_control.errors import InvalidValue, DuplicateHeading +from quality_control.errors import ( + InvalidValue, DuplicateHeading, InconsistentColumns) from quality_control.headers import ( invalid_header, invalid_headings, duplicate_headings) @@ -62,6 +63,19 @@ def se_errors(line_number, fields): se.invalid_value(line_number, *field) for field in enumerate(fields[1:], start=2))) +def make_column_consistency_checker(header_row): + """Build function to check for column consistency""" + headers = tuple(field.strip() for field in header_row.split("\t")) + def __checker__(line_number, contents_row): + contents = tuple(field.strip() for field in contents_row.split("\t")) + if len(contents) != len(headers): + return InconsistentColumns( + line_number, len(headers), len(contents), + (f"Header row has {len(headers)} columns while row " + f"{line_number} has {len(contents)} columns")) + return None + return __checker__ + def collect_errors( filepath: str, filetype: FileType, strains: list, update_progress: Optional[Callable] = None, @@ -94,12 +108,17 @@ def collect_errors( line = line.decode("utf-8") if line_number == 1: + consistent_columns_checker = make_column_consistency_checker(line) for error in __process_errors__( line_number, line, partial(header_errors, strains=strains), errors): yield error if line_number != 1: + col_consistency_error = consistent_columns_checker(line_number, line) + if col_consistency_error: + yield col_consistency_error + for error in __process_errors__( line_number, line, ( average_errors if filetype == FileType.AVERAGE |