aboutsummaryrefslogtreecommitdiff
path: root/quality_control
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
committerFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
commite68c807e6598a4087d7c83510ba33c81139f5544 (patch)
tree1c8d52c686e64cf8751f51d85bf8164ff8b9653c /quality_control
parent3eef6d582245c80e274c9e135028de90788a712b (diff)
downloadgn-uploader-e68c807e6598a4087d7c83510ba33c81139f5544.tar.gz
Check for inconsistent columns
The number columns in each contents line should be equal to the nember of columns in the header line.
Diffstat (limited to 'quality_control')
-rw-r--r--quality_control/errors.py3
-rw-r--r--quality_control/parsing.py21
2 files changed, 23 insertions, 1 deletions
diff --git a/quality_control/errors.py b/quality_control/errors.py
index 678fe09..fff6c7c 100644
--- a/quality_control/errors.py
+++ b/quality_control/errors.py
@@ -7,3 +7,6 @@ InvalidValue = namedtuple(
DuplicateHeading = namedtuple(
"DuplicateHeading", ("line", "columns", "heading", "message"))
+
+InconsistentColumns = namedtuple(
+ "InconsistentColumns", ("line", "header_count", "contents_count", "message"))
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index ba22e0c..28a311e 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -7,7 +7,8 @@ from typing import Tuple, Union, Iterable, Generator, Callable, Optional
import quality_control.average as avg
import quality_control.standard_error as se
-from quality_control.errors import InvalidValue, DuplicateHeading
+from quality_control.errors import (
+ InvalidValue, DuplicateHeading, InconsistentColumns)
from quality_control.headers import (
invalid_header, invalid_headings, duplicate_headings)
@@ -62,6 +63,19 @@ def se_errors(line_number, fields):
se.invalid_value(line_number, *field)
for field in enumerate(fields[1:], start=2)))
+def make_column_consistency_checker(header_row):
+ """Build function to check for column consistency"""
+ headers = tuple(field.strip() for field in header_row.split("\t"))
+ def __checker__(line_number, contents_row):
+ contents = tuple(field.strip() for field in contents_row.split("\t"))
+ if len(contents) != len(headers):
+ return InconsistentColumns(
+ line_number, len(headers), len(contents),
+ (f"Header row has {len(headers)} columns while row "
+ f"{line_number} has {len(contents)} columns"))
+ return None
+ return __checker__
+
def collect_errors(
filepath: str, filetype: FileType, strains: list,
update_progress: Optional[Callable] = None,
@@ -94,12 +108,17 @@ def collect_errors(
line = line.decode("utf-8")
if line_number == 1:
+ consistent_columns_checker = make_column_consistency_checker(line)
for error in __process_errors__(
line_number, line, partial(header_errors, strains=strains),
errors):
yield error
if line_number != 1:
+ col_consistency_error = consistent_columns_checker(line_number, line)
+ if col_consistency_error:
+ yield col_consistency_error
+
for error in __process_errors__(
line_number, line, (
average_errors if filetype == FileType.AVERAGE