about summary refs log tree commit diff
path: root/quality_control
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
committerFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
commite68c807e6598a4087d7c83510ba33c81139f5544 (patch)
tree1c8d52c686e64cf8751f51d85bf8164ff8b9653c /quality_control
parent3eef6d582245c80e274c9e135028de90788a712b (diff)
downloadgn-uploader-e68c807e6598a4087d7c83510ba33c81139f5544.tar.gz
Check for inconsistent columns
The number columns in each contents line should be equal to the nember
of columns in the header line.
Diffstat (limited to 'quality_control')
-rw-r--r--quality_control/errors.py3
-rw-r--r--quality_control/parsing.py21
2 files changed, 23 insertions, 1 deletions
diff --git a/quality_control/errors.py b/quality_control/errors.py
index 678fe09..fff6c7c 100644
--- a/quality_control/errors.py
+++ b/quality_control/errors.py
@@ -7,3 +7,6 @@ InvalidValue = namedtuple(
 
 DuplicateHeading = namedtuple(
     "DuplicateHeading", ("line", "columns", "heading", "message"))
+
+InconsistentColumns = namedtuple(
+    "InconsistentColumns", ("line", "header_count", "contents_count", "message"))
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index ba22e0c..28a311e 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -7,7 +7,8 @@ from typing import Tuple, Union, Iterable, Generator, Callable, Optional
 
 import quality_control.average as avg
 import quality_control.standard_error as se
-from quality_control.errors import InvalidValue, DuplicateHeading
+from quality_control.errors import (
+    InvalidValue, DuplicateHeading, InconsistentColumns)
 from quality_control.headers import (
     invalid_header, invalid_headings, duplicate_headings)
 
@@ -62,6 +63,19 @@ def se_errors(line_number, fields):
             se.invalid_value(line_number, *field)
             for field in enumerate(fields[1:], start=2)))
 
+def make_column_consistency_checker(header_row):
+    """Build function to check for column consistency"""
+    headers = tuple(field.strip() for field in header_row.split("\t"))
+    def __checker__(line_number, contents_row):
+        contents = tuple(field.strip() for field in contents_row.split("\t"))
+        if len(contents) != len(headers):
+            return InconsistentColumns(
+                line_number, len(headers), len(contents),
+                (f"Header row has {len(headers)} columns while row "
+                 f"{line_number} has {len(contents)} columns"))
+        return None
+    return __checker__
+
 def collect_errors(
         filepath: str, filetype: FileType, strains: list,
         update_progress: Optional[Callable] = None,
@@ -94,12 +108,17 @@ def collect_errors(
                 line = line.decode("utf-8")
 
             if line_number == 1:
+                consistent_columns_checker = make_column_consistency_checker(line)
                 for error in __process_errors__(
                         line_number, line, partial(header_errors, strains=strains),
                         errors):
                     yield error
 
             if line_number != 1:
+                col_consistency_error = consistent_columns_checker(line_number, line)
+                if col_consistency_error:
+                    yield col_consistency_error
+
                 for error in __process_errors__(
                         line_number, line, (
                             average_errors if filetype == FileType.AVERAGE