From e68c807e6598a4087d7c83510ba33c81139f5544 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 28 Jun 2022 15:20:54 +0300 Subject: Check for inconsistent columns The number columns in each contents line should be equal to the nember of columns in the header line. --- qc_app/parse.py | 13 +++++++++---- qc_app/templates/errors_display.html | 8 ++++++-- quality_control/errors.py | 3 +++ quality_control/parsing.py | 21 ++++++++++++++++++++- scripts/qc.py | 6 ++++-- tests/qc/test_error_collection.py | 20 +++++++++++++++++++- 6 files changed, 61 insertions(+), 10 deletions(-) diff --git a/qc_app/parse.py b/qc_app/parse.py index a017b2c..5d75c37 100644 --- a/qc_app/parse.py +++ b/qc_app/parse.py @@ -6,11 +6,12 @@ from redis import Redis from flask import flash, request, url_for, redirect, Blueprint, render_template from flask import current_app as app -from quality_control.errors import InvalidValue +from quality_control.errors import InvalidValue, DuplicateHeading from . import jobs parsebp = Blueprint("parse", __name__) isinvalidvalue = lambda item: isinstance(item, InvalidValue) +isduplicateheading = lambda item: isinstance(item, DuplicateHeading) @parsebp.route("/parse", methods=["GET"]) def parse(): @@ -73,7 +74,9 @@ def parse_status(job_id: str): if status == "parse-error": return redirect(url_for("parse.fail", job_id=job_id)) - app.jinja_env.globals.update(isinvalidvalue=isinvalidvalue) + app.jinja_env.globals.update( + isinvalidvalue=isinvalidvalue, + isduplicateheading=isduplicateheading) return render_template( "job_progress.html", job_id = job_id, @@ -93,8 +96,10 @@ def results(job_id: str): if job: filename = job["filename"] - errors = jsonpickle.decode(job["errors"]) - app.jinja_env.globals.update(isinvalidvalue=isinvalidvalue) + errors = jsonpickle.decode(job.get("errors", jsonpickle.encode(tuple()))) + app.jinja_env.globals.update( + isinvalidvalue=isinvalidvalue, + isduplicateheading=isduplicateheading) return render_template( "parse_results.html", errors=errors, diff --git a/qc_app/templates/errors_display.html b/qc_app/templates/errors_display.html index 0c9a212..39144a9 100644 --- a/qc_app/templates/errors_display.html +++ b/qc_app/templates/errors_display.html @@ -22,15 +22,19 @@ {%if isinvalidvalue(error):%} {{error.column}} - {%else: %} + {%elif isduplicateheading(error): %} {{error.columns}} + {%else: %} + - {%endif %} {%if isinvalidvalue(error):%} Invalid Value - {%else: %} + {%elif isduplicateheading(error): %} Duplicate Header + {%else%} + Inconsistent Columns {%endif %} {{error["message"]}} diff --git a/quality_control/errors.py b/quality_control/errors.py index 678fe09..fff6c7c 100644 --- a/quality_control/errors.py +++ b/quality_control/errors.py @@ -7,3 +7,6 @@ InvalidValue = namedtuple( DuplicateHeading = namedtuple( "DuplicateHeading", ("line", "columns", "heading", "message")) + +InconsistentColumns = namedtuple( + "InconsistentColumns", ("line", "header_count", "contents_count", "message")) diff --git a/quality_control/parsing.py b/quality_control/parsing.py index ba22e0c..28a311e 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -7,7 +7,8 @@ from typing import Tuple, Union, Iterable, Generator, Callable, Optional import quality_control.average as avg import quality_control.standard_error as se -from quality_control.errors import InvalidValue, DuplicateHeading +from quality_control.errors import ( + InvalidValue, DuplicateHeading, InconsistentColumns) from quality_control.headers import ( invalid_header, invalid_headings, duplicate_headings) @@ -62,6 +63,19 @@ def se_errors(line_number, fields): se.invalid_value(line_number, *field) for field in enumerate(fields[1:], start=2))) +def make_column_consistency_checker(header_row): + """Build function to check for column consistency""" + headers = tuple(field.strip() for field in header_row.split("\t")) + def __checker__(line_number, contents_row): + contents = tuple(field.strip() for field in contents_row.split("\t")) + if len(contents) != len(headers): + return InconsistentColumns( + line_number, len(headers), len(contents), + (f"Header row has {len(headers)} columns while row " + f"{line_number} has {len(contents)} columns")) + return None + return __checker__ + def collect_errors( filepath: str, filetype: FileType, strains: list, update_progress: Optional[Callable] = None, @@ -94,12 +108,17 @@ def collect_errors( line = line.decode("utf-8") if line_number == 1: + consistent_columns_checker = make_column_consistency_checker(line) for error in __process_errors__( line_number, line, partial(header_errors, strains=strains), errors): yield error if line_number != 1: + col_consistency_error = consistent_columns_checker(line_number, line) + if col_consistency_error: + yield col_consistency_error + for error in __process_errors__( line_number, line, ( average_errors if filetype == FileType.AVERAGE diff --git a/scripts/qc.py b/scripts/qc.py index 9bad55e..de01bb7 100644 --- a/scripts/qc.py +++ b/scripts/qc.py @@ -6,8 +6,8 @@ from typing import Union, Callable import magic -from quality_control.errors import InvalidValue from quality_control.utils import make_progress_calculator +from quality_control.errors import InvalidValue, DuplicateHeading from quality_control.parsing import ( take, FileType, @@ -77,7 +77,9 @@ def print_errors(errors, verbose): for error in errors: cols = ( error.column if isinstance(error, InvalidValue) - else ", ".join(str(col) for col in error.columns)) + else (", ".join(str(col) for col in error.columns) + if isinstance(error, DuplicateHeading) + else "-")) errors_exist = True print(f"{starter}{error.line}\t{cols}\t{error.message}") diff --git a/tests/qc/test_error_collection.py b/tests/qc/test_error_collection.py index fe85bb1..ee1e6c2 100644 --- a/tests/qc/test_error_collection.py +++ b/tests/qc/test_error_collection.py @@ -2,8 +2,9 @@ import pytest -from quality_control.errors import InvalidValue, DuplicateHeading from quality_control.parsing import take, FileType, collect_errors +from quality_control.errors import ( + InvalidValue, DuplicateHeading, InconsistentColumns) @pytest.mark.parametrize( "sample,num,expected", @@ -34,3 +35,20 @@ def test_collect_errors(filepath, filetype, strains, count): def __valid_instance(item): return isinstance(item, (InvalidValue, DuplicateHeading)) assert all(__valid_instance(error) for error in results) + +@pytest.mark.parametrize( + "filepath,filetype,expected", + (("tests/test_data/average_inconsistent_columns.tsv", FileType.AVERAGE, + (InconsistentColumns( + 4, 4, 5, "Header row has 4 columns while row 4 has 5 columns"), + InconsistentColumns( + 5, 4, 3, "Header row has 4 columns while row 5 has 3 columns"), + InconsistentColumns( + 6, 4, 7, "Header row has 4 columns while row 6 has 7 columns"))),)) +def test_collect_inconsistent_column_errors(filepath, filetype, strains, expected): + """ + Given: A file with inconsistent columns in certain lines + When: collect_errors is run on the file + Then: All the lines with inconsistent columns are flagged + """ + assert tuple(collect_errors(filepath, filetype, strains)) == expected -- cgit v1.2.3