aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
committerFrederick Muriuki Muriithi2022-06-28 15:20:54 +0300
commite68c807e6598a4087d7c83510ba33c81139f5544 (patch)
tree1c8d52c686e64cf8751f51d85bf8164ff8b9653c
parent3eef6d582245c80e274c9e135028de90788a712b (diff)
downloadgn-uploader-e68c807e6598a4087d7c83510ba33c81139f5544.tar.gz
Check for inconsistent columns
The number columns in each contents line should be equal to the nember of columns in the header line.
-rw-r--r--qc_app/parse.py13
-rw-r--r--qc_app/templates/errors_display.html8
-rw-r--r--quality_control/errors.py3
-rw-r--r--quality_control/parsing.py21
-rw-r--r--scripts/qc.py6
-rw-r--r--tests/qc/test_error_collection.py20
6 files changed, 61 insertions, 10 deletions
diff --git a/qc_app/parse.py b/qc_app/parse.py
index a017b2c..5d75c37 100644
--- a/qc_app/parse.py
+++ b/qc_app/parse.py
@@ -6,11 +6,12 @@ from redis import Redis
from flask import flash, request, url_for, redirect, Blueprint, render_template
from flask import current_app as app
-from quality_control.errors import InvalidValue
+from quality_control.errors import InvalidValue, DuplicateHeading
from . import jobs
parsebp = Blueprint("parse", __name__)
isinvalidvalue = lambda item: isinstance(item, InvalidValue)
+isduplicateheading = lambda item: isinstance(item, DuplicateHeading)
@parsebp.route("/parse", methods=["GET"])
def parse():
@@ -73,7 +74,9 @@ def parse_status(job_id: str):
if status == "parse-error":
return redirect(url_for("parse.fail", job_id=job_id))
- app.jinja_env.globals.update(isinvalidvalue=isinvalidvalue)
+ app.jinja_env.globals.update(
+ isinvalidvalue=isinvalidvalue,
+ isduplicateheading=isduplicateheading)
return render_template(
"job_progress.html",
job_id = job_id,
@@ -93,8 +96,10 @@ def results(job_id: str):
if job:
filename = job["filename"]
- errors = jsonpickle.decode(job["errors"])
- app.jinja_env.globals.update(isinvalidvalue=isinvalidvalue)
+ errors = jsonpickle.decode(job.get("errors", jsonpickle.encode(tuple())))
+ app.jinja_env.globals.update(
+ isinvalidvalue=isinvalidvalue,
+ isduplicateheading=isduplicateheading)
return render_template(
"parse_results.html",
errors=errors,
diff --git a/qc_app/templates/errors_display.html b/qc_app/templates/errors_display.html
index 0c9a212..39144a9 100644
--- a/qc_app/templates/errors_display.html
+++ b/qc_app/templates/errors_display.html
@@ -22,15 +22,19 @@
<td>
{%if isinvalidvalue(error):%}
{{error.column}}
- {%else: %}
+ {%elif isduplicateheading(error): %}
{{error.columns}}
+ {%else: %}
+ -
{%endif %}
</td>
<td>
{%if isinvalidvalue(error):%}
Invalid Value
- {%else: %}
+ {%elif isduplicateheading(error): %}
Duplicate Header
+ {%else%}
+ Inconsistent Columns
{%endif %}
</td>
<td>{{error["message"]}}</td>
diff --git a/quality_control/errors.py b/quality_control/errors.py
index 678fe09..fff6c7c 100644
--- a/quality_control/errors.py
+++ b/quality_control/errors.py
@@ -7,3 +7,6 @@ InvalidValue = namedtuple(
DuplicateHeading = namedtuple(
"DuplicateHeading", ("line", "columns", "heading", "message"))
+
+InconsistentColumns = namedtuple(
+ "InconsistentColumns", ("line", "header_count", "contents_count", "message"))
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index ba22e0c..28a311e 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -7,7 +7,8 @@ from typing import Tuple, Union, Iterable, Generator, Callable, Optional
import quality_control.average as avg
import quality_control.standard_error as se
-from quality_control.errors import InvalidValue, DuplicateHeading
+from quality_control.errors import (
+ InvalidValue, DuplicateHeading, InconsistentColumns)
from quality_control.headers import (
invalid_header, invalid_headings, duplicate_headings)
@@ -62,6 +63,19 @@ def se_errors(line_number, fields):
se.invalid_value(line_number, *field)
for field in enumerate(fields[1:], start=2)))
+def make_column_consistency_checker(header_row):
+ """Build function to check for column consistency"""
+ headers = tuple(field.strip() for field in header_row.split("\t"))
+ def __checker__(line_number, contents_row):
+ contents = tuple(field.strip() for field in contents_row.split("\t"))
+ if len(contents) != len(headers):
+ return InconsistentColumns(
+ line_number, len(headers), len(contents),
+ (f"Header row has {len(headers)} columns while row "
+ f"{line_number} has {len(contents)} columns"))
+ return None
+ return __checker__
+
def collect_errors(
filepath: str, filetype: FileType, strains: list,
update_progress: Optional[Callable] = None,
@@ -94,12 +108,17 @@ def collect_errors(
line = line.decode("utf-8")
if line_number == 1:
+ consistent_columns_checker = make_column_consistency_checker(line)
for error in __process_errors__(
line_number, line, partial(header_errors, strains=strains),
errors):
yield error
if line_number != 1:
+ col_consistency_error = consistent_columns_checker(line_number, line)
+ if col_consistency_error:
+ yield col_consistency_error
+
for error in __process_errors__(
line_number, line, (
average_errors if filetype == FileType.AVERAGE
diff --git a/scripts/qc.py b/scripts/qc.py
index 9bad55e..de01bb7 100644
--- a/scripts/qc.py
+++ b/scripts/qc.py
@@ -6,8 +6,8 @@ from typing import Union, Callable
import magic
-from quality_control.errors import InvalidValue
from quality_control.utils import make_progress_calculator
+from quality_control.errors import InvalidValue, DuplicateHeading
from quality_control.parsing import (
take,
FileType,
@@ -77,7 +77,9 @@ def print_errors(errors, verbose):
for error in errors:
cols = (
error.column if isinstance(error, InvalidValue)
- else ", ".join(str(col) for col in error.columns))
+ else (", ".join(str(col) for col in error.columns)
+ if isinstance(error, DuplicateHeading)
+ else "-"))
errors_exist = True
print(f"{starter}{error.line}\t{cols}\t{error.message}")
diff --git a/tests/qc/test_error_collection.py b/tests/qc/test_error_collection.py
index fe85bb1..ee1e6c2 100644
--- a/tests/qc/test_error_collection.py
+++ b/tests/qc/test_error_collection.py
@@ -2,8 +2,9 @@
import pytest
-from quality_control.errors import InvalidValue, DuplicateHeading
from quality_control.parsing import take, FileType, collect_errors
+from quality_control.errors import (
+ InvalidValue, DuplicateHeading, InconsistentColumns)
@pytest.mark.parametrize(
"sample,num,expected",
@@ -34,3 +35,20 @@ def test_collect_errors(filepath, filetype, strains, count):
def __valid_instance(item):
return isinstance(item, (InvalidValue, DuplicateHeading))
assert all(__valid_instance(error) for error in results)
+
+@pytest.mark.parametrize(
+ "filepath,filetype,expected",
+ (("tests/test_data/average_inconsistent_columns.tsv", FileType.AVERAGE,
+ (InconsistentColumns(
+ 4, 4, 5, "Header row has 4 columns while row 4 has 5 columns"),
+ InconsistentColumns(
+ 5, 4, 3, "Header row has 4 columns while row 5 has 3 columns"),
+ InconsistentColumns(
+ 6, 4, 7, "Header row has 4 columns while row 6 has 7 columns"))),))
+def test_collect_inconsistent_column_errors(filepath, filetype, strains, expected):
+ """
+ Given: A file with inconsistent columns in certain lines
+ When: collect_errors is run on the file
+ Then: All the lines with inconsistent columns are flagged
+ """
+ assert tuple(collect_errors(filepath, filetype, strains)) == expected