aboutsummaryrefslogtreecommitdiff
path: root/quality_control/parsing.py
diff options
context:
space:
mode:
Diffstat (limited to 'quality_control/parsing.py')
-rw-r--r--quality_control/parsing.py44
1 files changed, 25 insertions, 19 deletions
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index f7a664f..f1d21fc 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -1,6 +1,7 @@
"""Module handling the high-level parsing of the files"""
import collections
from enum import Enum
+from pathlib import Path
from functools import partial
from typing import Tuple, Union, Generator, Callable, Optional
@@ -30,44 +31,44 @@ def strain_names(dbconn: mdb.Connection, speciesid: int) -> tuple[str, ...]:
lambda item: bool(item.strip() if item is not None else item),
(name for names in samplenames for name in names))))
-def header_errors(line_number, fields, strains):
+def header_errors(filename, line_number, fields, strains):
"""Gather all header row errors."""
return (
- (invalid_header(line_number, fields),) +
- invalid_headings(line_number, strains, fields[1:]) +
- duplicate_headings(line_number, fields))
+ (invalid_header(filename, line_number, fields),) +
+ invalid_headings(filename, line_number, strains, fields[1:]) +
+ duplicate_headings(filename, line_number, fields))
-def empty_value(line_number, column_number, value):
+def empty_value(filename, line_number, column_number, value):
"""Check for empty field values."""
if value == "":
- return InvalidValue(
- line_number, column_number, value, "Empty value for column")
+ return InvalidValue(filename, line_number, column_number, value,
+ "Empty value for column")
return None
-def average_errors(line_number, fields):
+def average_errors(filename, line_number, fields):
"""Gather all errors for a line in a averages file."""
return (
- (empty_value(line_number, 1, fields[0]),) +
+ (empty_value(filename, line_number, 1, fields[0]),) +
tuple(
- avg.invalid_value(line_number, *field)
+ avg.invalid_value(filename, line_number, *field)
for field in enumerate(fields[1:], start=2)))
-def se_errors(line_number, fields):
+def se_errors(filename, line_number, fields):
"""Gather all errors for a line in a standard-errors file."""
return (
- (empty_value(line_number, 1, fields[0]),) +
+ (empty_value(filename, line_number, 1, fields[0]),) +
tuple(
- se.invalid_value(line_number, *field)
+ se.invalid_value(filename, line_number, *field)
for field in enumerate(fields[1:], start=2)))
-def make_column_consistency_checker(header_row):
+def make_column_consistency_checker(filename, header_row):
"""Build function to check for column consistency"""
headers = tuple(field.strip() for field in header_row.split("\t"))
def __checker__(line_number, contents_row):
contents = tuple(field.strip() for field in contents_row.split("\t"))
if len(contents) != len(headers):
return InconsistentColumns(
- line_number, len(headers), len(contents),
+ filename, line_number, len(headers), len(contents),
(f"Header row has {len(headers)} columns while row "
f"{line_number} has {len(contents)} columns"))
return None
@@ -79,8 +80,10 @@ def collect_errors(
user_aborted: Callable = lambda: False) -> Generator:
"""Run checks against file and collect all the errors"""
errors:Tuple[Union[InvalidValue, DuplicateHeading], ...] = tuple()
- def __process_errors__(line_number, line, error_checker_fn, errors = tuple()):
+ def __process_errors__(
+ filename, line_number, line, error_checker_fn, errors = tuple()):
errs = error_checker_fn(
+ filename,
line_number,
tuple(field.strip() for field in line.split("\t")))
if errs is None:
@@ -90,6 +93,7 @@ def collect_errors(
return errors + (errs,)
with open_file(filepath) as input_file:
+ filename = Path(filepath).name
for line_number, line in enumerate(input_file, start=1):
if user_aborted():
break
@@ -98,9 +102,11 @@ def collect_errors(
line = line.decode("utf-8")
if line_number == 1:
- consistent_columns_checker = make_column_consistency_checker(line)
+ consistent_columns_checker = make_column_consistency_checker(
+ filename, line)
for error in __process_errors__(
- line_number, line, partial(header_errors, strains=strains),
+ filename, line_number, line,
+ partial(header_errors, strains=strains),
errors):
yield error
@@ -110,7 +116,7 @@ def collect_errors(
yield col_consistency_error
for error in __process_errors__(
- line_number, line, (
+ filename, line_number, line, (
average_errors if filetype == FileType.AVERAGE
else se_errors),
errors):