aboutsummaryrefslogtreecommitdiff
path: root/quality_control/headers.py
blob: 436ea5a860b173aed40a4312d2e6737fea2caecc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""Validate the headers"""

from functools import reduce
from typing import Union, Tuple, Sequence

from quality_control.errors import InvalidValue, DuplicateHeading

def invalid_header(filename: str,
                   line_number: int,
                   headers: Sequence[str]) -> Union[InvalidValue, None]:
    """Return an `InvalidValue` object if the header row has less than 2
    items."""
    if len(headers) < 2:
        return InvalidValue(
            filename, line_number, 0, "<TAB>".join(headers),
            "The header MUST contain at least 2 columns")
    return None

def invalid_headings(
        filename: str, line_number: int, strains: Sequence[str],
        headings: Sequence[str]) -> Union[Tuple[InvalidValue, ...], None]:
    """Return tuple of `InvalidValue` objects for each error found for every
    column heading."""
    return tuple(
        InvalidValue(filename,
                     line_number,
                     col,
                     header,
                     f"'{header}' not a valid strain.")
        for col, header in
        enumerate(headings, start=2) if header not in strains)

def duplicate_headings(filename: str,
                       line_number: int,
                       headers: Sequence[str]) -> Tuple[DuplicateHeading, ...]:
    """Return a tuple of `DuplicateHeading` objects for each column heading that
    is a duplicate of another column heading."""
    def __update_columns__(acc, item):
        if item[1] in acc.keys():
            return {**acc, item[1]: acc[item[1]] + (item[0],)}
        return {**acc, item[1]: (item[0],)}
    repeated = {# type: ignore[var-annotated]
        heading: columns for heading, columns in
        reduce(__update_columns__, enumerate(headers, start=1), {}).items()
        if len(columns) > 1
    }
    return tuple(
        DuplicateHeading(
            filename, line_number, columns, heading, (
                f"Heading '{heading}', is repeated in columns "
                f"{','.join(str(i) for i in columns)}"))
        for heading, columns in repeated.items())