aboutsummaryrefslogtreecommitdiff
path: root/scripts/qc.py
blob: 7d41d6cb5af85f636599d10b467e683d8a38c2ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Implements the command-line interface for the qc application"""
import os
import sys
import argparse
import mimetypes
from typing import Union, Callable

from functional_tools import take

from quality_control.utils import make_progress_calculator
from quality_control.errors import InvalidValue, DuplicateHeading
from quality_control.parsing import FileType, strain_names, collect_errors


def is_file_mime(filepath:str, mimetype:str) -> bool:
    """Check that `filepath` has a mimetype of `mimetype` or `text/plain`"""
    the_type = mimetypes.guess_type(filepath)[0]
    return the_type in ("text/plain", mimetype)

def cli_argument_parser():
    """Create the parser for the CLI arguments"""
    parser = argparse.ArgumentParser(
        prog="qc", description = (
            "Command-Line Interface program for quality control of data files"))
    parser.add_argument(
        "filetype",
        help="The type of file to check",
        choices=("average", "standard-error"))
    parser.add_argument(
        "filepath",
        help=(
            "The path to the file to be checked."
            "If an absolute path is not provided, then the file will be relative to"
            f"\t'{os.getcwd()}'"))
    default_strains_file = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), "etc/strains.csv")
    parser.add_argument(
        "-s", "--strainsfile",
        help=(
            "Path to the file containing allowable strains/samples. "
            f"[default '{default_strains_file}']"),
        default=default_strains_file)

    parser.add_argument(
        "-c", "--count", type=int,
        help=(
            "Number of errors to display. "
            "A negative number means display all errors."),
        default=20)

    parser.add_argument(
        "-v", "--verbose",
        help="Controls whether to show extra output",
        default=False, action="store_true")
    return parser

def make_progress_indicator(
        verbose: bool, progress_calc_fn: Callable) -> Union[Callable, None]:
    """Utility to display the progress"""
    if not verbose:
        return None

    def __indicator__(linenumber, linetext):
        msg = progress_calc_fn(linenumber, linetext)
        print(f"LINE: {msg.currentline} ({msg.percent:.2f}%)", end="\r")
        return msg

    return __indicator__

def print_errors(errors, verbose):
    """Print out the errors"""
    errors_exist = False
    starter = "\n" if verbose else ""
    print(f"{starter}line(s)\tcolumn(s)\terrors")
    for error in errors:
        cols = (
            error.column if isinstance(error, InvalidValue)
            else (", ".join(str(col) for col in error.columns)
                  if isinstance(error, DuplicateHeading)
                  else "-"))
        errors_exist = True
        print(f"{starter}{error.line}\t{cols}\t{error.message}")

    if not errors_exist:
        print("No errors were found!")

    return errors

def check(filepath, filetype, strains, count, verbose=False):
    """Check the file and print out results"""
    updater = make_progress_indicator(
        verbose, make_progress_calculator(os.stat(filepath).st_size))

    if count > 0:
        return print_errors(
            take(collect_errors(filepath, filetype, strains, updater), count),
            verbose)
    return print_errors(
        collect_errors(filepath, filetype, strains, updater), verbose)

def main():
    """Entry point function"""
    argparser = cli_argument_parser()
    args = argparser.parse_args()
    if not os.path.exists(args.filepath):
        print(f"The file '{args.filepath}' does not exist.", file=sys.stderr)
        return 1

    if not os.path.exists(args.strainsfile):
        print(f"The file '{args.strainsfile}' does not exist.", file=sys.stderr)
        return 2

    if not is_file_mime(args.filepath, "text/tab-separated-values"):
        print(
            f"The file '{args.filepath}' MUST be a tab-separated file.",
            file=sys.stderr)
        return 3

    if not is_file_mime(args.strainsfile, "text/csv"):
        print(
            f"The file '{args.strainsfile}' MUST be a tab-separated file.",
            file=sys.stderr)
        return 4

    if args.verbose:
        print(f"Parsing the strain names from '{args.strainsfile}'")

    strains = strain_names(os.path.realpath(args.strainsfile))

    filepath = os.path.realpath(args.filepath)
    if args.verbose:
        print(f"Checking '{filepath}' for errors")

    return check(
        filepath, (
            FileType.AVERAGE if args.filetype == "average"
            else FileType.STANDARD_ERROR),
        strains, args.count, verbose=args.verbose)

if __name__ == "__main__":
    main()