From fcade690de59249a2789c26e8f668f36f8f4e075 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 2 May 2022 13:04:03 +0300 Subject: Optimise strain names parsing - Use a way faster way of parsing the strains file --- quality_control/parsing.py | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'quality_control') diff --git a/quality_control/parsing.py b/quality_control/parsing.py index 9fe88f1..436c90c 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -16,21 +16,6 @@ class FileType(Enum): AVERAGE = 1 STANDARD_ERROR = 2 -def parse_strains(filepath): - """Parse the strains file""" - with open(filepath, encoding="utf8") as strains_file: - reader = csv.DictReader( - strains_file, - fieldnames=[ - header.strip() for header - in strains_file.readline().split("\t")], - delimiter="\t") - for row in reader: - yield { - key: (value if value != "\\N" else None) - for key, value in row.items() - } - def __parse_header(line, strains): return valid_header( set(strains), @@ -47,13 +32,21 @@ LINE_PARSERS = { FileType.STANDARD_ERROR: __parse_standard_error_line } -def strain_names(strains): - """Retrieve a complete list of the names of the strains""" - def __extract_strain_names(acc, strain): - return acc + tuple( - item for item in (strain["Name"], strain["Name2"]) - if (item is not None and item != "")) - return reduce(__extract_strain_names, strains, tuple()) +def strain_names(filepath): + """Retrieve the strains names from given file""" + strains = set() + with open(filepath, encoding="utf8") as strains_file: + for idx, line in enumerate(strains_file.readlines()): + if idx > 0: + parts = line.split() + for name in (parts[1], parts[2]): + strains.add(name.strip()) + if len(parts) >= 6: + alias = parts[5].strip() + if alias != "" and alias not in ("P", "\\N"): + strains.add(alias) + + return strains def parse_file(filepath: str, filetype: FileType, strains: list): """Parse the given file""" -- cgit v1.2.3