diff options
author | Frederick Muriuki Muriithi | 2022-05-02 13:04:03 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2022-05-02 13:04:03 +0300 |
commit | fcade690de59249a2789c26e8f668f36f8f4e075 (patch) | |
tree | 73a9f8d40871e7942c4ae034eabf39855b6756ea | |
parent | 5632dcab27058875de99d63cbd263acfa3a9a2d5 (diff) | |
download | gn-uploader-fcade690de59249a2789c26e8f668f36f8f4e075.tar.gz |
Optimise strain names parsing
- Use a way faster way of parsing the strains file
-rw-r--r-- | qc_app/parse.py | 9 | ||||
-rw-r--r-- | quality_control/parsing.py | 37 | ||||
-rw-r--r-- | scripts/qc.py | 5 | ||||
-rw-r--r-- | tests/conftest.py | 4 |
4 files changed, 22 insertions, 33 deletions
diff --git a/qc_app/parse.py b/qc_app/parse.py index 795cc01..baad9a6 100644 --- a/qc_app/parse.py +++ b/qc_app/parse.py @@ -19,8 +19,7 @@ from quality_control.parsing import ( FileType, parse_file, strain_names, - parse_errors, - parse_strains) + parse_errors) parsebp = Blueprint("parse", __name__) @@ -34,8 +33,7 @@ def queued_parse( try: job_meta = jobs.update_meta( dbconn, job_id, status = "in-progress", progress = 0) - parsed = parse_file( - filepath, filetype, strain_names(parse_strains(strainsfile))) + parsed = parse_file(filepath, filetype, strain_names(strainsfile)) for line, curr_size in parsed: job_meta = jobs.update_meta( dbconn, job_id, @@ -174,11 +172,10 @@ def queued_collect_errors( dbconn = sqlite3.connect(dbpath) job_meta = jobs.retrieve_meta(dbconn, job.get_id()) for error in parse_errors( - filepath, filetype, strain_names(parse_strains(strainsfile)), + filepath, filetype, strain_names(strainsfile), seek_pos): count = count + 1 progress = ((error["position"] / job_meta["filesize"]) * 100) - print(f"CURRENT PROGRESS: {progress}") job_meta = jobs.update_meta( dbconn, job_id, message = f"Collected {count} errors", progress = progress) diff --git a/quality_control/parsing.py b/quality_control/parsing.py index 9fe88f1..436c90c 100644 --- a/quality_control/parsing.py +++ b/quality_control/parsing.py @@ -16,21 +16,6 @@ class FileType(Enum): AVERAGE = 1 STANDARD_ERROR = 2 -def parse_strains(filepath): - """Parse the strains file""" - with open(filepath, encoding="utf8") as strains_file: - reader = csv.DictReader( - strains_file, - fieldnames=[ - header.strip() for header - in strains_file.readline().split("\t")], - delimiter="\t") - for row in reader: - yield { - key: (value if value != "\\N" else None) - for key, value in row.items() - } - def __parse_header(line, strains): return valid_header( set(strains), @@ -47,13 +32,21 @@ LINE_PARSERS = { FileType.STANDARD_ERROR: __parse_standard_error_line } -def strain_names(strains): - """Retrieve a complete list of the names of the strains""" - def __extract_strain_names(acc, strain): - return acc + tuple( - item for item in (strain["Name"], strain["Name2"]) - if (item is not None and item != "")) - return reduce(__extract_strain_names, strains, tuple()) +def strain_names(filepath): + """Retrieve the strains names from given file""" + strains = set() + with open(filepath, encoding="utf8") as strains_file: + for idx, line in enumerate(strains_file.readlines()): + if idx > 0: + parts = line.split() + for name in (parts[1], parts[2]): + strains.add(name.strip()) + if len(parts) >= 6: + alias = parts[5].strip() + if alias != "" and alias not in ("P", "\\N"): + strains.add(alias) + + return strains def parse_file(filepath: str, filetype: FileType, strains: list): """Parse the given file""" diff --git a/scripts/qc.py b/scripts/qc.py index 09758cb..9937e5b 100644 --- a/scripts/qc.py +++ b/scripts/qc.py @@ -10,8 +10,7 @@ from quality_control.parsing import ( FileType, parse_file, strain_names, - parse_errors, - parse_strains) + parse_errors) def is_file_mime(filepath, mimetype): @@ -93,7 +92,7 @@ def main(): if args.verbose: print(f"Parsing the strain names from '{args.strainsfile}'") - strains = strain_names(parse_strains(os.path.realpath(args.strainsfile))) + strains = strain_names(os.path.realpath(args.strainsfile)) filepath = os.path.realpath(args.filepath) if args.verbose: diff --git a/tests/conftest.py b/tests/conftest.py index f79166d..6ef5374 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,9 +2,9 @@ import pytest -from quality_control.parsing import strain_names, parse_strains +from quality_control.parsing import strain_names @pytest.fixture(scope="session") def strains(): """Parse the strains once every test session""" - return strain_names(parse_strains("strains.csv")) + return strain_names("etc/strains.csv") |