Optimise strain names parsing

- Use a way faster way of parsing the strains file
author: Frederick Muriuki Muriithi 2022-05-02 13:04:03 +0300
committer: Frederick Muriuki Muriithi 2022-05-02 13:04:03 +0300
commit: fcade690de59249a2789c26e8f668f36f8f4e075 (patch)
tree: 73a9f8d40871e7942c4ae034eabf39855b6756ea
parent: 5632dcab27058875de99d63cbd263acfa3a9a2d5 (diff)
download: gn-uploader-fcade690de59249a2789c26e8f668f36f8f4e075.tar.gz
4 files changed, 22 insertions, 33 deletions
diff --git a/qc_app/parse.py b/qc_app/parse.py
index 795cc01..baad9a6 100644
--- a/qc_app/parse.py
+++ b/qc_app/parse.py
@@ -19,8 +19,7 @@ from quality_control.parsing import (
     FileType,
     parse_file,
     strain_names,
-    parse_errors,
-    parse_strains)
+    parse_errors)
 
 parsebp = Blueprint("parse", __name__)
 
@@ -34,8 +33,7 @@ def queued_parse(
         try:
             job_meta = jobs.update_meta(
                 dbconn, job_id, status = "in-progress", progress = 0)
-            parsed = parse_file(
-                filepath, filetype, strain_names(parse_strains(strainsfile)))
+            parsed = parse_file(filepath, filetype, strain_names(strainsfile))
             for line, curr_size in parsed:
                 job_meta = jobs.update_meta(
                     dbconn, job_id,
@@ -174,11 +172,10 @@ def queued_collect_errors(
             dbconn = sqlite3.connect(dbpath)
             job_meta = jobs.retrieve_meta(dbconn, job.get_id())
             for error in parse_errors(
-                    filepath, filetype, strain_names(parse_strains(strainsfile)),
+                    filepath, filetype, strain_names(strainsfile),
                     seek_pos):
                 count = count + 1
                 progress  = ((error["position"] / job_meta["filesize"]) * 100)
-                print(f"CURRENT PROGRESS: {progress}")
                 job_meta = jobs.update_meta(
                     dbconn, job_id, message = f"Collected {count} errors",
                     progress = progress)
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index 9fe88f1..436c90c 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -16,21 +16,6 @@ class FileType(Enum):
     AVERAGE = 1
     STANDARD_ERROR = 2
 
-def parse_strains(filepath):
-    """Parse the strains file"""
-    with open(filepath, encoding="utf8") as strains_file:
-        reader = csv.DictReader(
-            strains_file,
-            fieldnames=[
-                header.strip() for header
-                in strains_file.readline().split("\t")],
-            delimiter="\t")
-        for row in reader:
-            yield {
-                key: (value if value != "\\N" else None)
-                for key, value in row.items()
-            }
-
 def __parse_header(line, strains):
     return valid_header(
         set(strains),
@@ -47,13 +32,21 @@ LINE_PARSERS = {
     FileType.STANDARD_ERROR: __parse_standard_error_line
 }
 
-def strain_names(strains):
-    """Retrieve a complete list of the names of the strains"""
-    def __extract_strain_names(acc, strain):
-        return acc + tuple(
-            item for item in (strain["Name"], strain["Name2"])
-            if (item is not None and item != ""))
-    return reduce(__extract_strain_names, strains, tuple())
+def strain_names(filepath):
+    """Retrieve the strains names from given file"""
+    strains = set()
+    with open(filepath, encoding="utf8") as strains_file:
+        for idx, line in enumerate(strains_file.readlines()):
+            if idx > 0:
+                parts = line.split()
+                for name in (parts[1], parts[2]):
+                    strains.add(name.strip())
+                    if len(parts) >= 6:
+                        alias = parts[5].strip()
+                        if alias != "" and alias not in ("P", "\\N"):
+                            strains.add(alias)
+
+    return strains
 
 def parse_file(filepath: str, filetype: FileType, strains: list):
     """Parse the given file"""
diff --git a/scripts/qc.py b/scripts/qc.py
index 09758cb..9937e5b 100644
--- a/scripts/qc.py
+++ b/scripts/qc.py
@@ -10,8 +10,7 @@ from quality_control.parsing import (
     FileType,
     parse_file,
     strain_names,
-    parse_errors,
-    parse_strains)
+    parse_errors)
 
 
 def is_file_mime(filepath, mimetype):
@@ -93,7 +92,7 @@ def main():
     if args.verbose:
         print(f"Parsing the strain names from '{args.strainsfile}'")
 
-    strains = strain_names(parse_strains(os.path.realpath(args.strainsfile)))
+    strains = strain_names(os.path.realpath(args.strainsfile))
 
     filepath = os.path.realpath(args.filepath)
     if args.verbose:
diff --git a/tests/conftest.py b/tests/conftest.py
index f79166d..6ef5374 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,9 +2,9 @@
 
 import pytest
 
-from quality_control.parsing import strain_names, parse_strains
+from quality_control.parsing import strain_names
 
 @pytest.fixture(scope="session")
 def strains():
     """Parse the strains once every test session"""
-    return strain_names(parse_strains("strains.csv"))
+    return strain_names("etc/strains.csv")
author	Frederick Muriuki Muriithi	2022-05-02 13:04:03 +0300
committer	Frederick Muriuki Muriithi	2022-05-02 13:04:03 +0300
commit	fcade690de59249a2789c26e8f668f36f8f4e075 (patch)
tree	73a9f8d40871e7942c4ae034eabf39855b6756ea
parent	5632dcab27058875de99d63cbd263acfa3a9a2d5 (diff)
download	gn-uploader-fcade690de59249a2789c26e8f668f36f8f4e075.tar.gz