Implement remaining file parsing tests

* Implement remaining file parsing tests and some helpers functions needed for ensuring the tests work.
author: Frederick Muriuki Muriithi 2022-04-13 15:11:17 +0300
committer: Frederick Muriuki Muriithi 2022-04-13 15:11:17 +0300
commit: 8954890bd7410d79ce151196f406b8e1b6985238 (patch)
tree: 4c1a6a44af9a518d13d6b3ba8813895fa4e068b6
parent: 6804b610266d0804dd5c391f5171943429c285cd (diff)
download: gn-uploader-8954890bd7410d79ce151196f406b8e1b6985238.tar.gz
4 files changed, 35 insertions, 21 deletions
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index b7b0ff5..8b2715a 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -1,6 +1,6 @@
 import csv
-
 from enum import Enum
+from functools import reduce
 
 import quality_control.average as avg
 import quality_control.standard_error as se
@@ -28,7 +28,7 @@ def parse_strains(filepath):
 
 def __parse_header(line, strains):
     return valid_header(
-        strains,
+        set(strains),
         tuple(header.strip() for header in line.split("\t")))
 
 def __parse_average_line(line):
@@ -42,16 +42,20 @@ LINE_PARSERS = {
     FileType.STANDARD_ERROR: __parse_standard_error_line
 }
 
-def parse_file(filepath: str, filetype: FileType, strains_filepath: str):
+def strain_names(strains):
+    def __extract_strain_names(acc, strain):
+        return acc + tuple(
+            item for item in (strain["Name"], strain["Name2"])
+            if (item is not None and item != ""))
+    return reduce(__extract_strain_names, strains, tuple())
+
+def parse_file(filepath: str, filetype: FileType, strains: list):
     seek_pos = 0
     try:
         with open(filepath, encoding="utf-8") as input_file:
             for line_number, line in enumerate(input_file):
                 if line_number == 0:
-                    yield __parse_header(
-                        line,
-                        tuple(strain["Name"] for strain
-                              in parse_strains(strains_filepath)))
+                    yield __parse_header(line, strains)
                     seek_pos = seek_pos + len(line)
 
                 yield LINE_PARSERS[filetype](
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..0cdba3e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,9 @@
+from functools import reduce
+
+import pytest
+
+from quality_control.parsing import strain_names, parse_strains
+
+@pytest.fixture(scope="session")
+def strains():
+    return strain_names(parse_strains("strains.csv"))
diff --git a/tests/qc/test_parsing.py b/tests/qc/test_parsing.py
index 14cfbde..be13d9b 100644
--- a/tests/qc/test_parsing.py
+++ b/tests/qc/test_parsing.py
@@ -11,37 +11,38 @@ from quality_control.parsing import FileType, parse_file
      ("tests/test_data/average.tsv", FileType.STANDARD_ERROR),
      ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.AVERAGE),
      ("tests/test_data/standarderror.tsv", FileType.AVERAGE),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv",
+     ("tests/test_data/duplicated_headers_no_data_errors.tsv",
       FileType.STANDARD_ERROR),))
-def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype):
+def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype, strains):
     with pytest.raises(ParseError):
-        for line in parse_file(filepath, filetype, "strains.csv"):
+        for line in parse_file(filepath, filetype, strains):
             pass
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_crlf.tsv", FileType.AVERAGE),
      ("tests/test_data/average.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE)))
-def test_parse_file_passes_with_valid_files(filepath, filetype):
-    assert False, "Not Implemented"
+     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR)))
+def test_parse_file_passes_with_valid_files(filepath, filetype, strains):
+    for line in parse_file(filepath, filetype, strains):
+        assert bool(line)
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_large.tsv", FileType.AVERAGE),
      ("tests/test_data/average.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE)))
-def test_parse_file_works_with_large_files(filepath, filetype):
-    assert False, "Not Implemented"
+     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR)))
+def test_parse_file_works_with_large_files(filepath, filetype, strains):
+    for line in parse_file(filepath, filetype, strains):
+        assert bool(line)
 
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_error_at_end_200MB.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR)))
-def test_parse_file_raises_exception_on_error_in_file(filepath, filetype):
+     ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR),
+     ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.AVERAGE)))
+def test_parse_file_raises_exception_on_error_in_file(filepath, filetype, strains):
     with pytest.raises(ParseError):
-        for line in parse_file(filepath, filetype, "strains.csv"):
+        for line in parse_file(filepath, filetype, strains):
             pass
diff --git a/tests/test_data/we_found_no_errors_in_your_file.tsv b/tests/test_data/duplicated_headers_no_data_errors.tsv
index a49ed1b..a49ed1b 100644
--- a/tests/test_data/we_found_no_errors_in_your_file.tsv
+++ b/tests/test_data/duplicated_headers_no_data_errors.tsv
author	Frederick Muriuki Muriithi	2022-04-13 15:11:17 +0300
committer	Frederick Muriuki Muriithi	2022-04-13 15:11:17 +0300
commit	8954890bd7410d79ce151196f406b8e1b6985238 (patch)
tree	4c1a6a44af9a518d13d6b3ba8813895fa4e068b6
parent	6804b610266d0804dd5c391f5171943429c285cd (diff)
download	gn-uploader-8954890bd7410d79ce151196f406b8e1b6985238.tar.gz