From 8954890bd7410d79ce151196f406b8e1b6985238 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Wed, 13 Apr 2022 15:11:17 +0300
Subject: Implement remaining file parsing tests

* Implement remaining file parsing tests and some helpers functions
  needed for ensuring the tests work.
---
 quality_control/parsing.py                         | 18 ++++++++------
 tests/conftest.py                                  |  9 +++++++
 tests/qc/test_parsing.py                           | 29 +++++++++++-----------
 .../duplicated_headers_no_data_errors.tsv          |  2 ++
 .../test_data/we_found_no_errors_in_your_file.tsv  |  2 --
 5 files changed, 37 insertions(+), 23 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_data/duplicated_headers_no_data_errors.tsv
 delete mode 100644 tests/test_data/we_found_no_errors_in_your_file.tsv

diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index b7b0ff5..8b2715a 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -1,6 +1,6 @@
 import csv
-
 from enum import Enum
+from functools import reduce
 
 import quality_control.average as avg
 import quality_control.standard_error as se
@@ -28,7 +28,7 @@ def parse_strains(filepath):
 
 def __parse_header(line, strains):
     return valid_header(
-        strains,
+        set(strains),
         tuple(header.strip() for header in line.split("\t")))
 
 def __parse_average_line(line):
@@ -42,16 +42,20 @@ LINE_PARSERS = {
     FileType.STANDARD_ERROR: __parse_standard_error_line
 }
 
-def parse_file(filepath: str, filetype: FileType, strains_filepath: str):
+def strain_names(strains):
+    def __extract_strain_names(acc, strain):
+        return acc + tuple(
+            item for item in (strain["Name"], strain["Name2"])
+            if (item is not None and item != ""))
+    return reduce(__extract_strain_names, strains, tuple())
+
+def parse_file(filepath: str, filetype: FileType, strains: list):
     seek_pos = 0
     try:
         with open(filepath, encoding="utf-8") as input_file:
             for line_number, line in enumerate(input_file):
                 if line_number == 0:
-                    yield __parse_header(
-                        line,
-                        tuple(strain["Name"] for strain
-                              in parse_strains(strains_filepath)))
+                    yield __parse_header(line, strains)
                     seek_pos = seek_pos + len(line)
 
                 yield LINE_PARSERS[filetype](
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..0cdba3e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,9 @@
+from functools import reduce
+
+import pytest
+
+from quality_control.parsing import strain_names, parse_strains
+
+@pytest.fixture(scope="session")
+def strains():
+    return strain_names(parse_strains("strains.csv"))
diff --git a/tests/qc/test_parsing.py b/tests/qc/test_parsing.py
index 14cfbde..be13d9b 100644
--- a/tests/qc/test_parsing.py
+++ b/tests/qc/test_parsing.py
@@ -11,37 +11,38 @@ from quality_control.parsing import FileType, parse_file
      ("tests/test_data/average.tsv", FileType.STANDARD_ERROR),
      ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.AVERAGE),
      ("tests/test_data/standarderror.tsv", FileType.AVERAGE),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv",
+     ("tests/test_data/duplicated_headers_no_data_errors.tsv",
       FileType.STANDARD_ERROR),))
-def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype):
+def test_parse_file_fails_with_wrong_filetype_declaration(filepath, filetype, strains):
     with pytest.raises(ParseError):
-        for line in parse_file(filepath, filetype, "strains.csv"):
+        for line in parse_file(filepath, filetype, strains):
             pass
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_crlf.tsv", FileType.AVERAGE),
      ("tests/test_data/average.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE)))
-def test_parse_file_passes_with_valid_files(filepath, filetype):
-    assert False, "Not Implemented"
+     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR)))
+def test_parse_file_passes_with_valid_files(filepath, filetype, strains):
+    for line in parse_file(filepath, filetype, strains):
+        assert bool(line)
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_large.tsv", FileType.AVERAGE),
      ("tests/test_data/average.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR),
-     ("tests/test_data/we_found_no_errors_in_your_file.tsv", FileType.AVERAGE)))
-def test_parse_file_works_with_large_files(filepath, filetype):
-    assert False, "Not Implemented"
+     ("tests/test_data/standarderror.tsv", FileType.STANDARD_ERROR)))
+def test_parse_file_works_with_large_files(filepath, filetype, strains):
+    for line in parse_file(filepath, filetype, strains):
+        assert bool(line)
 
 
 @pytest.mark.parametrize(
     "filepath,filetype",
     (("tests/test_data/average_error_at_end_200MB.tsv", FileType.AVERAGE),
-     ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR)))
-def test_parse_file_raises_exception_on_error_in_file(filepath, filetype):
+     ("tests/test_data/standarderror_1_error_at_end.tsv", FileType.STANDARD_ERROR),
+     ("tests/test_data/duplicated_headers_no_data_errors.tsv", FileType.AVERAGE)))
+def test_parse_file_raises_exception_on_error_in_file(filepath, filetype, strains):
     with pytest.raises(ParseError):
-        for line in parse_file(filepath, filetype, "strains.csv"):
+        for line in parse_file(filepath, filetype, strains):
             pass
diff --git a/tests/test_data/duplicated_headers_no_data_errors.tsv b/tests/test_data/duplicated_headers_no_data_errors.tsv
new file mode 100644
index 0000000..a49ed1b
--- /dev/null
+++ b/tests/test_data/duplicated_headers_no_data_errors.tsv
@@ -0,0 +1,2 @@
+ProbeSetID	BXD95	BXD27	BXD98	BXD99	BXD39	BXD33	BXD45	BXD51	BXD56	BXD76	BXD56	BXD12	BXD83	DBA/2J	BXD87	BXD81	BXD86	BXD84	BXD85	BXD40	BXD8	BXD79	BXD77	BXD70	BXD71	BXD73	BXD68	BXD67	BXD66	BXD63	BXD64	BXD65	BXD48	BXD50	BXD55	BXD6	BXD100	BXD11	BXD24	BXD18	BXD21	BXD20	BXD14	BXD16	BXD44	BXD38	BXD42	BXD43	BXD28	BXD25	BXD2	BXD23	BXD29	BXD22	C57BL/6J	D2B6F1	BXD19	BXD90	BXD90	BXD1	BXD101	BXD90	BXD62	BXD9	BXD89	BXD96	BXD69
+10608724	6.356	6.532	6.515	6.563	6.471	6.472	6.632	6.564	6.601	6.584	6.342	6.542	6.639	6.343	6.468	6.367	6.555	6.635	6.443	6.531	6.514	6.582	6.544	6.504	6.387	6.489	6.726	6.556	6.462	6.594	6.604	6.629	6.547	6.648	6.493	6.451	6.672	6.601	6.689	6.731	6.769	6.633	6.425	6.612	6.486	6.452	6.557	6.343	6.345	6.345	6.346	6.339	6.539	6.554	6.481	6.524	6.382	6.491	6.352	6.556	6.636	6.574	6.334	6.744	6.427	6.405	6.493
\ No newline at end of file
diff --git a/tests/test_data/we_found_no_errors_in_your_file.tsv b/tests/test_data/we_found_no_errors_in_your_file.tsv
deleted file mode 100644
index a49ed1b..0000000
--- a/tests/test_data/we_found_no_errors_in_your_file.tsv
+++ /dev/null
@@ -1,2 +0,0 @@
-ProbeSetID	BXD95	BXD27	BXD98	BXD99	BXD39	BXD33	BXD45	BXD51	BXD56	BXD76	BXD56	BXD12	BXD83	DBA/2J	BXD87	BXD81	BXD86	BXD84	BXD85	BXD40	BXD8	BXD79	BXD77	BXD70	BXD71	BXD73	BXD68	BXD67	BXD66	BXD63	BXD64	BXD65	BXD48	BXD50	BXD55	BXD6	BXD100	BXD11	BXD24	BXD18	BXD21	BXD20	BXD14	BXD16	BXD44	BXD38	BXD42	BXD43	BXD28	BXD25	BXD2	BXD23	BXD29	BXD22	C57BL/6J	D2B6F1	BXD19	BXD90	BXD90	BXD1	BXD101	BXD90	BXD62	BXD9	BXD89	BXD96	BXD69
-10608724	6.356	6.532	6.515	6.563	6.471	6.472	6.632	6.564	6.601	6.584	6.342	6.542	6.639	6.343	6.468	6.367	6.555	6.635	6.443	6.531	6.514	6.582	6.544	6.504	6.387	6.489	6.726	6.556	6.462	6.594	6.604	6.629	6.547	6.648	6.493	6.451	6.672	6.601	6.689	6.731	6.769	6.633	6.425	6.612	6.486	6.452	6.557	6.343	6.345	6.345	6.346	6.339	6.539	6.554	6.481	6.524	6.382	6.491	6.352	6.556	6.636	6.574	6.334	6.744	6.427	6.405	6.493
\ No newline at end of file
-- 
cgit v1.2.3