aboutsummaryrefslogtreecommitdiff
path: root/quality_control
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2022-04-13 12:03:08 +0300
committerFrederick Muriuki Muriithi2022-04-13 12:03:08 +0300
commit1cc86cb4b71fe29b40115813836ca1277c1df859 (patch)
treed511c13a53f5e882e29129bbe190afecc6caa0dc /quality_control
parentbb8a92b36f85f6f3b2c6cd9ed6bbebd03ecf127c (diff)
downloadgn-uploader-1cc86cb4b71fe29b40115813836ca1277c1df859.tar.gz
Implement test for parsing that fails
* Improve tests that ensure parsing fails in case the file has errors * Add strains.csv file * Implement minimum viable functionality that passes the implemented tests
Diffstat (limited to 'quality_control')
-rw-r--r--quality_control/errors.py8
-rw-r--r--quality_control/parsing.py60
2 files changed, 63 insertions, 5 deletions
diff --git a/quality_control/errors.py b/quality_control/errors.py
index 99f9c97..758a300 100644
--- a/quality_control/errors.py
+++ b/quality_control/errors.py
@@ -4,10 +4,14 @@ class InvalidCellValue(Exception):
"""Raised when a function encounters an invalid value"""
def __init__(self, *args):
- Exception.__init__(self, *args)
+ super().__init__(self, *args)
class InvalidHeaderValue(Exception):
"""Raised when a header contains values not in the reference file."""
def __init__(self, *args):
- Exception.__init__(self, *args)
+ super().__init__(self, *args)
+
+class ParseError(Exception):
+ def __init(self, *args):
+ super().__init__(*args)
diff --git a/quality_control/parsing.py b/quality_control/parsing.py
index 52124f9..eda9181 100644
--- a/quality_control/parsing.py
+++ b/quality_control/parsing.py
@@ -1,12 +1,66 @@
+import csv
+
from enum import Enum
import quality_control.average as avg
import quality_control.standard_error as se
-from quality_control.errors import InvalidCellValue, InvalidHeaderValue
+from quality_control.headers import valid_header
+from quality_control.errors import (
+ ParseError, InvalidCellValue, InvalidHeaderValue)
class FileType(Enum):
AVERAGE = 1
STANDARD_ERROR = 2
-def parse_file(filepath: str, filetype: FileType):
- pass
+def parse_strains(filepath):
+ with open(filepath) as strains_file:
+ reader = csv.DictReader(
+ strains_file,
+ fieldnames=[
+ header.strip() for header
+ in strains_file.readline().split("\t")],
+ delimiter="\t")
+ for row in reader:
+ yield {
+ key: (value if value != "\\N" else None)
+ for key, value in row.items()
+ }
+
+def __parse_header(line, strains):
+ return valid_header(
+ strains,
+ tuple(header.strip() for header in line.split("\t")))
+
+def __parse_average_line(line):
+ return (line[0],) + tuple(avg.valid_value(field) for field in line[1:])
+
+def __parse_standard_error_line(line):
+ return (line[0],) + tuple(se.valid_value(field) for field in line[1:])
+
+LINE_PARSERS = {
+ FileType.AVERAGE: __parse_average_line,
+ FileType.STANDARD_ERROR: __parse_standard_error_line
+}
+
+def parse_file(filepath: str, filetype: FileType, strains_filepath: str):
+ seek_pos = 0
+ try:
+ with open(filepath, encoding="utf-8") as input_file:
+ for line_number, line in enumerate(input_file):
+ if line_number == 0:
+ yield __parse_header(
+ line,
+ tuple(strain["Name"] for strain
+ in parse_strains(strains_filepath)))
+ seek_pos = seek_pos + len(line)
+
+ yield LINE_PARSERS[filetype](
+ tuple(field.strip() for field in line.split("\t")))
+ seek_pos = seek_pos + len(line)
+ except (InvalidCellValue, InvalidHeaderValue) as err:
+ raise ParseError({
+ "filepath": filepath,
+ "filetype": filetype,
+ "position": seek_pos,
+ "line_number": line_number
+ })