From 3f879d120f7628646f383a457206b15037cc57dc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 5 Feb 2024 15:18:44 +0300 Subject: Check that data in geno file is valid Add a function to ensure the values in the geno files are all listed in the control data under the "genotypes" key. --- r_qtl/r_qtl2.py | 4 +-- r_qtl/r_qtl2_qc.py | 18 +++++++++++++- .../test_files/geno_with_missing_genotypes.zip | Bin 0 -> 738 bytes tests/r_qtl/test_r_qtl2_qc.py | 27 ++++++++++++++++++--- 4 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 tests/r_qtl/test_files/geno_with_missing_genotypes.zip diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index f8c08d9..e74312f 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -10,7 +10,7 @@ import yaml from functional_tools import take, chain -from r_qtl.errors import InvalidFormat +from r_qtl.errors import InvalidFormat, MissingFileError __FILE_TYPES__ = ( "geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap", @@ -250,7 +250,7 @@ def file_data(zfile: ZipFile, zfile, member_key, cdata, process_transposed_value): yield row except KeyError as exc: - raise InvalidFormat(*exc.args) from exc + raise MissingFileError(*exc.args) from exc def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load cross information where present.""" diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 853c34b..70a00f9 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,7 +1,7 @@ """Quality control checks for R/qtl2 data bundles.""" from zipfile import ZipFile from functools import reduce -from typing import Union, Sequence +from typing import Union, Sequence, Iterator, Optional from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 @@ -54,3 +54,19 @@ def validate_bundle(zfile: ZipFile): raise rqe.MissingFileError( "The following files do not exist in the bundle: " + ", ".join(missing)) + +def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]: + """Check for and retrieve geno errors.""" + cdata = rqtl2.control_data(zfile) + genotypes = tuple(cdata.get("genotypes", {}).keys()) + try: + for lineno, row in enumerate( + rqtl2.file_data(zfile, "geno", cdata), start=1): + for field, value in row.items(): + if field == "id": + continue + if value is not None and value not in genotypes: + yield (lineno, field, ( + f"Invalid value '{value}'. Expected one of {genotypes}")) + except rqe.MissingFileError: + yield (None, None, "Missing 'geno' file.") diff --git a/tests/r_qtl/test_files/geno_with_missing_genotypes.zip b/tests/r_qtl/test_files/geno_with_missing_genotypes.zip new file mode 100644 index 0000000..b174d4e Binary files /dev/null and b/tests/r_qtl/test_files/geno_with_missing_genotypes.zip differ diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py index 9b19ae2..bcbcbac 100644 --- a/tests/r_qtl/test_r_qtl2_qc.py +++ b/tests/r_qtl/test_r_qtl2_qc.py @@ -5,7 +5,7 @@ import pytest from zipfile import ZipFile from r_qtl import r_qtl2 as rqtl2 -from r_qtl import r_qtl2_qc as qc +from r_qtl import r_qtl2_qc as rqc ###### DO NOT COMMIT THIS ###### from quality_control.debug import __pk__ @@ -47,7 +47,7 @@ def test_bundle_files_list(filepath, expected): THEN: verify that ALL files listed in the control file are returned. """ with ZipFile(Path(filepath).absolute(), "r") as zfile: - assert qc.bundle_files_list(rqtl2.control_data(zfile)) == expected + assert rqc.bundle_files_list(rqtl2.control_data(zfile)) == expected @pytest.mark.unit_test @pytest.mark.parametrize( @@ -83,4 +83,25 @@ def test_missing_files(filepath, expected): exist in the bundle are returned. """ with ZipFile(Path(filepath).absolute(), "r") as zfile: - assert qc.missing_files(zfile) == expected + assert rqc.missing_files(zfile) == expected + +@pytest.mark.unit_test +@pytest.mark.parametrize( + "filepath,expected", + (("tests/r_qtl/test_files/empty_control_file_yaml.zip", + ((None, None, "Missing 'geno' file."),)), + ("tests/r_qtl/test_files/test_geno.zip", + tuple()), + ("tests/r_qtl/test_files/geno_with_missing_genotypes.zip", + ((1, "AXR-1", f"Invalid value 'X'. Expected one of ('L', 'C')"), + (2, "EC.480C", f"Invalid value 'Y'. Expected one of ('L', 'C')"), + (6, "HH.335C-Col/PhyA", f"Invalid value 'H'. Expected one of ('L', 'C')"))))) +def test_geno_errors(filepath, expected): + """ + GIVEN: A R/qtl2 bundle + WHEN: We call r_qtl.r_qtl2_qc.geno_errors(..) on it + THEN: We should get a sequence of all errors present in the file, or an + empty sequence if no errors exist. + """ + with ZipFile(Path(filepath).absolute(), "r") as zfile: + assert tuple(rqc.geno_errors(zfile)) == expected -- cgit v1.2.3