From b59ddf47a8f968e23aae6985986806ad5d704474 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Tue, 6 Feb 2024 06:01:35 +0300 Subject: Check that pheno values are numerical and at least 3 decimal places --- r_qtl/r_qtl2_qc.py | 24 ++++++++++++++++++- tests/r_qtl/test_files/pheno_with_errors.zip | Bin 0 -> 522 bytes tests/r_qtl/test_files/pheno_without_errors.zip | Bin 0 -> 539 bytes tests/r_qtl/test_r_qtl2_qc.py | 30 +++++++++++++++++++----- 4 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 tests/r_qtl/test_files/pheno_with_errors.zip create mode 100644 tests/r_qtl/test_files/pheno_without_errors.zip diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 70a00f9..b45c17a 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,4 +1,5 @@ """Quality control checks for R/qtl2 data bundles.""" +import re from zipfile import ZipFile from functools import reduce from typing import Union, Sequence, Iterator, Optional @@ -55,7 +56,8 @@ def validate_bundle(zfile: ZipFile): "The following files do not exist in the bundle: " + ", ".join(missing)) -def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]: +def geno_errors(zfile: ZipFile) -> Iterator[ + tuple[Optional[int], Optional[str], str]]: """Check for and retrieve geno errors.""" cdata = rqtl2.control_data(zfile) genotypes = tuple(cdata.get("genotypes", {}).keys()) @@ -70,3 +72,23 @@ def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], f"Invalid value '{value}'. Expected one of {genotypes}")) except rqe.MissingFileError: yield (None, None, "Missing 'geno' file.") + +def pheno_errors(zfile: ZipFile) -> Iterator[ + tuple[Optional[int], Optional[str], str]]: + """Check for and retrieve pheno errors.""" + cdata = rqtl2.control_data(zfile) + try: + for lineno, row in enumerate( + rqtl2.file_data(zfile, "pheno", cdata), start=1): + for field, value in row.items(): + if field == "id": + continue + if value is not None and not( + re.search(r"^([0-9]+\.[0-9]{3,}|[0-9]+\.?0*)$", value) + or re.search(r"^0\.0+$", value) + or re.search("^0+$", value)): + yield (lineno, field, ( + f"Invalid value '{value}'. Expected numerical value " + "with at least 3 decimal places.")) + except rqe.MissingFileError: + yield (None, None, "Missing 'pheno' file.") diff --git a/tests/r_qtl/test_files/pheno_with_errors.zip b/tests/r_qtl/test_files/pheno_with_errors.zip new file mode 100644 index 0000000..321e44d Binary files /dev/null and b/tests/r_qtl/test_files/pheno_with_errors.zip differ diff --git a/tests/r_qtl/test_files/pheno_without_errors.zip b/tests/r_qtl/test_files/pheno_without_errors.zip new file mode 100644 index 0000000..9d9d027 Binary files /dev/null and b/tests/r_qtl/test_files/pheno_without_errors.zip differ diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py index bcbcbac..1c96a86 100644 --- a/tests/r_qtl/test_r_qtl2_qc.py +++ b/tests/r_qtl/test_r_qtl2_qc.py @@ -7,10 +7,6 @@ from zipfile import ZipFile from r_qtl import r_qtl2 as rqtl2 from r_qtl import r_qtl2_qc as rqc -###### DO NOT COMMIT THIS ###### -from quality_control.debug import __pk__ -###### END: DO NOT COMMIT THIS ###### - @pytest.mark.unit_test @pytest.mark.parametrize( "filepath,expected", @@ -93,8 +89,8 @@ def test_missing_files(filepath, expected): ("tests/r_qtl/test_files/test_geno.zip", tuple()), ("tests/r_qtl/test_files/geno_with_missing_genotypes.zip", - ((1, "AXR-1", f"Invalid value 'X'. Expected one of ('L', 'C')"), - (2, "EC.480C", f"Invalid value 'Y'. Expected one of ('L', 'C')"), + ((1, "AXR-1", "Invalid value 'X'. Expected one of ('L', 'C')"), + (2, "EC.480C", "Invalid value 'Y'. Expected one of ('L', 'C')"), (6, "HH.335C-Col/PhyA", f"Invalid value 'H'. Expected one of ('L', 'C')"))))) def test_geno_errors(filepath, expected): """ @@ -105,3 +101,25 @@ def test_geno_errors(filepath, expected): """ with ZipFile(Path(filepath).absolute(), "r") as zfile: assert tuple(rqc.geno_errors(zfile)) == expected + +@pytest.mark.unit_test +@pytest.mark.parametrize( + "filepath,expected", + (("tests/r_qtl/test_files/empty_control_file_yaml.zip", + ((None, None, "Missing 'pheno' file."),)), + ("tests/r_qtl/test_files/pheno_without_errors.zip", + tuple()), + ("tests/r_qtl/test_files/pheno_with_errors.zip", + ((1, "liver", ("Invalid value '61.92'. Expected numerical value " + "with at least 3 decimal places.")), + (2, "spleen", ("Invalid value 'brrr'. Expected numerical value " + "with at least 3 decimal places.")))))) +def test_pheno_errors(filepath, expected): + """ + GIVEN: A R/qtl2 bundle + WHEN: we check for pheno errors + THEN: We should get a sequence of all errors present in the pheno file, or + an empty sequence if no errors exist. + """ + with ZipFile(Path(filepath).absolute(), "r") as zfile: + assert tuple(rqc.pheno_errors(zfile)) == expected -- cgit v1.2.3