about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-05 15:18:44 +0300
committerFrederick Muriuki Muriithi2024-02-05 15:43:25 +0300
commit3f879d120f7628646f383a457206b15037cc57dc (patch)
treeeaecac8299eeabef511f0297ac439ae0cf590a4e
parent3245182e967f5ac8a296cf47ce3e622c3cb754ed (diff)
downloadgn-uploader-3f879d120f7628646f383a457206b15037cc57dc.tar.gz
Check that data in geno file is valid
Add a function to ensure the values in the geno files are all listed
in the control data under the "genotypes" key.
-rw-r--r--r_qtl/r_qtl2.py4
-rw-r--r--r_qtl/r_qtl2_qc.py18
-rw-r--r--tests/r_qtl/test_files/geno_with_missing_genotypes.zipbin0 -> 738 bytes
-rw-r--r--tests/r_qtl/test_r_qtl2_qc.py27
4 files changed, 43 insertions, 6 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index f8c08d9..e74312f 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -10,7 +10,7 @@ import yaml
 
 from functional_tools import take, chain
 
-from r_qtl.errors import InvalidFormat
+from r_qtl.errors import InvalidFormat, MissingFileError
 
 __FILE_TYPES__ = (
     "geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap",
@@ -250,7 +250,7 @@ def file_data(zfile: ZipFile,
                 zfile, member_key, cdata, process_transposed_value):
             yield row
     except KeyError as exc:
-        raise InvalidFormat(*exc.args) from exc
+        raise MissingFileError(*exc.args) from exc
 
 def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load cross information where present."""
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py
index 853c34b..70a00f9 100644
--- a/r_qtl/r_qtl2_qc.py
+++ b/r_qtl/r_qtl2_qc.py
@@ -1,7 +1,7 @@
 """Quality control checks for R/qtl2 data bundles."""
 from zipfile import ZipFile
 from functools import reduce
-from typing import Union, Sequence
+from typing import Union, Sequence, Iterator, Optional
 
 from r_qtl import errors as rqe
 from r_qtl import r_qtl2 as rqtl2
@@ -54,3 +54,19 @@ def validate_bundle(zfile: ZipFile):
         raise rqe.MissingFileError(
                         "The following files do not exist in the bundle: " +
                         ", ".join(missing))
+
+def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]:
+    """Check for and retrieve geno errors."""
+    cdata = rqtl2.control_data(zfile)
+    genotypes = tuple(cdata.get("genotypes", {}).keys())
+    try:
+        for lineno, row in enumerate(
+                rqtl2.file_data(zfile, "geno", cdata), start=1):
+            for field, value in row.items():
+                if field == "id":
+                    continue
+                if value is not None and value not in genotypes:
+                    yield (lineno, field, (
+                        f"Invalid value '{value}'. Expected one of {genotypes}"))
+    except rqe.MissingFileError:
+        yield (None, None, "Missing 'geno' file.")
diff --git a/tests/r_qtl/test_files/geno_with_missing_genotypes.zip b/tests/r_qtl/test_files/geno_with_missing_genotypes.zip
new file mode 100644
index 0000000..b174d4e
--- /dev/null
+++ b/tests/r_qtl/test_files/geno_with_missing_genotypes.zip
Binary files differdiff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py
index 9b19ae2..bcbcbac 100644
--- a/tests/r_qtl/test_r_qtl2_qc.py
+++ b/tests/r_qtl/test_r_qtl2_qc.py
@@ -5,7 +5,7 @@ import pytest
 from zipfile import ZipFile
 
 from r_qtl import r_qtl2 as rqtl2
-from r_qtl import r_qtl2_qc as qc
+from r_qtl import r_qtl2_qc as rqc
 
 ###### DO NOT COMMIT THIS ######
 from quality_control.debug import __pk__
@@ -47,7 +47,7 @@ def test_bundle_files_list(filepath, expected):
     THEN: verify that ALL files listed in the control file are returned.
     """
     with ZipFile(Path(filepath).absolute(), "r") as zfile:
-        assert qc.bundle_files_list(rqtl2.control_data(zfile)) == expected
+        assert rqc.bundle_files_list(rqtl2.control_data(zfile)) == expected
 
 @pytest.mark.unit_test
 @pytest.mark.parametrize(
@@ -83,4 +83,25 @@ def test_missing_files(filepath, expected):
         exist in the bundle are returned.
     """
     with ZipFile(Path(filepath).absolute(), "r") as zfile:
-        assert qc.missing_files(zfile) == expected
+        assert rqc.missing_files(zfile) == expected
+
+@pytest.mark.unit_test
+@pytest.mark.parametrize(
+    "filepath,expected",
+    (("tests/r_qtl/test_files/empty_control_file_yaml.zip",
+      ((None, None, "Missing 'geno' file."),)),
+     ("tests/r_qtl/test_files/test_geno.zip",
+      tuple()),
+     ("tests/r_qtl/test_files/geno_with_missing_genotypes.zip",
+      ((1, "AXR-1", f"Invalid value 'X'. Expected one of ('L', 'C')"),
+       (2, "EC.480C", f"Invalid value 'Y'. Expected one of ('L', 'C')"),
+       (6, "HH.335C-Col/PhyA", f"Invalid value 'H'. Expected one of ('L', 'C')")))))
+def test_geno_errors(filepath, expected):
+    """
+    GIVEN: A R/qtl2 bundle
+    WHEN: We call r_qtl.r_qtl2_qc.geno_errors(..) on it
+    THEN: We should get a sequence of all errors present in the file, or an
+        empty sequence if no errors exist.
+    """
+    with ZipFile(Path(filepath).absolute(), "r") as zfile:
+        assert tuple(rqc.geno_errors(zfile)) == expected