aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-06 06:01:35 +0300
committerFrederick Muriuki Muriithi2024-02-06 06:07:46 +0300
commitb59ddf47a8f968e23aae6985986806ad5d704474 (patch)
treef19a26b238e0b85ceee864fc9fb3dd9d0e6fcbf8
parentf98d0325e641dd43e0ac460ef4fa931eb94c054f (diff)
downloadgn-uploader-b59ddf47a8f968e23aae6985986806ad5d704474.tar.gz
Check that pheno values are numerical and at least 3 decimal places
-rw-r--r--r_qtl/r_qtl2_qc.py24
-rw-r--r--tests/r_qtl/test_files/pheno_with_errors.zipbin0 -> 522 bytes
-rw-r--r--tests/r_qtl/test_files/pheno_without_errors.zipbin0 -> 539 bytes
-rw-r--r--tests/r_qtl/test_r_qtl2_qc.py30
4 files changed, 47 insertions, 7 deletions
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py
index 70a00f9..b45c17a 100644
--- a/r_qtl/r_qtl2_qc.py
+++ b/r_qtl/r_qtl2_qc.py
@@ -1,4 +1,5 @@
"""Quality control checks for R/qtl2 data bundles."""
+import re
from zipfile import ZipFile
from functools import reduce
from typing import Union, Sequence, Iterator, Optional
@@ -55,7 +56,8 @@ def validate_bundle(zfile: ZipFile):
"The following files do not exist in the bundle: " +
", ".join(missing))
-def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]:
+def geno_errors(zfile: ZipFile) -> Iterator[
+ tuple[Optional[int], Optional[str], str]]:
"""Check for and retrieve geno errors."""
cdata = rqtl2.control_data(zfile)
genotypes = tuple(cdata.get("genotypes", {}).keys())
@@ -70,3 +72,23 @@ def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str],
f"Invalid value '{value}'. Expected one of {genotypes}"))
except rqe.MissingFileError:
yield (None, None, "Missing 'geno' file.")
+
+def pheno_errors(zfile: ZipFile) -> Iterator[
+ tuple[Optional[int], Optional[str], str]]:
+ """Check for and retrieve pheno errors."""
+ cdata = rqtl2.control_data(zfile)
+ try:
+ for lineno, row in enumerate(
+ rqtl2.file_data(zfile, "pheno", cdata), start=1):
+ for field, value in row.items():
+ if field == "id":
+ continue
+ if value is not None and not(
+ re.search(r"^([0-9]+\.[0-9]{3,}|[0-9]+\.?0*)$", value)
+ or re.search(r"^0\.0+$", value)
+ or re.search("^0+$", value)):
+ yield (lineno, field, (
+ f"Invalid value '{value}'. Expected numerical value "
+ "with at least 3 decimal places."))
+ except rqe.MissingFileError:
+ yield (None, None, "Missing 'pheno' file.")
diff --git a/tests/r_qtl/test_files/pheno_with_errors.zip b/tests/r_qtl/test_files/pheno_with_errors.zip
new file mode 100644
index 0000000..321e44d
--- /dev/null
+++ b/tests/r_qtl/test_files/pheno_with_errors.zip
Binary files differ
diff --git a/tests/r_qtl/test_files/pheno_without_errors.zip b/tests/r_qtl/test_files/pheno_without_errors.zip
new file mode 100644
index 0000000..9d9d027
--- /dev/null
+++ b/tests/r_qtl/test_files/pheno_without_errors.zip
Binary files differ
diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py
index bcbcbac..1c96a86 100644
--- a/tests/r_qtl/test_r_qtl2_qc.py
+++ b/tests/r_qtl/test_r_qtl2_qc.py
@@ -7,10 +7,6 @@ from zipfile import ZipFile
from r_qtl import r_qtl2 as rqtl2
from r_qtl import r_qtl2_qc as rqc
-###### DO NOT COMMIT THIS ######
-from quality_control.debug import __pk__
-###### END: DO NOT COMMIT THIS ######
-
@pytest.mark.unit_test
@pytest.mark.parametrize(
"filepath,expected",
@@ -93,8 +89,8 @@ def test_missing_files(filepath, expected):
("tests/r_qtl/test_files/test_geno.zip",
tuple()),
("tests/r_qtl/test_files/geno_with_missing_genotypes.zip",
- ((1, "AXR-1", f"Invalid value 'X'. Expected one of ('L', 'C')"),
- (2, "EC.480C", f"Invalid value 'Y'. Expected one of ('L', 'C')"),
+ ((1, "AXR-1", "Invalid value 'X'. Expected one of ('L', 'C')"),
+ (2, "EC.480C", "Invalid value 'Y'. Expected one of ('L', 'C')"),
(6, "HH.335C-Col/PhyA", f"Invalid value 'H'. Expected one of ('L', 'C')")))))
def test_geno_errors(filepath, expected):
"""
@@ -105,3 +101,25 @@ def test_geno_errors(filepath, expected):
"""
with ZipFile(Path(filepath).absolute(), "r") as zfile:
assert tuple(rqc.geno_errors(zfile)) == expected
+
+@pytest.mark.unit_test
+@pytest.mark.parametrize(
+ "filepath,expected",
+ (("tests/r_qtl/test_files/empty_control_file_yaml.zip",
+ ((None, None, "Missing 'pheno' file."),)),
+ ("tests/r_qtl/test_files/pheno_without_errors.zip",
+ tuple()),
+ ("tests/r_qtl/test_files/pheno_with_errors.zip",
+ ((1, "liver", ("Invalid value '61.92'. Expected numerical value "
+ "with at least 3 decimal places.")),
+ (2, "spleen", ("Invalid value 'brrr'. Expected numerical value "
+ "with at least 3 decimal places."))))))
+def test_pheno_errors(filepath, expected):
+ """
+ GIVEN: A R/qtl2 bundle
+ WHEN: we check for pheno errors
+ THEN: We should get a sequence of all errors present in the pheno file, or
+ an empty sequence if no errors exist.
+ """
+ with ZipFile(Path(filepath).absolute(), "r") as zfile:
+ assert tuple(rqc.pheno_errors(zfile)) == expected