about summary refs log tree commit diff
path: root/r_qtl
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-06 06:01:35 +0300
committerFrederick Muriuki Muriithi2024-02-06 06:07:46 +0300
commitb59ddf47a8f968e23aae6985986806ad5d704474 (patch)
treef19a26b238e0b85ceee864fc9fb3dd9d0e6fcbf8 /r_qtl
parentf98d0325e641dd43e0ac460ef4fa931eb94c054f (diff)
downloadgn-uploader-b59ddf47a8f968e23aae6985986806ad5d704474.tar.gz
Check that pheno values are numerical and at least 3 decimal places
Diffstat (limited to 'r_qtl')
-rw-r--r--r_qtl/r_qtl2_qc.py24
1 files changed, 23 insertions, 1 deletions
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py
index 70a00f9..b45c17a 100644
--- a/r_qtl/r_qtl2_qc.py
+++ b/r_qtl/r_qtl2_qc.py
@@ -1,4 +1,5 @@
 """Quality control checks for R/qtl2 data bundles."""
+import re
 from zipfile import ZipFile
 from functools import reduce
 from typing import Union, Sequence, Iterator, Optional
@@ -55,7 +56,8 @@ def validate_bundle(zfile: ZipFile):
                         "The following files do not exist in the bundle: " +
                         ", ".join(missing))
 
-def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]:
+def geno_errors(zfile: ZipFile) -> Iterator[
+        tuple[Optional[int], Optional[str], str]]:
     """Check for and retrieve geno errors."""
     cdata = rqtl2.control_data(zfile)
     genotypes = tuple(cdata.get("genotypes", {}).keys())
@@ -70,3 +72,23 @@ def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str],
                         f"Invalid value '{value}'. Expected one of {genotypes}"))
     except rqe.MissingFileError:
         yield (None, None, "Missing 'geno' file.")
+
+def pheno_errors(zfile: ZipFile) -> Iterator[
+        tuple[Optional[int], Optional[str], str]]:
+    """Check for and retrieve pheno errors."""
+    cdata = rqtl2.control_data(zfile)
+    try:
+        for lineno, row in enumerate(
+                rqtl2.file_data(zfile, "pheno", cdata), start=1):
+            for field, value in row.items():
+                if field == "id":
+                    continue
+                if value is not None and not(
+                        re.search(r"^([0-9]+\.[0-9]{3,}|[0-9]+\.?0*)$", value)
+                        or re.search(r"^0\.0+$", value)
+                        or re.search("^0+$", value)):
+                    yield (lineno, field, (
+                        f"Invalid value '{value}'. Expected numerical value "
+                        "with at least 3 decimal places."))
+    except rqe.MissingFileError:
+        yield (None, None, "Missing 'pheno' file.")