diff options
author | Frederick Muriuki Muriithi | 2024-02-06 06:01:35 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-02-06 06:07:46 +0300 |
commit | b59ddf47a8f968e23aae6985986806ad5d704474 (patch) | |
tree | f19a26b238e0b85ceee864fc9fb3dd9d0e6fcbf8 /r_qtl/r_qtl2_qc.py | |
parent | f98d0325e641dd43e0ac460ef4fa931eb94c054f (diff) | |
download | gn-uploader-b59ddf47a8f968e23aae6985986806ad5d704474.tar.gz |
Check that pheno values are numerical and at least 3 decimal places
Diffstat (limited to 'r_qtl/r_qtl2_qc.py')
-rw-r--r-- | r_qtl/r_qtl2_qc.py | 24 |
1 files changed, 23 insertions, 1 deletions
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 70a00f9..b45c17a 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,4 +1,5 @@ """Quality control checks for R/qtl2 data bundles.""" +import re from zipfile import ZipFile from functools import reduce from typing import Union, Sequence, Iterator, Optional @@ -55,7 +56,8 @@ def validate_bundle(zfile: ZipFile): "The following files do not exist in the bundle: " + ", ".join(missing)) -def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]: +def geno_errors(zfile: ZipFile) -> Iterator[ + tuple[Optional[int], Optional[str], str]]: """Check for and retrieve geno errors.""" cdata = rqtl2.control_data(zfile) genotypes = tuple(cdata.get("genotypes", {}).keys()) @@ -70,3 +72,23 @@ def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], f"Invalid value '{value}'. Expected one of {genotypes}")) except rqe.MissingFileError: yield (None, None, "Missing 'geno' file.") + +def pheno_errors(zfile: ZipFile) -> Iterator[ + tuple[Optional[int], Optional[str], str]]: + """Check for and retrieve pheno errors.""" + cdata = rqtl2.control_data(zfile) + try: + for lineno, row in enumerate( + rqtl2.file_data(zfile, "pheno", cdata), start=1): + for field, value in row.items(): + if field == "id": + continue + if value is not None and not( + re.search(r"^([0-9]+\.[0-9]{3,}|[0-9]+\.?0*)$", value) + or re.search(r"^0\.0+$", value) + or re.search("^0+$", value)): + yield (lineno, field, ( + f"Invalid value '{value}'. Expected numerical value " + "with at least 3 decimal places.")) + except rqe.MissingFileError: + yield (None, None, "Missing 'pheno' file.") |