From 3f879d120f7628646f383a457206b15037cc57dc Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 5 Feb 2024 15:18:44 +0300 Subject: Check that data in geno file is valid Add a function to ensure the values in the geno files are all listed in the control data under the "genotypes" key. --- r_qtl/r_qtl2_qc.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'r_qtl/r_qtl2_qc.py') diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 853c34b..70a00f9 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,7 +1,7 @@ """Quality control checks for R/qtl2 data bundles.""" from zipfile import ZipFile from functools import reduce -from typing import Union, Sequence +from typing import Union, Sequence, Iterator, Optional from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 @@ -54,3 +54,19 @@ def validate_bundle(zfile: ZipFile): raise rqe.MissingFileError( "The following files do not exist in the bundle: " + ", ".join(missing)) + +def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]: + """Check for and retrieve geno errors.""" + cdata = rqtl2.control_data(zfile) + genotypes = tuple(cdata.get("genotypes", {}).keys()) + try: + for lineno, row in enumerate( + rqtl2.file_data(zfile, "geno", cdata), start=1): + for field, value in row.items(): + if field == "id": + continue + if value is not None and value not in genotypes: + yield (lineno, field, ( + f"Invalid value '{value}'. Expected one of {genotypes}")) + except rqe.MissingFileError: + yield (None, None, "Missing 'geno' file.") -- cgit v1.2.3