aboutsummaryrefslogtreecommitdiff
path: root/r_qtl
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-05 15:18:44 +0300
committerFrederick Muriuki Muriithi2024-02-05 15:43:25 +0300
commit3f879d120f7628646f383a457206b15037cc57dc (patch)
treeeaecac8299eeabef511f0297ac439ae0cf590a4e /r_qtl
parent3245182e967f5ac8a296cf47ce3e622c3cb754ed (diff)
downloadgn-uploader-3f879d120f7628646f383a457206b15037cc57dc.tar.gz
Check that data in geno file is valid
Add a function to ensure the values in the geno files are all listed in the control data under the "genotypes" key.
Diffstat (limited to 'r_qtl')
-rw-r--r--r_qtl/r_qtl2.py4
-rw-r--r--r_qtl/r_qtl2_qc.py18
2 files changed, 19 insertions, 3 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index f8c08d9..e74312f 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -10,7 +10,7 @@ import yaml
from functional_tools import take, chain
-from r_qtl.errors import InvalidFormat
+from r_qtl.errors import InvalidFormat, MissingFileError
__FILE_TYPES__ = (
"geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap",
@@ -250,7 +250,7 @@ def file_data(zfile: ZipFile,
zfile, member_key, cdata, process_transposed_value):
yield row
except KeyError as exc:
- raise InvalidFormat(*exc.args) from exc
+ raise MissingFileError(*exc.args) from exc
def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
"""Load cross information where present."""
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py
index 853c34b..70a00f9 100644
--- a/r_qtl/r_qtl2_qc.py
+++ b/r_qtl/r_qtl2_qc.py
@@ -1,7 +1,7 @@
"""Quality control checks for R/qtl2 data bundles."""
from zipfile import ZipFile
from functools import reduce
-from typing import Union, Sequence
+from typing import Union, Sequence, Iterator, Optional
from r_qtl import errors as rqe
from r_qtl import r_qtl2 as rqtl2
@@ -54,3 +54,19 @@ def validate_bundle(zfile: ZipFile):
raise rqe.MissingFileError(
"The following files do not exist in the bundle: " +
", ".join(missing))
+
+def geno_errors(zfile: ZipFile) -> Iterator[tuple[Optional[int], Optional[str], str]]:
+ """Check for and retrieve geno errors."""
+ cdata = rqtl2.control_data(zfile)
+ genotypes = tuple(cdata.get("genotypes", {}).keys())
+ try:
+ for lineno, row in enumerate(
+ rqtl2.file_data(zfile, "geno", cdata), start=1):
+ for field, value in row.items():
+ if field == "id":
+ continue
+ if value is not None and value not in genotypes:
+ yield (lineno, field, (
+ f"Invalid value '{value}'. Expected one of {genotypes}"))
+ except rqe.MissingFileError:
+ yield (None, None, "Missing 'geno' file.")