From dd369b846524fed0c08d1b7318fd73478506c3ee Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Fri, 9 Feb 2024 03:10:06 +0300 Subject: Provide the key for each file listed in the control file. --- r_qtl/r_qtl2_qc.py | 32 +++++++------- tests/r_qtl/test_r_qtl2_qc.py | 98 ++++++++++++++++++++++++++++--------------- 2 files changed, 83 insertions(+), 47 deletions(-) diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index 4b3e184..a41c442 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -2,7 +2,7 @@ import re from zipfile import ZipFile from functools import reduce -from typing import Union, Sequence, Iterator, Optional, Callable +from typing import Union, Iterator, Optional, Callable from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 @@ -11,14 +11,15 @@ from r_qtl.fileerrors import MissingFile from quality_control.errors import InvalidValue -def bundle_files_list(cdata: dict) -> tuple[str, ...]: +def bundle_files_list(cdata: dict) -> tuple[tuple[str, str], ...]: """Retrieve files listed in control file.""" - def __merge__(alist: tuple[str, ...], member: Union[str, Sequence[str]]) -> tuple[str, ...]: + def __merge__(alist: tuple[tuple[str, str], ...], member: str) -> tuple[ + tuple[str, str], ...]: if isinstance(cdata[member], str): - return alist + (cdata[member],) - return alist + tuple(cdata[member]) + return alist + ((member, str(cdata[member])),) + return alist + tuple((member, str(afile)) for afile in cdata[member]) - fileslist: tuple[str, ...] = reduce( + fileslist: tuple[tuple[str, str], ...] = reduce( __merge__, (key for key in cdata.keys() if key in __FILE_TYPES__), tuple()) @@ -26,30 +27,33 @@ def bundle_files_list(cdata: dict) -> tuple[str, ...]: if "file" in cdata.get("sex", {}): sexfile = cdata["sex"]["file"] fileslist = fileslist + ( - (sexfile,) if isinstance(sexfile, str) else tuple(sexfile)) + (("sex.file", sexfile),) if isinstance(sexfile, str) + else tuple(("sex.file", afile) for afile in sexfile)) if "file" in cdata.get("cross_info", {}): crossinfofile = cdata["cross_info"]["file"] fileslist = fileslist + ( - (crossinfofile,) if isinstance(crossinfofile, str) - else tuple(crossinfofile)) + (("cross_info.file", crossinfofile),) + if isinstance(crossinfofile, str) + else tuple(("cross_info.file", afile) for afile in crossinfofile)) return fileslist -def missing_files(zfile: ZipFile) -> tuple[str, ...]: +def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]: """ Retrieve a list of files listed in the control file that do not exist in the bundle. """ - def __missing_p__(thefile): + def __missing_p__(filedetails: tuple[str, str]): + _cfkey, thefile = filedetails try: zfile.getinfo(thefile) return False except KeyError: return True - return tuple(filter(__missing_p__, - bundle_files_list(rqtl2.control_data(zfile)))) + return tuple(afile for afile in bundle_files_list(rqtl2.control_data(zfile)) + if __missing_p__(afile)) def validate_bundle(zfile: ZipFile): """Ensure the R/qtl2 bundle is valid.""" @@ -57,7 +61,7 @@ def validate_bundle(zfile: ZipFile): if len(missing) > 0: raise rqe.MissingFileError( "The following files do not exist in the bundle: " + - ", ".join(missing)) + ", ".join(mfile[1] for mfile in missing)) def make_genocode_checker(genocode: dict) -> Callable[[int, str, str], Optional[InvalidValue]]: """Make a checker from the genotypes in the control data""" diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py index 554cfc4..d12172e 100644 --- a/tests/r_qtl/test_r_qtl2_qc.py +++ b/tests/r_qtl/test_r_qtl2_qc.py @@ -18,27 +18,44 @@ from quality_control.errors import InvalidValue ("tests/r_qtl/test_files/empty_control_file_json.zip", tuple()), ("tests/r_qtl/test_files/allfilesmissing_stringmembers.zip", - ("geno.csv", "fgeno.csv", "pheno.csv", "covar.csv", "phenocovar.csv", - "gmap.csv", "pmap.csv", "phenose.csv", "sex.csv", "crossinfo.csv")), + (("geno", "geno.csv"), ("founder_geno", "fgeno.csv"), + ("pheno", "pheno.csv"), ("covar", "covar.csv"), + ("phenocovar", "phenocovar.csv"), ("gmap", "gmap.csv"), + ("pmap", "pmap.csv"), ("phenose", "phenose.csv"), + ("sex.file", "sex.csv"), ("cross_info.file", "crossinfo.csv"))), ("tests/r_qtl/test_files/allfilesmissing_listmembers.zip", - ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", - "pheno01.csv", "pheno02.csv", "covar01.csv", "covar02.csv", - "phenocovar01.csv", "phenocovar02.csv", "phenocovar03.csv", - "phenocovar04.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", - "phenose01.csv", "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", - "crossinfo01.csv", "crossinfo02.csv")), + (("geno", "geno01.csv"), ("geno", "geno02.csv"), + ("founder_geno", "fgeno01.csv"), ("founder_geno", "fgeno02.csv"), + ("founder_geno", "fgeno03.csv"), ("pheno", "pheno01.csv"), + ("pheno", "pheno02.csv"), ("covar", "covar01.csv"), + ("covar", "covar02.csv"), ("phenocovar", "phenocovar01.csv"), + ("phenocovar", "phenocovar02.csv"), ("phenocovar", "phenocovar03.csv"), + ("phenocovar", "phenocovar04.csv"), ("gmap", "gmap01.csv"), + ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), ("pmap", "pmap02.csv"), + ("phenose", "phenose01.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex01.csv"), ("sex.file", "sex02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo01.csv"), + ("cross_info.file", "crossinfo02.csv"))), ("tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip", - ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", - "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", - "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", - "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", - "crossinfo.csv")), + (("geno", "geno01.csv"), ("geno", "geno02.csv"), + ("founder_geno", "fgeno01.csv"), ("founder_geno", "fgeno02.csv"), + ("founder_geno", "fgeno03.csv"), ("pheno", "pheno01.csv"), + ("pheno", "pheno02.csv"), ("covar", "covar.csv"), + ("phenocovar", "phenocovar.csv"), ("gmap", "gmap01.csv"), + ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), ("pmap", "pmap02.csv"), + ("phenose", "phenose01.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex01.csv"), ("sex.file", "sex02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo.csv"))), ("tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip", - ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", - "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", - "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", - "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", - "crossinfo.csv")))) + (("geno", "geno01.csv"), ("geno", "geno02.csv"), + ("founder_geno", "fgeno01.csv"), ("founder_geno", "fgeno02.csv"), + ("founder_geno", "fgeno03.csv"), ("pheno", "pheno01.csv"), + ("pheno", "pheno02.csv"), ("covar", "covar.csv"), + ("phenocovar", "phenocovar.csv"), ("gmap", "gmap01.csv"), + ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), ("pmap", "pmap02.csv"), + ("phenose", "phenose01.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex01.csv"), ("sex.file", "sex02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo.csv"))))) def test_bundle_files_list(filepath, expected): """ GIVEN: R/qtl2 bundle with a control file listing files @@ -56,24 +73,39 @@ def test_bundle_files_list(filepath, expected): ("tests/r_qtl/test_files/empty_control_file_json.zip", tuple()), ("tests/r_qtl/test_files/allfilesmissing_stringmembers.zip", - ("geno.csv", "fgeno.csv", "pheno.csv", "covar.csv", "phenocovar.csv", - "gmap.csv", "pmap.csv", "phenose.csv", "sex.csv", "crossinfo.csv")), + (("geno", "geno.csv"), ("founder_geno", "fgeno.csv"), + ("pheno", "pheno.csv"), ("covar", "covar.csv"), + ("phenocovar", "phenocovar.csv"), ("gmap", "gmap.csv"), + ("pmap", "pmap.csv"), ("phenose", "phenose.csv"), + ("sex.file", "sex.csv"), ("cross_info.file", "crossinfo.csv"))), ("tests/r_qtl/test_files/allfilesmissing_listmembers.zip", - ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", - "pheno01.csv", "pheno02.csv", "covar01.csv", "covar02.csv", - "phenocovar01.csv", "phenocovar02.csv", "phenocovar03.csv", - "phenocovar04.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", - "phenose01.csv", "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", - "crossinfo01.csv", "crossinfo02.csv")), + (("geno", "geno01.csv"), ("geno", "geno02.csv"), + ("founder_geno", "fgeno01.csv"), ("founder_geno", "fgeno02.csv"), + ("founder_geno", "fgeno03.csv"), ("pheno", "pheno01.csv"), + ("pheno", "pheno02.csv"), ("covar", "covar01.csv"), + ("covar", "covar02.csv"), ("phenocovar", "phenocovar01.csv"), + ("phenocovar", "phenocovar02.csv"), ("phenocovar", "phenocovar03.csv"), + ("phenocovar", "phenocovar04.csv"), ("gmap", "gmap01.csv"), + ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), ("pmap", "pmap02.csv"), + ("phenose", "phenose01.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex01.csv"), ("sex.file", "sex02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo01.csv"), + ("cross_info.file", "crossinfo02.csv"))), ("tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip", - ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", - "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", - "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", - "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", - "crossinfo.csv")), + (("geno", "geno01.csv"), ("geno", "geno02.csv"), + ("founder_geno", "fgeno01.csv"), ("founder_geno", "fgeno02.csv"), + ("founder_geno", "fgeno03.csv"), ("pheno", "pheno01.csv"), + ("pheno", "pheno02.csv"), ("covar", "covar.csv"), + ("phenocovar", "phenocovar.csv"), ("gmap", "gmap01.csv"), + ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), ("pmap", "pmap02.csv"), + ("phenose", "phenose01.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex01.csv"), ("sex.file", "sex02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo.csv"))), ("tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip", - ("fgeno01.csv", "covar.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", - "pmap02.csv", "phenose02.csv", "sex03.csv", "crossinfo.csv")))) + (("founder_geno", "fgeno01.csv"), ("covar", "covar.csv"), + ("gmap", "gmap01.csv"), ("gmap", "gmap02.csv"), ("pmap", "pmap01.csv"), + ("pmap", "pmap02.csv"), ("phenose", "phenose02.csv"), + ("sex.file", "sex03.csv"), ("cross_info.file", "crossinfo.csv"))))) def test_missing_files(filepath, expected): """ GIVEN: R/qtl2 bundle with a control file listing files -- cgit v1.2.3