From 635cf832f4717da6e8e7ef273a675a4ceea42ed0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 5 Feb 2024 06:08:19 +0300 Subject: Retrieve list of all files, and list of missing files Add QC a function to list all files listed in the control file, and another to list only the files missing from the bundle. --- r_qtl/r_qtl2_qc.py | 47 +++++++++++ .../test_files/allfilesmissing_listmembers.zip | Bin 0 -> 429 bytes .../test_files/allfilesmissing_mixedmembers.zip | Bin 0 -> 413 bytes .../test_files/allfilesmissing_stringmembers.zip | Bin 0 -> 370 bytes .../test_files/somefilesmissing_mixedmembers.zip | Bin 0 -> 1913 bytes tests/r_qtl/test_r_qtl2_qc.py | 87 +++++++++++++++++++++ 6 files changed, 134 insertions(+) create mode 100644 r_qtl/r_qtl2_qc.py create mode 100644 tests/r_qtl/test_files/allfilesmissing_listmembers.zip create mode 100644 tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip create mode 100644 tests/r_qtl/test_files/allfilesmissing_stringmembers.zip create mode 100644 tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip create mode 100644 tests/r_qtl/test_r_qtl2_qc.py diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py new file mode 100644 index 0000000..f666f40 --- /dev/null +++ b/r_qtl/r_qtl2_qc.py @@ -0,0 +1,47 @@ +"""Quality control checks for R/qtl2 data bundles.""" +from zipfile import ZipFile +from functools import reduce +from typing import Union, Sequence + +from r_qtl import r_qtl2 as rqtl2 +from r_qtl.r_qtl2 import __FILE_TYPES__ + +def bundle_files_list(zfile, cdata: dict) -> tuple[str]: + """Retrieve files listed in control file.""" + def __merge__(alist: tuple[str, ...], member: Union[str, Sequence[str]]) -> tuple[str, ...]: + if isinstance(cdata[member], str): + return alist + (cdata[member],) + return alist + tuple(cdata[member]) + + fileslist = tuple() + fileslist = reduce(__merge__, + (key for key in cdata.keys() if key in __FILE_TYPES__), + tuple()) + + if "file" in cdata.get("sex", {}): + sexfile = cdata["sex"]["file"] + fileslist = fileslist + ( + (sexfile,) if isinstance(sexfile, str) else tuple(sexfile)) + + if "file" in cdata.get("cross_info", {}): + crossinfofile = cdata["cross_info"]["file"] + fileslist = fileslist + ( + (crossinfofile,) if isinstance(crossinfofile, str) + else tuple(crossinfofile)) + + return fileslist + +def missing_files(zfile: ZipFile) -> tuple[str]: + """ + Retrieve a list of files listed in the control file that do not exist in the + bundle. + """ + def __missing_p__(thefile): + try: + zfile.getinfo(thefile) + return False + except KeyError: + return True + + return tuple(filter(__missing_p__, + bundle_files_list(zfile, rqtl2.control_data(zfile)))) diff --git a/tests/r_qtl/test_files/allfilesmissing_listmembers.zip b/tests/r_qtl/test_files/allfilesmissing_listmembers.zip new file mode 100644 index 0000000..8cdbe07 Binary files /dev/null and b/tests/r_qtl/test_files/allfilesmissing_listmembers.zip differ diff --git a/tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip b/tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip new file mode 100644 index 0000000..9278440 Binary files /dev/null and b/tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip differ diff --git a/tests/r_qtl/test_files/allfilesmissing_stringmembers.zip b/tests/r_qtl/test_files/allfilesmissing_stringmembers.zip new file mode 100644 index 0000000..2b356ec Binary files /dev/null and b/tests/r_qtl/test_files/allfilesmissing_stringmembers.zip differ diff --git a/tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip b/tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip new file mode 100644 index 0000000..b9320fc Binary files /dev/null and b/tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip differ diff --git a/tests/r_qtl/test_r_qtl2_qc.py b/tests/r_qtl/test_r_qtl2_qc.py new file mode 100644 index 0000000..5fcccf5 --- /dev/null +++ b/tests/r_qtl/test_r_qtl2_qc.py @@ -0,0 +1,87 @@ +"""Test that the QC functions work as expected""" +from pathlib import Path + +import pytest +from zipfile import ZipFile + +from r_qtl import r_qtl2 as rqtl2 +from r_qtl import r_qtl2_qc as qc + +###### DO NOT COMMIT THIS ###### +from quality_control.debug import __pk__ +###### END: DO NOT COMMIT THIS ###### + +@pytest.mark.unit_test +@pytest.mark.parametrize( + "filepath,expected", + (("tests/r_qtl/test_files/empty_control_file_yaml.zip", + tuple()), + ("tests/r_qtl/test_files/empty_control_file_json.zip", + tuple()), + ("tests/r_qtl/test_files/allfilesmissing_stringmembers.zip", + ("geno.csv", "fgeno.csv", "pheno.csv", "covar.csv", "phenocovar.csv", + "gmap.csv", "pmap.csv", "phenose.csv", "sex.csv", "crossinfo.csv")), + ("tests/r_qtl/test_files/allfilesmissing_listmembers.zip", + ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", + "pheno01.csv", "pheno02.csv", "covar01.csv", "covar02.csv", + "phenocovar01.csv", "phenocovar02.csv", "phenocovar03.csv", + "phenocovar04.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", + "phenose01.csv", "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", + "crossinfo01.csv", "crossinfo02.csv")), + ("tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip", + ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", + "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", + "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", + "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", + "crossinfo.csv")), + ("tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip", + ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", + "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", + "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", + "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", + "crossinfo.csv")))) +def test_bundle_files_list(filepath, expected): + """ + GIVEN: R/qtl2 bundle with a control file listing files + WHEN: `bundle_files_list` is called on the bundle + THEN: verify that ALL files listed in the control file are returned. + """ + with ZipFile(Path(filepath).absolute(), "r") as zfile: + assert qc.bundle_files_list( + zfile, rqtl2.control_data(zfile)) == expected + +@pytest.mark.unit_test +@pytest.mark.parametrize( + "filepath,expected", + (("tests/r_qtl/test_files/empty_control_file_yaml.zip", + tuple()), + ("tests/r_qtl/test_files/empty_control_file_json.zip", + tuple()), + ("tests/r_qtl/test_files/allfilesmissing_stringmembers.zip", + ("geno.csv", "fgeno.csv", "pheno.csv", "covar.csv", "phenocovar.csv", + "gmap.csv", "pmap.csv", "phenose.csv", "sex.csv", "crossinfo.csv")), + ("tests/r_qtl/test_files/allfilesmissing_listmembers.zip", + ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", + "pheno01.csv", "pheno02.csv", "covar01.csv", "covar02.csv", + "phenocovar01.csv", "phenocovar02.csv", "phenocovar03.csv", + "phenocovar04.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", + "phenose01.csv", "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", + "crossinfo01.csv", "crossinfo02.csv")), + ("tests/r_qtl/test_files/allfilesmissing_mixedmembers.zip", + ("geno01.csv", "geno02.csv", "fgeno01.csv", "fgeno02.csv", "fgeno03.csv", + "pheno01.csv", "pheno02.csv", "covar.csv", "phenocovar.csv", + "gmap01.csv", "gmap02.csv", "pmap01.csv", "pmap02.csv", "phenose01.csv", + "phenose02.csv", "sex01.csv", "sex02.csv", "sex03.csv", + "crossinfo.csv")), + ("tests/r_qtl/test_files/somefilesmissing_mixedmembers.zip", + ("fgeno01.csv", "covar.csv", "gmap01.csv", "gmap02.csv", "pmap01.csv", + "pmap02.csv", "phenose02.csv", "sex03.csv", "crossinfo.csv")))) +def test_missing_files(filepath, expected): + """ + GIVEN: R/qtl2 bundle with a control file listing files + WHEN: `missing_files` is called on the bundle + THEN: verify that ALL files listed in the control file, that do not actually + exist in the bundle are returned. + """ + with ZipFile(Path(filepath).absolute(), "r") as zfile: + assert qc.missing_files(zfile) == expected -- cgit v1.2.3