From 95d2b868adebbc7ebbc2435f9184c30c014ec513 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 3 Jan 2024 16:41:33 +0300 Subject: Parse founder_geno files. Generalise parsing files. * Add tests for parsing "founder_geno" files * Extract common file parsing structure out to more general function * Use generic function to parse "founder_geno" file in test --- r_qtl/r_qtl2.py | 54 +++++++++++++ tests/r_qtl/test_files/test_founder_geno.zip | Bin 0 -> 672 bytes .../test_files/test_founder_geno_transposed.zip | Bin 0 -> 725 bytes tests/r_qtl/test_r_qtl2_geno.py | 87 +++++++++++++++++++-- 4 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 tests/r_qtl/test_files/test_founder_geno.zip create mode 100644 tests/r_qtl/test_files/test_founder_geno_transposed.zip diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 2256609..4dac24b 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -105,6 +105,35 @@ def with_transposed(zfile: ZipFile, except StopIteration: pass +def make_process_data_geno(cdata) -> tuple[ + Callable[[dict], dict], + Callable[[str, tuple[str, ...], tuple[str, ...]], + tuple[dict, ...]]]: + """Build functions to process genotype data.""" + def replace_genotype_codes(val): + return cdata["genotypes"].get(val, val) + + def replace_na_strings(val): + nastrings = cdata.get("na.strings") + if bool(nastrings): + return (None if val in nastrings else val) + return val + def __non_transposed__(row: dict) -> dict: + return { + key: thread_op(value, replace_genotype_codes, replace_na_strings) + for key,value in row.items() + } + def __transposed__(id_key: str, + ids: tuple[str, ...], + vals: tuple[str, ...]) -> tuple[dict, ...]: + return tuple( + dict(zip( + [id_key, vals[0]], + (thread_op(item, replace_genotype_codes, replace_na_strings) + for item in items))) + for items in zip(ids, vals[1:])) + return (__non_transposed__, __transposed__) + def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): @@ -176,3 +205,28 @@ def phenotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: for items in zip(ids, vals[1:])) for row in with_transposed(zfile, "pheno", cdata, __merge__): yield row + +def __default_process_value_transposed__( + id_key: str, + ids: tuple[str, ...], + vals: tuple[str, ...]) -> tuple[dict, ...]: + """Default values processor for transposed files.""" + return tuple( + dict(zip([id_key, vals[0]], items)) for items in zip(ids, vals[1:])) + +def file_data(zfile: ZipFile, + member_key: str, + cdata: dict, + process_value: Callable[[dict], dict] = lambda val: val, + process_transposed_value: Callable[ + [str, tuple[str, ...], tuple[str, ...]], + tuple[dict, ...]] = __default_process_value_transposed__) -> Iterator[dict]: + """Load data from files in R/qtl2 zip bundle.""" + if not cdata.get(f"{member_key}_transposed", False): + for row in with_non_transposed(zfile, member_key, cdata, process_value): + yield row + return + + for row in with_transposed( + zfile, member_key, cdata, process_transposed_value): + yield row diff --git a/tests/r_qtl/test_files/test_founder_geno.zip b/tests/r_qtl/test_files/test_founder_geno.zip new file mode 100644 index 0000000..f77626b Binary files /dev/null and b/tests/r_qtl/test_files/test_founder_geno.zip differ diff --git a/tests/r_qtl/test_files/test_founder_geno_transposed.zip b/tests/r_qtl/test_files/test_founder_geno_transposed.zip new file mode 100644 index 0000000..6cc8151 Binary files /dev/null and b/tests/r_qtl/test_files/test_founder_geno_transposed.zip differ diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py index d323c4b..787d13a 100644 --- a/tests/r_qtl/test_r_qtl2_geno.py +++ b/tests/r_qtl/test_r_qtl2_geno.py @@ -177,11 +177,80 @@ def test_parse_geno_files(relpath, expected): @pytest.mark.unit_test @pytest.mark.parametrize( "relpath,expected", - ( - ("tests/r_qtl/test_files/test_founder_geno.zip", - ()), - ("tests/r_qtl/test_files/test_founder_geno_transposed.zip", - ()))) + (("tests/r_qtl/test_files/test_founder_geno.zip", + (({ + "id": "1", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "2", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "3", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": None, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "4", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "5", + "PVV4": None, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + }))), + ("tests/r_qtl/test_files/test_founder_geno_transposed.zip", + (({ + "id": "1", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "2", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "3", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": None, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "4", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "5", + "PVV4": None, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + }))))) def test_parse_founder_geno_files(relpath, expected): """Test parsing of founder_geno files from the R/qtl2 bundle. @@ -190,5 +259,9 @@ def test_parse_founder_geno_files(relpath, expected): THEN: ensure that the data we get is as expected """ with ZipFile(Path(relpath).absolute(), "r") as zfile: - assert tuple(rqtl2.founder_genotype_data( - zfile, rqtl2.control_data(zfile))) == expected + cdata = rqtl2.control_data(zfile) + assert tuple(rqtl2.file_data( + zfile, + "founder_geno", + cdata, + *rqtl2.make_process_data_geno(cdata))) == expected -- cgit v1.2.3