From 9b51f59bc4b598c1136525300af5f696bcf66fc0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Mon, 15 Jan 2024 18:36:06 +0300 Subject: Process `na.strings` even for default cases There was a bug where the `na.strings` were not processed correctly if the user called the `r_qtl.r_qtl2.file_data(...)` function without explicitly providing the `process_*` arguments. This commit fixes that. --- r_qtl/r_qtl2.py | 89 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 36 deletions(-) (limited to 'r_qtl') diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index b688404..13ac355 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -4,7 +4,7 @@ import csv import json from zipfile import ZipFile from functools import reduce, partial -from typing import Iterator, Iterable, Callable +from typing import Iterator, Iterable, Callable, Optional import yaml @@ -28,6 +28,13 @@ def control_data(zfile: ZipFile) -> dict: if files[0].endswith(".json") else yaml.safe_load(zfile.read(files[0]))) +def replace_na_strings(cdata, val): + """Replace values indicated in `na.strings` with `None`.""" + nastrings = cdata.get("na.strings") + if bool(nastrings): + return (None if val in nastrings else val) + return val + def with_non_transposed(zfile: ZipFile, member_key: str, cdata: dict, @@ -46,24 +53,27 @@ def with_non_transposed(zfile: ZipFile, sep = cdata.get("sep", ",") with zfile.open(cdata[member_key]) as innerfile: - wrapped_file = io.TextIOWrapper(innerfile) - firstrow = tuple( - field.strip() for field in - next(filter(not_comment_line, wrapped_file)).strip().split(sep)) - id_key = firstrow[0] - wrapped_file.seek(0) - reader = csv.DictReader(filter(not_comment_line, wrapped_file), - delimiter=sep) - for row in reader: - processed = process_value(row) - yield { - "id": processed[id_key], - **{ - key: value - for key, value in processed.items() - if key != id_key + try: + wrapped_file = io.TextIOWrapper(innerfile) + firstrow = tuple( + field.strip() for field in + next(filter(not_comment_line, wrapped_file)).strip().split(sep)) + id_key = firstrow[0] + wrapped_file.seek(0) + reader = csv.DictReader(filter(not_comment_line, wrapped_file), + delimiter=sep) + for row in reader: + processed = process_value(row) + yield { + "id": processed[id_key], + **{ + key: value + for key, value in processed.items() + if key != id_key + } } - } + except StopIteration as exc: + raise InvalidFormat("The file has no rows!") from exc def __make_organise_by_id__(id_key): """Return a function to use with `reduce` to organise values by some @@ -129,14 +139,10 @@ def make_process_data_geno(cdata) -> tuple[ def replace_genotype_codes(val): return cdata["genotypes"].get(val, val) - def replace_na_strings(val): - nastrings = cdata.get("na.strings") - if bool(nastrings): - return (None if val in nastrings else val) - return val def __non_transposed__(row: dict) -> dict: return { - key: chain(value, replace_genotype_codes, replace_na_strings) + key: chain(value, replace_genotype_codes, + partial(replace_na_strings, cdata)) for key,value in row.items() } def __transposed__(id_key: str, @@ -145,7 +151,7 @@ def make_process_data_geno(cdata) -> tuple[ return tuple( dict(zip( [id_key, vals[0]], - (chain(item, replace_genotype_codes, replace_na_strings) + (chain(item, replace_genotype_codes, partial(replace_na_strings, cdata)) for item in items))) for items in zip(ids, vals[1:])) return (__non_transposed__, __transposed__) @@ -189,22 +195,33 @@ def make_process_data_covar(cdata) -> tuple[ for items in zip(ids, vals[1:])) return (non_transposed, transposed) -def __default_process_value_transposed__( - id_key: str, - ids: tuple[str, ...], - vals: tuple[str, ...]) -> tuple[dict, ...]: - """Default values processor for transposed files.""" - return tuple( - dict(zip([id_key, vals[0]], items)) for items in zip(ids, vals[1:])) - def file_data(zfile: ZipFile, member_key: str, cdata: dict, - process_value: Callable[[dict], dict] = lambda val: val, - process_transposed_value: Callable[ + process_value: Optional[Callable[[dict], dict]] = None, + process_transposed_value: Optional[Callable[ [str, tuple[str, ...], tuple[str, ...]], - tuple[dict, ...]] = __default_process_value_transposed__) -> Iterator[dict]: + tuple[dict, ...]]] = None) -> Iterator[dict]: """Load data from files in R/qtl2 zip bundle.""" + def __default_process_value_non_transposed__(val: dict) -> dict: + return { + key: replace_na_strings(cdata, value) for key,value in val.items() + } + + def __default_process_value_transposed__( + id_key: str, + ids: tuple[str, ...], + vals: tuple[str, ...]) -> tuple[dict, ...]: + """Default values processor for transposed files.""" + return tuple( + dict(zip([id_key, replace_na_strings(cdata, vals[0])], items)) + for items in zip( + ids, (replace_na_strings(cdata, val) for val in vals[1:]))) + + process_value = process_value or __default_process_value_non_transposed__ + process_transposed_value = ( + process_transposed_value or __default_process_value_transposed__) + try: if isinstance(cdata[member_key], list): for row in (line for lines in -- cgit v1.2.3