From 9b51f59bc4b598c1136525300af5f696bcf66fc0 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Mon, 15 Jan 2024 18:36:06 +0300
Subject: Process `na.strings` even for default cases

There was a bug where the `na.strings` were not processed correctly if
the user called the `r_qtl.r_qtl2.file_data(...)` function without
explicitly providing the `process_*` arguments.

This commit fixes that.
---
 r_qtl/r_qtl2.py | 89 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 36 deletions(-)

(limited to 'r_qtl')

diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index b688404..13ac355 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -4,7 +4,7 @@ import csv
 import json
 from zipfile import ZipFile
 from functools import reduce, partial
-from typing import Iterator, Iterable, Callable
+from typing import Iterator, Iterable, Callable, Optional
 
 import yaml
 
@@ -28,6 +28,13 @@ def control_data(zfile: ZipFile) -> dict:
             if files[0].endswith(".json")
             else yaml.safe_load(zfile.read(files[0])))
 
+def replace_na_strings(cdata, val):
+    """Replace values indicated in `na.strings` with `None`."""
+    nastrings = cdata.get("na.strings")
+    if bool(nastrings):
+        return (None if val in nastrings else val)
+    return val
+
 def with_non_transposed(zfile: ZipFile,
                         member_key: str,
                         cdata: dict,
@@ -46,24 +53,27 @@ def with_non_transposed(zfile: ZipFile,
 
     sep = cdata.get("sep", ",")
     with zfile.open(cdata[member_key]) as innerfile:
-        wrapped_file = io.TextIOWrapper(innerfile)
-        firstrow = tuple(
-            field.strip() for field in
-            next(filter(not_comment_line, wrapped_file)).strip().split(sep))
-        id_key = firstrow[0]
-        wrapped_file.seek(0)
-        reader = csv.DictReader(filter(not_comment_line, wrapped_file),
-                                delimiter=sep)
-        for row in reader:
-            processed = process_value(row)
-            yield {
-                "id": processed[id_key],
-                **{
-                    key: value
-                    for key, value in processed.items()
-                    if key != id_key
+        try:
+            wrapped_file = io.TextIOWrapper(innerfile)
+            firstrow = tuple(
+                field.strip() for field in
+                next(filter(not_comment_line, wrapped_file)).strip().split(sep))
+            id_key = firstrow[0]
+            wrapped_file.seek(0)
+            reader = csv.DictReader(filter(not_comment_line, wrapped_file),
+                                    delimiter=sep)
+            for row in reader:
+                processed = process_value(row)
+                yield {
+                    "id": processed[id_key],
+                    **{
+                        key: value
+                        for key, value in processed.items()
+                        if key != id_key
+                    }
                 }
-            }
+        except StopIteration as exc:
+            raise InvalidFormat("The file has no rows!") from exc
 
 def __make_organise_by_id__(id_key):
     """Return a function to use with `reduce` to organise values by some
@@ -129,14 +139,10 @@ def make_process_data_geno(cdata) -> tuple[
     def replace_genotype_codes(val):
         return cdata["genotypes"].get(val, val)
 
-    def replace_na_strings(val):
-        nastrings = cdata.get("na.strings")
-        if bool(nastrings):
-            return (None if val in nastrings else val)
-        return val
     def __non_transposed__(row: dict) -> dict:
         return {
-            key: chain(value, replace_genotype_codes, replace_na_strings)
+            key: chain(value, replace_genotype_codes,
+                       partial(replace_na_strings, cdata))
             for key,value in row.items()
         }
     def __transposed__(id_key: str,
@@ -145,7 +151,7 @@ def make_process_data_geno(cdata) -> tuple[
         return tuple(
             dict(zip(
                 [id_key, vals[0]],
-                (chain(item, replace_genotype_codes, replace_na_strings)
+                (chain(item, replace_genotype_codes, partial(replace_na_strings, cdata))
                  for item in items)))
             for items in zip(ids, vals[1:]))
     return (__non_transposed__, __transposed__)
@@ -189,22 +195,33 @@ def make_process_data_covar(cdata) -> tuple[
             for items in zip(ids, vals[1:]))
     return (non_transposed, transposed)
 
-def __default_process_value_transposed__(
-        id_key: str,
-        ids: tuple[str, ...],
-        vals: tuple[str, ...]) -> tuple[dict, ...]:
-    """Default values processor for transposed files."""
-    return tuple(
-        dict(zip([id_key, vals[0]], items)) for items in zip(ids, vals[1:]))
-
 def file_data(zfile: ZipFile,
               member_key: str,
               cdata: dict,
-              process_value: Callable[[dict], dict] = lambda val: val,
-              process_transposed_value: Callable[
+              process_value: Optional[Callable[[dict], dict]] = None,
+              process_transposed_value: Optional[Callable[
                   [str, tuple[str, ...], tuple[str, ...]],
-                  tuple[dict, ...]] = __default_process_value_transposed__) -> Iterator[dict]:
+                  tuple[dict, ...]]] = None) -> Iterator[dict]:
     """Load data from files in R/qtl2 zip bundle."""
+    def __default_process_value_non_transposed__(val: dict) -> dict:
+        return {
+            key: replace_na_strings(cdata, value) for key,value in val.items()
+        }
+
+    def __default_process_value_transposed__(
+            id_key: str,
+            ids: tuple[str, ...],
+            vals: tuple[str, ...]) -> tuple[dict, ...]:
+        """Default values processor for transposed files."""
+        return tuple(
+            dict(zip([id_key, replace_na_strings(cdata, vals[0])], items))
+            for items in zip(
+                    ids, (replace_na_strings(cdata, val) for val in vals[1:])))
+
+    process_value = process_value or __default_process_value_non_transposed__
+    process_transposed_value = (
+        process_transposed_value or __default_process_value_transposed__)
+
     try:
         if isinstance(cdata[member_key], list):
             for row in (line for lines in
-- 
cgit 1.4.1