about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-02-20 05:16:45 +0300
committerFrederick Muriuki Muriithi2024-02-20 05:16:45 +0300
commitb1483d974d30d162e12557f55e856ec7d79bad2e (patch)
tree0711530c5f06e2485edf4e768187e3df79faad55
parent8e692b0c372db3999dfc3989f361676c579fb9cd (diff)
downloadgn-uploader-b1483d974d30d162e12557f55e856ec7d79bad2e.tar.gz
Read each file separately
Provide the function 'read_file_data' in the 'r_qtl.r_qtl2' module to
read each file in the bundle separately.

The function 'file_data' in the 'r_qtl.r_qtl2' module reads *ALL* the
files of a particular type (e.g. geno files) and returns a single
generator object with the data from *ALL* the files. This does not
render itself very useful for error checking.

We needed a way to check for errors, and report them for each and
every file in the bundle, for easier tracking and fixing.
-rw-r--r--r_qtl/r_qtl2.py24
1 files changed, 22 insertions, 2 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 93b8c8e..8c17362 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -5,7 +5,7 @@ import json
 from pathlib import Path
 from zipfile import ZipFile
 from functools import reduce, partial
-from typing import Iterator, Iterable, Callable, Optional
+from typing import Union, Iterator, Iterable, Callable, Optional
 
 import yaml
 
@@ -302,7 +302,7 @@ def raw_file_data(zipfilepath: Union[str, Path],
           zfile.open(memberfilename) as innerfile):
         wrappedfile = io.TextIOWrapper(innerfile)
         for  line in wrappedfile:
-            yield line.strip()
+            yield line
 
 def strip_comments(rawdata: Iterator[str], commentchar) -> Iterator[str]:
     """Remove comments from raw text."""
@@ -334,3 +334,23 @@ def read_control_file(zipfilepath: Union[str, Path]) -> dict:
                 if bool(cdata.get(ftype))
             }
         }
+
+
+def read_file_data(
+        zipfilepath: Union[str, Path],
+        memberfilename: str,
+        processfile: Callable[[Iterator[str]], Iterator[str]] = lambda itr: itr,
+        processline: Callable[[str], str] = lambda line: line,
+        processfield: Callable[
+            [Optional[str]], Optional[str]] = lambda val: val) -> Iterator[
+                tuple[Optional[str], ...]]:
+    """Read a single file from the bundle processing each field."""
+    cdata = read_control_file(zipfilepath)
+    return (
+        tuple(processfield(field.strip())
+              for field in processline(row.strip()).split(cdata["sep"]))
+        for row in
+        processfile(
+            strip_comments(
+                raw_file_data(zipfilepath, memberfilename),
+                cdata["comment.char"])))