diff options
author | Frederick Muriuki Muriithi | 2024-02-20 05:16:45 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-02-20 05:16:45 +0300 |
commit | b1483d974d30d162e12557f55e856ec7d79bad2e (patch) | |
tree | 0711530c5f06e2485edf4e768187e3df79faad55 | |
parent | 8e692b0c372db3999dfc3989f361676c579fb9cd (diff) | |
download | gn-uploader-b1483d974d30d162e12557f55e856ec7d79bad2e.tar.gz |
Read each file separately
Provide the function 'read_file_data' in the 'r_qtl.r_qtl2' module to
read each file in the bundle separately.
The function 'file_data' in the 'r_qtl.r_qtl2' module reads *ALL* the
files of a particular type (e.g. geno files) and returns a single
generator object with the data from *ALL* the files. This does not
render itself very useful for error checking.
We needed a way to check for errors, and report them for each and
every file in the bundle, for easier tracking and fixing.
-rw-r--r-- | r_qtl/r_qtl2.py | 24 |
1 files changed, 22 insertions, 2 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 93b8c8e..8c17362 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -5,7 +5,7 @@ import json from pathlib import Path from zipfile import ZipFile from functools import reduce, partial -from typing import Iterator, Iterable, Callable, Optional +from typing import Union, Iterator, Iterable, Callable, Optional import yaml @@ -302,7 +302,7 @@ def raw_file_data(zipfilepath: Union[str, Path], zfile.open(memberfilename) as innerfile): wrappedfile = io.TextIOWrapper(innerfile) for line in wrappedfile: - yield line.strip() + yield line def strip_comments(rawdata: Iterator[str], commentchar) -> Iterator[str]: """Remove comments from raw text.""" @@ -334,3 +334,23 @@ def read_control_file(zipfilepath: Union[str, Path]) -> dict: if bool(cdata.get(ftype)) } } + + +def read_file_data( + zipfilepath: Union[str, Path], + memberfilename: str, + processfile: Callable[[Iterator[str]], Iterator[str]] = lambda itr: itr, + processline: Callable[[str], str] = lambda line: line, + processfield: Callable[ + [Optional[str]], Optional[str]] = lambda val: val) -> Iterator[ + tuple[Optional[str], ...]]: + """Read a single file from the bundle processing each field.""" + cdata = read_control_file(zipfilepath) + return ( + tuple(processfield(field.strip()) + for field in processline(row.strip()).split(cdata["sep"])) + for row in + processfile( + strip_comments( + raw_file_data(zipfilepath, memberfilename), + cdata["comment.char"]))) |