diff options
author | Frederick Muriuki Muriithi | 2024-01-03 04:52:07 +0300 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2024-01-03 04:52:07 +0300 |
commit | 9481d1705f735a1087ced871bcb169d147e44dd0 (patch) | |
tree | a4de0e380ca1d379aef2b6917462a2b50b3bd82b /r_qtl | |
parent | 645e98ab0bf341bdc4f739e5002c47e08fd6159b (diff) | |
download | gn-uploader-9481d1705f735a1087ced871bcb169d147e44dd0.tar.gz |
Refactor: Extract potentially reusable functions
The processing of transposed files is probably going to be very
similar, thus the need to extract some reusable code from the
geno-file-specific function in preparation.
Diffstat (limited to 'r_qtl')
-rw-r--r-- | r_qtl/r_qtl2.py | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 4d609fd..16bb652 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -47,6 +47,22 @@ def with_non_transposed(zfile: ZipFile, for row in reader: yield func(row) +def __make_organise_by_id__(id_key): + """Return a function to use with `reduce` to organise values by some + identifier.""" + def __organiser__(acc, item): + row = acc.get(item[id_key], {}) + return {**acc, item[id_key]: {**row, **item}} + return __organiser__ + +def __batch_of_n__(iterable: Iterable, num): + """Return a batch of `num` items or less from the `iterable`.""" + while True: + items = take(iterable, num) + if len(items) <= 0: + break + yield items + def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): @@ -78,13 +94,6 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: for item in items))) for items in zip(samples, line[1:])) - def __n_batch__(iterable: Iterable, num): - while True: - items = take(iterable, num) - if len(items) <= 0: - break - yield items - if cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: lines = (line.strip().split(cdata.get("sep", ",")) @@ -93,13 +102,10 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: try: id_line = next(lines) id_key, samples = id_line[0], id_line[1:] - def __organise_by_id__(acc, item): - row = acc.get(item[id_key], {}) - return {**acc, item[id_key]: {**row, **item}} for _key, row in reduce(# type: ignore[var-annotated] - __organise_by_id__, + __make_organise_by_id__(id_key), (row - for batch in __n_batch__(lines, 300) + for batch in __batch_of_n__(lines, 300) for line in batch for row in __merge__(id_key, samples, line)), {}).items(): |