From 9481d1705f735a1087ced871bcb169d147e44dd0 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Wed, 3 Jan 2024 04:52:07 +0300 Subject: Refactor: Extract potentially reusable functions The processing of transposed files is probably going to be very similar, thus the need to extract some reusable code from the geno-file-specific function in preparation. --- r_qtl/r_qtl2.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'r_qtl') diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 4d609fd..16bb652 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -47,6 +47,22 @@ def with_non_transposed(zfile: ZipFile, for row in reader: yield func(row) +def __make_organise_by_id__(id_key): + """Return a function to use with `reduce` to organise values by some + identifier.""" + def __organiser__(acc, item): + row = acc.get(item[id_key], {}) + return {**acc, item[id_key]: {**row, **item}} + return __organiser__ + +def __batch_of_n__(iterable: Iterable, num): + """Return a batch of `num` items or less from the `iterable`.""" + while True: + items = take(iterable, num) + if len(items) <= 0: + break + yield items + def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): @@ -78,13 +94,6 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: for item in items))) for items in zip(samples, line[1:])) - def __n_batch__(iterable: Iterable, num): - while True: - items = take(iterable, num) - if len(items) <= 0: - break - yield items - if cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: lines = (line.strip().split(cdata.get("sep", ",")) @@ -93,13 +102,10 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: try: id_line = next(lines) id_key, samples = id_line[0], id_line[1:] - def __organise_by_id__(acc, item): - row = acc.get(item[id_key], {}) - return {**acc, item[id_key]: {**row, **item}} for _key, row in reduce(# type: ignore[var-annotated] - __organise_by_id__, + __make_organise_by_id__(id_key), (row - for batch in __n_batch__(lines, 300) + for batch in __batch_of_n__(lines, 300) for line in batch for row in __merge__(id_key, samples, line)), {}).items(): -- cgit v1.2.3