diff options
-rw-r--r-- | README.org | 2 | ||||
-rw-r--r-- | r_qtl/r_qtl2.py | 30 |
2 files changed, 19 insertions, 13 deletions
@@ -72,7 +72,7 @@ Run unit tests with: To run the linter over the code base, run: #+BEGIN_SRC shell - pylint *.py tests quality_control qc_app r_qtl scripts + pylint setup.py wsgi.py tests quality_control qc_app r_qtl scripts #+END_SRC To check for correct type usage in the application, run: diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 4d609fd..16bb652 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -47,6 +47,22 @@ def with_non_transposed(zfile: ZipFile, for row in reader: yield func(row) +def __make_organise_by_id__(id_key): + """Return a function to use with `reduce` to organise values by some + identifier.""" + def __organiser__(acc, item): + row = acc.get(item[id_key], {}) + return {**acc, item[id_key]: {**row, **item}} + return __organiser__ + +def __batch_of_n__(iterable: Iterable, num): + """Return a batch of `num` items or less from the `iterable`.""" + while True: + items = take(iterable, num) + if len(items) <= 0: + break + yield items + def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): @@ -78,13 +94,6 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: for item in items))) for items in zip(samples, line[1:])) - def __n_batch__(iterable: Iterable, num): - while True: - items = take(iterable, num) - if len(items) <= 0: - break - yield items - if cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: lines = (line.strip().split(cdata.get("sep", ",")) @@ -93,13 +102,10 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: try: id_line = next(lines) id_key, samples = id_line[0], id_line[1:] - def __organise_by_id__(acc, item): - row = acc.get(item[id_key], {}) - return {**acc, item[id_key]: {**row, **item}} for _key, row in reduce(# type: ignore[var-annotated] - __organise_by_id__, + __make_organise_by_id__(id_key), (row - for batch in __n_batch__(lines, 300) + for batch in __batch_of_n__(lines, 300) for line in batch for row in __merge__(id_key, samples, line)), {}).items(): |