From 2162cec2084f712993180618eb92c5a6dfdc5963 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 28 Dec 2023 13:19:50 +0300 Subject: Rework parsing of transposed geno files. --- r_qtl/r_qtl2.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'r_qtl') diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 47f101e..ec7a954 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -5,11 +5,13 @@ import json import yaml from pathlib import Path from functools import reduce -from typing import Any, List, Union, Iterator from zipfile import ZipFile, ZipInfo, is_zipfile +from typing import Any, List, Union, Iterator, Iterable from r_qtl.errors import InvalidFormat +from quality_control.parsing import take + def thread_op(value, *functions): """Thread the `value` through the sequence of `functions`.""" return reduce(lambda result, func: func(result), functions, value) @@ -30,25 +32,16 @@ def control_data(zfile: ZipFile) -> dict: if files[0].endswith(".json") else yaml.safe_load(zfile.read(files[0]))) -def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict: - """Read Individual ID key and the marker names.""" - line_num = 0 - with zfile.open(cdata["geno"]) as genofile: - for line in filter(lambda line: not line.startswith("#"), - io.TextIOWrapper(genofile)): - line_parts = line.strip().split(cdata.get("sep", ",")) - return { - "individual_id_key": line_parts[0].strip(), - "markers": tuple(marker.strip() for marker in line_parts[1:]) - } - def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): return cdata["genotypes"].get(val, val) def replace_na_strings(val): - return (None if val in cdata["na.strings"] else val) + nastrings = cdata.get("na.strings") + if bool(nastrings): + return (None if val in nastrings else val) + return val if not cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: @@ -68,13 +61,20 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: def __merge__(key, samples, line): marker = line[0] - return ( + return tuple( dict(zip( [key, marker], (thread_op(item, replace_genotype_codes, replace_na_strings) for item in items))) for items in zip(samples, line[1:])) + def __n_batch__(iterable: Iterable, num): + while True: + items = take(iterable, num) + if len(items) <= 0: + break + yield items + if cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: lines = (line.strip().split(cdata.get("sep", ",")) @@ -82,11 +82,19 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: io.TextIOWrapper(genofile))) id_line = next(lines) id_key, samples = id_line[0], id_line[1:] - for line in lines: - for row in __merge__(id_key, samples, line): - yield row + def __organise_by_id__(acc, item): + row = acc.get(item[id_key], {}) + return {**acc, item[id_key]: {**row, **item}} + for _key, row in reduce(# type: ignore[var-annotated] + __organise_by_id__, + (row + for batch in __n_batch__(lines, 300) + for line in batch + for row in __merge__(id_key, samples, line)), + {}).items(): + yield row -def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict: +def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> tuple[dict, ...]: """Read gmap files to get the genome mapping data""" assert map_type in ("genetic-map", "physical-map"), "Invalid map type" map_file = cdata[{ -- cgit v1.2.3