From 2162cec2084f712993180618eb92c5a6dfdc5963 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 28 Dec 2023 13:19:50 +0300 Subject: Rework parsing of transposed geno files. --- mypy.ini | 3 + r_qtl/r_qtl2.py | 46 +++++++++------ tests/r_qtl/test_r_qtl2_geno.py | 125 +++++++++++++++++++++++----------------- 3 files changed, 101 insertions(+), 73 deletions(-) diff --git a/mypy.ini b/mypy.ini index b0c3b7b..08e896e 100644 --- a/mypy.ini +++ b/mypy.ini @@ -25,4 +25,7 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-MySQLdb.*] +ignore_missing_imports = True + +[mypy-yaml.*] ignore_missing_imports = True \ No newline at end of file diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 47f101e..ec7a954 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -5,11 +5,13 @@ import json import yaml from pathlib import Path from functools import reduce -from typing import Any, List, Union, Iterator from zipfile import ZipFile, ZipInfo, is_zipfile +from typing import Any, List, Union, Iterator, Iterable from r_qtl.errors import InvalidFormat +from quality_control.parsing import take + def thread_op(value, *functions): """Thread the `value` through the sequence of `functions`.""" return reduce(lambda result, func: func(result), functions, value) @@ -30,25 +32,16 @@ def control_data(zfile: ZipFile) -> dict: if files[0].endswith(".json") else yaml.safe_load(zfile.read(files[0]))) -def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict: - """Read Individual ID key and the marker names.""" - line_num = 0 - with zfile.open(cdata["geno"]) as genofile: - for line in filter(lambda line: not line.startswith("#"), - io.TextIOWrapper(genofile)): - line_parts = line.strip().split(cdata.get("sep", ",")) - return { - "individual_id_key": line_parts[0].strip(), - "markers": tuple(marker.strip() for marker in line_parts[1:]) - } - def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" def replace_genotype_codes(val): return cdata["genotypes"].get(val, val) def replace_na_strings(val): - return (None if val in cdata["na.strings"] else val) + nastrings = cdata.get("na.strings") + if bool(nastrings): + return (None if val in nastrings else val) + return val if not cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: @@ -68,13 +61,20 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: def __merge__(key, samples, line): marker = line[0] - return ( + return tuple( dict(zip( [key, marker], (thread_op(item, replace_genotype_codes, replace_na_strings) for item in items))) for items in zip(samples, line[1:])) + def __n_batch__(iterable: Iterable, num): + while True: + items = take(iterable, num) + if len(items) <= 0: + break + yield items + if cdata.get("geno_transposed", False): with zfile.open(cdata["geno"]) as genofile: lines = (line.strip().split(cdata.get("sep", ",")) @@ -82,11 +82,19 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: io.TextIOWrapper(genofile))) id_line = next(lines) id_key, samples = id_line[0], id_line[1:] - for line in lines: - for row in __merge__(id_key, samples, line): - yield row + def __organise_by_id__(acc, item): + row = acc.get(item[id_key], {}) + return {**acc, item[id_key]: {**row, **item}} + for _key, row in reduce(# type: ignore[var-annotated] + __organise_by_id__, + (row + for batch in __n_batch__(lines, 300) + for line in batch + for row in __merge__(id_key, samples, line)), + {}).items(): + yield row -def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict: +def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> tuple[dict, ...]: """Read gmap files to get the genome mapping data""" assert map_type in ("genetic-map", "physical-map"), "Invalid map type" map_file = cdata[{ diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py index 908ef55..7b660a6 100644 --- a/tests/r_qtl/test_r_qtl2_geno.py +++ b/tests/r_qtl/test_r_qtl2_geno.py @@ -92,60 +92,77 @@ from r_qtl import r_qtl2 as rqtl2 "EC.66C": 2 })), ("tests/r_qtl/test_files/test_geno_transposed.zip", - ({"id": "1", "PVV4": 1}, - {"id": "2", "PVV4": 1}, - {"id": "3", "PVV4": 2}, - {"id": "4", "PVV4": 1}, - {"id": "5", "PVV4": 2}, - {"id": "6", "PVV4": 2}, - {"id": "7", "PVV4": 1}, - {"id": "8", "PVV4": 2}, - {"id": "9", "PVV4": None}, - {"id": "10", "PVV4": 2}, - - {"id": "1", "AXR-1": 1}, - {"id": "2", "AXR-1": 1}, - {"id": "3", "AXR-1": 2}, - {"id": "4", "AXR-1": 1}, - {"id": "5", "AXR-1": 2}, - {"id": "6", "AXR-1": 2}, - {"id": "7", "AXR-1": 1}, - {"id": "8", "AXR-1": 2}, - {"id": "9", "AXR-1": 2}, - {"id": "10", "AXR-1": 2}, - - {"id": "1", "HH.335C-Col/PhyA": 1}, - {"id": "2", "HH.335C-Col/PhyA": 1}, - {"id": "3", "HH.335C-Col/PhyA": None}, - {"id": "4", "HH.335C-Col/PhyA": 1}, - {"id": "5", "HH.335C-Col/PhyA": 2}, - {"id": "6", "HH.335C-Col/PhyA": 2}, - {"id": "7", "HH.335C-Col/PhyA": 1}, - {"id": "8", "HH.335C-Col/PhyA": 2}, - {"id": "9", "HH.335C-Col/PhyA": 2}, - {"id": "10", "HH.335C-Col/PhyA": 2}, - - {"id": "1", "EC.480C": 1}, - {"id": "2", "EC.480C": 1}, - {"id": "3", "EC.480C": 1}, - {"id": "4", "EC.480C": 1}, - {"id": "5", "EC.480C": 2}, - {"id": "6", "EC.480C": 2}, - {"id": "7", "EC.480C": 1}, - {"id": "8", "EC.480C": 1}, - {"id": "9", "EC.480C": 2}, - {"id": "10", "EC.480C": 2}, - - {"id": "1","EC.66C": 1}, - {"id": "2", "EC.66C": 1}, - {"id": "3", "EC.66C": 1}, - {"id": "4", "EC.66C": 1}, - {"id": "5", "EC.66C": 2}, - {"id": "6", "EC.66C": 2}, - {"id": "7", "EC.66C": 1}, - {"id": "8", "EC.66C": 1}, - {"id": "9", "EC.66C": 2}, - {"id": "10", "EC.66C": 2})))) + ({ + "id": "1", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "2", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "3", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": None, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "4", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + },{ + "id": "5", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + }, { + "id": "6", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + }, { + "id": "7", + "PVV4": 1, + "AXR-1": 1, + "HH.335C-Col/PhyA": 1, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "8", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 1, + "EC.66C": 1 + }, { + "id": "9", + "PVV4": None, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + }, { + "id": "10", + "PVV4": 2, + "AXR-1": 2, + "HH.335C-Col/PhyA": 2, + "EC.480C": 2, + "EC.66C": 2 + })))) def test_parse_geno_files(relpath,expected): """ GIVEN: Path to a zip file with R/qtl2 data -- cgit v1.2.3