From d4fff5fda2d9fe2b9730a7cffcc8f85b3a8eff17 Mon Sep 17 00:00:00 2001 From: Frederick Muriuki Muriithi Date: Thu, 28 Dec 2023 09:55:26 +0300 Subject: Handle transposed geno files. --- r_qtl/r_qtl2.py | 50 ++++++++++----- tests/r_qtl/test_r_qtl2_geno.py | 134 ++++++++++++++++------------------------ 2 files changed, 88 insertions(+), 96 deletions(-) diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index e019d99..47f101e 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -32,7 +32,6 @@ def control_data(zfile: ZipFile) -> dict: def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict: """Read Individual ID key and the marker names.""" - # TODO: Handle transposed files line_num = 0 with zfile.open(cdata["geno"]) as genofile: for line in filter(lambda line: not line.startswith("#"), @@ -45,25 +44,48 @@ def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict: def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load the genotype file, making use of the control data.""" - # TODO: Handle transposed files - with zfile.open(cdata["geno"]) as genofile: - reader = csv.DictReader(filter(lambda line: not line.startswith("#"), - io.TextIOWrapper(genofile)), - delimiter=cdata.get("sep", ",")) - if not cdata.get("geno_transposed", False): + def replace_genotype_codes(val): + return cdata["genotypes"].get(val, val) + + def replace_na_strings(val): + return (None if val in cdata["na.strings"] else val) + + if not cdata.get("geno_transposed", False): + with zfile.open(cdata["geno"]) as genofile: + reader = csv.DictReader( + filter(lambda line: not line.startswith("#"), + io.TextIOWrapper(genofile)), + delimiter=cdata.get("sep", ",")) for row in reader: yield { key: thread_op( value, - # replace genotype codes - lambda val: cdata["genotypes"].get(val, val), - # replace N/A strings - lambda val: ( - None if val in cdata["na.strings"] else val)) + replace_genotype_codes, + replace_na_strings) for key,value in row.items() } + def __merge__(key, samples, line): + marker = line[0] + return ( + dict(zip( + [key, marker], + (thread_op(item, replace_genotype_codes, replace_na_strings) + for item in items))) + for items in zip(samples, line[1:])) + + if cdata.get("geno_transposed", False): + with zfile.open(cdata["geno"]) as genofile: + lines = (line.strip().split(cdata.get("sep", ",")) + for line in filter(lambda line: not line.startswith("#"), + io.TextIOWrapper(genofile))) + id_line = next(lines) + id_key, samples = id_line[0], id_line[1:] + for line in lines: + for row in __merge__(id_key, samples, line): + yield row + def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict: """Read gmap files to get the genome mapping data""" assert map_type in ("genetic-map", "physical-map"), "Invalid map type" @@ -71,10 +93,6 @@ def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict: "genetic-map": "gmap", "physical-map": "pmap" }[map_type]] - # TODO: Might need to check `gmap_transposed` and `pmap_transposed` instead - # of `geno_transposed` -- see - # https://github.com/rqtl/qtl2data/blob/main/ArabMAGIC/arabmagic_tair8.json - # for the *_transposed values transposed_dict = { "genetic-map": "gmap_transposed", "physical-map": "pmap_transposed" diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py index 5ebb5a9..908ef55 100644 --- a/tests/r_qtl/test_r_qtl2_geno.py +++ b/tests/r_qtl/test_r_qtl2_geno.py @@ -92,86 +92,60 @@ from r_qtl import r_qtl2 as rqtl2 "EC.66C": 2 })), ("tests/r_qtl/test_files/test_geno_transposed.zip", - ({ - "id": "1", - "PVV4": 1, - "AXR-1": 1, - "HH.335C-Col/PhyA": 1, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "2", - "PVV4": 1, - "AXR-1": 1, - "HH.335C-Col/PhyA": 1, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "3", - "PVV4": 2, - "AXR-1": 2, - "HH.335C-Col/PhyA": None, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "4", - "PVV4": 1, - "AXR-1": 1, - "HH.335C-Col/PhyA": 1, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "5", - "PVV4": 2, - "AXR-1": 2, - "HH.335C-Col/PhyA": 2, - "EC.480C": 2, - "EC.66C": 2 - }, - { - "id": "6", - "PVV4": 2, - "AXR-1": 2, - "HH.335C-Col/PhyA": 2, - "EC.480C": 2, - "EC.66C": 2 - }, - { - "id": "7", - "PVV4": 1, - "AXR-1": 1, - "HH.335C-Col/PhyA": 1, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "8", - "PVV4": 2, - "AXR-1": 2, - "HH.335C-Col/PhyA": 2, - "EC.480C": 1, - "EC.66C": 1 - }, - { - "id": "9", - "PVV4": None, - "AXR-1": 2, - "HH.335C-Col/PhyA": 2, - "EC.480C": 2, - "EC.66C": 2 - }, - { - "id": "10", - "PVV4": 2, - "AXR-1": 2, - "HH.335C-Col/PhyA": 2, - "EC.480C": 2, - "EC.66C": 2 - })))) + ({"id": "1", "PVV4": 1}, + {"id": "2", "PVV4": 1}, + {"id": "3", "PVV4": 2}, + {"id": "4", "PVV4": 1}, + {"id": "5", "PVV4": 2}, + {"id": "6", "PVV4": 2}, + {"id": "7", "PVV4": 1}, + {"id": "8", "PVV4": 2}, + {"id": "9", "PVV4": None}, + {"id": "10", "PVV4": 2}, + + {"id": "1", "AXR-1": 1}, + {"id": "2", "AXR-1": 1}, + {"id": "3", "AXR-1": 2}, + {"id": "4", "AXR-1": 1}, + {"id": "5", "AXR-1": 2}, + {"id": "6", "AXR-1": 2}, + {"id": "7", "AXR-1": 1}, + {"id": "8", "AXR-1": 2}, + {"id": "9", "AXR-1": 2}, + {"id": "10", "AXR-1": 2}, + + {"id": "1", "HH.335C-Col/PhyA": 1}, + {"id": "2", "HH.335C-Col/PhyA": 1}, + {"id": "3", "HH.335C-Col/PhyA": None}, + {"id": "4", "HH.335C-Col/PhyA": 1}, + {"id": "5", "HH.335C-Col/PhyA": 2}, + {"id": "6", "HH.335C-Col/PhyA": 2}, + {"id": "7", "HH.335C-Col/PhyA": 1}, + {"id": "8", "HH.335C-Col/PhyA": 2}, + {"id": "9", "HH.335C-Col/PhyA": 2}, + {"id": "10", "HH.335C-Col/PhyA": 2}, + + {"id": "1", "EC.480C": 1}, + {"id": "2", "EC.480C": 1}, + {"id": "3", "EC.480C": 1}, + {"id": "4", "EC.480C": 1}, + {"id": "5", "EC.480C": 2}, + {"id": "6", "EC.480C": 2}, + {"id": "7", "EC.480C": 1}, + {"id": "8", "EC.480C": 1}, + {"id": "9", "EC.480C": 2}, + {"id": "10", "EC.480C": 2}, + + {"id": "1","EC.66C": 1}, + {"id": "2", "EC.66C": 1}, + {"id": "3", "EC.66C": 1}, + {"id": "4", "EC.66C": 1}, + {"id": "5", "EC.66C": 2}, + {"id": "6", "EC.66C": 2}, + {"id": "7", "EC.66C": 1}, + {"id": "8", "EC.66C": 1}, + {"id": "9", "EC.66C": 2}, + {"id": "10", "EC.66C": 2})))) def test_parse_geno_files(relpath,expected): """ GIVEN: Path to a zip file with R/qtl2 data -- cgit v1.2.3