From 2162cec2084f712993180618eb92c5a6dfdc5963 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Thu, 28 Dec 2023 13:19:50 +0300
Subject: Rework parsing of transposed geno files.

---
 mypy.ini                        |   3 +
 r_qtl/r_qtl2.py                 |  46 +++++++++------
 tests/r_qtl/test_r_qtl2_geno.py | 125 +++++++++++++++++++++++-----------------
 3 files changed, 101 insertions(+), 73 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index b0c3b7b..08e896e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -25,4 +25,7 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 
 [mypy-MySQLdb.*]
+ignore_missing_imports = True
+
+[mypy-yaml.*]
 ignore_missing_imports = True
\ No newline at end of file
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 47f101e..ec7a954 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -5,11 +5,13 @@ import json
 import yaml
 from pathlib import Path
 from functools import reduce
-from typing import Any, List, Union, Iterator
 from zipfile import ZipFile, ZipInfo, is_zipfile
+from typing import Any, List, Union, Iterator, Iterable
 
 from r_qtl.errors import InvalidFormat
 
+from quality_control.parsing import take
+
 def thread_op(value, *functions):
     """Thread the `value` through the sequence of `functions`."""
     return reduce(lambda result, func: func(result), functions, value)
@@ -30,25 +32,16 @@ def control_data(zfile: ZipFile) -> dict:
             if files[0].endswith(".json")
             else yaml.safe_load(zfile.read(files[0])))
 
-def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict:
-    """Read Individual ID key and the marker names."""
-    line_num = 0
-    with zfile.open(cdata["geno"]) as genofile:
-        for line in filter(lambda line: not line.startswith("#"),
-                           io.TextIOWrapper(genofile)):
-            line_parts = line.strip().split(cdata.get("sep", ","))
-            return {
-                "individual_id_key": line_parts[0].strip(),
-                "markers": tuple(marker.strip() for marker in line_parts[1:])
-            }
-
 def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load the genotype file, making use of the control data."""
     def replace_genotype_codes(val):
         return cdata["genotypes"].get(val, val)
 
     def replace_na_strings(val):
-        return (None if val in cdata["na.strings"] else val)
+        nastrings = cdata.get("na.strings")
+        if bool(nastrings):
+            return (None if val in nastrings else val)
+        return val
 
     if not cdata.get("geno_transposed", False):
         with zfile.open(cdata["geno"]) as genofile:
@@ -68,13 +61,20 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
 
     def __merge__(key, samples, line):
         marker = line[0]
-        return (
+        return tuple(
             dict(zip(
                 [key, marker],
                 (thread_op(item, replace_genotype_codes, replace_na_strings)
                  for item in items)))
             for items in zip(samples, line[1:]))
 
+    def __n_batch__(iterable: Iterable, num):
+        while True:
+            items = take(iterable, num)
+            if len(items) <= 0:
+                break
+            yield items
+
     if cdata.get("geno_transposed", False):
         with zfile.open(cdata["geno"]) as genofile:
             lines = (line.strip().split(cdata.get("sep", ","))
@@ -82,11 +82,19 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
                                          io.TextIOWrapper(genofile)))
             id_line = next(lines)
             id_key, samples = id_line[0], id_line[1:]
-            for line in lines:
-                for row in __merge__(id_key, samples, line):
-                    yield row
+            def __organise_by_id__(acc, item):
+                row = acc.get(item[id_key], {})
+                return {**acc, item[id_key]: {**row, **item}}
+            for _key, row in reduce(# type: ignore[var-annotated]
+                    __organise_by_id__,
+                    (row
+                     for batch in __n_batch__(lines, 300)
+                     for line in batch
+                     for row in __merge__(id_key, samples, line)),
+                    {}).items():
+                yield row
 
-def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict:
+def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> tuple[dict, ...]:
     """Read gmap files to get the genome mapping data"""
     assert map_type in ("genetic-map", "physical-map"), "Invalid map type"
     map_file = cdata[{
diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py
index 908ef55..7b660a6 100644
--- a/tests/r_qtl/test_r_qtl2_geno.py
+++ b/tests/r_qtl/test_r_qtl2_geno.py
@@ -92,60 +92,77 @@ from r_qtl import r_qtl2 as rqtl2
              "EC.66C": 2
            })),
         ("tests/r_qtl/test_files/test_geno_transposed.zip",
-         ({"id": "1", "PVV4": 1},
-          {"id": "2", "PVV4": 1},
-          {"id": "3", "PVV4": 2},
-          {"id": "4", "PVV4": 1},
-          {"id": "5", "PVV4": 2},
-          {"id": "6", "PVV4": 2},
-          {"id": "7", "PVV4": 1},
-          {"id": "8", "PVV4": 2},
-          {"id": "9", "PVV4": None},
-          {"id": "10", "PVV4": 2},
-
-          {"id": "1", "AXR-1": 1},
-          {"id": "2", "AXR-1": 1},
-          {"id": "3", "AXR-1": 2},
-          {"id": "4", "AXR-1": 1},
-          {"id": "5", "AXR-1": 2},
-          {"id": "6", "AXR-1": 2},
-          {"id": "7", "AXR-1": 1},
-          {"id": "8", "AXR-1": 2},
-          {"id": "9", "AXR-1": 2},
-          {"id": "10", "AXR-1": 2},
-
-          {"id": "1", "HH.335C-Col/PhyA": 1},
-          {"id": "2", "HH.335C-Col/PhyA": 1},
-          {"id": "3", "HH.335C-Col/PhyA": None},
-          {"id": "4", "HH.335C-Col/PhyA": 1},
-          {"id": "5", "HH.335C-Col/PhyA": 2},
-          {"id": "6", "HH.335C-Col/PhyA": 2},
-          {"id": "7", "HH.335C-Col/PhyA": 1},
-          {"id": "8", "HH.335C-Col/PhyA": 2},
-          {"id": "9", "HH.335C-Col/PhyA": 2},
-          {"id": "10", "HH.335C-Col/PhyA": 2},
-
-          {"id": "1", "EC.480C": 1},
-          {"id": "2", "EC.480C": 1},
-          {"id": "3", "EC.480C": 1},
-          {"id": "4", "EC.480C": 1},
-          {"id": "5", "EC.480C": 2},
-          {"id": "6", "EC.480C": 2},
-          {"id": "7", "EC.480C": 1},
-          {"id": "8", "EC.480C": 1},
-          {"id": "9", "EC.480C": 2},
-          {"id": "10", "EC.480C": 2},
-
-          {"id": "1","EC.66C": 1},
-          {"id": "2", "EC.66C": 1},
-          {"id": "3", "EC.66C": 1},
-          {"id": "4", "EC.66C": 1},
-          {"id": "5", "EC.66C": 2},
-          {"id": "6", "EC.66C": 2},
-          {"id": "7", "EC.66C": 1},
-          {"id": "8", "EC.66C": 1},
-          {"id": "9", "EC.66C": 2},
-          {"id": "10", "EC.66C": 2}))))
+         ({
+             "id": "1",
+             "PVV4": 1,
+             "AXR-1": 1,
+             "HH.335C-Col/PhyA": 1,
+             "EC.480C": 1,
+             "EC.66C": 1
+         }, {
+             "id": "2",
+             "PVV4": 1,
+             "AXR-1": 1,
+             "HH.335C-Col/PhyA": 1,
+             "EC.480C": 1,
+             "EC.66C": 1
+         }, {
+             "id": "3",
+             "PVV4": 2,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": None,
+             "EC.480C": 1,
+             "EC.66C": 1
+         }, {
+             "id": "4",
+             "PVV4": 1,
+             "AXR-1": 1,
+             "HH.335C-Col/PhyA": 1,
+             "EC.480C": 1,
+             "EC.66C": 1
+         },{
+             "id": "5",
+             "PVV4": 2,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": 2,
+             "EC.480C": 2,
+             "EC.66C": 2
+         }, {
+             "id": "6",
+             "PVV4": 2,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": 2,
+             "EC.480C": 2,
+             "EC.66C": 2
+         }, {
+             "id": "7",
+             "PVV4": 1,
+             "AXR-1": 1,
+             "HH.335C-Col/PhyA": 1,
+             "EC.480C": 1,
+             "EC.66C": 1
+         }, {
+             "id": "8",
+             "PVV4": 2,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": 2,
+             "EC.480C": 1,
+             "EC.66C": 1
+         }, {
+             "id": "9",
+             "PVV4": None,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": 2,
+             "EC.480C": 2,
+             "EC.66C": 2
+         }, {
+             "id": "10",
+             "PVV4": 2,
+             "AXR-1": 2,
+             "HH.335C-Col/PhyA": 2,
+             "EC.480C": 2,
+             "EC.66C": 2
+         }))))
 def test_parse_geno_files(relpath,expected):
     """
     GIVEN: Path to a zip file with R/qtl2 data
-- 
cgit v1.2.3