From d4fff5fda2d9fe2b9730a7cffcc8f85b3a8eff17 Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Thu, 28 Dec 2023 09:55:26 +0300
Subject: Handle transposed geno files.

---
 r_qtl/r_qtl2.py                 |  50 ++++++++++-----
 tests/r_qtl/test_r_qtl2_geno.py | 134 ++++++++++++++++------------------------
 2 files changed, 88 insertions(+), 96 deletions(-)

diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index e019d99..47f101e 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -32,7 +32,6 @@ def control_data(zfile: ZipFile) -> dict:
 
 def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict:
     """Read Individual ID key and the marker names."""
-    # TODO: Handle transposed files
     line_num = 0
     with zfile.open(cdata["geno"]) as genofile:
         for line in filter(lambda line: not line.startswith("#"),
@@ -45,25 +44,48 @@ def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict:
 
 def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load the genotype file, making use of the control data."""
-    # TODO: Handle transposed files
-    with zfile.open(cdata["geno"]) as genofile:
-        reader = csv.DictReader(filter(lambda line: not line.startswith("#"),
-                                       io.TextIOWrapper(genofile)),
-                                delimiter=cdata.get("sep", ","))
-        if not cdata.get("geno_transposed", False):
+    def replace_genotype_codes(val):
+        return cdata["genotypes"].get(val, val)
+
+    def replace_na_strings(val):
+        return (None if val in cdata["na.strings"] else val)
+
+    if not cdata.get("geno_transposed", False):
+        with zfile.open(cdata["geno"]) as genofile:
+            reader = csv.DictReader(
+                filter(lambda line: not line.startswith("#"),
+                       io.TextIOWrapper(genofile)),
+                delimiter=cdata.get("sep", ","))
             for row in reader:
                 yield {
                     key: thread_op(
                         value,
-                        # replace genotype codes
-                        lambda val: cdata["genotypes"].get(val, val),
-                        # replace N/A strings
-                        lambda val: (
-                            None if val in cdata["na.strings"] else val))
+                        replace_genotype_codes,
+                        replace_na_strings)
                     for key,value
                     in row.items()
                 }
 
+    def __merge__(key, samples, line):
+        marker = line[0]
+        return (
+            dict(zip(
+                [key, marker],
+                (thread_op(item, replace_genotype_codes, replace_na_strings)
+                 for item in items)))
+            for items in zip(samples, line[1:]))
+
+    if cdata.get("geno_transposed", False):
+        with zfile.open(cdata["geno"]) as genofile:
+            lines = (line.strip().split(cdata.get("sep", ","))
+                     for line in filter(lambda line: not line.startswith("#"),
+                                         io.TextIOWrapper(genofile)))
+            id_line = next(lines)
+            id_key, samples = id_line[0], id_line[1:]
+            for line in lines:
+                for row in __merge__(id_key, samples, line):
+                    yield row
+
 def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict:
     """Read gmap files to get the genome mapping data"""
     assert map_type in ("genetic-map", "physical-map"), "Invalid map type"
@@ -71,10 +93,6 @@ def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict:
         "genetic-map": "gmap",
         "physical-map": "pmap"
     }[map_type]]
-    # TODO: Might need to check `gmap_transposed` and `pmap_transposed` instead
-    #  of `geno_transposed` -- see
-    #  https://github.com/rqtl/qtl2data/blob/main/ArabMAGIC/arabmagic_tair8.json
-    #  for the *_transposed values
     transposed_dict = {
         "genetic-map": "gmap_transposed",
         "physical-map": "pmap_transposed"
diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py
index 5ebb5a9..908ef55 100644
--- a/tests/r_qtl/test_r_qtl2_geno.py
+++ b/tests/r_qtl/test_r_qtl2_geno.py
@@ -92,86 +92,60 @@ from r_qtl import r_qtl2 as rqtl2
              "EC.66C": 2
            })),
         ("tests/r_qtl/test_files/test_geno_transposed.zip",
-         ({
-             "id": "1",
-             "PVV4": 1,
-             "AXR-1": 1,
-             "HH.335C-Col/PhyA": 1,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "2",
-             "PVV4": 1,
-             "AXR-1": 1,
-             "HH.335C-Col/PhyA": 1,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "3",
-             "PVV4": 2,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": None,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "4",
-             "PVV4": 1,
-             "AXR-1": 1,
-             "HH.335C-Col/PhyA": 1,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "5",
-             "PVV4": 2,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": 2,
-             "EC.480C": 2,
-             "EC.66C": 2
-           },
-          {
-             "id": "6",
-             "PVV4": 2,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": 2,
-             "EC.480C": 2,
-             "EC.66C": 2
-           },
-          {
-             "id": "7",
-             "PVV4": 1,
-             "AXR-1": 1,
-             "HH.335C-Col/PhyA": 1,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "8",
-             "PVV4": 2,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": 2,
-             "EC.480C": 1,
-             "EC.66C": 1
-           },
-          {
-             "id": "9",
-             "PVV4": None,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": 2,
-             "EC.480C": 2,
-             "EC.66C": 2
-           },
-          {
-             "id": "10",
-             "PVV4": 2,
-             "AXR-1": 2,
-             "HH.335C-Col/PhyA": 2,
-             "EC.480C": 2,
-             "EC.66C": 2
-           }))))
+         ({"id": "1", "PVV4": 1},
+          {"id": "2", "PVV4": 1},
+          {"id": "3", "PVV4": 2},
+          {"id": "4", "PVV4": 1},
+          {"id": "5", "PVV4": 2},
+          {"id": "6", "PVV4": 2},
+          {"id": "7", "PVV4": 1},
+          {"id": "8", "PVV4": 2},
+          {"id": "9", "PVV4": None},
+          {"id": "10", "PVV4": 2},
+
+          {"id": "1", "AXR-1": 1},
+          {"id": "2", "AXR-1": 1},
+          {"id": "3", "AXR-1": 2},
+          {"id": "4", "AXR-1": 1},
+          {"id": "5", "AXR-1": 2},
+          {"id": "6", "AXR-1": 2},
+          {"id": "7", "AXR-1": 1},
+          {"id": "8", "AXR-1": 2},
+          {"id": "9", "AXR-1": 2},
+          {"id": "10", "AXR-1": 2},
+
+          {"id": "1", "HH.335C-Col/PhyA": 1},
+          {"id": "2", "HH.335C-Col/PhyA": 1},
+          {"id": "3", "HH.335C-Col/PhyA": None},
+          {"id": "4", "HH.335C-Col/PhyA": 1},
+          {"id": "5", "HH.335C-Col/PhyA": 2},
+          {"id": "6", "HH.335C-Col/PhyA": 2},
+          {"id": "7", "HH.335C-Col/PhyA": 1},
+          {"id": "8", "HH.335C-Col/PhyA": 2},
+          {"id": "9", "HH.335C-Col/PhyA": 2},
+          {"id": "10", "HH.335C-Col/PhyA": 2},
+
+          {"id": "1", "EC.480C": 1},
+          {"id": "2", "EC.480C": 1},
+          {"id": "3", "EC.480C": 1},
+          {"id": "4", "EC.480C": 1},
+          {"id": "5", "EC.480C": 2},
+          {"id": "6", "EC.480C": 2},
+          {"id": "7", "EC.480C": 1},
+          {"id": "8", "EC.480C": 1},
+          {"id": "9", "EC.480C": 2},
+          {"id": "10", "EC.480C": 2},
+
+          {"id": "1","EC.66C": 1},
+          {"id": "2", "EC.66C": 1},
+          {"id": "3", "EC.66C": 1},
+          {"id": "4", "EC.66C": 1},
+          {"id": "5", "EC.66C": 2},
+          {"id": "6", "EC.66C": 2},
+          {"id": "7", "EC.66C": 1},
+          {"id": "8", "EC.66C": 1},
+          {"id": "9", "EC.66C": 2},
+          {"id": "10", "EC.66C": 2}))))
 def test_parse_geno_files(relpath,expected):
     """
     GIVEN: Path to a zip file with R/qtl2 data
-- 
cgit v1.2.3