about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-03 16:41:33 +0300
committerFrederick Muriuki Muriithi2024-01-03 16:41:33 +0300
commit95d2b868adebbc7ebbc2435f9184c30c014ec513 (patch)
treeed4849c1162d66766e24a9c97c4b03e6a225db22
parente713b566a873424709300110a78801fd49a74ea1 (diff)
downloadgn-uploader-95d2b868adebbc7ebbc2435f9184c30c014ec513.tar.gz
Parse founder_geno files. Generalise parsing files.
* Add tests for parsing "founder_geno" files
* Extract common file parsing structure out to more general function
* Use generic function to parse "founder_geno" file in test
-rw-r--r--r_qtl/r_qtl2.py54
-rw-r--r--tests/r_qtl/test_files/test_founder_geno.zipbin0 -> 672 bytes
-rw-r--r--tests/r_qtl/test_files/test_founder_geno_transposed.zipbin0 -> 725 bytes
-rw-r--r--tests/r_qtl/test_r_qtl2_geno.py87
4 files changed, 134 insertions, 7 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 2256609..4dac24b 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -105,6 +105,35 @@ def with_transposed(zfile: ZipFile,
         except StopIteration:
             pass
 
+def make_process_data_geno(cdata) -> tuple[
+        Callable[[dict], dict],
+        Callable[[str, tuple[str, ...], tuple[str, ...]],
+                 tuple[dict, ...]]]:
+    """Build functions to process genotype data."""
+    def replace_genotype_codes(val):
+        return cdata["genotypes"].get(val, val)
+
+    def replace_na_strings(val):
+        nastrings = cdata.get("na.strings")
+        if bool(nastrings):
+            return (None if val in nastrings else val)
+        return val
+    def __non_transposed__(row: dict) -> dict:
+        return {
+            key: thread_op(value, replace_genotype_codes, replace_na_strings)
+            for key,value in row.items()
+        }
+    def __transposed__(id_key: str,
+                       ids: tuple[str, ...],
+                       vals: tuple[str, ...]) -> tuple[dict, ...]:
+        return tuple(
+            dict(zip(
+                [id_key, vals[0]],
+                (thread_op(item, replace_genotype_codes, replace_na_strings)
+                 for item in items)))
+            for items in zip(ids, vals[1:]))
+    return (__non_transposed__, __transposed__)
+
 def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load the genotype file, making use of the control data."""
     def replace_genotype_codes(val):
@@ -176,3 +205,28 @@ def phenotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
                      for items in zip(ids, vals[1:]))
     for row in with_transposed(zfile, "pheno", cdata, __merge__):
         yield row
+
+def __default_process_value_transposed__(
+        id_key: str,
+        ids: tuple[str, ...],
+        vals: tuple[str, ...]) -> tuple[dict, ...]:
+    """Default values processor for transposed files."""
+    return tuple(
+        dict(zip([id_key, vals[0]], items)) for items in zip(ids, vals[1:]))
+
+def file_data(zfile: ZipFile,
+              member_key: str,
+              cdata: dict,
+              process_value: Callable[[dict], dict] = lambda val: val,
+              process_transposed_value: Callable[
+                  [str, tuple[str, ...], tuple[str, ...]],
+                  tuple[dict, ...]] = __default_process_value_transposed__) -> Iterator[dict]:
+    """Load data from files in R/qtl2 zip bundle."""
+    if not cdata.get(f"{member_key}_transposed", False):
+        for row in with_non_transposed(zfile, member_key, cdata, process_value):
+            yield row
+        return
+
+    for row in with_transposed(
+            zfile, member_key, cdata, process_transposed_value):
+        yield row
diff --git a/tests/r_qtl/test_files/test_founder_geno.zip b/tests/r_qtl/test_files/test_founder_geno.zip
new file mode 100644
index 0000000..f77626b
--- /dev/null
+++ b/tests/r_qtl/test_files/test_founder_geno.zip
Binary files differdiff --git a/tests/r_qtl/test_files/test_founder_geno_transposed.zip b/tests/r_qtl/test_files/test_founder_geno_transposed.zip
new file mode 100644
index 0000000..6cc8151
--- /dev/null
+++ b/tests/r_qtl/test_files/test_founder_geno_transposed.zip
Binary files differdiff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py
index d323c4b..787d13a 100644
--- a/tests/r_qtl/test_r_qtl2_geno.py
+++ b/tests/r_qtl/test_r_qtl2_geno.py
@@ -177,11 +177,80 @@ def test_parse_geno_files(relpath, expected):
 @pytest.mark.unit_test
 @pytest.mark.parametrize(
     "relpath,expected",
-    (
-        ("tests/r_qtl/test_files/test_founder_geno.zip",
-         ()),
-        ("tests/r_qtl/test_files/test_founder_geno_transposed.zip",
-         ())))
+    (("tests/r_qtl/test_files/test_founder_geno.zip",
+      (({
+          "id": "1",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "2",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "3",
+          "PVV4": 2,
+          "AXR-1": 2,
+          "HH.335C-Col/PhyA": None,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "4",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "5",
+          "PVV4": None,
+          "AXR-1": 2,
+          "HH.335C-Col/PhyA": 2,
+          "EC.480C": 2,
+          "EC.66C": 2
+      }))),
+     ("tests/r_qtl/test_files/test_founder_geno_transposed.zip",
+      (({
+          "id": "1",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "2",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "3",
+          "PVV4": 2,
+          "AXR-1": 2,
+          "HH.335C-Col/PhyA": None,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "4",
+          "PVV4": 1,
+          "AXR-1": 1,
+          "HH.335C-Col/PhyA": 1,
+          "EC.480C": 1,
+          "EC.66C": 1
+      }, {
+          "id": "5",
+          "PVV4": None,
+          "AXR-1": 2,
+          "HH.335C-Col/PhyA": 2,
+          "EC.480C": 2,
+          "EC.66C": 2
+      })))))
 def test_parse_founder_geno_files(relpath, expected):
     """Test parsing of founder_geno files from the R/qtl2 bundle.
 
@@ -190,5 +259,9 @@ def test_parse_founder_geno_files(relpath, expected):
     THEN: ensure that the data we get is as expected
     """
     with ZipFile(Path(relpath).absolute(), "r") as zfile:
-        assert tuple(rqtl2.founder_genotype_data(
-            zfile, rqtl2.control_data(zfile))) == expected
+        cdata = rqtl2.control_data(zfile)
+        assert tuple(rqtl2.file_data(
+            zfile,
+            "founder_geno",
+            cdata,
+            *rqtl2.make_process_data_geno(cdata))) == expected