aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-03 16:41:33 +0300
committerFrederick Muriuki Muriithi2024-01-03 16:41:33 +0300
commit95d2b868adebbc7ebbc2435f9184c30c014ec513 (patch)
treeed4849c1162d66766e24a9c97c4b03e6a225db22
parente713b566a873424709300110a78801fd49a74ea1 (diff)
downloadgn-uploader-95d2b868adebbc7ebbc2435f9184c30c014ec513.tar.gz
Parse founder_geno files. Generalise parsing files.
* Add tests for parsing "founder_geno" files * Extract common file parsing structure out to more general function * Use generic function to parse "founder_geno" file in test
-rw-r--r--r_qtl/r_qtl2.py54
-rw-r--r--tests/r_qtl/test_files/test_founder_geno.zipbin0 -> 672 bytes
-rw-r--r--tests/r_qtl/test_files/test_founder_geno_transposed.zipbin0 -> 725 bytes
-rw-r--r--tests/r_qtl/test_r_qtl2_geno.py87
4 files changed, 134 insertions, 7 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 2256609..4dac24b 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -105,6 +105,35 @@ def with_transposed(zfile: ZipFile,
except StopIteration:
pass
+def make_process_data_geno(cdata) -> tuple[
+ Callable[[dict], dict],
+ Callable[[str, tuple[str, ...], tuple[str, ...]],
+ tuple[dict, ...]]]:
+ """Build functions to process genotype data."""
+ def replace_genotype_codes(val):
+ return cdata["genotypes"].get(val, val)
+
+ def replace_na_strings(val):
+ nastrings = cdata.get("na.strings")
+ if bool(nastrings):
+ return (None if val in nastrings else val)
+ return val
+ def __non_transposed__(row: dict) -> dict:
+ return {
+ key: thread_op(value, replace_genotype_codes, replace_na_strings)
+ for key,value in row.items()
+ }
+ def __transposed__(id_key: str,
+ ids: tuple[str, ...],
+ vals: tuple[str, ...]) -> tuple[dict, ...]:
+ return tuple(
+ dict(zip(
+ [id_key, vals[0]],
+ (thread_op(item, replace_genotype_codes, replace_na_strings)
+ for item in items)))
+ for items in zip(ids, vals[1:]))
+ return (__non_transposed__, __transposed__)
+
def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
"""Load the genotype file, making use of the control data."""
def replace_genotype_codes(val):
@@ -176,3 +205,28 @@ def phenotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
for items in zip(ids, vals[1:]))
for row in with_transposed(zfile, "pheno", cdata, __merge__):
yield row
+
+def __default_process_value_transposed__(
+ id_key: str,
+ ids: tuple[str, ...],
+ vals: tuple[str, ...]) -> tuple[dict, ...]:
+ """Default values processor for transposed files."""
+ return tuple(
+ dict(zip([id_key, vals[0]], items)) for items in zip(ids, vals[1:]))
+
+def file_data(zfile: ZipFile,
+ member_key: str,
+ cdata: dict,
+ process_value: Callable[[dict], dict] = lambda val: val,
+ process_transposed_value: Callable[
+ [str, tuple[str, ...], tuple[str, ...]],
+ tuple[dict, ...]] = __default_process_value_transposed__) -> Iterator[dict]:
+ """Load data from files in R/qtl2 zip bundle."""
+ if not cdata.get(f"{member_key}_transposed", False):
+ for row in with_non_transposed(zfile, member_key, cdata, process_value):
+ yield row
+ return
+
+ for row in with_transposed(
+ zfile, member_key, cdata, process_transposed_value):
+ yield row
diff --git a/tests/r_qtl/test_files/test_founder_geno.zip b/tests/r_qtl/test_files/test_founder_geno.zip
new file mode 100644
index 0000000..f77626b
--- /dev/null
+++ b/tests/r_qtl/test_files/test_founder_geno.zip
Binary files differ
diff --git a/tests/r_qtl/test_files/test_founder_geno_transposed.zip b/tests/r_qtl/test_files/test_founder_geno_transposed.zip
new file mode 100644
index 0000000..6cc8151
--- /dev/null
+++ b/tests/r_qtl/test_files/test_founder_geno_transposed.zip
Binary files differ
diff --git a/tests/r_qtl/test_r_qtl2_geno.py b/tests/r_qtl/test_r_qtl2_geno.py
index d323c4b..787d13a 100644
--- a/tests/r_qtl/test_r_qtl2_geno.py
+++ b/tests/r_qtl/test_r_qtl2_geno.py
@@ -177,11 +177,80 @@ def test_parse_geno_files(relpath, expected):
@pytest.mark.unit_test
@pytest.mark.parametrize(
"relpath,expected",
- (
- ("tests/r_qtl/test_files/test_founder_geno.zip",
- ()),
- ("tests/r_qtl/test_files/test_founder_geno_transposed.zip",
- ())))
+ (("tests/r_qtl/test_files/test_founder_geno.zip",
+ (({
+ "id": "1",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "2",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "3",
+ "PVV4": 2,
+ "AXR-1": 2,
+ "HH.335C-Col/PhyA": None,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "4",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "5",
+ "PVV4": None,
+ "AXR-1": 2,
+ "HH.335C-Col/PhyA": 2,
+ "EC.480C": 2,
+ "EC.66C": 2
+ }))),
+ ("tests/r_qtl/test_files/test_founder_geno_transposed.zip",
+ (({
+ "id": "1",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "2",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "3",
+ "PVV4": 2,
+ "AXR-1": 2,
+ "HH.335C-Col/PhyA": None,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "4",
+ "PVV4": 1,
+ "AXR-1": 1,
+ "HH.335C-Col/PhyA": 1,
+ "EC.480C": 1,
+ "EC.66C": 1
+ }, {
+ "id": "5",
+ "PVV4": None,
+ "AXR-1": 2,
+ "HH.335C-Col/PhyA": 2,
+ "EC.480C": 2,
+ "EC.66C": 2
+ })))))
def test_parse_founder_geno_files(relpath, expected):
"""Test parsing of founder_geno files from the R/qtl2 bundle.
@@ -190,5 +259,9 @@ def test_parse_founder_geno_files(relpath, expected):
THEN: ensure that the data we get is as expected
"""
with ZipFile(Path(relpath).absolute(), "r") as zfile:
- assert tuple(rqtl2.founder_genotype_data(
- zfile, rqtl2.control_data(zfile))) == expected
+ cdata = rqtl2.control_data(zfile)
+ assert tuple(rqtl2.file_data(
+ zfile,
+ "founder_geno",
+ cdata,
+ *rqtl2.make_process_data_geno(cdata))) == expected