aboutsummaryrefslogtreecommitdiff
path: root/r_qtl
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2023-12-28 13:19:50 +0300
committerFrederick Muriuki Muriithi2023-12-28 13:20:27 +0300
commit2162cec2084f712993180618eb92c5a6dfdc5963 (patch)
treec07f6b18d9cb5dcf31e8731adbde36a75b6553fa /r_qtl
parentd4fff5fda2d9fe2b9730a7cffcc8f85b3a8eff17 (diff)
downloadgn-uploader-2162cec2084f712993180618eb92c5a6dfdc5963.tar.gz
Rework parsing of transposed geno files.
Diffstat (limited to 'r_qtl')
-rw-r--r--r_qtl/r_qtl2.py46
1 files changed, 27 insertions, 19 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 47f101e..ec7a954 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -5,11 +5,13 @@ import json
import yaml
from pathlib import Path
from functools import reduce
-from typing import Any, List, Union, Iterator
from zipfile import ZipFile, ZipInfo, is_zipfile
+from typing import Any, List, Union, Iterator, Iterable
from r_qtl.errors import InvalidFormat
+from quality_control.parsing import take
+
def thread_op(value, *functions):
"""Thread the `value` through the sequence of `functions`."""
return reduce(lambda result, func: func(result), functions, value)
@@ -30,25 +32,16 @@ def control_data(zfile: ZipFile) -> dict:
if files[0].endswith(".json")
else yaml.safe_load(zfile.read(files[0])))
-def genotype_metadata(zfile: ZipFile, cdata: dict) -> dict:
- """Read Individual ID key and the marker names."""
- line_num = 0
- with zfile.open(cdata["geno"]) as genofile:
- for line in filter(lambda line: not line.startswith("#"),
- io.TextIOWrapper(genofile)):
- line_parts = line.strip().split(cdata.get("sep", ","))
- return {
- "individual_id_key": line_parts[0].strip(),
- "markers": tuple(marker.strip() for marker in line_parts[1:])
- }
-
def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
"""Load the genotype file, making use of the control data."""
def replace_genotype_codes(val):
return cdata["genotypes"].get(val, val)
def replace_na_strings(val):
- return (None if val in cdata["na.strings"] else val)
+ nastrings = cdata.get("na.strings")
+ if bool(nastrings):
+ return (None if val in nastrings else val)
+ return val
if not cdata.get("geno_transposed", False):
with zfile.open(cdata["geno"]) as genofile:
@@ -68,13 +61,20 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
def __merge__(key, samples, line):
marker = line[0]
- return (
+ return tuple(
dict(zip(
[key, marker],
(thread_op(item, replace_genotype_codes, replace_na_strings)
for item in items)))
for items in zip(samples, line[1:]))
+ def __n_batch__(iterable: Iterable, num):
+ while True:
+ items = take(iterable, num)
+ if len(items) <= 0:
+ break
+ yield items
+
if cdata.get("geno_transposed", False):
with zfile.open(cdata["geno"]) as genofile:
lines = (line.strip().split(cdata.get("sep", ","))
@@ -82,11 +82,19 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
io.TextIOWrapper(genofile)))
id_line = next(lines)
id_key, samples = id_line[0], id_line[1:]
- for line in lines:
- for row in __merge__(id_key, samples, line):
- yield row
+ def __organise_by_id__(acc, item):
+ row = acc.get(item[id_key], {})
+ return {**acc, item[id_key]: {**row, **item}}
+ for _key, row in reduce(# type: ignore[var-annotated]
+ __organise_by_id__,
+ (row
+ for batch in __n_batch__(lines, 300)
+ for line in batch
+ for row in __merge__(id_key, samples, line)),
+ {}).items():
+ yield row
-def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> dict:
+def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> tuple[dict, ...]:
"""Read gmap files to get the genome mapping data"""
assert map_type in ("genetic-map", "physical-map"), "Invalid map type"
map_file = cdata[{