aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gn3/db/genotypes.py38
-rw-r--r--tests/unit/db/test_genotypes.py101
2 files changed, 136 insertions, 3 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 8710d2e..b5d14a5 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()):
("prgy", prgy),
("nprgy", len(prgy)))
-def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
"""
Parse a data line in a genotype file
@@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
("cM", cM),
("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
("genotype", genotype))
+
+def build_genotype_chromosomes(geno_obj, markers):
+ """
+ Build up the chromosomes from the given markers and partially built geno
+ object
+ """
+ mrks = [dict(marker) for marker in markers]
+ chr_names = {marker["chr"] for marker in mrks}
+ return tuple((
+ ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2),
+ ("mb_column", geno_obj["mb_column"]),
+ ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name)))
+ for chr_name in sorted(chr_names))
+
+def parse_genotype_file(filename: str, parlist = tuple()):
+ """
+ Parse the provided genotype file into a usable pytho3 data structure.
+ """
+ with open(filename, "r") as infile:
+ contents = infile.readlines()
+
+ lines = tuple(line for line in contents if
+ ((not line.strip().startswith("#")) and
+ (not line.strip() == "")))
+ labels = parse_genotype_labels(
+ line for line in lines if line.startswith("@"))
+ data_lines = tuple(line for line in lines if not line.startswith("@"))
+ header = parse_genotype_header(data_lines[0], parlist)
+ geno_obj = dict(labels + header)
+ markers = tuple(
+ parse_genotype_marker(line, geno_obj, parlist)
+ for line in data_lines[1:])
+ chromosomes = tuple(
+ dict(chromosome) for chromosome in
+ build_genotype_chromosomes(geno_obj, markers))
+ return {**geno_obj, "chromosomes": chromosomes}
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index ba90191..a05ce48 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,7 +1,11 @@
"""Tests gn3.db.genotypes"""
from unittest import TestCase
from gn3.db.genotypes import (
- parse_genotype_labels, parse_genotype_header, parse_genotype_data_line)
+ parse_genotype_file,
+ parse_genotype_labels,
+ parse_genotype_header,
+ parse_genotype_marker,
+ build_genotype_chromosomes)
class TestGenotypes(TestCase):
"""Tests for functions in `gn3.db.genotypes`."""
@@ -69,5 +73,98 @@ class TestGenotypes(TestCase):
("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
with self.subTest(line = line):
self.assertEqual(
- parse_genotype_data_line(line, geno_obj, parlist),
+ parse_genotype_marker(line, geno_obj, parlist),
expected)
+
+ def test_build_genotype_chromosomes(self):
+ """
+ Given `markers` and `geno_obj`, test that `build_genotype_chromosomes`
+ builds a sequence of chromosomes with the given markers ordered
+ according to the `chr` value."""
+ for markers, geno_obj, expected in [
+ [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", 3.0),
+ ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1))),
+ (("chr", "2"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", 3.0),
+ ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))],
+ {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+ "cm_column": 2, "Mbmap": True, "mb_column": 3},
+ ((("name", "1"), ("mb_exists", True), ("cm_column", 2),
+ ("mb_column", 3),
+ ("loci",
+ ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": 3.0,
+ "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))),
+ (("name", "2"), ("mb_exists", True), ("cm_column", 2),
+ ("mb_column", 3),
+ ("loci",
+ ({"chr": "2", "name": "rs31443144", "cM": 2.0, "Mb": 3.0,
+ "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))))],
+ [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", None),
+ ("genotype", (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)))],
+ {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+ "cm_column": 2, "Mbmap": False, "mb_column": None},
+ ((("name", "1"), ("mb_exists", False), ("cm_column", 2),
+ ("mb_column", None),
+ ("loci",
+ ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None,
+ "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]:
+ with self.subTest(markers = markers):
+ self.assertEqual(
+ build_genotype_chromosomes(geno_obj, markers),
+ expected)
+
+ def test_parse_genotype_file(self):
+ """Test the parsing of genotype files. """
+ self.assertEqual(
+ parse_genotype_file(
+ "tests/unit/db/data/genotypes/genotype_sample1.geno"),
+ {"group": "BXD",
+ "type": "riset",
+ "mat": "B",
+ "pat": "D",
+ "het": "H",
+ "unk": "U",
+ "Mbmap": True,
+ "cm_column": 2,
+ "mb_column": 3,
+ "prgy": ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"),
+ "nprgy": 6,
+ "chromosomes": (
+ {"name": "1",
+ "mb_exists": True,
+ "cm_column": 2,
+ "mb_column": 3,
+ "loci": (
+ {"chr": "1",
+ "name": "rs31443144",
+ "cM": 2.0,
+ "Mb": 3.0,
+ "genotype": (-1, -1, 1, 1, 1, -1)
+ },
+ {"chr": "1",
+ "name": "rs6269442",
+ "cM": 2.0,
+ "Mb": 3.0,
+ "genotype": (-1, -1, 1, 1, 0, "U")},
+ {"chr": "1",
+ "name": "rs32285189",
+ "cM": 2.0,
+ "Mb": 3.0,
+ "genotype": (-1, "U", 1, 1, 1, -1)})},
+ {"name": "2",
+ "mb_exists": True,
+ "cm_column": 2,
+ "mb_column": 3,
+ "loci": (
+ {"chr": "2",
+ "name": "rs31443144",
+ "cM": 2.0,
+ "Mb": 3.0,
+ "genotype": (-1, -1, 1, 1, 1, -1)},
+ {"chr": "2",
+ "name": "rs6269442",
+ "cM": 2.0,
+ "Mb": 3.0,
+ "genotype": (-1, -1, 1, 1, 0, "U")})})})