diff options
author | Muriithi Frederick Muriuki | 2021-09-01 10:49:52 +0300 |
---|---|---|
committer | Muriithi Frederick Muriuki | 2021-09-01 10:49:52 +0300 |
commit | abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc (patch) | |
tree | d1e523ec38a0cffa361d4b70c6ba9282ae8058eb | |
parent | a1c217cf277feda3815a8435d6c8909f1b5546a1 (diff) | |
download | genenetwork3-abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc.tar.gz |
Built top-level genotype file parsing function
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
* gn3/db/genotypes.py: parse genotype files
* tests/unit/db/test_genotypes.py: test parsing is correct
Add the overall genotype files parsing function and tests to check that the
parsing works as expected.
-rw-r--r-- | gn3/db/genotypes.py | 38 | ||||
-rw-r--r-- | tests/unit/db/test_genotypes.py | 101 |
2 files changed, 136 insertions, 3 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 8710d2e..b5d14a5 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()): ("prgy", prgy), ("nprgy", len(prgy))) -def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): +def parse_genotype_marker(line: str, geno_obj: dict, parlist: list): """ Parse a data line in a genotype file @@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): ("cM", cM), ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), ("genotype", genotype)) + +def build_genotype_chromosomes(geno_obj, markers): + """ + Build up the chromosomes from the given markers and partially built geno + object + """ + mrks = [dict(marker) for marker in markers] + chr_names = {marker["chr"] for marker in mrks} + return tuple(( + ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2), + ("mb_column", geno_obj["mb_column"]), + ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name))) + for chr_name in sorted(chr_names)) + +def parse_genotype_file(filename: str, parlist = tuple()): + """ + Parse the provided genotype file into a usable pytho3 data structure. + """ + with open(filename, "r") as infile: + contents = infile.readlines() + + lines = tuple(line for line in contents if + ((not line.strip().startswith("#")) and + (not line.strip() == ""))) + labels = parse_genotype_labels( + line for line in lines if line.startswith("@")) + data_lines = tuple(line for line in lines if not line.startswith("@")) + header = parse_genotype_header(data_lines[0], parlist) + geno_obj = dict(labels + header) + markers = tuple( + parse_genotype_marker(line, geno_obj, parlist) + for line in data_lines[1:]) + chromosomes = tuple( + dict(chromosome) for chromosome in + build_genotype_chromosomes(geno_obj, markers)) + return {**geno_obj, "chromosomes": chromosomes} diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py index ba90191..a05ce48 100644 --- a/tests/unit/db/test_genotypes.py +++ b/tests/unit/db/test_genotypes.py @@ -1,7 +1,11 @@ """Tests gn3.db.genotypes""" from unittest import TestCase from gn3.db.genotypes import ( - parse_genotype_labels, parse_genotype_header, parse_genotype_data_line) + parse_genotype_file, + parse_genotype_labels, + parse_genotype_header, + parse_genotype_marker, + build_genotype_chromosomes) class TestGenotypes(TestCase): """Tests for functions in `gn3.db.genotypes`.""" @@ -69,5 +73,98 @@ class TestGenotypes(TestCase): ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]: with self.subTest(line = line): self.assertEqual( - parse_genotype_data_line(line, geno_obj, parlist), + parse_genotype_marker(line, geno_obj, parlist), expected) + + def test_build_genotype_chromosomes(self): + """ + Given `markers` and `geno_obj`, test that `build_genotype_chromosomes` + builds a sequence of chromosomes with the given markers ordered + according to the `chr` value.""" + for markers, geno_obj, expected in [ + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1))), + (("chr", "2"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", 3.0), + ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": True, "mb_column": 3}, + ((("name", "1"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))), + (("name", "2"), ("mb_exists", True), ("cm_column", 2), + ("mb_column", 3), + ("loci", + ({"chr": "2", "name": "rs31443144", "cM": 2.0, "Mb": 3.0, + "genotype": (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)},))))], + [[(("chr", "1"), ("name", "rs31443144"), ("cM", 2.0), + ("Mb", None), + ("genotype", (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)))], + {"mat": "B", "pat": "D", "het": "H", "unk": "U", + "cm_column": 2, "Mbmap": False, "mb_column": None}, + ((("name", "1"), ("mb_exists", False), ("cm_column", 2), + ("mb_column", None), + ("loci", + ({"chr": "1", "name": "rs31443144", "cM": 2.0, "Mb": None, + "genotype": (-1, 1, 1, 0, 1, -1, "U", 1, -1, -1)},))),)]]: + with self.subTest(markers = markers): + self.assertEqual( + build_genotype_chromosomes(geno_obj, markers), + expected) + + def test_parse_genotype_file(self): + """Test the parsing of genotype files. """ + self.assertEqual( + parse_genotype_file( + "tests/unit/db/data/genotypes/genotype_sample1.geno"), + {"group": "BXD", + "type": "riset", + "mat": "B", + "pat": "D", + "het": "H", + "unk": "U", + "Mbmap": True, + "cm_column": 2, + "mb_column": 3, + "prgy": ("BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9"), + "nprgy": 6, + "chromosomes": ( + {"name": "1", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "1", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1) + }, + {"chr": "1", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")}, + {"chr": "1", + "name": "rs32285189", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, "U", 1, 1, 1, -1)})}, + {"name": "2", + "mb_exists": True, + "cm_column": 2, + "mb_column": 3, + "loci": ( + {"chr": "2", + "name": "rs31443144", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 1, -1)}, + {"chr": "2", + "name": "rs6269442", + "cM": 2.0, + "Mb": 3.0, + "genotype": (-1, -1, 1, 1, 0, "U")})})}) |