aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
committerMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
commita1c217cf277feda3815a8435d6c8909f1b5546a1 (patch)
tree04644c259b0d1f9437c783fccc20da7e07ca5ca4
parentb975e0cfd1d0adc5f51e66292d29d4651d3f053f (diff)
downloadgenenetwork3-a1c217cf277feda3815a8435d6c8909f1b5546a1.tar.gz
Parse data lines into markers
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse data lines in file to genetic markers. * tests/unit/db/test_genotypes.py: test that parsing works. Add some tests to check that the parsing of the markers works as expected, and add the code to actually parse the markers.
-rw-r--r--gn3/db/genotypes.py37
-rw-r--r--tests/unit/db/test_genotypes.py38
2 files changed, 74 insertions, 1 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index be0dfc2..8710d2e 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()):
("mb_column", None if not Mbmap else items.index("Mb")),
("prgy", prgy),
("nprgy", len(prgy)))
+
+def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+ """
+ Parse a data line in a genotype file
+
+ DESCRIPTION:
+ Reworks
+ https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
+ """
+ marker_row = [item.strip() for item in line.split("\t")]
+ geno_table = {
+ geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
+ geno_obj["unk"]: "U"
+ }
+ start_pos = 4 if geno_obj["Mbmap"] else 3
+ if len(parlist) > 0:
+ start_pos = start_pos + 2
+
+ alleles = marker_row[start_pos:]
+ genotype = tuple(
+ (geno_table[allele] if allele in geno_table.keys() else "U")
+ for allele in alleles)
+ if len(parlist) > 0:
+ genotype = (-1, 1) + genotype
+ try:
+ cM = float(geno_obj["cm_column"])
+ except:
+ if geno_obj["Mbmap"]:
+ cM = float(geno_obj["mb_column"])
+ else:
+ cM = 0
+ return (
+ ("chr", marker_row[0]),
+ ("name", marker_row[1]),
+ ("cM", cM),
+ ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
+ ("genotype", genotype))
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index 4fa8a53..ba90191 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,11 +1,13 @@
"""Tests gn3.db.genotypes"""
from unittest import TestCase
-from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header
+from gn3.db.genotypes import (
+ parse_genotype_labels, parse_genotype_header, parse_genotype_data_line)
class TestGenotypes(TestCase):
"""Tests for functions in `gn3.db.genotypes`."""
def test_parse_genotype_labels(self):
+ """Test that the genotype labels are parsed correctly."""
self.assertEqual(
parse_genotype_labels([
"@name: test_group\t", "@filler: test_filler ",
@@ -17,6 +19,7 @@ class TestGenotypes(TestCase):
("het", "test_het"), ("unk", "test_unk")))
def test_parse_genotype_header(self):
+ """Test that the genotype header is parsed correctly."""
for header, expected in [
[("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t"
"BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"),
@@ -35,3 +38,36 @@ class TestGenotypes(TestCase):
("nprgy", 13))]]:
with self.subTest(header=header):
self.assertEqual(parse_genotype_header(header), expected)
+
+ def test_parse_genotype_data_line(self):
+ """Test parsing of data lines."""
+ for line, geno_obj, parlist, expected in [
+ ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+ {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+ "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+ "mb_column": 3},
+ tuple(),
+ (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", 3.0),
+ ("genotype",
+ ("U", "U", "U", "U", "U", "U", "U", "U", "U", "U")))],
+ ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+ {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+ "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+ "mb_column": 3},
+ ("some", "parlist", "content"),
+ (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", 3.0),
+ ("genotype",
+ (-1, 1, "U", "U", "U", "U", "U", "U", "U", "U")))],
+ ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tH\tD\tB\tU\tD\tB\tB",
+ {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+ "cm_column": 2, "Mbmap": True, "mb_column": 3},
+ tuple(),
+ (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+ ("Mb", 3.0),
+ ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
+ with self.subTest(line = line):
+ self.assertEqual(
+ parse_genotype_data_line(line, geno_obj, parlist),
+ expected)