about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
committerMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
commita1c217cf277feda3815a8435d6c8909f1b5546a1 (patch)
tree04644c259b0d1f9437c783fccc20da7e07ca5ca4
parentb975e0cfd1d0adc5f51e66292d29d4651d3f053f (diff)
downloadgenenetwork3-a1c217cf277feda3815a8435d6c8909f1b5546a1.tar.gz
Parse data lines into markers
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: parse data lines in file to genetic markers.
* tests/unit/db/test_genotypes.py: test that parsing works.

  Add some tests to check that the parsing of the markers works as expected,
  and add the code to actually parse the markers.
-rw-r--r--gn3/db/genotypes.py37
-rw-r--r--tests/unit/db/test_genotypes.py38
2 files changed, 74 insertions, 1 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index be0dfc2..8710d2e 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()):
         ("mb_column", None if not Mbmap else items.index("Mb")),
         ("prgy", prgy),
         ("nprgy", len(prgy)))
+
+def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+    """
+    Parse a data line in a genotype file
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
+    """
+    marker_row = [item.strip() for item in line.split("\t")]
+    geno_table = {
+        geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
+        geno_obj["unk"]: "U"
+    }
+    start_pos = 4 if geno_obj["Mbmap"] else 3
+    if len(parlist) > 0:
+        start_pos = start_pos + 2
+
+    alleles = marker_row[start_pos:]
+    genotype = tuple(
+        (geno_table[allele] if allele in geno_table.keys() else "U")
+        for allele in alleles)
+    if len(parlist) > 0:
+        genotype = (-1, 1) + genotype
+    try:
+        cM = float(geno_obj["cm_column"])
+    except:
+        if geno_obj["Mbmap"]:
+            cM = float(geno_obj["mb_column"])
+        else:
+            cM = 0
+    return (
+        ("chr", marker_row[0]),
+        ("name", marker_row[1]),
+        ("cM", cM),
+        ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
+        ("genotype", genotype))
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
index 4fa8a53..ba90191 100644
--- a/tests/unit/db/test_genotypes.py
+++ b/tests/unit/db/test_genotypes.py
@@ -1,11 +1,13 @@
 """Tests gn3.db.genotypes"""
 from unittest import TestCase
-from gn3.db.genotypes import parse_genotype_labels, parse_genotype_header
+from gn3.db.genotypes import (
+    parse_genotype_labels, parse_genotype_header, parse_genotype_data_line)
 
 class TestGenotypes(TestCase):
     """Tests for functions in `gn3.db.genotypes`."""
 
     def test_parse_genotype_labels(self):
+        """Test that the genotype labels are parsed correctly."""
         self.assertEqual(
             parse_genotype_labels([
                 "@name: test_group\t", "@filler: test_filler    ",
@@ -17,6 +19,7 @@ class TestGenotypes(TestCase):
          ("het", "test_het"), ("unk", "test_unk")))
 
     def test_parse_genotype_header(self):
+        """Test that the genotype header is parsed correctly."""
         for header, expected in [
                 [("Chr\tLocus\tcM\tMb\tBXD1\tBXD2\tBXD5\tBXD6\tBXD8\tBXD9\t"
                   "BXD11\tBXD12\tBXD13\tBXD14\tBXD15\tBXD16\tBXD18\tBXD19"),
@@ -35,3 +38,36 @@ class TestGenotypes(TestCase):
                   ("nprgy", 13))]]:
             with self.subTest(header=header):
                 self.assertEqual(parse_genotype_header(header), expected)
+
+    def test_parse_genotype_data_line(self):
+        """Test parsing of data lines."""
+        for line, geno_obj, parlist, expected in [
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+                 {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+                  "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+                  "mb_column": 3},
+                 tuple(),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype",
+                   ("U", "U", "U", "U", "U", "U", "U", "U", "U", "U")))],
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tD\tD\tB\tB\tD\tB\tB",
+                 {"mat": "test_mat", "pat": "test_pat", "het": "test_het",
+                  "unk": "test_unk", "cm_column": 2, "Mbmap": True,
+                  "mb_column": 3},
+                 ("some", "parlist", "content"),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype",
+                   (-1, 1, "U", "U", "U", "U", "U", "U", "U", "U")))],
+                ["1\trs31443144\t1.50\t3.010274\tB\tB\tD\tH\tD\tB\tU\tD\tB\tB",
+                 {"mat": "B", "pat": "D", "het": "H", "unk": "U",
+                  "cm_column": 2, "Mbmap": True, "mb_column": 3},
+                 tuple(),
+                 (("chr", "1"), ("name", "rs31443144"), ("cM", 2.0),
+                  ("Mb", 3.0),
+                  ("genotype", (-1, -1, 1, 0, 1, -1, "U", 1, -1, -1)))]]:
+            with self.subTest(line = line):
+                self.assertEqual(
+                    parse_genotype_data_line(line, geno_obj, parlist),
+                    expected)