about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMuriithi Frederick Muriuki2021-09-01 06:18:20 +0300
committerMuriithi Frederick Muriuki2021-09-01 06:42:51 +0300
commit221c773daea839ecf0e50c196484bb91e3a6db33 (patch)
tree50e139dd0bbed18e1771fcfcd3dd0195ec78efe7
parentb5e1d1176f1bf4f7c0b68b27beb15e99418f1650 (diff)
downloadgenenetwork3-221c773daea839ecf0e50c196484bb91e3a6db33.tar.gz
Implement parsing of genotype labels
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi

* gn3/db/genotypes.py: parse genotype labels
* tests/unit/db/test_genotypes.py: test that genotype labels are parsed
  correctly

  As part of parsing the genotype files into usable python data structures,
  this commit adds a function to parse the label lines (beginning with "@")
  into the appropriate values.
-rw-r--r--gn3/db/genotypes.py20
-rw-r--r--tests/unit/db/test_genotypes.py17
2 files changed, 37 insertions, 0 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 610ddde..2be3e1a 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -67,3 +67,23 @@ def __load_genotype_samples_from_plink(genotype_filename: str):
     """
     genofile = open(genotype_filename)
     return [line.split(" ")[1] for line in genofile]
+
+def parse_genotype_labels(lines: list):
+    """
+    Parse label lines into usable genotype values
+
+    DESCRIPTION:
+    Reworks
+    https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93
+    """
+    acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"]
+    def __parse_label(line):
+        label, value = [l.strip() for l in line[1:].split(":")]
+        if label not in acceptable_labels:
+            return None
+        if label == "name":
+            return ("group", value)
+        return (label, value)
+    return tuple(
+        item for item in (__parse_label(line) for line in lines)
+        if item is not None)
diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py
new file mode 100644
index 0000000..0264764
--- /dev/null
+++ b/tests/unit/db/test_genotypes.py
@@ -0,0 +1,17 @@
+"""Tests gn3.db.genotypes"""
+from unittest import TestCase
+from gn3.db.genotypes import parse_genotype_labels
+
+class TestGenotypes(TestCase):
+    """Tests for functions in `gn3.db.genotypes`."""
+
+    def test_parse_genotype_labels(self):
+        self.assertEqual(
+            parse_genotype_labels([
+                "@name: test_group\t", "@filler: test_filler    ",
+                "@type:test_type", "@mat:test_mat   \t", "@pat:test_pat ",
+                "@het: test_het ", "@unk: test_unk", "@other: test_other",
+                "@brrr: test_brrr "]),
+        (("group", "test_group"), ("filler", "test_filler"),
+         ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"),
+         ("het", "test_het"), ("unk", "test_unk")))