diff options
author | Muriithi Frederick Muriuki | 2021-09-01 06:18:20 +0300 |
---|---|---|
committer | Muriithi Frederick Muriuki | 2021-09-01 06:42:51 +0300 |
commit | 221c773daea839ecf0e50c196484bb91e3a6db33 (patch) | |
tree | 50e139dd0bbed18e1771fcfcd3dd0195ec78efe7 | |
parent | b5e1d1176f1bf4f7c0b68b27beb15e99418f1650 (diff) | |
download | genenetwork3-221c773daea839ecf0e50c196484bb91e3a6db33.tar.gz |
Implement parsing of genotype labels
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
* gn3/db/genotypes.py: parse genotype labels
* tests/unit/db/test_genotypes.py: test that genotype labels are parsed
correctly
As part of parsing the genotype files into usable python data structures,
this commit adds a function to parse the label lines (beginning with "@")
into the appropriate values.
-rw-r--r-- | gn3/db/genotypes.py | 20 | ||||
-rw-r--r-- | tests/unit/db/test_genotypes.py | 17 |
2 files changed, 37 insertions, 0 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index 610ddde..2be3e1a 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -67,3 +67,23 @@ def __load_genotype_samples_from_plink(genotype_filename: str): """ genofile = open(genotype_filename) return [line.split(" ")[1] for line in genofile] + +def parse_genotype_labels(lines: list): + """ + Parse label lines into usable genotype values + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L75-L93 + """ + acceptable_labels = ["name", "filler", "type", "mat", "pat", "het", "unk"] + def __parse_label(line): + label, value = [l.strip() for l in line[1:].split(":")] + if label not in acceptable_labels: + return None + if label == "name": + return ("group", value) + return (label, value) + return tuple( + item for item in (__parse_label(line) for line in lines) + if item is not None) diff --git a/tests/unit/db/test_genotypes.py b/tests/unit/db/test_genotypes.py new file mode 100644 index 0000000..0264764 --- /dev/null +++ b/tests/unit/db/test_genotypes.py @@ -0,0 +1,17 @@ +"""Tests gn3.db.genotypes""" +from unittest import TestCase +from gn3.db.genotypes import parse_genotype_labels + +class TestGenotypes(TestCase): + """Tests for functions in `gn3.db.genotypes`.""" + + def test_parse_genotype_labels(self): + self.assertEqual( + parse_genotype_labels([ + "@name: test_group\t", "@filler: test_filler ", + "@type:test_type", "@mat:test_mat \t", "@pat:test_pat ", + "@het: test_het ", "@unk: test_unk", "@other: test_other", + "@brrr: test_brrr "]), + (("group", "test_group"), ("filler", "test_filler"), + ("type", "test_type"), ("mat", "test_mat"), ("pat", "test_pat"), + ("het", "test_het"), ("unk", "test_unk"))) |