diff options
author | Muriithi Frederick Muriuki | 2021-09-01 09:11:17 +0300 |
---|---|---|
committer | Muriithi Frederick Muriuki | 2021-09-01 09:11:17 +0300 |
commit | a1c217cf277feda3815a8435d6c8909f1b5546a1 (patch) | |
tree | 04644c259b0d1f9437c783fccc20da7e07ca5ca4 /gn3/db | |
parent | b975e0cfd1d0adc5f51e66292d29d4651d3f053f (diff) | |
download | genenetwork3-a1c217cf277feda3815a8435d6c8909f1b5546a1.tar.gz |
Parse data lines into markers
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
* gn3/db/genotypes.py: parse data lines in file to genetic markers.
* tests/unit/db/test_genotypes.py: test that parsing works.
Add some tests to check that the parsing of the markers works as expected,
and add the code to actually parse the markers.
Diffstat (limited to 'gn3/db')
-rw-r--r-- | gn3/db/genotypes.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py index be0dfc2..8710d2e 100644 --- a/gn3/db/genotypes.py +++ b/gn3/db/genotypes.py @@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()): ("mb_column", None if not Mbmap else items.index("Mb")), ("prgy", prgy), ("nprgy", len(prgy))) + +def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list): + """ + Parse a data line in a genotype file + + DESCRIPTION: + Reworks + https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190 + """ + marker_row = [item.strip() for item in line.split("\t")] + geno_table = { + geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0, + geno_obj["unk"]: "U" + } + start_pos = 4 if geno_obj["Mbmap"] else 3 + if len(parlist) > 0: + start_pos = start_pos + 2 + + alleles = marker_row[start_pos:] + genotype = tuple( + (geno_table[allele] if allele in geno_table.keys() else "U") + for allele in alleles) + if len(parlist) > 0: + genotype = (-1, 1) + genotype + try: + cM = float(geno_obj["cm_column"]) + except: + if geno_obj["Mbmap"]: + cM = float(geno_obj["mb_column"]) + else: + cM = 0 + return ( + ("chr", marker_row[0]), + ("name", marker_row[1]), + ("cM", cM), + ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None), + ("genotype", genotype)) |