aboutsummaryrefslogtreecommitdiff
path: root/gn3/db
diff options
context:
space:
mode:
authorMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
committerMuriithi Frederick Muriuki2021-09-01 09:11:17 +0300
commita1c217cf277feda3815a8435d6c8909f1b5546a1 (patch)
tree04644c259b0d1f9437c783fccc20da7e07ca5ca4 /gn3/db
parentb975e0cfd1d0adc5f51e66292d29d4651d3f053f (diff)
downloadgenenetwork3-a1c217cf277feda3815a8435d6c8909f1b5546a1.tar.gz
Parse data lines into markers
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse data lines in file to genetic markers. * tests/unit/db/test_genotypes.py: test that parsing works. Add some tests to check that the parsing of the markers works as expected, and add the code to actually parse the markers.
Diffstat (limited to 'gn3/db')
-rw-r--r--gn3/db/genotypes.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index be0dfc2..8710d2e 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -106,3 +106,40 @@ def parse_genotype_header(line: str, parlist = tuple()):
("mb_column", None if not Mbmap else items.index("Mb")),
("prgy", prgy),
("nprgy", len(prgy)))
+
+def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+ """
+ Parse a data line in a genotype file
+
+ DESCRIPTION:
+ Reworks
+ https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/gen_geno_ob.py#L143-L190
+ """
+ marker_row = [item.strip() for item in line.split("\t")]
+ geno_table = {
+ geno_obj["mat"]: -1, geno_obj["pat"]: 1, geno_obj["het"]: 0,
+ geno_obj["unk"]: "U"
+ }
+ start_pos = 4 if geno_obj["Mbmap"] else 3
+ if len(parlist) > 0:
+ start_pos = start_pos + 2
+
+ alleles = marker_row[start_pos:]
+ genotype = tuple(
+ (geno_table[allele] if allele in geno_table.keys() else "U")
+ for allele in alleles)
+ if len(parlist) > 0:
+ genotype = (-1, 1) + genotype
+ try:
+ cM = float(geno_obj["cm_column"])
+ except:
+ if geno_obj["Mbmap"]:
+ cM = float(geno_obj["mb_column"])
+ else:
+ cM = 0
+ return (
+ ("chr", marker_row[0]),
+ ("name", marker_row[1]),
+ ("cM", cM),
+ ("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
+ ("genotype", genotype))