aboutsummaryrefslogtreecommitdiff
path: root/gn3/db
diff options
context:
space:
mode:
authorMuriithi Frederick Muriuki2021-09-01 10:49:52 +0300
committerMuriithi Frederick Muriuki2021-09-01 10:49:52 +0300
commitabfc0410a2385d8c3d6ee1915fc99b708e1d0dbc (patch)
treed1e523ec38a0cffa361d4b70c6ba9282ae8058eb /gn3/db
parenta1c217cf277feda3815a8435d6c8909f1b5546a1 (diff)
downloadgenenetwork3-abfc0410a2385d8c3d6ee1915fc99b708e1d0dbc.tar.gz
Built top-level genotype file parsing function
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi * gn3/db/genotypes.py: parse genotype files * tests/unit/db/test_genotypes.py: test parsing is correct Add the overall genotype files parsing function and tests to check that the parsing works as expected.
Diffstat (limited to 'gn3/db')
-rw-r--r--gn3/db/genotypes.py38
1 files changed, 37 insertions, 1 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py
index 8710d2e..b5d14a5 100644
--- a/gn3/db/genotypes.py
+++ b/gn3/db/genotypes.py
@@ -107,7 +107,7 @@ def parse_genotype_header(line: str, parlist = tuple()):
("prgy", prgy),
("nprgy", len(prgy)))
-def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
+def parse_genotype_marker(line: str, geno_obj: dict, parlist: list):
"""
Parse a data line in a genotype file
@@ -143,3 +143,39 @@ def parse_genotype_data_line(line: str, geno_obj: dict, parlist: list):
("cM", cM),
("Mb", float(geno_obj["mb_column"]) if geno_obj["Mbmap"] else None),
("genotype", genotype))
+
+def build_genotype_chromosomes(geno_obj, markers):
+ """
+ Build up the chromosomes from the given markers and partially built geno
+ object
+ """
+ mrks = [dict(marker) for marker in markers]
+ chr_names = {marker["chr"] for marker in mrks}
+ return tuple((
+ ("name", chr_name), ("mb_exists", geno_obj["Mbmap"]), ("cm_column", 2),
+ ("mb_column", geno_obj["mb_column"]),
+ ("loci", tuple(marker for marker in mrks if marker["chr"] == chr_name)))
+ for chr_name in sorted(chr_names))
+
+def parse_genotype_file(filename: str, parlist = tuple()):
+ """
+ Parse the provided genotype file into a usable pytho3 data structure.
+ """
+ with open(filename, "r") as infile:
+ contents = infile.readlines()
+
+ lines = tuple(line for line in contents if
+ ((not line.strip().startswith("#")) and
+ (not line.strip() == "")))
+ labels = parse_genotype_labels(
+ line for line in lines if line.startswith("@"))
+ data_lines = tuple(line for line in lines if not line.startswith("@"))
+ header = parse_genotype_header(data_lines[0], parlist)
+ geno_obj = dict(labels + header)
+ markers = tuple(
+ parse_genotype_marker(line, geno_obj, parlist)
+ for line in data_lines[1:])
+ chromosomes = tuple(
+ dict(chromosome) for chromosome in
+ build_genotype_chromosomes(geno_obj, markers))
+ return {**geno_obj, "chromosomes": chromosomes}