From 7b94f989bcfbf6543bfa628422331adfa3d5daac Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Wed, 24 Mar 2021 16:18:00 +0300 Subject: Add extra procedure for parsing a genotype file * gn3/computations/parsers.py (parse_genofile): New procedure. * tests/unit/computations/test_parsers.py: New test files for above. --- gn3/computations/parsers.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 gn3/computations/parsers.py (limited to 'gn3/computations/parsers.py') diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py new file mode 100644 index 0000000..94387ff --- /dev/null +++ b/gn3/computations/parsers.py @@ -0,0 +1,38 @@ +"""Parsers for generating some files in genenetwork""" +import os +from typing import Any, Dict, List, Tuple + + +def parse_genofile(file_path: str) -> Tuple[List[str], + List[Dict[str, Any]]]: + """Parse a genotype file with a given format""" + if not os.path.exists(file_path): + raise FileNotFoundError + __map = { + 'b': -1, + 'd': 1, + 'h': 0, + 'u': None, + } + genotypes, strains = [], [] + with open(file_path, "r") as _genofile: + for line in _genofile: + line = line.strip() + if line.startswith(("#", "@")): + continue + cells = line.split() + if line.startswith("Chr"): + strains = cells[4:] + strains = [strain.lower() for strain in strains] + continue + values = [__map.get(value.lower(), None) for value in cells[4:]] + genotype = { + "chr": cells[0], + "locus": cells[1], + "cm": cells[2], + "mb": cells[3], + "values": values, + "dicvalues": dict(zip(strains, values)), + } + genotypes.append(genotype) + return strains, genotypes -- cgit v1.2.3