about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2024-01-02 09:14:21 +0300
committerFrederick Muriuki Muriithi2024-01-02 09:14:21 +0300
commitb3b9ba3b5b4e516d6220668155f9b5c57a51eb7d (patch)
treef76e7fbf8e3fd67fbf6aa57bb31626bc45e252a9
parent7a2bcc9e86bde0eb9c0d370f83df4684e5522f26 (diff)
downloadgn-uploader-b3b9ba3b5b4e516d6220668155f9b5c57a51eb7d.tar.gz
Abstract away non-transposed file processing
Since the processing of non-transposed files is mostly similar,
abstract away the common operations into a separate function and use
the function instead of repeating the same pattern of code throughout
the codebase.
-rw-r--r--r_qtl/r_qtl2.py53
1 files changed, 29 insertions, 24 deletions
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 22cf62c..4d609fd 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -32,6 +32,21 @@ def control_data(zfile: ZipFile) -> dict:
             if files[0].endswith(".json")
             else yaml.safe_load(zfile.read(files[0])))
 
+def with_non_transposed(zfile: ZipFile,
+                        member_key: str,
+                        cdata: dict,
+                        func: Callable[[dict], dict] = lambda val: val) -> Iterator[dict]:
+    """Abstracts away common file-opening for non-transposed R/qtl2 files."""
+    def not_comment_line(line):
+        return not line.startswith(cdata.get("comment.char", "#"))
+
+    with zfile.open(cdata[member_key]) as innerfile:
+        reader = csv.DictReader(
+            filter(not_comment_line, io.TextIOWrapper(innerfile)),
+            delimiter=cdata.get("sep", ","))
+        for row in reader:
+            yield func(row)
+
 def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load the genotype file, making use of the control data."""
     def replace_genotype_codes(val):
@@ -44,20 +59,15 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
         return val
 
     if not cdata.get("geno_transposed", False):
-        with zfile.open(cdata["geno"]) as genofile:
-            reader = csv.DictReader(
-                filter(lambda line: not line.startswith("#"),
-                       io.TextIOWrapper(genofile)),
-                delimiter=cdata.get("sep", ","))
-            for row in reader:
-                yield {
-                    key: thread_op(
-                        value,
-                        replace_genotype_codes,
-                        replace_na_strings)
-                    for key,value
-                    in row.items()
-                }
+        for line in with_non_transposed(
+                zfile,
+                "geno",
+                cdata,
+                lambda row: {
+                    key: thread_op(value, replace_genotype_codes, replace_na_strings)
+                    for key,value in row.items()
+                }):
+            yield line
 
     def __merge__(key, samples, line):
         marker = line[0]
@@ -100,23 +110,18 @@ def genotype_data(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
 def map_data(zfile: ZipFile, map_type: str, cdata: dict) -> tuple[dict, ...]:
     """Read gmap files to get the genome mapping data"""
     assert map_type in ("genetic-map", "physical-map"), "Invalid map type"
-    map_file = cdata[{
+    map_file_key = {
         "genetic-map": "gmap",
         "physical-map": "pmap"
-    }[map_type]]
+    }[map_type]
     transposed_dict = {
         "genetic-map": "gmap_transposed",
         "physical-map": "pmap_transposed"
     }
     if not cdata.get(transposed_dict[map_type], False):
-        with zfile.open(map_file) as gmapfile:
-            reader = csv.DictReader(
-                filter(lambda line: not line.startswith("#"),
-                       io.TextIOWrapper(gmapfile)),
-                delimiter=cdata.get("sep", ","))
-            return tuple(row for row in reader)
-
-    with zfile.open(map_file) as gmapfile:
+        return tuple(with_non_transposed(zfile, map_file_key, cdata))
+
+    with zfile.open(cdata[map_file_key]) as gmapfile:
         lines = [[field.strip() for field in
                   line.strip().split(cdata.get("sep", ","))]
                  for line in