From aaff8b8ac968bce9821d6fef22b1296247a9df09 Mon Sep 17 00:00:00 2001
From: Arun Isaac
Date: Thu, 24 Feb 2022 13:46:34 +0530
Subject: gn3: Explicitly specify UTF-8 to be the file encoding.

When the encoding is not specified explicitly, the system default encoding is
used. This is not recommended.

* gn3/computations/ctl.py (call_ctl_script),
gn3/computations/gemma.py (generate_pheno_txt_file),
gn3/computations/parsers.py (parse_genofile),
gn3/computations/partial_correlations.py (partial_correlations_fast),
gn3/computations/rqtl.py (process_rqtl_output, process_perm_output),
gn3/computations/wgcna.py (dump_wgcna_data, call_wgcna_script),
gn3/fs_helpers.py (jsonfile_to_dict): Explicitly specify UTF-8 to be the file
encoding.
*
tests/unit/computations/test_gemma.py (TestGemma.test_generate_pheno_txt_file),
tests/unit/computations/test_wgcna.py (TestWgcna.test_create_json_file): Test
for call to open with encoding='utf-8' argument.
---
 gn3/computations/ctl.py                  | 2 +-
 gn3/computations/gemma.py                | 2 +-
 gn3/computations/parsers.py              | 2 +-
 gn3/computations/partial_correlations.py | 2 +-
 gn3/computations/rqtl.py                 | 4 ++--
 gn3/computations/wgcna.py                | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'gn3/computations')

diff --git a/gn3/computations/ctl.py b/gn3/computations/ctl.py
index 238740f..f881410 100644
--- a/gn3/computations/ctl.py
+++ b/gn3/computations/ctl.py
@@ -16,7 +16,7 @@ def call_ctl_script(data):
     cmd = compose_wgcna_cmd("ctl_analysis.R", temp_file_name)
 
     cmd_results = run_cmd(cmd)
-    with open(temp_file_name, "r") as outputfile:
+    with open(temp_file_name, "r", encoding="utf-8") as outputfile:
         if cmd_results["code"] != 0:
             return (cmd_results, None)
         output_file_data = json.load(outputfile)
diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py
index 0b22d3c..8036a7b 100644
--- a/gn3/computations/gemma.py
+++ b/gn3/computations/gemma.py
@@ -31,7 +31,7 @@ def generate_pheno_txt_file(trait_filename: str,
     # Early return if this already exists!
     if os.path.isfile(f"{tmpdir}/gn2/{trait_filename}"):
         return f"{tmpdir}/gn2/{trait_filename}"
-    with open(f"{tmpdir}/gn2/{trait_filename}", "w") as _file:
+    with open(f"{tmpdir}/gn2/{trait_filename}", "w", encoding="utf-8") as _file:
         for value in values:
             if value == "x":
                 _file.write("NA\n")
diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py
index 1af35d6..79e3955 100644
--- a/gn3/computations/parsers.py
+++ b/gn3/computations/parsers.py
@@ -15,7 +15,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str],
         'u': None,
     }
     genotypes, samples = [], []
-    with open(file_path, "r") as _genofile:
+    with open(file_path, "r", encoding="utf-8") as _genofile:
         for line in _genofile:
             line = line.strip()
             if line.startswith(("#", "@")):
diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py
index 7110cc5..e826a8b 100644
--- a/gn3/computations/partial_correlations.py
+++ b/gn3/computations/partial_correlations.py
@@ -241,7 +241,7 @@ def partial_correlations_fast(# pylint: disable=[R0913, R0914]
     function in GeneNetwork1.
     """
     assert method in ("spearman", "pearson")
-    with open(database_filename, "r") as dataset_file: # pytest: disable=[W1514]
+    with open(database_filename, "r", encoding="utf-8") as dataset_file: # pytest: disable=[W1514]
         dataset = tuple(dataset_file.readlines())
 
     good_dataset_samples = good_dataset_samples_indexes(
diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py
index 0433b3f..b3539a9 100644
--- a/gn3/computations/rqtl.py
+++ b/gn3/computations/rqtl.py
@@ -56,7 +56,7 @@ def process_rqtl_output(file_name: str) -> List:
     # Later I should probably redo this using csv.read to avoid the
     # awkwardness with removing quotes with [1:-1]
     with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"),
-                           "output", file_name), "r") as the_file:
+                           "output", file_name), "r", encoding="utf-8") as the_file:
         for line in the_file:
             line_items = line.split(",")
             if line_items[1][1:-1] == "chr" or not line_items:
@@ -88,7 +88,7 @@ def process_perm_output(file_name: str):
     """
     perm_results = []
     with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"),
-                           "output", "PERM_" + file_name), "r") as the_file:
+                           "output", "PERM_" + file_name), "r", encoding="utf-8") as the_file:
         for i, line in enumerate(the_file):
             if i == 0:
                 # Skip header line
diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py
index ab12fe7..de26f48 100644
--- a/gn3/computations/wgcna.py
+++ b/gn3/computations/wgcna.py
@@ -19,7 +19,7 @@ def dump_wgcna_data(request_data: dict):
 
     request_data["TMPDIR"] = TMPDIR
 
-    with open(temp_file_path, "w") as output_file:
+    with open(temp_file_path, "w", encoding="utf-8") as output_file:
         json.dump(request_data, output_file)
 
     return temp_file_path
@@ -75,7 +75,7 @@ def call_wgcna_script(rscript_path: str, request_data: dict):
 
         run_cmd_results = run_cmd(cmd)
 
-        with open(generated_file, "r") as outputfile:
+        with open(generated_file, "r", encoding="utf-8") as outputfile:
 
             if run_cmd_results["code"] != 0:
                 return run_cmd_results
-- 
cgit 1.4.1