about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-06-02 14:04:12 -0500
committerFrederick Muriuki Muriithi2025-06-02 14:07:03 -0500
commit4876d8d879727de4b36b6a4f238511b788c083cf (patch)
tree160f653befb20cfe436441edf1f1d573eeffe71b /scripts
parent4c5b7b7ef118a9ad0b620ae7c7563daa9cd1c33d (diff)
downloadgn-uploader-4876d8d879727de4b36b6a4f238511b788c083cf.tar.gz
Process NA-strings in the worker script.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/load_phenotypes_to_db.py22
1 files changed, 15 insertions, 7 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 9ad3a93..e044b8f 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -30,6 +30,11 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 
 
+
+def __replace_na_strings__(line, na_strings):
+    return ((None if value in na_strings else value) for value in line)
+
+
 def save_phenotypes(
         cursor: mysqldb.Connection,
         control_data: dict[str, Any],
@@ -61,7 +66,9 @@ def save_phenotypes(
                                            control_data["comment.char"])
     return create_new_phenotypes(
         cursor,
-        (dict(zip(_headers, line)) for filecontent
+        (dict(zip(_headers,
+                  __replace_na_strings__(line, control_data["na.strings"])))
+         for filecontent
          in (rqtl2.read_csv_file(path) for path in phenofiles)
          for idx, line in enumerate(filecontent)
          if idx != 0))
@@ -125,14 +132,15 @@ def save_numeric_data(
     _filescontents = tuple(
         rqtl2.read_csv_file(path,
                             separator=control_data["sep"],
-                            comment_char=control_data["comment.char"],
-                            na_strings=control_data["na.strings"])
+                            comment_char=control_data["comment.char"])
         for path in phenofiles)
     _dataitems = (
-        __row_to_dataitems__(dict(zip(_headers, line)),
-                             dataidmap,
-                             pheno_name2id,
-                             samples)
+        __row_to_dataitems__(
+            dict(zip(_headers,
+                     __replace_na_strings__(line, control_data["na.strings"]))),
+            dataidmap,
+            pheno_name2id,
+            samples)
         for linenum, line in (enumline for filecontent in _filescontents
                               for enumline in enumerate(filecontent))
         if linenum > 0)