about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-06-02 12:29:20 -0500
committerFrederick Muriuki Muriithi2025-06-02 12:32:10 -0500
commitb61a06334cb69a7b0a0f2b5d45c5ebee2688c47e (patch)
tree81e19f5d420e504534daf3bb92cc875b71b4e27c /scripts
parentf0b0e04bb6aa1744c802d4eb5cd1cb7c84c88b02 (diff)
downloadgn-uploader-b61a06334cb69a7b0a0f2b5d45c5ebee2688c47e.tar.gz
Fix bug: Compute data correctly. Replace N/A strings with NoneType.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/load_phenotypes_to_db.py52
1 files changed, 33 insertions, 19 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index dbfafc7..52042aa 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -75,17 +75,21 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int:
         return int(cursor.fetchone()["CurrentMaxDataId"]) + 1
 
 
-def __row_to_dataitems__(row: dict, samples: dict) -> tuple[dict, ...]:
-    return tuple(
-        {
-            "phenotype_id": row["phenotype_id"],
-            "data_id": dataidmap[row["phenotype_id"]]["data_id"],
-            "sample_name": samplename,
-            "sample_id": samples[samplename]["Id"],
-            "value": value
-        }
-        for samplename, value in row.items()
-        if samplename in samples.keys())
+def __row_to_dataitems__(
+        sample_row: dict,
+        dataidmap: dict,
+        pheno_name2id: dict[str, int],
+        samples: dict
+) -> tuple[dict, ...]:
+    samplename = sample_row["id"]
+
+    return ({
+        "phenotype_id": dataidmap[pheno_name2id[phenoname]]["phenotype_id"],
+        "data_id": dataidmap[pheno_name2id[phenoname]]["data_id"],
+        "sample_name": samplename,
+        "sample_id": samples[samplename]["Id"],
+        "value": phenovalue
+    } for phenoname, phenovalue in sample_row.items() if phenoname != "id")
 
 
 def save_numeric_data(
@@ -118,17 +122,27 @@ def save_numeric_data(
                                            control_data["sep"],
                                            control_data["comment.char"])
 
+    _filescontents = tuple(
+        rqtl2.read_csv_file(path,
+                            separator=control_data["sep"],
+                            comment_char=control_data["comment.char"],
+                            na_strings=control_data["na.strings"])
+        for path in phenofiles)
+    _dataitems = (
+        __row_to_dataitems__(dict(zip(_headers, line)),
+                             dataidmap,
+                             pheno_name2id,
+                             samples)
+        for linenum, line in (enumline for filecontent in _filescontents
+                              for enumline in enumerate(filecontent))
+        if linenum > 0)
+
     return save_phenotypes_data(
         conn,
         table,
-        # BUG: This seems to always be empty for some reason
-        (item for items in
-         (__row_to_dataitems__(dict(zip(_headers, line)), samples)
-          for filecontent
-          in (rqtl2.read_csv_file(path) for path in phenofiles)
-         for idx, line in enumerate(filecontent)
-         if idx != 0)
-         for item in items))
+        (item for items in _dataitems
+         for item in items
+         if item["value"] is not None))
 
 
 save_pheno_data = partial(save_numeric_data,