aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-06-02 12:29:20 -0500
committerFrederick Muriuki Muriithi2025-06-02 12:32:10 -0500
commitb61a06334cb69a7b0a0f2b5d45c5ebee2688c47e (patch)
tree81e19f5d420e504534daf3bb92cc875b71b4e27c
parentf0b0e04bb6aa1744c802d4eb5cd1cb7c84c88b02 (diff)
downloadgn-uploader-b61a06334cb69a7b0a0f2b5d45c5ebee2688c47e.tar.gz
Fix bug: Compute data correctly. Replace N/A strings with NoneType.
-rw-r--r--scripts/load_phenotypes_to_db.py52
1 files changed, 33 insertions, 19 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index dbfafc7..52042aa 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -75,17 +75,21 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int:
return int(cursor.fetchone()["CurrentMaxDataId"]) + 1
-def __row_to_dataitems__(row: dict, samples: dict) -> tuple[dict, ...]:
- return tuple(
- {
- "phenotype_id": row["phenotype_id"],
- "data_id": dataidmap[row["phenotype_id"]]["data_id"],
- "sample_name": samplename,
- "sample_id": samples[samplename]["Id"],
- "value": value
- }
- for samplename, value in row.items()
- if samplename in samples.keys())
+def __row_to_dataitems__(
+ sample_row: dict,
+ dataidmap: dict,
+ pheno_name2id: dict[str, int],
+ samples: dict
+) -> tuple[dict, ...]:
+ samplename = sample_row["id"]
+
+ return ({
+ "phenotype_id": dataidmap[pheno_name2id[phenoname]]["phenotype_id"],
+ "data_id": dataidmap[pheno_name2id[phenoname]]["data_id"],
+ "sample_name": samplename,
+ "sample_id": samples[samplename]["Id"],
+ "value": phenovalue
+ } for phenoname, phenovalue in sample_row.items() if phenoname != "id")
def save_numeric_data(
@@ -118,17 +122,27 @@ def save_numeric_data(
control_data["sep"],
control_data["comment.char"])
+ _filescontents = tuple(
+ rqtl2.read_csv_file(path,
+ separator=control_data["sep"],
+ comment_char=control_data["comment.char"],
+ na_strings=control_data["na.strings"])
+ for path in phenofiles)
+ _dataitems = (
+ __row_to_dataitems__(dict(zip(_headers, line)),
+ dataidmap,
+ pheno_name2id,
+ samples)
+ for linenum, line in (enumline for filecontent in _filescontents
+ for enumline in enumerate(filecontent))
+ if linenum > 0)
+
return save_phenotypes_data(
conn,
table,
- # BUG: This seems to always be empty for some reason
- (item for items in
- (__row_to_dataitems__(dict(zip(_headers, line)), samples)
- for filecontent
- in (rqtl2.read_csv_file(path) for path in phenofiles)
- for idx, line in enumerate(filecontent)
- if idx != 0)
- for item in items))
+ (item for items in _dataitems
+ for item in items
+ if item["value"] is not None))
save_pheno_data = partial(save_numeric_data,