diff options
author | Frederick Muriuki Muriithi | 2025-06-02 12:29:20 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-06-02 12:32:10 -0500 |
commit | b61a06334cb69a7b0a0f2b5d45c5ebee2688c47e (patch) | |
tree | 81e19f5d420e504534daf3bb92cc875b71b4e27c | |
parent | f0b0e04bb6aa1744c802d4eb5cd1cb7c84c88b02 (diff) | |
download | gn-uploader-b61a06334cb69a7b0a0f2b5d45c5ebee2688c47e.tar.gz |
Fix bug: Compute data correctly. Replace N/A strings with NoneType.
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 52 |
1 files changed, 33 insertions, 19 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index dbfafc7..52042aa 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -75,17 +75,21 @@ def __fetch_next_dataid__(conn: mysqldb.Connection) -> int: return int(cursor.fetchone()["CurrentMaxDataId"]) + 1 -def __row_to_dataitems__(row: dict, samples: dict) -> tuple[dict, ...]: - return tuple( - { - "phenotype_id": row["phenotype_id"], - "data_id": dataidmap[row["phenotype_id"]]["data_id"], - "sample_name": samplename, - "sample_id": samples[samplename]["Id"], - "value": value - } - for samplename, value in row.items() - if samplename in samples.keys()) +def __row_to_dataitems__( + sample_row: dict, + dataidmap: dict, + pheno_name2id: dict[str, int], + samples: dict +) -> tuple[dict, ...]: + samplename = sample_row["id"] + + return ({ + "phenotype_id": dataidmap[pheno_name2id[phenoname]]["phenotype_id"], + "data_id": dataidmap[pheno_name2id[phenoname]]["data_id"], + "sample_name": samplename, + "sample_id": samples[samplename]["Id"], + "value": phenovalue + } for phenoname, phenovalue in sample_row.items() if phenoname != "id") def save_numeric_data( @@ -118,17 +122,27 @@ def save_numeric_data( control_data["sep"], control_data["comment.char"]) + _filescontents = tuple( + rqtl2.read_csv_file(path, + separator=control_data["sep"], + comment_char=control_data["comment.char"], + na_strings=control_data["na.strings"]) + for path in phenofiles) + _dataitems = ( + __row_to_dataitems__(dict(zip(_headers, line)), + dataidmap, + pheno_name2id, + samples) + for linenum, line in (enumline for filecontent in _filescontents + for enumline in enumerate(filecontent)) + if linenum > 0) + return save_phenotypes_data( conn, table, - # BUG: This seems to always be empty for some reason - (item for items in - (__row_to_dataitems__(dict(zip(_headers, line)), samples) - for filecontent - in (rqtl2.read_csv_file(path) for path in phenofiles) - for idx, line in enumerate(filecontent) - if idx != 0) - for item in items)) + (item for items in _dataitems + for item in items + if item["value"] is not None)) save_pheno_data = partial(save_numeric_data, |