diff options
author | Frederick Muriuki Muriithi | 2025-08-25 10:04:13 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-08-26 10:06:10 -0500 |
commit | 819e5b3cf86607aebc97f266760025fcc739ff5a (patch) | |
tree | 8493507bf16bdda4d47f97c380e1d1319c685503 | |
parent | 87186314c2431381390595063487eb6a2718a1e4 (diff) | |
download | gn-uploader-819e5b3cf86607aebc97f266760025fcc739ff5a.tar.gz |
Update script to use newer form of `create_new_phenotypes` function.
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 83 |
1 files changed, 29 insertions, 54 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index c1a7687..3737f2d 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -40,8 +40,10 @@ def __replace_na_strings__(line, na_strings): def save_phenotypes( - cursor: mysqldb.Connection, + conn: mysqldb.Connection, control_data: dict[str, Any], + population_id, + publication_id, filesdir: Path ) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" @@ -63,7 +65,9 @@ def save_phenotypes( control_data["sep"], control_data["comment.char"]) return create_new_phenotypes( - cursor, + conn, + population_id, + publication_id, (dict(zip(_headers, __replace_na_strings__(line, control_data["na.strings"]))) for filecontent @@ -75,14 +79,6 @@ def save_phenotypes( if idx != 0)) -def __fetch_next_dataid__(conn: mysqldb.Connection) -> int: - """Fetch the next available DataId value from the database.""" - with conn.cursor(cursorclass=DictCursor) as cursor: - cursor.execute( - "SELECT MAX(DataId) AS CurrentMaxDataId FROM PublishXRef") - return int(cursor.fetchone()["CurrentMaxDataId"]) + 1 - - def __row_to_dataitems__( sample_row: dict, dataidmap: dict, @@ -199,34 +195,6 @@ save_phenotypes_n = partial(save_numeric_data, table="NStrain") -def cross_reference_phenotypes_publications_and_data( - conn: mysqldb.Connection, xref_data: tuple[dict, ...] -): - """Crossreference the phenotypes, publication and data.""" - with conn.cursor(cursorclass=DictCursor) as cursor: - cursor.execute("SELECT MAX(Id) CurrentMaxId FROM PublishXRef") - _nextid = int(cursor.fetchone()["CurrentMaxId"]) + 1 - _params = tuple({**row, "xref_id": _id} - for _id, row in enumerate(xref_data, start=_nextid)) - cursor.executemany( - ("INSERT INTO PublishXRef(" - "Id, InbredSetId, PhenotypeId, PublicationId, DataId, comments" - ") " - "VALUES (" - "%(xref_id)s, %(population_id)s, %(phenotype_id)s, " - "%(publication_id)s, %(data_id)s, 'Upload of new data.'" - ")"), - _params) - cursor.executemany( - "UPDATE PublishXRef SET mean=" - "(SELECT AVG(value) FROM PublishData WHERE PublishData.Id=PublishXRef.DataId) " - "WHERE PublishXRef.Id=%(xref_id)s AND " - "InbredSetId=%(population_id)s", - _params) - return _params - return tuple() - - def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments,too-many-arguments] authserver, token, @@ -369,25 +337,35 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too- with ZipFile(str(bundle), "r") as zfile: _files = rqtl2.extract(zfile, _outdir) logger.info("Saving new phenotypes.") - _phenos = save_phenotypes(conn, _control_data, _outdir) - def __build_phenos_maps__(accumulator, current): - dataid, row = current + _phenos = save_phenotypes(conn, + _control_data, + _population["Id"], + _publication["Id"], + _outdir) + + def __build_phenos_maps__(accumulator, row): return ({ **accumulator[0], row["phenotype_id"]: { "population_id": _population["Id"], "phenotype_id": row["phenotype_id"], - "data_id": dataid, - "publication_id": _publication["Id"], + "data_id": row["data_id"], + "publication_id": row["publication_id"], } }, { **accumulator[1], - row["id"]: row["phenotype_id"] - }) - dataidmap, pheno_name2id = reduce( - __build_phenos_maps__, - enumerate(_phenos, start=__fetch_next_dataid__(conn)), - ({},{})) + row["pre_publication_abbreviation"]: row["phenotype_id"] + }, ( + accumulator[2] + ({ + "xref_id": row["xref_id"], + "population_id": row["population_id"], + "phenotype_id": row["phenotype_id"], + "publication_id": row["publication_id"], + "data_id": row["data_id"] + },))) + dataidmap, pheno_name2id, _xrefs = reduce(__build_phenos_maps__, + _phenos, + ({},{}, tuple())) # 3. a. Fetch the strain names and IDS: create name->ID map samples = { row["Name"]: row @@ -402,11 +380,8 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too- control_data=_control_data, filesdir=_outdir) logger.info("Saved %s new phenotype data rows.", _num_data_rows) - # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef - logger.info("Cross-referencing new phenotypes to their data and publications.") - _xrefs = cross_reference_phenotypes_publications_and_data( - conn, tuple(dataidmap.values())) - # 5. If standard errors and N exist, save them too + + # 4. If standard errors and N exist, save them too # (use IDs returned in `3. b.` above). if _control_data.get("phenose"): logger.info("Saving new phenotypes standard errors.") |