diff options
author | Frederick Muriuki Muriithi | 2025-05-19 10:19:55 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-05-19 10:26:29 -0500 |
commit | 2b03ca5772dd347d7fd8bd4d3a94ccca933861ec (patch) | |
tree | ae77c697858f1bc2597ac03004709970cadb1d59 | |
parent | e6adace2f9302a01b796176b7016feb9fae3d351 (diff) | |
download | gn-uploader-2b03ca5772dd347d7fd8bd4d3a94ccca933861ec.tar.gz |
Pass Connection rather than Cursor: Transaction is maintained.
The idea is that all the data is saved to the database in a single
transaction, rather than in bits that could lead to data
inconsistencies.
As it were, simply passing the connection object, and letting each
function create its own cursor will still allow the transaction to be
maintained and will not necessitate the refactor of multiple already
existing functions.
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 108 |
1 files changed, 54 insertions, 54 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index d48084e..753494b 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -38,28 +38,31 @@ def save_publications( return _publications -def save_phenotypes(cursor: Cursor, phenofiles): +def save_phenotypes( + cursor: mysqldb.Connection, + control_data: dict[str, Any] +) -> tuple[dict, ...]: """Read `phenofiles` and save the phenotypes therein.""" pass -def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles): +def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles): """Read the `datafiles` and save the data in the database.""" pass -def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles): +def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles): """Read the `sefiles` and save the data in the database.""" pass -def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles): +def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles): """Read the `nfiles` and save the data in the database.""" pass def cross_reference_phenotypes_publications_and_data( - cursor: Cursor, xref_data: tuple[dict, ...]): + conn: mysqldb.Connection, xref_data: tuple[dict, ...]): """Crossreference the phenotypes, publication and data.""" pass @@ -67,56 +70,53 @@ def cross_reference_phenotypes_publications_and_data( def load_data(conn, job): """Load the data attached in the given job.""" _job_metadata = json.loads(job["job-metadata"]) - with conn.cursor(cursorclass=DictCursor) as cursor: - # Steps - # 0. Read data from the files: can be multiple files per type - # - # 1. Save all new phenotypes: - # -> return phenotype IDs - _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) - logger.info("Saving basic phenotype data.") - - _phenos = save_phenotypes(cursor, _control_data) - _next_data_id = fetch_next_dataid(...) - dataidmap = { - row["phenotype_id"]: { - "phenotype_id": row["phenotype_id"], - "data_id": _nextid - } - for _nextid, row in enumerate(_phenos, start=_next_data_id) + # Steps + # 0. Read data from the files: can be multiple files per type + # + # 1. Save all new phenotypes: + # -> return phenotype IDs + _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) + logger.info("Saving basic phenotype data.") + + _phenos = save_phenotypes(cursor, _control_data) + _next_data_id = fetch_next_dataid(...) + dataidmap = { + row["phenotype_id"]: { + "phenotype_id": row["phenotype_id"], + "data_id": _nextid } - # 2. Save any new publications (in multi-file bundle): - # -> return publication IDS - publications = publications + save_publications( - cursor, - _control_data.get( - "metadata", {}).get( - "publications"), - _job_metadata.get("publicationid")) - _pubidmap = { - # TODO: Map the pheno ids to the publication ids - } - # 3. a. Fetch the strain names and IDS: create name->ID map - samples = samples_by_species_and_population( - # from uploader.samples.models import samples_by_species_and_population - conn, species["SpeciesId"], population["PopulationId"]) - # b. Save all the data items (DataIds are vibes), return new IDs - data = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["pheno"]) - # c. If standard errors and N exist, save them too - # (use IDs returned in `b` above). - data_se = save_phenotypes_data( - cursor, dataidmap, samples, , _control_data["phenose"]) - data_n = save_phenotypes_n( - cursor, dataidmap, samples, , _control_data["phenonum"]) - # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef - xrefs = cross_reference_phenotypes_publications_and_data( - cursor, __merge_map_with_publications__(dataidmap)) - # 5. If entirely new data, update authorisations (break this down) - update_auth(_user, _species, _population, _dataset, _phenos) - return 0 - - return 1 + for _nextid, row in enumerate(_phenos, start=_next_data_id) + } + # 2. Save any new publications (in multi-file bundle): + # -> return publication IDS + publications = publications + save_publications( + cursor, + _control_data.get( + "metadata", {}).get( + "publications"), + _job_metadata.get("publicationid")) + _pubidmap = { + # TODO: Map the pheno ids to the publication ids + } + # 3. a. Fetch the strain names and IDS: create name->ID map + samples = samples_by_species_and_population( + # from uploader.samples.models import samples_by_species_and_population + conn, species["SpeciesId"], population["PopulationId"]) + # b. Save all the data items (DataIds are vibes), return new IDs + data = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["pheno"]) + # c. If standard errors and N exist, save them too + # (use IDs returned in `b` above). + data_se = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["phenose"]) + data_n = save_phenotypes_n( + cursor, dataidmap, samples, , _control_data["phenonum"]) + # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef + xrefs = cross_reference_phenotypes_publications_and_data( + cursor, __merge_map_with_publications__(dataidmap)) + # 5. If entirely new data, update authorisations (break this down) + update_auth(_user, _species, _population, _dataset, _phenos) + return 0 if __name__ == "__main__": |