aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-05-19 10:19:55 -0500
committerFrederick Muriuki Muriithi2025-05-19 10:26:29 -0500
commit2b03ca5772dd347d7fd8bd4d3a94ccca933861ec (patch)
treeae77c697858f1bc2597ac03004709970cadb1d59
parente6adace2f9302a01b796176b7016feb9fae3d351 (diff)
downloadgn-uploader-2b03ca5772dd347d7fd8bd4d3a94ccca933861ec.tar.gz
Pass Connection rather than Cursor: Transaction is maintained.
The idea is that all the data is saved to the database in a single transaction, rather than in bits that could lead to data inconsistencies. As it were, simply passing the connection object, and letting each function create its own cursor will still allow the transaction to be maintained and will not necessitate the refactor of multiple already existing functions.
-rw-r--r--scripts/load_phenotypes_to_db.py108
1 files changed, 54 insertions, 54 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index d48084e..753494b 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -38,28 +38,31 @@ def save_publications(
return _publications
-def save_phenotypes(cursor: Cursor, phenofiles):
+def save_phenotypes(
+ cursor: mysqldb.Connection,
+ control_data: dict[str, Any]
+) -> tuple[dict, ...]:
"""Read `phenofiles` and save the phenotypes therein."""
pass
-def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles):
+def save_phenotypes_data(conn: mysqldb.Connection, dataidmap, samples, datafiles):
"""Read the `datafiles` and save the data in the database."""
pass
-def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles):
+def save_phenotype_se(conn: mysqldb.Connection, dataidmap, samples, sefiles):
"""Read the `sefiles` and save the data in the database."""
pass
-def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles):
+def save_phenotype_n(conn: mysqldb.Connection, dataidmap, samples, nfiles):
"""Read the `nfiles` and save the data in the database."""
pass
def cross_reference_phenotypes_publications_and_data(
- cursor: Cursor, xref_data: tuple[dict, ...]):
+ conn: mysqldb.Connection, xref_data: tuple[dict, ...]):
"""Crossreference the phenotypes, publication and data."""
pass
@@ -67,56 +70,53 @@ def cross_reference_phenotypes_publications_and_data(
def load_data(conn, job):
"""Load the data attached in the given job."""
_job_metadata = json.loads(job["job-metadata"])
- with conn.cursor(cursorclass=DictCursor) as cursor:
- # Steps
- # 0. Read data from the files: can be multiple files per type
- #
- # 1. Save all new phenotypes:
- # -> return phenotype IDs
- _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"])
- logger.info("Saving basic phenotype data.")
-
- _phenos = save_phenotypes(cursor, _control_data)
- _next_data_id = fetch_next_dataid(...)
- dataidmap = {
- row["phenotype_id"]: {
- "phenotype_id": row["phenotype_id"],
- "data_id": _nextid
- }
- for _nextid, row in enumerate(_phenos, start=_next_data_id)
+ # Steps
+ # 0. Read data from the files: can be multiple files per type
+ #
+ # 1. Save all new phenotypes:
+ # -> return phenotype IDs
+ _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"])
+ logger.info("Saving basic phenotype data.")
+
+ _phenos = save_phenotypes(cursor, _control_data)
+ _next_data_id = fetch_next_dataid(...)
+ dataidmap = {
+ row["phenotype_id"]: {
+ "phenotype_id": row["phenotype_id"],
+ "data_id": _nextid
}
- # 2. Save any new publications (in multi-file bundle):
- # -> return publication IDS
- publications = publications + save_publications(
- cursor,
- _control_data.get(
- "metadata", {}).get(
- "publications"),
- _job_metadata.get("publicationid"))
- _pubidmap = {
- # TODO: Map the pheno ids to the publication ids
- }
- # 3. a. Fetch the strain names and IDS: create name->ID map
- samples = samples_by_species_and_population(
- # from uploader.samples.models import samples_by_species_and_population
- conn, species["SpeciesId"], population["PopulationId"])
- # b. Save all the data items (DataIds are vibes), return new IDs
- data = save_phenotypes_data(
- cursor, dataidmap, samples, , _control_data["pheno"])
- # c. If standard errors and N exist, save them too
- # (use IDs returned in `b` above).
- data_se = save_phenotypes_data(
- cursor, dataidmap, samples, , _control_data["phenose"])
- data_n = save_phenotypes_n(
- cursor, dataidmap, samples, , _control_data["phenonum"])
- # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
- xrefs = cross_reference_phenotypes_publications_and_data(
- cursor, __merge_map_with_publications__(dataidmap))
- # 5. If entirely new data, update authorisations (break this down)
- update_auth(_user, _species, _population, _dataset, _phenos)
- return 0
-
- return 1
+ for _nextid, row in enumerate(_phenos, start=_next_data_id)
+ }
+ # 2. Save any new publications (in multi-file bundle):
+ # -> return publication IDS
+ publications = publications + save_publications(
+ cursor,
+ _control_data.get(
+ "metadata", {}).get(
+ "publications"),
+ _job_metadata.get("publicationid"))
+ _pubidmap = {
+ # TODO: Map the pheno ids to the publication ids
+ }
+ # 3. a. Fetch the strain names and IDS: create name->ID map
+ samples = samples_by_species_and_population(
+ # from uploader.samples.models import samples_by_species_and_population
+ conn, species["SpeciesId"], population["PopulationId"])
+ # b. Save all the data items (DataIds are vibes), return new IDs
+ data = save_phenotypes_data(
+ cursor, dataidmap, samples, , _control_data["pheno"])
+ # c. If standard errors and N exist, save them too
+ # (use IDs returned in `b` above).
+ data_se = save_phenotypes_data(
+ cursor, dataidmap, samples, , _control_data["phenose"])
+ data_n = save_phenotypes_n(
+ cursor, dataidmap, samples, , _control_data["phenonum"])
+ # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
+ xrefs = cross_reference_phenotypes_publications_and_data(
+ cursor, __merge_map_with_publications__(dataidmap))
+ # 5. If entirely new data, update authorisations (break this down)
+ update_auth(_user, _species, _population, _dataset, _phenos)
+ return 0
if __name__ == "__main__":