diff options
author | Frederick Muriuki Muriithi | 2025-05-12 13:33:56 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-05-12 13:33:56 -0500 |
commit | eef0ed319f24946f174e9e2adee71a753df12163 (patch) | |
tree | 7efaab1ce0345532bdc2037bdb640feb3ef4abde | |
parent | ff82ac219ba8be0810f4789989a7a67b0e42b877 (diff) | |
download | gn-uploader-eef0ed319f24946f174e9e2adee71a753df12163.tar.gz |
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 108 |
1 files changed, 107 insertions, 1 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index fa710a6..5c792f0 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -3,6 +3,8 @@ import logging import argparse from pathlib import Path +from MySQLdb.cursors import Cursor, DictCursor + from gn_libs import jobs, mysqldb, sqlite3 logging.basicConfig( @@ -10,9 +12,90 @@ logging.basicConfig( logger = logging.getLogger(__name__) +def save_publications(cursor: Cursor, pubfiles): + """Read the `pubfiles` and save the publications therein.""" + # Check for PubMed IDs, perhaps? + pass + + +def save_phenotypes(cursor: Cursor, phenofiles): + """Read `phenofiles` and save the phenotypes therein.""" + pass + + +def save_phenotypes_data(cursor: Cursor, dataidmap, samples, datafiles): + """Read the `datafiles` and save the data in the database.""" + pass + + +def save_phenotype_se(cursor: Cursor, dataidmap, samples, sefiles): + """Read the `sefiles` and save the data in the database.""" + pass + + +def save_phenotype_n(cursor: Cursor, dataidmap, samples, nfiles): + """Read the `nfiles` and save the data in the database.""" + pass + + +def cross_reference_phenotypes_publications_and_data( + cursor: Cursor, xref_data: tuple[dict, ...]): + """Crossreference the phenotypes, publication and data.""" + pass + + def load_data(conn, job): """Load the data attached in the given job.""" - pass + with conn.cursor(cursorclass=DictCursor) as cursor: + # Steps + # 0. Read data from the files: can be multiple files per type + # + # 1. Save all new phenotypes: + # -> return phenotype IDs + _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) + logger.info("Saving basic phenotype data.") + _phenos = save_phenotypes(cursor, _control_data["pheno"]) + _next_data_id = fetch_next_dataid(...) + dataidmap = { + row["phenotype_id"]: { + "phenotype_id": row["phenotype_id"], + "data_id": _nextid + } + for _nextid, row in enumerate(_phenos, start=_next_data_id) + } + # 2. Save any new publications (in multi-file bundle): + # -> return publication IDS + logger.info("Saving publications.") + # -> check whether the publication-id exists? + # -> perhaps setup the bundle with the appropriate publications + # -> gn-data -> (phenotypes-se, phenotypes-n) + # -> gn-metadata -> (pheno, geno) + publication = save_publications( + cursor, _control_data.get("metadata", {}).get("publications")) + _pubidmap = { + # TODO: Map the pheno ids to the publication ids + } + # 3. a. Fetch the strain names and IDS: create name->ID map + samples = samples_by_species_and_population( + # from uploader.samples.models import samples_by_species_and_population + conn, species["SpeciesId"], population["PopulationId"]) + # b. Save all the data items (DataIds are vibes), return new IDs + data = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["pheno"]) + # c. If standard errors and N exist, save them too + # (use IDs returned in `b` above). + data_se = save_phenotypes_data( + cursor, dataidmap, samples, , _control_data["phenose"]) + data_n = save_phenotypes_n( + cursor, dataidmap, samples, , _control_data["phenonum"]) + # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef + xrefs = cross_reference_phenotypes_publications_and_data( + cursor, __merge_map_with_publications__(dataidmap)) + # 5. If entirely new data, update authorisations (break this down) + update_auth(_user, _species, _population, _dataset, _phenos) + return 0 + + return 1 if __name__ == "__main__": @@ -46,6 +129,26 @@ if __name__ == "__main__": with (mysqldb.database_connection(args.db_uri) as conn, sqlite3.connection(args.jobs_db_path) as jobs_conn): + # Lock the PublishXRef/PublishData/PublishSE/NStrain here: Why? + # The `DataId` values are sequential, but not auto-increment + # Can't convert `PublishXRef`.`DataId` to AUTO_INCREMENT. + # `SELECT MAX(DataId) FROM PublishXRef;` + # How do you check for a table lock? + # https://oracle-base.com/articles/mysql/mysql-identify-locked-tables + # `SHOW OPEN TABLES LIKE 'Publish%';` + logger.debug( + ("Locking database tables for the connection:" + "\n\t- %s\n\t- %s\n\t- %s\n\t- %s\n"), + "PublishXRef", + "PublishData", + "PublishSE", + "NStrain") + conn.execute(# Lock the tables to avoid race conditions + "LOCK TABLES " + "PublishXRef WRITE, " + "PublishData WRITE, " + "PublishSE WRITE, " + "NStrain WRITE") try: return load_data(conn, jobs.job(jobs_conn, args.job_id)) except jobs.jobs.JobNotFound as _jne: @@ -55,6 +158,9 @@ if __name__ == "__main__": exc_info=True, stack_info=True) + logger.debug("Unlocking all database tables.") + conn.execute("UNLOCK TABLES") + return 1 |