diff options
author | Frederick Muriuki Muriithi | 2025-04-14 12:34:52 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-04-14 12:35:25 -0500 |
commit | c58105c64920625b5e01d820169c884dc136d260 (patch) | |
tree | ab6bb2b95a3a39808c277b37b53fe5d33aa18362 /scripts/phenotypes_bulk_edit.py | |
parent | 49f1b2fa06da54d2f839ef6d4ca3061b92883417 (diff) | |
download | gn-uploader-c58105c64920625b5e01d820169c884dc136d260.tar.gz |
Move difference computation to `run()` function.
Diffstat (limited to 'scripts/phenotypes_bulk_edit.py')
-rw-r--r-- | scripts/phenotypes_bulk_edit.py | 97 |
1 files changed, 46 insertions, 51 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py index 1d8124e..bbce60d 100644 --- a/scripts/phenotypes_bulk_edit.py +++ b/scripts/phenotypes_bulk_edit.py @@ -10,6 +10,7 @@ from MySQLdb.cursors import DictCursor from gn_libs import jobs, mysqldb, sqlite3 +from uploader.phenotypes.models import phenotypes_data_by_ids import uploader.publications.pubmed as pmed from uploader.publications.misc import publications_differences from uploader.phenotypes.views import BULK_EDIT_COMMON_FIELDNAMES @@ -87,52 +88,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]: return diff -def compute_differences( - conn, - file_contents, - pheno_ids, - pheno_xref_ids, - pubmed_ids -) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]: - """Compute differences between data in DB and edited data.""" - logger.info("Computing differences.") - # 1. Basic Phenotype data differences - # a. Descriptions differences - desc_diff = descriptions_differences( - file_contents, __fetch_phenotypes__(conn, pheno_ids)) - logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff) - - # b. Publications differences - db_publications = fetch_phenotype_publications(conn, pheno_xref_ids) - - _pubmed_map = { - (int(row["PubMed_ID"]) if bool(row["PubMed_ID"]) else None): f"{row['phenotype_id']}::{row['xref_id']}" - for row in file_contents - } - _pub_id_map = { - f"{pub['PhenotypeId']}::{pub['xref_id']}": pub["PublicationId"] - for pub in db_publications - } - - new_publications = update_publications( - conn, tuple({ - **pub, "publication_id": _pub_id_map[_pubmed_map[pub["pubmed_id"]]] - } for pub in pmed.fetch_publications(tuple( - pubmed_id for pubmed_id in pubmed_ids - if pubmed_id not in - tuple(row["PubMed_ID"] for row in db_publications))))) - logger.debug("New Publications: %s", new_publications) - pub_diff = publications_differences( - file_contents, db_publications, { - row["PubMed_ID" if "PubMed_ID" in row else "pubmed_id"]: row[ - "PublicationId" if "PublicationId" in row else "publication_id"] - for row in db_publications + new_publications}) - logger.debug("Publications diff: %s", pub_diff) - # 2. Data differences - # data_diff = data_differences(...) - pass - - def update_descriptions(): """Update descriptions in the database""" logger.info("Updating descriptions") @@ -216,11 +171,51 @@ def run(conn, job): check_ids(conn, pheno_xref_ids) check_for_mandatory_fields() # stop running here if any errors are found. - compute_differences(conn, - file_contents, - pheno_ids, - pheno_xref_ids, - pubmed_ids) + + ### Compute differences + logger.info("Computing differences.") + # 1. Basic Phenotype data differences + # a. Descriptions differences + _desc_diff = descriptions_differences( + file_contents, __fetch_phenotypes__(conn, pheno_ids)) + logger.debug("DESCRIPTIONS DIFFERENCES: %s", _desc_diff) + + # b. Publications differences + _db_publications = fetch_phenotype_publications(conn, pheno_xref_ids) + logger.debug("DB PUBLICATIONS: %s", _db_publications) + + _pubmed_map = { + (int(row["PubMed_ID"]) if bool(row["PubMed_ID"]) else None): f"{row['phenotype_id']}::{row['xref_id']}" + for row in file_contents + } + _pub_id_map = { + f"{pub['PhenotypeId']}::{pub['xref_id']}": pub["PublicationId"] + for pub in _db_publications + } + + _new_publications = update_publications( + conn, tuple({ + **pub, "publication_id": _pub_id_map[_pubmed_map[pub["pubmed_id"]]] + } for pub in pmed.fetch_publications(tuple( + pubmed_id for pubmed_id in pubmed_ids + if pubmed_id not in + tuple(row["PubMed_ID"] for row in _db_publications))))) + _pub_diff = publications_differences( + file_contents, _db_publications, { + row["PubMed_ID" if "PubMed_ID" in row else "pubmed_id"]: row[ + "PublicationId" if "PublicationId" in row else "publication_id"] + for row in _db_publications + _new_publications}) + logger.debug("Publications diff: %s", _pub_diff) + # 2. Data differences + # _db_pheno_data = phenotypes_data_by_ids(conn, tuple({ + # "population_id": job["metadata"]["population-id"], + # "phenoid": row[0], + # "xref_id": row[1] + # } for row in pheno_xref_ids)) + # logger.debug("Phenotype Data in Database: %s", _db_pheno_data) + + # data_diff = data_differences(...) + ### END: Compute differences update_descriptions() link_publications() update_values() |