diff options
author | Frederick Muriuki Muriithi | 2025-04-09 17:02:28 -0500 |
---|---|---|
committer | Frederick Muriuki Muriithi | 2025-04-09 17:02:28 -0500 |
commit | b1b154b3d7cb146e6d9862ca5df622738e61654d (patch) | |
tree | d7921876f378698e1e82577b4a5ec9f461488133 /scripts | |
parent | d45ea3bbd7185b0867ea6ea0695015ea9c441f7e (diff) | |
download | gn-uploader-b1b154b3d7cb146e6d9862ca5df622738e61654d.tar.gz |
Compute publications differences.
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/phenotypes_bulk_edit.py | 41 |
1 files changed, 37 insertions, 4 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py index 9ff5ffc..395b1bb 100644 --- a/scripts/phenotypes_bulk_edit.py +++ b/scripts/phenotypes_bulk_edit.py @@ -225,24 +225,57 @@ def __save_new_publications__(conn, publications, pubmed_ids) -> dict: return {} +def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict: """Compute differences in the publications.""" logger.info("Computing differences in publications.") + assert len(file_data) == len(db_data), "Publication counts differ!" db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])), db_data, set([None])) + + pubmedid_to_id_map = { + f"{row['PhenotypeId']}::{row['xref_id']}": row["PublicationId"] for row in db_data + } new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids)) - new_publications = __fetch_new_pubmed_ids__(new_pubmed_ids) + new_publications = __save_new_publications__( + conn, __fetch_new_pubmed_ids__(new_pubmed_ids), new_pubmed_ids) + new_pubmedid_to_id_map = { + row["PubMed_ID"]: new_publications.get( + row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"]) + for row in file_data + } + return tuple( + item for item in ({ + "PhenotypeId": row["phenotype_id"], + "xref_id": row["xref_id"], + "PublicationId": new_pubmedid_to_id_map.get( + row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"]) + } for row in file_data) + if item["PublicationId"] != pubmedid_to_id_map[f"{item['PhenotypeId']}::{item['xref_id']}"]) -def compute_differences(conn, file_contents, pheno_ids, pheno_xref_ids, pubmed_ids) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]: + +def compute_differences( + conn, + file_contents, + pheno_ids, + pheno_xref_ids, + pubmed_ids +) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]: """Compute differences between data in DB and edited data.""" logger.info("Computing differences.") # 1. Basic Phenotype data differences # a. Descriptions differences - desc_diff = descriptions_differences(file_contents, __fetch_phenotypes__(conn, pheno_ids)) + desc_diff = descriptions_differences( + file_contents, __fetch_phenotypes__(conn, pheno_ids)) logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff) # b. Publications differences - # pub_diff = publications_differences(...) + pub_diff = publications_differences( + conn, + file_contents, + __fetch_publications__(conn, pheno_xref_ids), + pubmed_ids) + logger.debug("Publications diff: %s", pub_diff) # 2. Data differences # data_diff = data_differences(...) pass |