aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-04-09 17:02:28 -0500
committerFrederick Muriuki Muriithi2025-04-09 17:02:28 -0500
commitb1b154b3d7cb146e6d9862ca5df622738e61654d (patch)
treed7921876f378698e1e82577b4a5ec9f461488133 /scripts
parentd45ea3bbd7185b0867ea6ea0695015ea9c441f7e (diff)
downloadgn-uploader-b1b154b3d7cb146e6d9862ca5df622738e61654d.tar.gz
Compute publications differences.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/phenotypes_bulk_edit.py41
1 files changed, 37 insertions, 4 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py
index 9ff5ffc..395b1bb 100644
--- a/scripts/phenotypes_bulk_edit.py
+++ b/scripts/phenotypes_bulk_edit.py
@@ -225,24 +225,57 @@ def __save_new_publications__(conn, publications, pubmed_ids) -> dict:
return {}
+def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict:
"""Compute differences in the publications."""
logger.info("Computing differences in publications.")
+ assert len(file_data) == len(db_data), "Publication counts differ!"
db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])),
db_data,
set([None]))
+
+ pubmedid_to_id_map = {
+ f"{row['PhenotypeId']}::{row['xref_id']}": row["PublicationId"] for row in db_data
+ }
new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids))
- new_publications = __fetch_new_pubmed_ids__(new_pubmed_ids)
+ new_publications = __save_new_publications__(
+ conn, __fetch_new_pubmed_ids__(new_pubmed_ids), new_pubmed_ids)
+ new_pubmedid_to_id_map = {
+ row["PubMed_ID"]: new_publications.get(
+ row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
+ for row in file_data
+ }
+ return tuple(
+ item for item in ({
+ "PhenotypeId": row["phenotype_id"],
+ "xref_id": row["xref_id"],
+ "PublicationId": new_pubmedid_to_id_map.get(
+ row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
+ } for row in file_data)
+ if item["PublicationId"] != pubmedid_to_id_map[f"{item['PhenotypeId']}::{item['xref_id']}"])
-def compute_differences(conn, file_contents, pheno_ids, pheno_xref_ids, pubmed_ids) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]:
+
+def compute_differences(
+ conn,
+ file_contents,
+ pheno_ids,
+ pheno_xref_ids,
+ pubmed_ids
+) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]:
"""Compute differences between data in DB and edited data."""
logger.info("Computing differences.")
# 1. Basic Phenotype data differences
# a. Descriptions differences
- desc_diff = descriptions_differences(file_contents, __fetch_phenotypes__(conn, pheno_ids))
+ desc_diff = descriptions_differences(
+ file_contents, __fetch_phenotypes__(conn, pheno_ids))
logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff)
# b. Publications differences
- # pub_diff = publications_differences(...)
+ pub_diff = publications_differences(
+ conn,
+ file_contents,
+ __fetch_publications__(conn, pheno_xref_ids),
+ pubmed_ids)
+ logger.debug("Publications diff: %s", pub_diff)
# 2. Data differences
# data_diff = data_differences(...)
pass