diff options
Diffstat (limited to 'scripts/phenotypes_bulk_edit.py')
-rw-r--r-- | scripts/phenotypes_bulk_edit.py | 175 |
1 files changed, 80 insertions, 95 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py index 4888924..cee5f4e 100644 --- a/scripts/phenotypes_bulk_edit.py +++ b/scripts/phenotypes_bulk_edit.py @@ -6,14 +6,19 @@ from pathlib import Path from typing import Iterator from functools import reduce -import requests -from lxml import etree from MySQLdb.cursors import DictCursor from gn_libs import jobs, mysqldb, sqlite3 +from uploader.phenotypes.models import phenotypes_data_by_ids +from uploader.phenotypes.misc import phenotypes_data_differences +from uploader.phenotypes.views import BULK_EDIT_COMMON_FIELDNAMES + import uploader.publications.pubmed as pmed -from uploader.publications.models import fetch_phenotype_publications +from uploader.publications.misc import publications_differences +from uploader.publications.models import ( + update_publications, fetch_phenotype_publications) + logging.basicConfig( format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s") logger = logging.getLogger(__name__) @@ -85,93 +90,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]: return diff -def __save_new_publications__(conn, publications, pubmed_ids) -> dict: - if len(publications) > 0: - with conn.cursor(cursorclass=DictCursor) as cursor: - cursor.executemany( - ("INSERT INTO " - "Publication( " - "PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, " - "Month, Year" - ") " - "VALUES(" - "%(pubmed_id)s, %(abstract)s, %(authors)s, %(title)s, " - "%(journal)s, %(volume)s, %(pages)s, %(month)s, %(year)s" - ") " - "ON DUPLICATE KEY UPDATE " - "Abstract=VALUES(Abstract), Authors=VALUES(Authors), " - "Title=VALUES(Title), Journal=VALUES(Journal), " - "Volume=VALUES(Volume), Pages=VALUES(pages), " - "Month=VALUES(Month), Year=VALUES(Year)"), - publications) - - paramstr = ", ".join(["%s"] * len(pubmed_ids)) - cursor.execute( - ("SELECT Id, PubMed_ID FROM Publication " - f"WHERE PubMed_ID IN ({paramstr})"), - pubmed_ids) - return { - row["PubMed_ID"]: row["Id"] for row in cursor.fetchall() - } - return {} - - -def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict: - """Compute differences in the publications.""" - logger.info("Computing differences in publications.") - assert len(file_data) == len(db_data), "Publication counts differ!" - db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])), - db_data, - set([None])) - - pubmedid_to_id_map = { - f"{row['PhenotypeId']}::{row['xref_id']}": row["PublicationId"] for row in db_data - } - new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids)) - new_publications = __save_new_publications__( - conn, pmed.fetch_publications(new_pubmed_ids), new_pubmed_ids) - new_pubmedid_to_id_map = { - row["PubMed_ID"]: new_publications.get( - row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"]) - for row in file_data - } - - return tuple( - item for item in ({ - "PhenotypeId": row["phenotype_id"], - "xref_id": row["xref_id"], - "PublicationId": new_pubmedid_to_id_map.get( - row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"]) - } for row in file_data) - if item["PublicationId"] != pubmedid_to_id_map[f"{item['PhenotypeId']}::{item['xref_id']}"]) - - -def compute_differences( - conn, - file_contents, - pheno_ids, - pheno_xref_ids, - pubmed_ids -) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]: - """Compute differences between data in DB and edited data.""" - logger.info("Computing differences.") - # 1. Basic Phenotype data differences - # a. Descriptions differences - desc_diff = descriptions_differences( - file_contents, __fetch_phenotypes__(conn, pheno_ids)) - logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff) - # b. Publications differences - pub_diff = publications_differences( - conn, - file_contents, - fetch_phenotype_publications(conn, pheno_xref_ids), - pubmed_ids) - logger.debug("Publications diff: %s", pub_diff) - # 2. Data differences - # data_diff = data_differences(...) - pass - - def update_descriptions(): """Update descriptions in the database""" logger.info("Updating descriptions") @@ -232,6 +150,12 @@ def read_file(filepath: Path) -> Iterator[str]: ((None if item.strip() == "" else item.strip()) for item in fields))) _pheno, _xref = _dict.pop("UniqueIdentifier").split("::") + _dict = { + key: ((float(val) if bool(val) else val) + if key not in BULK_EDIT_COMMON_FIELDNAMES + else val) + for key, val in _dict.items() + } _dict["phenotype_id"] = int(_pheno.split(":")[1]) _dict["xref_id"] = int(_xref.split(":")[1]) if _dict["PubMed_ID"] is not None: @@ -255,11 +179,68 @@ def run(conn, job): check_ids(conn, pheno_xref_ids) check_for_mandatory_fields() # stop running here if any errors are found. - compute_differences(conn, - file_contents, - pheno_ids, - pheno_xref_ids, - pubmed_ids) + + ### Compute differences + logger.info("Computing differences.") + # 1. Basic Phenotype data differences + # a. Descriptions differences + _desc_diff = descriptions_differences( + file_contents, __fetch_phenotypes__(conn, pheno_ids)) + logger.debug("DESCRIPTIONS DIFFERENCES: %s", _desc_diff) + + # b. Publications differences + _db_publications = fetch_phenotype_publications(conn, pheno_xref_ids) + logger.debug("DB PUBLICATIONS: %s", _db_publications) + + _pubmed_map = { + (int(row["PubMed_ID"]) if bool(row["PubMed_ID"]) else None): f"{row['phenotype_id']}::{row['xref_id']}" + for row in file_contents + } + _pub_id_map = { + f"{pub['PhenotypeId']}::{pub['xref_id']}": pub["PublicationId"] + for pub in _db_publications + } + + _new_publications = update_publications( + conn, tuple({ + **pub, "publication_id": _pub_id_map[_pubmed_map[pub["pubmed_id"]]] + } for pub in pmed.fetch_publications(tuple( + pubmed_id for pubmed_id in pubmed_ids + if pubmed_id not in + tuple(row["PubMed_ID"] for row in _db_publications))))) + _pub_diff = publications_differences( + file_contents, _db_publications, { + row["PubMed_ID" if "PubMed_ID" in row else "pubmed_id"]: row[ + "PublicationId" if "PublicationId" in row else "publication_id"] + for row in _db_publications + _new_publications}) + logger.debug("Publications diff: %s", _pub_diff) + # 2. Data differences + _db_pheno_data = phenotypes_data_by_ids(conn, tuple({ + "population_id": job["metadata"]["population-id"], + "phenoid": row[0], + "xref_id": row[1] + } for row in pheno_xref_ids)) + + data_diff = phenotypes_data_differences( + ({ + "phenotype_id": row["phenotype_id"], + "xref_id": row["xref_id"], + "data": { + key:val for key,val in row.items() + if key not in BULK_EDIT_COMMON_FIELDNAMES + [ + "phenotype_id", "xref_id"] + } + } for row in file_contents), + ({ + **row, + "PhenotypeId": row["Id"], + "data": { + dataitem["StrainName"]: dataitem + for dataitem in row["data"].values() + } + } for row in _db_pheno_data)) + logger.debug("Data differences: %s", data_diff) + ### END: Compute differences update_descriptions() link_publications() update_values() @@ -272,6 +253,10 @@ def main(): logger.setLevel(args.log_level.upper()) logger.debug("Arguments: %s", args) + logging.getLogger("uploader.phenotypes.misc").setLevel(args.log_level.upper()) + logging.getLogger("uploader.phenotypes.models").setLevel(args.log_level.upper()) + logging.getLogger("uploader.publications.models").setLevel(args.log_level.upper()) + with (mysqldb.database_connection(args.db_uri) as conn, sqlite3.connection(args.jobs_db_path) as jobs_conn): return run(conn, jobs.job(jobs_conn, args.job_id)) |