1 files changed, 80 insertions, 95 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py
index 4888924..cee5f4e 100644
--- a/scripts/phenotypes_bulk_edit.py
+++ b/scripts/phenotypes_bulk_edit.py
@@ -6,14 +6,19 @@ from pathlib import Path
 from typing import Iterator
 from functools import reduce
 
-import requests
-from lxml import etree
 from MySQLdb.cursors import DictCursor
 
 from gn_libs import jobs, mysqldb, sqlite3
 
+from uploader.phenotypes.models import phenotypes_data_by_ids
+from uploader.phenotypes.misc import phenotypes_data_differences
+from uploader.phenotypes.views import BULK_EDIT_COMMON_FIELDNAMES
+
 import uploader.publications.pubmed as pmed
-from uploader.publications.models import fetch_phenotype_publications
+from uploader.publications.misc import publications_differences
+from uploader.publications.models import (
+    update_publications, fetch_phenotype_publications)
+
 logging.basicConfig(
     format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -85,93 +90,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]:
     return diff
 
 
-def __save_new_publications__(conn, publications, pubmed_ids) -> dict:
-    if len(publications) > 0:
-        with conn.cursor(cursorclass=DictCursor) as cursor:
-            cursor.executemany(
-                ("INSERT INTO "
-                 "Publication( "
-                 "PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, "
-                 "Month, Year"
-                 ") "
-                 "VALUES("
-                 "%(pubmed_id)s, %(abstract)s, %(authors)s, %(title)s, "
-                 "%(journal)s, %(volume)s, %(pages)s, %(month)s, %(year)s"
-                 ") "
-                 "ON DUPLICATE KEY UPDATE "
-                 "Abstract=VALUES(Abstract), Authors=VALUES(Authors), "
-                 "Title=VALUES(Title), Journal=VALUES(Journal), "
-                 "Volume=VALUES(Volume), Pages=VALUES(pages), "
-                 "Month=VALUES(Month), Year=VALUES(Year)"),
-                publications)
-
-            paramstr = ", ".join(["%s"] * len(pubmed_ids))
-            cursor.execute(
-                ("SELECT Id, PubMed_ID FROM Publication "
-                 f"WHERE PubMed_ID IN ({paramstr})"),
-                pubmed_ids)
-            return {
-                row["PubMed_ID"]: row["Id"] for row in cursor.fetchall()
-            }
-        return {}
-
-
-def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict:
-    """Compute differences in the publications."""
-    logger.info("Computing differences in publications.")
-    assert len(file_data) == len(db_data), "Publication counts differ!"
-    db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])),
-                           db_data,
-                           set([None]))
-
-    pubmedid_to_id_map = {
-        f"{row['PhenotypeId']}::{row['xref_id']}": row["PublicationId"] for row in db_data
-    }
-    new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids))
-    new_publications = __save_new_publications__(
-        conn, pmed.fetch_publications(new_pubmed_ids), new_pubmed_ids)
-    new_pubmedid_to_id_map = {
-        row["PubMed_ID"]: new_publications.get(
-            row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
-        for row in file_data
-    }
-
-    return tuple(
-        item for item in ({
-            "PhenotypeId": row["phenotype_id"],
-            "xref_id": row["xref_id"],
-            "PublicationId": new_pubmedid_to_id_map.get(
-                row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
-        } for row in file_data)
-        if item["PublicationId"] != pubmedid_to_id_map[f"{item['PhenotypeId']}::{item['xref_id']}"])
-
-
-def compute_differences(
-        conn,
-        file_contents,
-        pheno_ids,
-        pheno_xref_ids,
-        pubmed_ids
-) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]:
-    """Compute differences between data in DB and edited data."""
-    logger.info("Computing differences.")
-    # 1. Basic Phenotype data differences
-    #    a. Descriptions differences
-    desc_diff = descriptions_differences(
-        file_contents, __fetch_phenotypes__(conn, pheno_ids))
-    logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff)
-    #    b. Publications differences
-    pub_diff = publications_differences(
-        conn,
-        file_contents,
-        fetch_phenotype_publications(conn, pheno_xref_ids),
-        pubmed_ids)
-    logger.debug("Publications diff: %s", pub_diff)
-    # 2. Data differences
-    # data_diff = data_differences(...)
-    pass
-
-
 def update_descriptions():
     """Update descriptions in the database"""
     logger.info("Updating descriptions")
@@ -232,6 +150,12 @@ def read_file(filepath: Path) -> Iterator[str]:
                 ((None if item.strip() == "" else item.strip())
                  for item in fields)))
             _pheno, _xref = _dict.pop("UniqueIdentifier").split("::")
+            _dict = {
+                key: ((float(val) if bool(val) else val)
+                      if key not in BULK_EDIT_COMMON_FIELDNAMES
+                      else val)
+                for key, val in _dict.items()
+            }
             _dict["phenotype_id"] = int(_pheno.split(":")[1])
             _dict["xref_id"] = int(_xref.split(":")[1])
             if _dict["PubMed_ID"] is not None:
@@ -255,11 +179,68 @@ def run(conn, job):
     check_ids(conn, pheno_xref_ids)
     check_for_mandatory_fields()
     # stop running here if any errors are found.
-    compute_differences(conn,
-                        file_contents,
-                        pheno_ids,
-                        pheno_xref_ids,
-                        pubmed_ids)
+
+    ### Compute differences
+    logger.info("Computing differences.")
+    # 1. Basic Phenotype data differences
+    #    a. Descriptions differences
+    _desc_diff = descriptions_differences(
+        file_contents, __fetch_phenotypes__(conn, pheno_ids))
+    logger.debug("DESCRIPTIONS DIFFERENCES: %s", _desc_diff)
+
+    #    b. Publications differences
+    _db_publications = fetch_phenotype_publications(conn, pheno_xref_ids)
+    logger.debug("DB PUBLICATIONS: %s", _db_publications)
+
+    _pubmed_map = {
+        (int(row["PubMed_ID"]) if bool(row["PubMed_ID"]) else None): f"{row['phenotype_id']}::{row['xref_id']}"
+        for row in file_contents
+    }
+    _pub_id_map = {
+        f"{pub['PhenotypeId']}::{pub['xref_id']}": pub["PublicationId"]
+        for pub in _db_publications
+    }
+
+    _new_publications = update_publications(
+        conn, tuple({
+            **pub, "publication_id": _pub_id_map[_pubmed_map[pub["pubmed_id"]]]
+        } for pub in pmed.fetch_publications(tuple(
+            pubmed_id for pubmed_id in pubmed_ids
+            if pubmed_id not in
+            tuple(row["PubMed_ID"] for row in _db_publications)))))
+    _pub_diff = publications_differences(
+        file_contents, _db_publications, {
+            row["PubMed_ID" if "PubMed_ID" in row else "pubmed_id"]: row[
+                "PublicationId" if "PublicationId" in row else "publication_id"]
+            for row in _db_publications + _new_publications})
+    logger.debug("Publications diff: %s", _pub_diff)
+    # 2. Data differences
+    _db_pheno_data = phenotypes_data_by_ids(conn, tuple({
+        "population_id": job["metadata"]["population-id"],
+        "phenoid": row[0],
+        "xref_id": row[1]
+    } for row in pheno_xref_ids))
+
+    data_diff = phenotypes_data_differences(
+        ({
+            "phenotype_id": row["phenotype_id"],
+            "xref_id": row["xref_id"],
+            "data": {
+                key:val for key,val in row.items()
+                if key not in BULK_EDIT_COMMON_FIELDNAMES + [
+                        "phenotype_id", "xref_id"]
+            }
+        } for row in file_contents),
+        ({
+            **row,
+            "PhenotypeId": row["Id"],
+            "data": {
+                dataitem["StrainName"]: dataitem
+                for dataitem in row["data"].values()
+            }
+        } for row in _db_pheno_data))
+    logger.debug("Data differences: %s", data_diff)
+    ### END: Compute differences
     update_descriptions()
     link_publications()
     update_values()
@@ -272,6 +253,10 @@ def main():
     logger.setLevel(args.log_level.upper())
     logger.debug("Arguments: %s", args)
 
+    logging.getLogger("uploader.phenotypes.misc").setLevel(args.log_level.upper())
+    logging.getLogger("uploader.phenotypes.models").setLevel(args.log_level.upper())
+    logging.getLogger("uploader.publications.models").setLevel(args.log_level.upper())
+
     with (mysqldb.database_connection(args.db_uri) as conn,
           sqlite3.connection(args.jobs_db_path) as jobs_conn):
         return run(conn, jobs.job(jobs_conn, args.job_id))