aboutsummaryrefslogtreecommitdiff
path: root/scripts/phenotypes_bulk_edit.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/phenotypes_bulk_edit.py')
-rw-r--r--scripts/phenotypes_bulk_edit.py167
1 files changed, 84 insertions, 83 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py
index b647199..cee5f4e 100644
--- a/scripts/phenotypes_bulk_edit.py
+++ b/scripts/phenotypes_bulk_edit.py
@@ -6,11 +6,19 @@ from pathlib import Path
from typing import Iterator
from functools import reduce
-import requests
from MySQLdb.cursors import DictCursor
from gn_libs import jobs, mysqldb, sqlite3
+from uploader.phenotypes.models import phenotypes_data_by_ids
+from uploader.phenotypes.misc import phenotypes_data_differences
+from uploader.phenotypes.views import BULK_EDIT_COMMON_FIELDNAMES
+
+import uploader.publications.pubmed as pmed
+from uploader.publications.misc import publications_differences
+from uploader.publications.models import (
+ update_publications, fetch_phenotype_publications)
+
logging.basicConfig(
format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
@@ -82,83 +90,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]:
return diff
-def __fetch_publications__(conn, ids):
- """Fetch publication from database by ID."""
- paramstr = ",".join(["(%s, %s)"] * len(ids))
- query = (
- "SELECT "
- "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID "
- "FROM PublishXRef AS pxr INNER JOIN Publication AS pub "
- "ON pxr.PublicationId=pub.Id "
- f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})")
- with conn.cursor(cursorclass=DictCursor) as cursor:
- cursor.execute(query, tuple(item for row in ids for item in row))
- return tuple(dict(row) for row in cursor.fetchall())
-
-
-def __process_pubmed_publication_data__(text):
- """Process the data from PubMed into usable data."""
- # Process with lxml
- pass
-
-
-def __fetch_new_pubmed_ids__(pubmed_ids):
- """Retrieve data on new publications from NCBI."""
- # See whether we can retrieve multiple publications in one go
- # Parse data and save to DB
- # Return PublicationId(s) for new publication(s).
- logger.info("Fetching publications data for the following PubMed IDs: %s",
- ", ".join(pubmed_ids))
-
- # Should we, perhaps, pass this in from a config variable?
- uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
- try:
- response = request.get(
- uri,
- params={
- "db": "pubmed",
- "retmode": "xml",
- "id": ",".join(str(item) for item in pubmed_ids)
- })
-
- if response.status_code == 200:
- return __process_pubmed_publication_data__(response.text)
-
- logger.error(
- "Could not fetch the new publication from %s (status code: %s)",
- uri,
- response.status_code)
- except requests.exceptions.ConnectionError:
- logger.error("Could not find the domain %s", uri)
-
- return tuple()
-
-
-def publications_differences(file_data, db_data, pubmed_ids):
- """Compute differences in the publications."""
- logger.info("Computing differences in publications.")
- db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])),
- db_data,
- set([None]))
- new_pubmeds = __fetch_new_pubmed_ids__(tuple(
- pubmed_ids.difference(db_pubmed_ids)))
- pass
-
-
-def compute_differences(conn, file_contents, pheno_ids, pheno_xref_ids, pubmed_ids) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]:
- """Compute differences between data in DB and edited data."""
- logger.info("Computing differences.")
- # 1. Basic Phenotype data differences
- # a. Descriptions differences
- desc_diff = descriptions_differences(file_contents, __fetch_phenotypes__(conn, pheno_ids))
- logger.debug("DESCRIPTIONS DIFFERENCES: %s", desc_diff)
- # b. Publications differences
- # pub_diff = publications_differences(...)
- # 2. Data differences
- # data_diff = data_differences(...)
- pass
-
-
def update_descriptions():
"""Update descriptions in the database"""
logger.info("Updating descriptions")
@@ -219,8 +150,17 @@ def read_file(filepath: Path) -> Iterator[str]:
((None if item.strip() == "" else item.strip())
for item in fields)))
_pheno, _xref = _dict.pop("UniqueIdentifier").split("::")
+ _dict = {
+ key: ((float(val) if bool(val) else val)
+ if key not in BULK_EDIT_COMMON_FIELDNAMES
+ else val)
+ for key, val in _dict.items()
+ }
_dict["phenotype_id"] = int(_pheno.split(":")[1])
_dict["xref_id"] = int(_xref.split(":")[1])
+ if _dict["PubMed_ID"] is not None:
+ _dict["PubMed_ID"] = int(_dict["PubMed_ID"])
+
yield _dict
count = count + 1
@@ -239,11 +179,68 @@ def run(conn, job):
check_ids(conn, pheno_xref_ids)
check_for_mandatory_fields()
# stop running here if any errors are found.
- compute_differences(conn,
- file_contents,
- pheno_ids,
- pheno_xref_ids,
- pubmed_ids)
+
+ ### Compute differences
+ logger.info("Computing differences.")
+ # 1. Basic Phenotype data differences
+ # a. Descriptions differences
+ _desc_diff = descriptions_differences(
+ file_contents, __fetch_phenotypes__(conn, pheno_ids))
+ logger.debug("DESCRIPTIONS DIFFERENCES: %s", _desc_diff)
+
+ # b. Publications differences
+ _db_publications = fetch_phenotype_publications(conn, pheno_xref_ids)
+ logger.debug("DB PUBLICATIONS: %s", _db_publications)
+
+ _pubmed_map = {
+ (int(row["PubMed_ID"]) if bool(row["PubMed_ID"]) else None): f"{row['phenotype_id']}::{row['xref_id']}"
+ for row in file_contents
+ }
+ _pub_id_map = {
+ f"{pub['PhenotypeId']}::{pub['xref_id']}": pub["PublicationId"]
+ for pub in _db_publications
+ }
+
+ _new_publications = update_publications(
+ conn, tuple({
+ **pub, "publication_id": _pub_id_map[_pubmed_map[pub["pubmed_id"]]]
+ } for pub in pmed.fetch_publications(tuple(
+ pubmed_id for pubmed_id in pubmed_ids
+ if pubmed_id not in
+ tuple(row["PubMed_ID"] for row in _db_publications)))))
+ _pub_diff = publications_differences(
+ file_contents, _db_publications, {
+ row["PubMed_ID" if "PubMed_ID" in row else "pubmed_id"]: row[
+ "PublicationId" if "PublicationId" in row else "publication_id"]
+ for row in _db_publications + _new_publications})
+ logger.debug("Publications diff: %s", _pub_diff)
+ # 2. Data differences
+ _db_pheno_data = phenotypes_data_by_ids(conn, tuple({
+ "population_id": job["metadata"]["population-id"],
+ "phenoid": row[0],
+ "xref_id": row[1]
+ } for row in pheno_xref_ids))
+
+ data_diff = phenotypes_data_differences(
+ ({
+ "phenotype_id": row["phenotype_id"],
+ "xref_id": row["xref_id"],
+ "data": {
+ key:val for key,val in row.items()
+ if key not in BULK_EDIT_COMMON_FIELDNAMES + [
+ "phenotype_id", "xref_id"]
+ }
+ } for row in file_contents),
+ ({
+ **row,
+ "PhenotypeId": row["Id"],
+ "data": {
+ dataitem["StrainName"]: dataitem
+ for dataitem in row["data"].values()
+ }
+ } for row in _db_pheno_data))
+ logger.debug("Data differences: %s", data_diff)
+ ### END: Compute differences
update_descriptions()
link_publications()
update_values()
@@ -256,6 +253,10 @@ def main():
logger.setLevel(args.log_level.upper())
logger.debug("Arguments: %s", args)
+ logging.getLogger("uploader.phenotypes.misc").setLevel(args.log_level.upper())
+ logging.getLogger("uploader.phenotypes.models").setLevel(args.log_level.upper())
+ logging.getLogger("uploader.publications.models").setLevel(args.log_level.upper())
+
with (mysqldb.database_connection(args.db_uri) as conn,
sqlite3.connection(args.jobs_db_path) as jobs_conn):
return run(conn, jobs.job(jobs_conn, args.job_id))