diff options
Diffstat (limited to 'uploader/publications')
-rw-r--r-- | uploader/publications/__init__.py | 2 | ||||
-rw-r--r-- | uploader/publications/datatables.py | 52 | ||||
-rw-r--r-- | uploader/publications/misc.py | 25 | ||||
-rw-r--r-- | uploader/publications/models.py | 96 | ||||
-rw-r--r-- | uploader/publications/pubmed.py | 103 | ||||
-rw-r--r-- | uploader/publications/views.py | 107 |
6 files changed, 385 insertions, 0 deletions
diff --git a/uploader/publications/__init__.py b/uploader/publications/__init__.py new file mode 100644 index 0000000..7efcabb --- /dev/null +++ b/uploader/publications/__init__.py @@ -0,0 +1,2 @@ +"""Package for handling publications.""" +from .views import pubbp diff --git a/uploader/publications/datatables.py b/uploader/publications/datatables.py new file mode 100644 index 0000000..e07fafd --- /dev/null +++ b/uploader/publications/datatables.py @@ -0,0 +1,52 @@ +"""Fetch data for datatables.""" +import logging +from typing import Optional + +from MySQLdb.cursors import DictCursor + +from gn_libs.mysqldb import Connection, debug_query + +logger = logging.getLogger(__name__) + +def fetch_publications( + conn: Connection, + search: Optional[str] = None, + offset: int = 0, + limit: int = -1 +) -> tuple[dict, int, int, int]: + """Fetch publications from the database.""" + _query = "SELECT * FROM Publication" + _count_query = "SELECT COUNT(*) FROM Publication" + _params = None + _where_clause = "" + _limit_clause = "" + if search is not None and bool(search): + _where_clause = ("WHERE PubMed_ID LIKE %s " + "OR Authors LIKE %s " + "OR Title LIKE %s") + _params = (f"%{search}%",) * 3 + + if limit > 0: + _limit_clause = f"LIMIT {limit} OFFSET {offset}" + + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute("SELECT COUNT(*) FROM Publication") + _total_rows = int(cursor.fetchone()["COUNT(*)"]) + + cursor.execute(f"{_count_query} {_where_clause}", _params) + debug_query(cursor, logger) + _result = cursor.fetchone() + _total_filtered = int(_result["COUNT(*)"] if bool(_result) else 0) + + cursor.execute(f"{_query} {_where_clause} {_limit_clause}", _params) + debug_query(cursor, logger) + _current_filtered = tuple( + {**dict(row), "index": idx} + for idx, row + in enumerate(cursor.fetchall(), start=offset+1)) + + return ( + _current_filtered, + len(_current_filtered), + _total_filtered, + _total_rows) diff --git a/uploader/publications/misc.py b/uploader/publications/misc.py new file mode 100644 index 0000000..fca6f71 --- /dev/null +++ b/uploader/publications/misc.py @@ -0,0 +1,25 @@ +"""Miscellaneous functions dealing with publications.""" + + +def publications_differences( + filedata: tuple[dict, ...], + dbdata: tuple[dict, ...], + pubmedid2pubidmap: tuple[dict, ...] +) -> tuple[dict, ...]: + """Compute the differences between file data and db data""" + diff = tuple() + for filerow, dbrow in zip( + sorted(filedata, key=lambda item: ( + item["phenotype_id"], item["xref_id"])), + sorted(dbdata, key=lambda item: ( + item["PhenotypeId"], item["xref_id"]))): + if filerow["PubMed_ID"] == dbrow["PubMed_ID"]: + continue + + newpubmed = filerow["PubMed_ID"] + diff = diff + ({ + **dbrow, + "PubMed_ID": newpubmed, + "PublicationId": pubmedid2pubidmap.get(newpubmed)},) + + return diff diff --git a/uploader/publications/models.py b/uploader/publications/models.py new file mode 100644 index 0000000..b199991 --- /dev/null +++ b/uploader/publications/models.py @@ -0,0 +1,96 @@ +"""Module to handle persistence and retrieval of publication to/from MariaDB""" +import logging +from typing import Iterable, Optional + +from MySQLdb.cursors import DictCursor + +from gn_libs.mysqldb import Connection, debug_query + +logger = logging.getLogger(__name__) + + +def fetch_phenotype_publications( + conn: Connection, + ids: tuple[tuple[int, int], ...] +) -> tuple[dict, ...]: + """Fetch publication from database by ID.""" + paramstr = ",".join(["(%s, %s)"] * len(ids)) + query = ( + "SELECT " + "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID " + "FROM PublishXRef AS pxr INNER JOIN Publication AS pub " + "ON pxr.PublicationId=pub.Id " + f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})") + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute(query, tuple(item for row in ids for item in row)) + return tuple(dict(row) for row in cursor.fetchall()) + + +def create_new_publications( + conn: Connection, + publications: tuple[dict, ...] +) -> tuple[dict, ...]: + if len(publications) > 0: + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.executemany( + ("INSERT INTO " + "Publication( " + "PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, " + "Month, Year" + ") " + "VALUES(" + "%(pubmed_id)s, %(abstract)s, %(authors)s, %(title)s, " + "%(journal)s, %(volume)s, %(pages)s, %(month)s, %(year)s" + ") " + "RETURNING *"), + publications) + return tuple({ + **row, "publication_id": row["Id"] + } for row in cursor.fetchall()) + return tuple() + + +def update_publications(conn: Connection , publications: tuple[dict, ...]) -> tuple[dict, ...]: + """Update details for multiple publications""" + if len(publications) > 0: + with conn.cursor(cursorclass=DictCursor) as cursor: + logger.debug("UPDATING PUBLICATIONS: %s", publications) + cursor.executemany( + ("UPDATE Publication SET " + "PubMed_ID=%(pubmed_id)s, Abstract=%(abstract)s, " + "Authors=%(authors)s, Title=%(title)s, Journal=%(journal)s, " + "Volume=%(volume)s, Pages=%(pages)s, Month=%(month)s, " + "Year=%(year)s " + "WHERE Id=%(publication_id)s"), + publications) + debug_query(cursor, logger) + return publications + return tuple() + return tuple() + + +def fetch_publication_by_id(conn: Connection, publication_id: int) -> dict: + """Fetch a specific publication from the database.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute("SELECT * FROM Publication WHERE Id=%s", + (publication_id,)) + _res = cursor.fetchone() + return dict(_res) if _res else {} + + +def fetch_publication_phenotypes( + conn: Connection, publication_id: int) -> Iterable[dict]: + """Fetch all phenotypes linked to this publication.""" + with conn.cursor(cursorclass=DictCursor) as cursor: + cursor.execute( + "SELECT pxr.Id AS xref_id, pxr.PublicationId, phe.* " + "FROM PublishXRef AS pxr INNER JOIN Phenotype AS phe " + "ON pxr.PhenotypeId=phe.Id " + "WHERE pxr.PublicationId=%s", + (publication_id,)) + while True: + row = cursor.fetchone() + if row: + yield row + else: + break diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py new file mode 100644 index 0000000..ed9b652 --- /dev/null +++ b/uploader/publications/pubmed.py @@ -0,0 +1,103 @@ +"""Module to interact with NCBI's PubMed""" +import logging + +import requests +from lxml import etree + +logger = logging.getLogger(__name__) + + +def __pub_date__(pubdate: etree.Element): + pubyear = pubdate.find("Year") + pubmonth = pubdate.find("Month") + pubday = pubdate.find("Day") + return { + "year": pubyear.text if pubyear is not None else None, + "month": pubmonth.text if pubmonth is not None else None, + "day": pubday.text if pubday is not None else None + } + + +def __journal__(journal: etree.Element) -> dict: + volume = journal.find("JournalIssue/Volume") + issue = journal.find("JournalIssue/Issue") + return { + "volume": volume.text if volume is not None else None, + "issue": issue.text if issue is not None else None, + **__pub_date__(journal.find("JournalIssue/PubDate")), + "journal": journal.find("Title").text + } + +def __author__(author: etree.Element) -> str: + return "%s %s" % ( + author.find("LastName").text, + author.find("Initials").text) + + +def __pages__(pagination: etree.Element) -> str: + start = pagination.find("StartPage") + end = pagination.find("EndPage") + return (start.text + ( + f"-{end.text}" if end is not None else "" + )) if start is not None else "" + + +def __abstract__(article: etree.Element) -> str: + abstract = article.find("Abstract/AbstractText") + return abstract.text if abstract is not None else None + + +def __article__(pubmed_article: etree.Element) -> dict: + article = pubmed_article.find("MedlineCitation/Article") + return { + "pubmed_id": int(pubmed_article.find("MedlineCitation/PMID").text), + "title": article.find("ArticleTitle").text, + **__journal__(article.find("Journal")), + "abstract": __abstract__(article), + "pages": __pages__(article.find("Pagination")), + "authors": ", ".join(__author__(author) + for author in article.findall("AuthorList/Author")) + } + + +def __process_pubmed_publication_data__(text) -> tuple[dict, ...]: + """Process the data from PubMed into usable data.""" + doc = etree.XML(text) + articles = doc.xpath("//PubmedArticle") + logger.debug("Retrieved %s publications from NCBI", len(articles)) + return tuple(__article__(article) for article in articles) + +def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]: + """Retrieve data on new publications from NCBI.""" + # See whether we can retrieve multiple publications in one go + # Parse data and save to DB + # Return PublicationId(s) for new publication(s). + if len(pubmed_ids) == 0: + logger.debug("There are no new PubMed IDs to fetch") + return tuple() + + logger.info("Fetching publications data for the following PubMed IDs: %s", + ", ".join((str(pid) for pid in pubmed_ids))) + + # Should we, perhaps, pass this in from a config variable? + uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + try: + response = requests.get( + uri, + params={ + "db": "pubmed", + "retmode": "xml", + "id": ",".join(str(item) for item in pubmed_ids) + }) + + if response.status_code == 200: + return __process_pubmed_publication_data__(response.text) + + logger.error( + "Could not fetch the new publication from %s (status code: %s)", + uri, + response.status_code) + except requests.exceptions.ConnectionError: + logger.error("Could not find the domain %s", uri) + + return tuple() diff --git a/uploader/publications/views.py b/uploader/publications/views.py new file mode 100644 index 0000000..0608a35 --- /dev/null +++ b/uploader/publications/views.py @@ -0,0 +1,107 @@ +"""Endpoints for publications""" +import json + +from MySQLdb.cursors import DictCursor +from gn_libs.mysqldb import database_connection +from flask import ( + flash, + request, + url_for, + redirect, + Blueprint, + render_template, + current_app as app) + +from uploader.authorisation import require_login + +from .models import ( + fetch_publication_by_id, + create_new_publications, + fetch_publication_phenotypes) + +from .datatables import fetch_publications + +from gn_libs.debug import __pk__ + +pubbp = Blueprint("publications", __name__) + + +@pubbp.route("/", methods=["GET"]) +@require_login +def index(): + """Index page for publications.""" + with database_connection(app.config["SQL_URI"]) as conn: + return render_template("publications/index.html") + + +@pubbp.route("/list", methods=["GET"]) +@require_login +def list_publications(): + # request breakdown: + # https://datatables.net/manual/server-side + _page = int(request.args.get("draw")) + _length = int(request.args.get("length") or '-1') + _start = int(request.args.get("start") or '0') + _search = request.args["search[value]"] + with (database_connection(app.config["SQL_URI"]) as conn, + conn.cursor(cursorclass=DictCursor) as cursor): + _publications, _current_rows, _totalfiltered, _totalrows = fetch_publications( + conn, + _search, + offset=_start, + limit=_length) + + return json.dumps({ + "draw": _page, + "recordsTotal": _totalrows, + "recordsFiltered": _totalfiltered, + "publications": _publications, + "status": "success" + }) + + +@pubbp.route("/view/<int:publication_id>", methods=["GET"]) +@require_login +def view_publication(publication_id: int): + """View more details on a particular publication.""" + with database_connection(app.config["SQL_URI"]) as conn: + return render_template( + "publications/view-publication.html", + publication=fetch_publication_by_id(conn, publication_id), + linked_phenotypes=tuple(fetch_publication_phenotypes( + conn, publication_id))) + + +@pubbp.route("/create", methods=["GET", "POST"]) +@require_login +def create_publication(): + """Create a new publication.""" + if(request.method == "GET"): + return render_template("publications/create-publication.html") + form = request.form + authors = form.get("publication-authors").encode("utf8") + if authors is None or authors == "": + flash("The publication's author(s) MUST be provided!", "alert alert-danger") + return redirect(url_for("publications.create", **request.args)) + + with database_connection(app.config["SQL_URI"]) as conn: + publications = create_new_publications(conn, ({ + "pubmed_id": form.get("pubmed-id") or None, + "abstract": form.get("publication-abstract").encode("utf8") or None, + "authors": authors, + "title": form.get("publication-title").encode("utf8") or None, + "journal": form.get("publication-journal").encode("utf8") or None, + "volume": form.get("publication-volume").encode("utf8") or None, + "pages": form.get("publication-pages").encode("utf8") or None, + "month": (form.get("publication-month") or "").encode("utf8").capitalize() or None, + "year": form.get("publication-year").encode("utf8") or None + },)) + flash("New publication created!", "alert alert-success") + return redirect(url_for( + request.args.get("return_to") or "publications.view_publication", + publication_id=publications[0]["publication_id"], + **request.args)) + + flash("Publication creation failed!", "alert alert-danger") + app.logger.debug("Failed to create the new publication.", exc_info=True) + return redirect(url_for("publications.create_publication")) |