aboutsummaryrefslogtreecommitdiff
path: root/uploader/publications
diff options
context:
space:
mode:
Diffstat (limited to 'uploader/publications')
-rw-r--r--uploader/publications/__init__.py1
-rw-r--r--uploader/publications/misc.py25
-rw-r--r--uploader/publications/models.py73
-rw-r--r--uploader/publications/pubmed.py103
4 files changed, 202 insertions, 0 deletions
diff --git a/uploader/publications/__init__.py b/uploader/publications/__init__.py
new file mode 100644
index 0000000..57c0cbb
--- /dev/null
+++ b/uploader/publications/__init__.py
@@ -0,0 +1 @@
+"""Package for handling publications."""
diff --git a/uploader/publications/misc.py b/uploader/publications/misc.py
new file mode 100644
index 0000000..fca6f71
--- /dev/null
+++ b/uploader/publications/misc.py
@@ -0,0 +1,25 @@
+"""Miscellaneous functions dealing with publications."""
+
+
+def publications_differences(
+ filedata: tuple[dict, ...],
+ dbdata: tuple[dict, ...],
+ pubmedid2pubidmap: tuple[dict, ...]
+) -> tuple[dict, ...]:
+ """Compute the differences between file data and db data"""
+ diff = tuple()
+ for filerow, dbrow in zip(
+ sorted(filedata, key=lambda item: (
+ item["phenotype_id"], item["xref_id"])),
+ sorted(dbdata, key=lambda item: (
+ item["PhenotypeId"], item["xref_id"]))):
+ if filerow["PubMed_ID"] == dbrow["PubMed_ID"]:
+ continue
+
+ newpubmed = filerow["PubMed_ID"]
+ diff = diff + ({
+ **dbrow,
+ "PubMed_ID": newpubmed,
+ "PublicationId": pubmedid2pubidmap.get(newpubmed)},)
+
+ return diff
diff --git a/uploader/publications/models.py b/uploader/publications/models.py
new file mode 100644
index 0000000..3fc9542
--- /dev/null
+++ b/uploader/publications/models.py
@@ -0,0 +1,73 @@
+"""Module to handle persistence and retrieval of publication to/from MariaDB"""
+import logging
+
+from MySQLdb.cursors import DictCursor
+
+from gn_libs.mysqldb import Connection, debug_query
+
+logger = logging.getLogger(__name__)
+
+
+def fetch_phenotype_publications(
+ conn: Connection,
+ ids: tuple[tuple[int, int], ...]
+) -> tuple[dict, ...]:
+ """Fetch publication from database by ID."""
+ paramstr = ",".join(["(%s, %s)"] * len(ids))
+ query = (
+ "SELECT "
+ "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID "
+ "FROM PublishXRef AS pxr INNER JOIN Publication AS pub "
+ "ON pxr.PublicationId=pub.Id "
+ f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})")
+ with conn.cursor(cursorclass=DictCursor) as cursor:
+ cursor.execute(query, tuple(item for row in ids for item in row))
+ return tuple(dict(row) for row in cursor.fetchall())
+
+
+def create_new_publications(
+ conn: Connection,
+ publications: tuple[dict, ...]
+) -> tuple[dict, ...]:
+ if len(publications) > 0:
+ with conn.cursor(cursorclass=DictCursor) as cursor:
+ cursor.executemany(
+ ("INSERT INTO "
+ "Publication( "
+ "PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, "
+ "Month, Year"
+ ") "
+ "VALUES("
+ "%(pubmed_id)s, %(abstract)s, %(authors)s, %(title)s, "
+ "%(journal)s, %(volume)s, %(pages)s, %(month)s, %(year)s"
+ ") "
+ "ON DUPLICATE KEY UPDATE "
+ "Abstract=VALUES(Abstract), Authors=VALUES(Authors), "
+ "Title=VALUES(Title), Journal=VALUES(Journal), "
+ "Volume=VALUES(Volume), Pages=VALUES(pages), "
+ "Month=VALUES(Month), Year=VALUES(Year) "
+ "RETURNING *"),
+ publications)
+ return tuple({
+ **row, "PublicationId": row["Id"]
+ } for row in cursor.fetchall())
+ return tuple()
+
+
+def update_publications(conn: Connection , publications: tuple[dict, ...]) -> tuple[dict, ...]:
+ """Update details for multiple publications"""
+ if len(publications) > 0:
+ with conn.cursor(cursorclass=DictCursor) as cursor:
+ logger.debug("UPDATING PUBLICATIONS: %s", publications)
+ cursor.executemany(
+ ("UPDATE Publication SET "
+ "PubMed_ID=%(pubmed_id)s, Abstract=%(abstract)s, "
+ "Authors=%(authors)s, Title=%(title)s, Journal=%(journal)s, "
+ "Volume=%(volume)s, Pages=%(pages)s, Month=%(month)s, "
+ "Year=%(year)s "
+ "WHERE Id=%(publication_id)s"),
+ publications)
+ debug_query(cursor, logger)
+ return publications
+ return tuple()
+ return tuple()
diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py
new file mode 100644
index 0000000..ed9b652
--- /dev/null
+++ b/uploader/publications/pubmed.py
@@ -0,0 +1,103 @@
+"""Module to interact with NCBI's PubMed"""
+import logging
+
+import requests
+from lxml import etree
+
+logger = logging.getLogger(__name__)
+
+
+def __pub_date__(pubdate: etree.Element):
+ pubyear = pubdate.find("Year")
+ pubmonth = pubdate.find("Month")
+ pubday = pubdate.find("Day")
+ return {
+ "year": pubyear.text if pubyear is not None else None,
+ "month": pubmonth.text if pubmonth is not None else None,
+ "day": pubday.text if pubday is not None else None
+ }
+
+
+def __journal__(journal: etree.Element) -> dict:
+ volume = journal.find("JournalIssue/Volume")
+ issue = journal.find("JournalIssue/Issue")
+ return {
+ "volume": volume.text if volume is not None else None,
+ "issue": issue.text if issue is not None else None,
+ **__pub_date__(journal.find("JournalIssue/PubDate")),
+ "journal": journal.find("Title").text
+ }
+
+def __author__(author: etree.Element) -> str:
+ return "%s %s" % (
+ author.find("LastName").text,
+ author.find("Initials").text)
+
+
+def __pages__(pagination: etree.Element) -> str:
+ start = pagination.find("StartPage")
+ end = pagination.find("EndPage")
+ return (start.text + (
+ f"-{end.text}" if end is not None else ""
+ )) if start is not None else ""
+
+
+def __abstract__(article: etree.Element) -> str:
+ abstract = article.find("Abstract/AbstractText")
+ return abstract.text if abstract is not None else None
+
+
+def __article__(pubmed_article: etree.Element) -> dict:
+ article = pubmed_article.find("MedlineCitation/Article")
+ return {
+ "pubmed_id": int(pubmed_article.find("MedlineCitation/PMID").text),
+ "title": article.find("ArticleTitle").text,
+ **__journal__(article.find("Journal")),
+ "abstract": __abstract__(article),
+ "pages": __pages__(article.find("Pagination")),
+ "authors": ", ".join(__author__(author)
+ for author in article.findall("AuthorList/Author"))
+ }
+
+
+def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
+ """Process the data from PubMed into usable data."""
+ doc = etree.XML(text)
+ articles = doc.xpath("//PubmedArticle")
+ logger.debug("Retrieved %s publications from NCBI", len(articles))
+ return tuple(__article__(article) for article in articles)
+
+def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
+ """Retrieve data on new publications from NCBI."""
+ # See whether we can retrieve multiple publications in one go
+ # Parse data and save to DB
+ # Return PublicationId(s) for new publication(s).
+ if len(pubmed_ids) == 0:
+ logger.debug("There are no new PubMed IDs to fetch")
+ return tuple()
+
+ logger.info("Fetching publications data for the following PubMed IDs: %s",
+ ", ".join((str(pid) for pid in pubmed_ids)))
+
+ # Should we, perhaps, pass this in from a config variable?
+ uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+ try:
+ response = requests.get(
+ uri,
+ params={
+ "db": "pubmed",
+ "retmode": "xml",
+ "id": ",".join(str(item) for item in pubmed_ids)
+ })
+
+ if response.status_code == 200:
+ return __process_pubmed_publication_data__(response.text)
+
+ logger.error(
+ "Could not fetch the new publication from %s (status code: %s)",
+ uri,
+ response.status_code)
+ except requests.exceptions.ConnectionError:
+ logger.error("Could not find the domain %s", uri)
+
+ return tuple()