about summary refs log tree commit diff
path: root/uploader
diff options
context:
space:
mode:
Diffstat (limited to 'uploader')
-rw-r--r--uploader/publications/pubmed.py97
1 files changed, 97 insertions, 0 deletions
diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py
new file mode 100644
index 0000000..d984d99
--- /dev/null
+++ b/uploader/publications/pubmed.py
@@ -0,0 +1,97 @@
+"""Module to interact with NCBI's PubMed"""
+
+
+def __pub_date__(pubdate: etree.Element):
+    pubyear = pubdate.find("Year")
+    pubmonth = pubdate.find("Month")
+    pubday = pubdate.find("Day")
+    return {
+        "year": pubyear.text if pubyear is not None else None,
+        "month": pubmonth.text if pubmonth is not None else None,
+        "day": pubday.text if pubday is not None else None
+    }
+
+
+def __journal__(journal: etree.Element) -> dict:
+    volume = journal.find("JournalIssue/Volume")
+    issue = journal.find("JournalIssue/Issue")
+    return {
+        "volume": volume.text if volume is not None else None,
+        "issue": issue.text if issue is not None else None,
+        **__pub_date__(journal.find("JournalIssue/PubDate")),
+        "journal": journal.find("Title").text
+    }
+
+def __author__(author: etree.Element) -> str:
+    return "%s %s" % (
+        author.find("LastName").text,
+        author.find("Initials").text)
+
+
+def __pages__(pagination: etree.Element) -> str:
+    start = pagination.find("StartPage")
+    end = pagination.find("EndPage")
+    return (start.text + (
+        f"-{end.text}" if end is not None else ""
+    )) if start is not None else ""
+
+
+def __abstract__(article: etree.Element) -> str:
+    abstract = article.find("Abstract/AbstractText")
+    return abstract.text if abstract is not None else None
+
+
+def __article__(pubmed_article: etree.Element) -> dict:
+    article = pubmed_article.find("MedlineCitation/Article")
+    return {
+        "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
+        "title": article.find("ArticleTitle").text,
+        **__journal__(article.find("Journal")),
+        "abstract": __abstract__(article),
+        "pages": __pages__(article.find("Pagination")),
+        "authors": ", ".join(__author__(author)
+                             for author in article.findall("AuthorList/Author"))
+    }
+
+
+def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
+    """Process the data from PubMed into usable data."""
+    doc = etree.XML(text)
+    articles = doc.xpath("//PubmedArticle")
+    logger.debug("Retrieved %s publications from NCBI", len(articles))
+    return tuple(__article__(article) for article in articles)
+
+def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
+    """Retrieve data on new publications from NCBI."""
+    # See whether we can retrieve multiple publications in one go
+    # Parse data and save to DB
+    # Return PublicationId(s) for new publication(s).
+    if len(pubmed_ids) == 0:
+        logger.debug("There are no new PubMed IDs to fetch")
+        return tuple()
+
+    logger.info("Fetching publications data for the following PubMed IDs: %s",
+                ", ".join((str(pid) for pid in pubmed_ids)))
+
+    # Should we, perhaps, pass this in from a config variable?
+    uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    try:
+        response = requests.get(
+            uri,
+            params={
+                "db": "pubmed",
+                "retmode": "xml",
+                "id": ",".join(str(item) for item in pubmed_ids)
+            })
+
+        if response.status_code == 200:
+            return __process_pubmed_publication_data__(response.text)
+
+        logger.error(
+            "Could not fetch the new publication from %s (status code: %s)",
+            uri,
+            response.status_code)
+    except requests.exceptions.ConnectionError:
+        logger.error("Could not find the domain %s", uri)
+
+    return tuple()