"""Module to interact with NCBI's PubMed""" import logging import requests from lxml import etree logger = logging.getLogger(__name__) def __pub_date__(pubdate: etree.Element): pubyear = pubdate.find("Year") pubmonth = pubdate.find("Month") pubday = pubdate.find("Day") return { "year": pubyear.text if pubyear is not None else None, "month": pubmonth.text if pubmonth is not None else None, "day": pubday.text if pubday is not None else None } def __journal__(journal: etree.Element) -> dict: volume = journal.find("JournalIssue/Volume") issue = journal.find("JournalIssue/Issue") return { "volume": volume.text if volume is not None else None, "issue": issue.text if issue is not None else None, **__pub_date__(journal.find("JournalIssue/PubDate")), "journal": journal.find("Title").text } def __author__(author: etree.Element) -> str: return "%s %s" % ( author.find("LastName").text, author.find("Initials").text) def __pages__(pagination: etree.Element) -> str: start = pagination.find("StartPage") end = pagination.find("EndPage") return (start.text + ( f"-{end.text}" if end is not None else "" )) if start is not None else "" def __abstract__(article: etree.Element) -> str: abstract = article.find("Abstract/AbstractText") return abstract.text if abstract is not None else None def __article__(pubmed_article: etree.Element) -> dict: article = pubmed_article.find("MedlineCitation/Article") return { "pubmed_id": int(pubmed_article.find("MedlineCitation/PMID").text), "title": article.find("ArticleTitle").text, **__journal__(article.find("Journal")), "abstract": __abstract__(article), "pages": __pages__(article.find("Pagination")), "authors": ", ".join(__author__(author) for author in article.findall("AuthorList/Author")) } def __process_pubmed_publication_data__(text) -> tuple[dict, ...]: """Process the data from PubMed into usable data.""" doc = etree.XML(text) articles = doc.xpath("//PubmedArticle") logger.debug("Retrieved %s publications from NCBI", len(articles)) return tuple(__article__(article) for article in articles) def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]: """Retrieve data on new publications from NCBI.""" # See whether we can retrieve multiple publications in one go # Parse data and save to DB # Return PublicationId(s) for new publication(s). if len(pubmed_ids) == 0: logger.debug("There are no new PubMed IDs to fetch") return tuple() logger.info("Fetching publications data for the following PubMed IDs: %s", ", ".join((str(pid) for pid in pubmed_ids))) # Should we, perhaps, pass this in from a config variable? uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" try: response = requests.get( uri, params={ "db": "pubmed", "retmode": "xml", "id": ",".join(str(item) for item in pubmed_ids) }) if response.status_code == 200: return __process_pubmed_publication_data__(response.text) logger.error( "Could not fetch the new publication from %s (status code: %s)", uri, response.status_code) except requests.exceptions.ConnectionError: logger.error("Could not find the domain %s", uri) return tuple()