uploader/publications/pubmed.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

"""Module to interact with NCBI's PubMed"""


def __pub_date__(pubdate: etree.Element):
    pubyear = pubdate.find("Year")
    pubmonth = pubdate.find("Month")
    pubday = pubdate.find("Day")
    return {
        "year": pubyear.text if pubyear is not None else None,
        "month": pubmonth.text if pubmonth is not None else None,
        "day": pubday.text if pubday is not None else None
    }


def __journal__(journal: etree.Element) -> dict:
    volume = journal.find("JournalIssue/Volume")
    issue = journal.find("JournalIssue/Issue")
    return {
        "volume": volume.text if volume is not None else None,
        "issue": issue.text if issue is not None else None,
        **__pub_date__(journal.find("JournalIssue/PubDate")),
        "journal": journal.find("Title").text
    }

def __author__(author: etree.Element) -> str:
    return "%s %s" % (
        author.find("LastName").text,
        author.find("Initials").text)


def __pages__(pagination: etree.Element) -> str:
    start = pagination.find("StartPage")
    end = pagination.find("EndPage")
    return (start.text + (
        f"-{end.text}" if end is not None else ""
    )) if start is not None else ""


def __abstract__(article: etree.Element) -> str:
    abstract = article.find("Abstract/AbstractText")
    return abstract.text if abstract is not None else None


def __article__(pubmed_article: etree.Element) -> dict:
    article = pubmed_article.find("MedlineCitation/Article")
    return {
        "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
        "title": article.find("ArticleTitle").text,
        **__journal__(article.find("Journal")),
        "abstract": __abstract__(article),
        "pages": __pages__(article.find("Pagination")),
        "authors": ", ".join(__author__(author)
                             for author in article.findall("AuthorList/Author"))
    }


def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
    """Process the data from PubMed into usable data."""
    doc = etree.XML(text)
    articles = doc.xpath("//PubmedArticle")
    logger.debug("Retrieved %s publications from NCBI", len(articles))
    return tuple(__article__(article) for article in articles)

def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
    """Retrieve data on new publications from NCBI."""
    # See whether we can retrieve multiple publications in one go
    # Parse data and save to DB
    # Return PublicationId(s) for new publication(s).
    if len(pubmed_ids) == 0:
        logger.debug("There are no new PubMed IDs to fetch")
        return tuple()

    logger.info("Fetching publications data for the following PubMed IDs: %s",
                ", ".join((str(pid) for pid in pubmed_ids)))

    # Should we, perhaps, pass this in from a config variable?
    uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    try:
        response = requests.get(
            uri,
            params={
                "db": "pubmed",
                "retmode": "xml",
                "id": ",".join(str(item) for item in pubmed_ids)
            })

        if response.status_code == 200:
            return __process_pubmed_publication_data__(response.text)

        logger.error(
            "Could not fetch the new publication from %s (status code: %s)",
            uri,
            response.status_code)
    except requests.exceptions.ConnectionError:
        logger.error("Could not find the domain %s", uri)

    return tuple()