1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
"""Module to interact with NCBI's PubMed"""
def __pub_date__(pubdate: etree.Element):
pubyear = pubdate.find("Year")
pubmonth = pubdate.find("Month")
pubday = pubdate.find("Day")
return {
"year": pubyear.text if pubyear is not None else None,
"month": pubmonth.text if pubmonth is not None else None,
"day": pubday.text if pubday is not None else None
}
def __journal__(journal: etree.Element) -> dict:
volume = journal.find("JournalIssue/Volume")
issue = journal.find("JournalIssue/Issue")
return {
"volume": volume.text if volume is not None else None,
"issue": issue.text if issue is not None else None,
**__pub_date__(journal.find("JournalIssue/PubDate")),
"journal": journal.find("Title").text
}
def __author__(author: etree.Element) -> str:
return "%s %s" % (
author.find("LastName").text,
author.find("Initials").text)
def __pages__(pagination: etree.Element) -> str:
start = pagination.find("StartPage")
end = pagination.find("EndPage")
return (start.text + (
f"-{end.text}" if end is not None else ""
)) if start is not None else ""
def __abstract__(article: etree.Element) -> str:
abstract = article.find("Abstract/AbstractText")
return abstract.text if abstract is not None else None
def __article__(pubmed_article: etree.Element) -> dict:
article = pubmed_article.find("MedlineCitation/Article")
return {
"pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
"title": article.find("ArticleTitle").text,
**__journal__(article.find("Journal")),
"abstract": __abstract__(article),
"pages": __pages__(article.find("Pagination")),
"authors": ", ".join(__author__(author)
for author in article.findall("AuthorList/Author"))
}
def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
"""Process the data from PubMed into usable data."""
doc = etree.XML(text)
articles = doc.xpath("//PubmedArticle")
logger.debug("Retrieved %s publications from NCBI", len(articles))
return tuple(__article__(article) for article in articles)
def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
"""Retrieve data on new publications from NCBI."""
# See whether we can retrieve multiple publications in one go
# Parse data and save to DB
# Return PublicationId(s) for new publication(s).
if len(pubmed_ids) == 0:
logger.debug("There are no new PubMed IDs to fetch")
return tuple()
logger.info("Fetching publications data for the following PubMed IDs: %s",
", ".join((str(pid) for pid in pubmed_ids)))
# Should we, perhaps, pass this in from a config variable?
uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
try:
response = requests.get(
uri,
params={
"db": "pubmed",
"retmode": "xml",
"id": ",".join(str(item) for item in pubmed_ids)
})
if response.status_code == 200:
return __process_pubmed_publication_data__(response.text)
logger.error(
"Could not fetch the new publication from %s (status code: %s)",
uri,
response.status_code)
except requests.exceptions.ConnectionError:
logger.error("Could not find the domain %s", uri)
return tuple()
|