1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
""""module contains code to fetch the data only from pubmed
At the moment we are only searching in pubmed db but this
feature can be extended to others e.g pmc
"""
# pylint: disable=C0301
import functools
import json
import requests
from Bio import Entrez
def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
"""fetch details of publocation based on their ids
Args:
id_list(list): List of publications iDs (pubmed)
email: (str,optional)
Returns:
list: details of fetched publications
"""
Entrez.email = email
if db_name.lower() == "pubmed":
handle = Entrez.efetch(db=db_name, retmode=retmode,
id=",".join(id_list))
results = Entrez.read(handle)
handle.close()
return extract_pub_metadata(results)
return []
def extract_pub_metadata(papers):
"""
Extract metadata from PubMed papers.
Args:
papers (dict): Dictionary containing PubMed papers.
Returns:
list: Extracted metadata for the papers.
"""
metadata = {}
for paper in papers["PubmedArticle"]:
article = paper['MedlineCitation']['Article']
author_list = article.get('AuthorList')
authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
for author in author_list])
abstract = article.get(
'Abstract', {}).get('AbstractText', '')
if isinstance(abstract, list):
abstract = ' '.join(abstract)
pub_id = str(paper["MedlineCitation"]["PMID"])
metadata[pub_id] = {
"pub_id": str(paper["MedlineCitation"]["PMID"]),
"title": article.get('ArticleTitle'),
"authors": authors,
"abstract": abstract,
"journal_title": article['Journal']['Title'],
"languages": article.get("Language", ""),
"source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
}
return metadata
def fetch_pubmed_id(query, db_name, max_search_count, ret_mode="xml", email="alexanderkabua@gmail.com"):
"""method to fetch the id for a given search in pubmed"""
Entrez.email = email
handle = Entrez.esearch(db=db_name, sort="relevance",
retmax=max_search_count, ret_mode=ret_mode, term=query)
results = Entrez.read(handle)
handle.close()
if results.get("IdList"):
return {
"query": query,
"id_list": results.get("IdList")
}
return None
def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
"""
Search pubmed for publication from json files with values being query string
Args:
input_file: (str): path to the json file with the query strings
max_search_count: no of ids/lookups per each search
db_name: target db default pubmed
returns: (Result<(pub_medata:list,doc_ids:dict),Error)
"""
try:
pub_data = []
doc_ids = {}
with open(input_file, "r", encoding="utf-8") as file_handler:
search_dict = json.load(file_handler)
for (filename, file_obj) in search_dict.items():
query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
db_name=db_name, max_search_count=max_search_count)
if query_ids:
for doc_id in query_ids.get("id_list"):
doc_ids[doc_id] = filename
pub_data.append(query_ids)
return (fetch_pub_details(functools.reduce(
lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)
except Exception as error:
raise error
def dump_all_to_file(response, doc_ids, output_file):
"""
function to map the pubmed data to doc_ids and dump to a json file
"""
data = {}
for (pub_id, pub_meta) in response.items():
doc_id = doc_ids.get(pub_id)
if data.get(doc_id):
data[doc_id].append(pub_meta)
else:
data[doc_id] = [pub_meta]
#
with open(output_file, "w+", encoding="utf-8") as file_handler:
json.dump(data, file_handler, indent=4)
# lossy method to fetch pub data
def fetch_id_lossy_search(query, db_name, max_results):
"""
Search PubMed data based on the provided search string.
Args:
- search_string (str): The search string.
Returns:
- dict: Dictionary containing search results and status code.
"""
try:
response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
headers={"content-type": "application/json"}
)
return response["esearchresult"]["idlist"]
except requests.exceptions.RequestException as error:
raise error
def search_pubmed_lossy(pubmed_id, db_name):
"""
Fetches records based on the PubMed ID.
Args:
- pubmed_id (str): PubMed ID.
Returns:
- dict: Records fetched based on PubMed ID.
"""
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
response = requests.get(url)
response.raise_for_status()
data = response.json()
if db_name.lower() == "pmc":
return data['pmc-articleset']['article']
return data["PubmedArticleSet"]["PubmedArticle"]
if __name__ == '__main__':
(pub_metadata, doc_ids_metadata) = fetch_all_queries(
input_file="parsed_all_files.json", max_search_count=1)
dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json")
|