""""module contains code to fetch the data only from pubmed
At the moment we are only searching in pubmed db but this
feature can be extended to others e.g pmc
"""
# pylint: disable=C0301
import functools
import json
import requests
from Bio import Entrez
def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
"""fetch details of publocation based on their ids
Args:
id_list(list): List of publications iDs (pubmed)
email: (str,optional)
Returns:
list: details of fetched publications
"""
Entrez.email = email
if db_name.lower() == "pubmed":
handle = Entrez.efetch(db=db_name, retmode=retmode,
id=",".join(id_list))
results = Entrez.read(handle)
handle.close()
return extract_pub_metadata(results)
return []
def extract_pub_metadata(papers):
"""
Extract metadata from PubMed papers.
Args:
papers (dict): Dictionary containing PubMed papers.
Returns:
list: Extracted metadata for the papers.
"""
metadata = {}
for paper in papers["PubmedArticle"]:
article = paper['MedlineCitation']['Article']
author_list = article.get('AuthorList')
authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
for author in author_list])
abstract = article.get(
'Abstract', {}).get('AbstractText', '')
if isinstance(abstract, list):
abstract = ' '.join(abstract)
pub_id = str(paper["MedlineCitation"]["PMID"])
metadata[pub_id] = {
"pub_id": str(paper["MedlineCitation"]["PMID"]),
"title": article.get('ArticleTitle'),
"authors": authors,
"abstract": abstract,
"journal_title": article['Journal']['Title'],
"languages": article.get("Language", ""),
"source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
}
return metadata
def fetch_pubmed_id(query, db_name, max_search_count, retmode="xml", email="alexanderkabua@gmail.com"):
"""method to fetch the id for a given search in pubmed"""
Entrez.email = email
handle = Entrez.esearch(db=db_name, sort="relevance",
retmax=max_search_count, retmode=retmode, term=query)
results = Entrez.read(handle)
handle.close()
if results.get("IdList"):
return {
"query": query,
"id_list": results.get("IdList")
}
return None
def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
"""
Search pubmed for publication from json files with values being query string
Args:
input_file: (str): path to the json file with the query strings
max_search_count: no of ids/lookups per each search
db_name: target db default pubmed
returns: (Result<(pub_medata:list,doc_ids:dict),Error)
"""
try:
pub_data = []
doc_ids = {}
with open(input_file, "r", encoding="utf-8") as file_handler:
search_dict = json.load(file_handler)
for (filename, file_obj) in search_dict.items():
query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
db_name=db_name, max_search_count=max_search_count)
if query_ids:
for doc_id in query_ids.get("id_list"):
doc_ids[doc_id] = filename
pub_data.append(query_ids)
return (fetch_pub_details(functools.reduce(
lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)
except Exception as error:
raise error
def dump_all_to_file(response, doc_ids, output_file):
"""
function to map the pubmed data to doc_ids and dump to a json file
"""
data = {}
for (pub_id, pub_meta) in response.items():
doc_id = doc_ids.get(pub_id)
if data.get(doc_id):
data[doc_id].append(pub_meta)
else:
data[doc_id] = [pub_meta]
#
with open(output_file, "w+", encoding="utf-8") as file_handler:
json.dump(data, file_handler, indent=4)
# lossy method to fetch pub data
def fetch_id_lossy_search(query, db_name, max_results):
"""
Search PubMed data based on the provided search string.
Args:
- search_string (str): The search string.
Returns:
- dict: Dictionary containing search results and status code.
"""
try:
response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
headers={"content-type": "application/json"},
timeout=300)
return response["esearchresult"]["idlist"]
except requests.exceptions.RequestException as error:
raise error
def search_pubmed_lossy(pubmed_id, db_name):
"""
Fetches records based on the PubMed ID.
Args:
- pubmed_id (str): PubMed ID.
Returns:
- dict: Records fetched based on PubMed ID.
"""
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
response = requests.get(url, timeout=300)
response.raise_for_status()
data = response.json()
if db_name.lower() == "pmc":
return data['pmc-articleset']['article']
return data["PubmedArticleSet"]["PubmedArticle"]
if __name__ == '__main__':
(pub_metadata, doc_ids_metadata) = fetch_all_queries(
input_file="parsed_all_files.json", max_search_count=1)
dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json")