scripts/pub_med.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

""""module contains code to fetch the data only from pubmed
At the moment we are only searching in pubmed db but this
feature can be extended to others e.g pmc
"""


# pylint: disable=C0301

import functools
import json
import requests

from Bio import Entrez


def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
    """fetch details of publocation based on their ids
    Args:
        id_list(list): List of publications iDs (pubmed)
        email: (str,optional)

    Returns:
          list:   details of fetched publications

    """
    Entrez.email = email
    if db_name.lower() == "pubmed":
        handle = Entrez.efetch(db=db_name, retmode=retmode,
                               id=",".join(id_list))
        results = Entrez.read(handle)
        handle.close()

        return extract_pub_metadata(results)

    return []


def extract_pub_metadata(papers):
    """
    Extract metadata from PubMed papers.

    Args:
        papers (dict): Dictionary containing PubMed papers.

    Returns:
        list: Extracted metadata for the papers.
    """
    metadata = {}
    for paper in papers["PubmedArticle"]:

        article = paper['MedlineCitation']['Article']
        author_list = article.get('AuthorList')

        authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
                             for author in author_list])
        abstract = article.get(
            'Abstract', {}).get('AbstractText', '')
        if isinstance(abstract, list):
            abstract = ' '.join(abstract)
        pub_id = str(paper["MedlineCitation"]["PMID"])
        metadata[pub_id] = {
            "pub_id": str(paper["MedlineCitation"]["PMID"]),
            "title": article.get('ArticleTitle'),
            "authors": authors,
            "abstract": abstract,
            "journal_title": article['Journal']['Title'],
            "languages": article.get("Language", ""),
            "source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
        }

    return metadata


def fetch_pubmed_id(query, db_name, max_search_count, retmode="xml", email="alexanderkabua@gmail.com"):
    """method to fetch the id for a given search in pubmed"""

    Entrez.email = email
    handle = Entrez.esearch(db=db_name, sort="relevance",
                            retmax=max_search_count, retmode=retmode, term=query)
    results = Entrez.read(handle)
    handle.close()
    if results.get("IdList"):
        return {
            "query": query,
            "id_list": results.get("IdList")
        }

    return None


def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
    """
    Search pubmed for publication from json files with values being query string
    Args:
        input_file: (str): path to the json file with the query strings
        max_search_count: no of ids/lookups per each search
        db_name:  target db default pubmed


    returns: (Result<(pub_medata:list,doc_ids:dict),Error)

    """
    try:

        pub_data = []
        doc_ids = {}
        with open(input_file, "r", encoding="utf-8") as file_handler:
            search_dict = json.load(file_handler)

            for (filename, file_obj) in search_dict.items():
                query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
                                            db_name=db_name, max_search_count=max_search_count)
                if query_ids:
                    for doc_id in query_ids.get("id_list"):
                        doc_ids[doc_id] = filename
                    pub_data.append(query_ids)

        return (fetch_pub_details(functools.reduce(
            lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)

    except Exception as error:
        raise error


def dump_all_to_file(response, doc_ids, output_file):
    """
    function to map the pubmed data to doc_ids and dump to a json file
    """

    data = {}

    for (pub_id, pub_meta) in response.items():
        doc_id = doc_ids.get(pub_id)
        if data.get(doc_id):
            data[doc_id].append(pub_meta)
        else:
            data[doc_id] = [pub_meta]

    #
    with open(output_file, "w+", encoding="utf-8") as file_handler:
        json.dump(data, file_handler, indent=4)


# lossy method to fetch pub  data
def fetch_id_lossy_search(query, db_name, max_results):
    """
    Search PubMed data based on the provided search string.

    Args:
    - search_string (str): The search string.

    Returns:
    - dict: Dictionary containing search results and status code.
    """

    try:
        response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
                                headers={"content-type": "application/json"}
                                )
        return response["esearchresult"]["idlist"]

    except requests.exceptions.RequestException as error:
        raise error


def search_pubmed_lossy(pubmed_id, db_name):
    """
    Fetches records based on the PubMed ID.

    Args:
    - pubmed_id (str): PubMed ID.

    Returns:
    - dict: Records fetched based on PubMed ID.
    """
    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    if db_name.lower() == "pmc":
        return data['pmc-articleset']['article']
    return data["PubmedArticleSet"]["PubmedArticle"]


if __name__ == '__main__':
    (pub_metadata, doc_ids_metadata) = fetch_all_queries(
        input_file="parsed_all_files.json", max_search_count=1)
    dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json")