about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
authorAlexander_Kabui2024-03-15 13:34:13 +0300
committerAlexander_Kabui2024-03-15 13:34:13 +0300
commit0c786fccb0c5805f944fa9003bb4a15e23b6024a (patch)
treeebd439a3c685535bc67bfc5dcad770f65d173f13 /scripts
parent7c5ec52301da8087ac78704e7cf1ee3c4a8472f7 (diff)
downloadgenenetwork3-0c786fccb0c5805f944fa9003bb4a15e23b6024a.tar.gz
add pubmed metadata
Diffstat (limited to 'scripts')
-rw-r--r--scripts/pub_med.py180
1 files changed, 180 insertions, 0 deletions
diff --git a/scripts/pub_med.py b/scripts/pub_med.py
new file mode 100644
index 0000000..f3e8861
--- /dev/null
+++ b/scripts/pub_med.py
@@ -0,0 +1,180 @@
+""""module contains code to fetch the data only from pubmed
+At the moment we are only searching in pubmed db but this
+feature can be extended to others e.g pmc
+"""
+
+
+import functools
+import json
+from Bio import Entrez
+
+
+def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
+    """fetch details of publocation based on their ids
+    Args:
+        id_list(list): List of publications iDs (pubmed)
+        email: (str,optional)
+
+    Returns:
+          list:   details of fetched publications
+
+    """
+    Entrez.email = email
+    if db_name.lower() == "pubmed":
+        handle = Entrez.efetch(db=db_name, retmode="xml",
+                               id=",".join(id_list))
+        results = Entrez.read(handle)
+        handle.close()
+
+        return extract_pub_metadata(results)
+
+
+def extract_pub_metadata(papers):
+    """
+    Extract metadata from PubMed papers.
+
+    Args:
+        papers (dict): Dictionary containing PubMed papers.
+
+    Returns:
+        list: Extracted metadata for the papers.
+    """
+    metadata = {}
+    for paper in papers["PubmedArticle"]:
+
+        article = paper['MedlineCitation']['Article']
+        author_list = article.get('AuthorList')
+
+        authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
+                             for author in author_list])
+        abstract = article.get(
+            'Abstract', {}).get('AbstractText', '')
+        if isinstance(abstract, list):
+            abstract = ' '.join(abstract)
+        pub_id = str(paper["MedlineCitation"]["PMID"])
+        metadata[pub_id] = {
+            "pub_id": str(paper["MedlineCitation"]["PMID"]),
+            "title": article.get('ArticleTitle'),
+            "authors": authors,
+            "abstract": abstract,
+            "journal_title": article['Journal']['Title'],
+            "languages": article.get("Language", ""),
+            "source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
+        }
+
+    return metadata
+
+
+def fetch_pubmed_id(query, db_name, max_search_count, ret_mode="xml", email="alexanderkabua@gmail.com"):
+    """method to fetch the id for a given search in pubmed"""
+
+    Entrez.email = email
+    handle = Entrez.esearch(db=db_name, sort="relevance",
+                            retmax=max_search_count, ret_mode="xml", term=query)
+    results = Entrez.read(handle)
+    handle.close()
+    if results.get("IdList"):
+        return {
+            "query": query,
+            "id_list": results.get("IdList")
+        }
+
+
+def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
+    """
+    Search pubmed for publication from json files with values being query string
+    Args:
+        input_file: (str): path to the json file with the query strings
+        max_search_count: no of ids/lookups per each search
+        db_name:  target db default pubmed
+
+
+    returns: (Result<(pub_medata:list,doc_ids:dict),Error)
+
+    """
+    try:
+
+        pub_data = []
+        doc_ids = {}
+        with open(input_file, "r") as file_handler:
+            search_dict = json.load(file_handler)
+
+            for (filename, file_obj) in search_dict.items():
+                query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
+                                            db_name=db_name, max_search_count=max_search_count)
+                if query_ids:
+                    for doc_id in query_ids.get("id_list"):
+                        doc_ids[doc_id] = filename
+                    pub_data.append(query_ids)
+
+        return (fetch_pub_details(functools.reduce(
+            lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)
+
+    except Exception as error:
+        raise error
+
+
+def dump_all_to_file(response, doc_ids, output_file):
+    """
+    function to map the pubmed data to doc_ids and dump to a json file
+    """
+
+    data = {}
+
+    for (pub_id, pub_meta) in response.items():
+        doc_id = doc_ids.get(pub_id)
+        if data.get(doc_id):
+            data[doc_id].append(pub_meta)
+        else:
+            data[doc_id] = [pub_meta]
+
+    #
+    with open(output_file, "w+") as file_handler:
+        json.dump(data, file_handler, indent=4)
+
+
+# lossy method to fetch pub  data
+def fetch_id_lossy_search(query, db_name, max_results):
+    """
+    Search PubMed data based on the provided search string.
+
+    Args:
+    - search_string (str): The search string.
+
+    Returns:
+    - dict: Dictionary containing search results and status code.
+    """
+
+    try:
+        response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
+                                headers={"content-type": "application/json"}
+                                )
+        return response["esearchresult"]["idlist"]
+
+    except requests.exceptions.RequestException as error:
+        raise error
+
+
+def search_pubmed_lossy(pubmed_id, db_name):
+    """
+    Fetches records based on the PubMed ID.
+
+    Args:
+    - pubmed_id (str): PubMed ID.
+
+    Returns:
+    - dict: Records fetched based on PubMed ID.
+    """
+    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
+    response = requests.get(url)
+    response.raise_for_status()
+    data = response.json()
+    if db_name.lower() == "pmc":
+        return data['pmc-articleset']['article']
+    return data["PubmedArticleSet"]["PubmedArticle"]
+
+
+if __name__ == '__main__':
+    (pub_data, doc_ids) = fetch_all_queries(
+        input_file="parsed_all_files.json", max_search_count=1)
+    dump_all_to_file(pub_data, doc_ids, "output_file.json")