Merge pull request #154 from genenetwork/feature/pubmed-metadata

pubmed metadata
author: Alexander Kabui 2024-03-18 21:29:46 +0300
committer: GitHub 2024-03-18 21:29:46 +0300
commit: bcce1a3c5796c90a3925736484c4a2d82e6b6149 (patch)
tree: 11227e3612d2cb407e254f44598b65159e1c072c
parent: 7c5ec52301da8087ac78704e7cf1ee3c4a8472f7 (diff)
parent: f9ed41d842b64eee2ad3a6821ecd15084320bff4 (diff)
download: genenetwork3-bcce1a3c5796c90a3925736484c4a2d82e6b6149.tar.gz
2 files changed, 217 insertions, 2 deletions
diff --git a/gn3/llms/process.py b/gn3/llms/process.py
index abd307e..549c7e6 100644
--- a/gn3/llms/process.py
+++ b/gn3/llms/process.py
@@ -1,6 +1,7 @@
 
 """this module contains code for processing response from fahamu client.py"""
 
+import os
 import string
 import json
 
@@ -10,6 +11,7 @@ import requests
 
 from gn3.llms.client import GeneNetworkQAClient
 from gn3.llms.response import DocIDs
+from gn3.settings import TMPDIR
 
 
 BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
@@ -67,13 +69,36 @@ def rate_document(task_id, doc_id, rating, auth_token):
         raise RuntimeError(f"An error occurred: {str(error)}") from error
 
 
+def load_file(filename):
+    """function to open and load json file"""
+    file_path = os.path.join(TMPDIR, filename)
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"{filename} was not found or is a directory")
+    with open(file_path, "rb") as file_handler:
+        return json.load(file_handler)
+
+
+def fetch_pubmed(references, file_name):
+    """method to fetch and populate references with pubmed"""
+
+    try:
+        pubmed = load_file(file_name)
+        for reference in references:
+            if pubmed.get(reference["doc_id"]):
+                reference["pubmed"] = pubmed.get(reference["doc_id"])
+        return references
+
+    except FileNotFoundError:
+        return references
+
+
 def get_gnqa(query, auth_token):
     """entry function for the gn3 api endpoint()"""
 
     api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token)
     res, task_id = api_client.ask('?ask=' + quote(query), auth_token)
-    if task_id == 0 :
-        raise  RuntimeError(f"Error connecting to Fahamu Api: {str(res)}")
+    if task_id == 0:
+        raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}")
     res, success = api_client.get_answer(task_id)
     if success == 1:
         resp_text = filter_response_text(res.text)
@@ -83,6 +108,8 @@ def get_gnqa(query, auth_token):
         context = resp_text['data']['context']
         references = parse_context(
             context, DocIDs().getInfo, format_bibliography_info)
+        references = fetch_pubmed(references, "pubmed.json")
+
         return task_id, answer, references
     else:
         return task_id, "Unfortunately, I have nothing on the query", []
diff --git a/scripts/pub_med.py b/scripts/pub_med.py
new file mode 100644
index 0000000..82b1730
--- /dev/null
+++ b/scripts/pub_med.py
@@ -0,0 +1,188 @@
+""""module contains code to fetch the data only from pubmed
+At the moment we are only searching in pubmed db but this
+feature can be extended to others e.g pmc
+"""
+
+
+# pylint: disable=C0301
+
+import functools
+import json
+import requests
+
+from Bio import Entrez
+
+
+def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
+    """fetch details of publocation based on their ids
+    Args:
+        id_list(list): List of publications iDs (pubmed)
+        email: (str,optional)
+
+    Returns:
+          list:   details of fetched publications
+
+    """
+    Entrez.email = email
+    if db_name.lower() == "pubmed":
+        handle = Entrez.efetch(db=db_name, retmode=retmode,
+                               id=",".join(id_list))
+        results = Entrez.read(handle)
+        handle.close()
+
+        return extract_pub_metadata(results)
+
+    return []
+
+
+def extract_pub_metadata(papers):
+    """
+    Extract metadata from PubMed papers.
+
+    Args:
+        papers (dict): Dictionary containing PubMed papers.
+
+    Returns:
+        list: Extracted metadata for the papers.
+    """
+    metadata = {}
+    for paper in papers["PubmedArticle"]:
+
+        article = paper['MedlineCitation']['Article']
+        author_list = article.get('AuthorList')
+
+        authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
+                             for author in author_list])
+        abstract = article.get(
+            'Abstract', {}).get('AbstractText', '')
+        if isinstance(abstract, list):
+            abstract = ' '.join(abstract)
+        pub_id = str(paper["MedlineCitation"]["PMID"])
+        metadata[pub_id] = {
+            "pub_id": str(paper["MedlineCitation"]["PMID"]),
+            "title": article.get('ArticleTitle'),
+            "authors": authors,
+            "abstract": abstract,
+            "journal_title": article['Journal']['Title'],
+            "languages": article.get("Language", ""),
+            "source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
+        }
+
+    return metadata
+
+
+def fetch_pubmed_id(query, db_name, max_search_count, retmode="xml", email="alexanderkabua@gmail.com"):
+    """method to fetch the id for a given search in pubmed"""
+
+    Entrez.email = email
+    handle = Entrez.esearch(db=db_name, sort="relevance",
+                            retmax=max_search_count, retmode=retmode, term=query)
+    results = Entrez.read(handle)
+    handle.close()
+    if results.get("IdList"):
+        return {
+            "query": query,
+            "id_list": results.get("IdList")
+        }
+
+    return None
+
+
+def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
+    """
+    Search pubmed for publication from json files with values being query string
+    Args:
+        input_file: (str): path to the json file with the query strings
+        max_search_count: no of ids/lookups per each search
+        db_name:  target db default pubmed
+
+
+    returns: (Result<(pub_medata:list,doc_ids:dict),Error)
+
+    """
+    try:
+
+        pub_data = []
+        doc_ids = {}
+        with open(input_file, "r", encoding="utf-8") as file_handler:
+            search_dict = json.load(file_handler)
+
+            for (filename, file_obj) in search_dict.items():
+                query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
+                                            db_name=db_name, max_search_count=max_search_count)
+                if query_ids:
+                    for doc_id in query_ids.get("id_list"):
+                        doc_ids[doc_id] = filename
+                    pub_data.append(query_ids)
+
+        return (fetch_pub_details(functools.reduce(
+            lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)
+
+    except Exception as error:
+        raise error
+
+
+def dump_all_to_file(response, doc_ids, output_file):
+    """
+    function to map the pubmed data to doc_ids and dump to a json file
+    """
+
+    data = {}
+
+    for (pub_id, pub_meta) in response.items():
+        doc_id = doc_ids.get(pub_id)
+        if data.get(doc_id):
+            data[doc_id].append(pub_meta)
+        else:
+            data[doc_id] = [pub_meta]
+
+    #
+    with open(output_file, "w+", encoding="utf-8") as file_handler:
+        json.dump(data, file_handler, indent=4)
+
+
+# lossy method to fetch pub  data
+def fetch_id_lossy_search(query, db_name, max_results):
+    """
+    Search PubMed data based on the provided search string.
+
+    Args:
+    - search_string (str): The search string.
+
+    Returns:
+    - dict: Dictionary containing search results and status code.
+    """
+
+    try:
+        response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
+                                headers={"content-type": "application/json"}
+                                )
+        return response["esearchresult"]["idlist"]
+
+    except requests.exceptions.RequestException as error:
+        raise error
+
+
+def search_pubmed_lossy(pubmed_id, db_name):
+    """
+    Fetches records based on the PubMed ID.
+
+    Args:
+    - pubmed_id (str): PubMed ID.
+
+    Returns:
+    - dict: Records fetched based on PubMed ID.
+    """
+    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
+    response = requests.get(url)
+    response.raise_for_status()
+    data = response.json()
+    if db_name.lower() == "pmc":
+        return data['pmc-articleset']['article']
+    return data["PubmedArticleSet"]["PubmedArticle"]
+
+
+if __name__ == '__main__':
+    (pub_metadata, doc_ids_metadata) = fetch_all_queries(
+        input_file="parsed_all_files.json", max_search_count=1)
+    dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json")
author	Alexander Kabui	2024-03-18 21:29:46 +0300
committer	GitHub	2024-03-18 21:29:46 +0300
commit	bcce1a3c5796c90a3925736484c4a2d82e6b6149 (patch)
tree	11227e3612d2cb407e254f44598b65159e1c072c
parent	7c5ec52301da8087ac78704e7cf1ee3c4a8472f7 (diff)
parent	f9ed41d842b64eee2ad3a6821ecd15084320bff4 (diff)
download	genenetwork3-bcce1a3c5796c90a3925736484c4a2d82e6b6149.tar.gz