aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Kabui2024-03-18 21:29:46 +0300
committerGitHub2024-03-18 21:29:46 +0300
commitbcce1a3c5796c90a3925736484c4a2d82e6b6149 (patch)
tree11227e3612d2cb407e254f44598b65159e1c072c
parent7c5ec52301da8087ac78704e7cf1ee3c4a8472f7 (diff)
parentf9ed41d842b64eee2ad3a6821ecd15084320bff4 (diff)
downloadgenenetwork3-bcce1a3c5796c90a3925736484c4a2d82e6b6149.tar.gz
Merge pull request #154 from genenetwork/feature/pubmed-metadata
pubmed metadata
-rw-r--r--gn3/llms/process.py31
-rw-r--r--scripts/pub_med.py188
2 files changed, 217 insertions, 2 deletions
diff --git a/gn3/llms/process.py b/gn3/llms/process.py
index abd307e..549c7e6 100644
--- a/gn3/llms/process.py
+++ b/gn3/llms/process.py
@@ -1,6 +1,7 @@
"""this module contains code for processing response from fahamu client.py"""
+import os
import string
import json
@@ -10,6 +11,7 @@ import requests
from gn3.llms.client import GeneNetworkQAClient
from gn3.llms.response import DocIDs
+from gn3.settings import TMPDIR
BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
@@ -67,13 +69,36 @@ def rate_document(task_id, doc_id, rating, auth_token):
raise RuntimeError(f"An error occurred: {str(error)}") from error
+def load_file(filename):
+ """function to open and load json file"""
+ file_path = os.path.join(TMPDIR, filename)
+ if not os.path.isfile(file_path):
+ raise FileNotFoundError(f"{filename} was not found or is a directory")
+ with open(file_path, "rb") as file_handler:
+ return json.load(file_handler)
+
+
+def fetch_pubmed(references, file_name):
+ """method to fetch and populate references with pubmed"""
+
+ try:
+ pubmed = load_file(file_name)
+ for reference in references:
+ if pubmed.get(reference["doc_id"]):
+ reference["pubmed"] = pubmed.get(reference["doc_id"])
+ return references
+
+ except FileNotFoundError:
+ return references
+
+
def get_gnqa(query, auth_token):
"""entry function for the gn3 api endpoint()"""
api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token)
res, task_id = api_client.ask('?ask=' + quote(query), auth_token)
- if task_id == 0 :
- raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}")
+ if task_id == 0:
+ raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}")
res, success = api_client.get_answer(task_id)
if success == 1:
resp_text = filter_response_text(res.text)
@@ -83,6 +108,8 @@ def get_gnqa(query, auth_token):
context = resp_text['data']['context']
references = parse_context(
context, DocIDs().getInfo, format_bibliography_info)
+ references = fetch_pubmed(references, "pubmed.json")
+
return task_id, answer, references
else:
return task_id, "Unfortunately, I have nothing on the query", []
diff --git a/scripts/pub_med.py b/scripts/pub_med.py
new file mode 100644
index 0000000..82b1730
--- /dev/null
+++ b/scripts/pub_med.py
@@ -0,0 +1,188 @@
+""""module contains code to fetch the data only from pubmed
+At the moment we are only searching in pubmed db but this
+feature can be extended to others e.g pmc
+"""
+
+
+# pylint: disable=C0301
+
+import functools
+import json
+import requests
+
+from Bio import Entrez
+
+
+def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"):
+ """fetch details of publocation based on their ids
+ Args:
+ id_list(list): List of publications iDs (pubmed)
+ email: (str,optional)
+
+ Returns:
+ list: details of fetched publications
+
+ """
+ Entrez.email = email
+ if db_name.lower() == "pubmed":
+ handle = Entrez.efetch(db=db_name, retmode=retmode,
+ id=",".join(id_list))
+ results = Entrez.read(handle)
+ handle.close()
+
+ return extract_pub_metadata(results)
+
+ return []
+
+
+def extract_pub_metadata(papers):
+ """
+ Extract metadata from PubMed papers.
+
+ Args:
+ papers (dict): Dictionary containing PubMed papers.
+
+ Returns:
+ list: Extracted metadata for the papers.
+ """
+ metadata = {}
+ for paper in papers["PubmedArticle"]:
+
+ article = paper['MedlineCitation']['Article']
+ author_list = article.get('AuthorList')
+
+ authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}'
+ for author in author_list])
+ abstract = article.get(
+ 'Abstract', {}).get('AbstractText', '')
+ if isinstance(abstract, list):
+ abstract = ' '.join(abstract)
+ pub_id = str(paper["MedlineCitation"]["PMID"])
+ metadata[pub_id] = {
+ "pub_id": str(paper["MedlineCitation"]["PMID"]),
+ "title": article.get('ArticleTitle'),
+ "authors": authors,
+ "abstract": abstract,
+ "journal_title": article['Journal']['Title'],
+ "languages": article.get("Language", ""),
+ "source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/"
+ }
+
+ return metadata
+
+
+def fetch_pubmed_id(query, db_name, max_search_count, retmode="xml", email="alexanderkabua@gmail.com"):
+ """method to fetch the id for a given search in pubmed"""
+
+ Entrez.email = email
+ handle = Entrez.esearch(db=db_name, sort="relevance",
+ retmax=max_search_count, retmode=retmode, term=query)
+ results = Entrez.read(handle)
+ handle.close()
+ if results.get("IdList"):
+ return {
+ "query": query,
+ "id_list": results.get("IdList")
+ }
+
+ return None
+
+
+def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"):
+ """
+ Search pubmed for publication from json files with values being query string
+ Args:
+ input_file: (str): path to the json file with the query strings
+ max_search_count: no of ids/lookups per each search
+ db_name: target db default pubmed
+
+
+ returns: (Result<(pub_medata:list,doc_ids:dict),Error)
+
+ """
+ try:
+
+ pub_data = []
+ doc_ids = {}
+ with open(input_file, "r", encoding="utf-8") as file_handler:
+ search_dict = json.load(file_handler)
+
+ for (filename, file_obj) in search_dict.items():
+ query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"),
+ db_name=db_name, max_search_count=max_search_count)
+ if query_ids:
+ for doc_id in query_ids.get("id_list"):
+ doc_ids[doc_id] = filename
+ pub_data.append(query_ids)
+
+ return (fetch_pub_details(functools.reduce(
+ lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids)
+
+ except Exception as error:
+ raise error
+
+
+def dump_all_to_file(response, doc_ids, output_file):
+ """
+ function to map the pubmed data to doc_ids and dump to a json file
+ """
+
+ data = {}
+
+ for (pub_id, pub_meta) in response.items():
+ doc_id = doc_ids.get(pub_id)
+ if data.get(doc_id):
+ data[doc_id].append(pub_meta)
+ else:
+ data[doc_id] = [pub_meta]
+
+ #
+ with open(output_file, "w+", encoding="utf-8") as file_handler:
+ json.dump(data, file_handler, indent=4)
+
+
+# lossy method to fetch pub data
+def fetch_id_lossy_search(query, db_name, max_results):
+ """
+ Search PubMed data based on the provided search string.
+
+ Args:
+ - search_string (str): The search string.
+
+ Returns:
+ - dict: Dictionary containing search results and status code.
+ """
+
+ try:
+ response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
+ headers={"content-type": "application/json"}
+ )
+ return response["esearchresult"]["idlist"]
+
+ except requests.exceptions.RequestException as error:
+ raise error
+
+
+def search_pubmed_lossy(pubmed_id, db_name):
+ """
+ Fetches records based on the PubMed ID.
+
+ Args:
+ - pubmed_id (str): PubMed ID.
+
+ Returns:
+ - dict: Records fetched based on PubMed ID.
+ """
+ url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
+ response = requests.get(url)
+ response.raise_for_status()
+ data = response.json()
+ if db_name.lower() == "pmc":
+ return data['pmc-articleset']['article']
+ return data["PubmedArticleSet"]["PubmedArticle"]
+
+
+if __name__ == '__main__':
+ (pub_metadata, doc_ids_metadata) = fetch_all_queries(
+ input_file="parsed_all_files.json", max_search_count=1)
+ dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json")