diff options
author | Alexander Kabui | 2024-03-18 21:29:46 +0300 |
---|---|---|
committer | GitHub | 2024-03-18 21:29:46 +0300 |
commit | bcce1a3c5796c90a3925736484c4a2d82e6b6149 (patch) | |
tree | 11227e3612d2cb407e254f44598b65159e1c072c | |
parent | 7c5ec52301da8087ac78704e7cf1ee3c4a8472f7 (diff) | |
parent | f9ed41d842b64eee2ad3a6821ecd15084320bff4 (diff) | |
download | genenetwork3-bcce1a3c5796c90a3925736484c4a2d82e6b6149.tar.gz |
Merge pull request #154 from genenetwork/feature/pubmed-metadata
pubmed metadata
-rw-r--r-- | gn3/llms/process.py | 31 | ||||
-rw-r--r-- | scripts/pub_med.py | 188 |
2 files changed, 217 insertions, 2 deletions
diff --git a/gn3/llms/process.py b/gn3/llms/process.py index abd307e..549c7e6 100644 --- a/gn3/llms/process.py +++ b/gn3/llms/process.py @@ -1,6 +1,7 @@ """this module contains code for processing response from fahamu client.py""" +import os import string import json @@ -10,6 +11,7 @@ import requests from gn3.llms.client import GeneNetworkQAClient from gn3.llms.response import DocIDs +from gn3.settings import TMPDIR BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks' @@ -67,13 +69,36 @@ def rate_document(task_id, doc_id, rating, auth_token): raise RuntimeError(f"An error occurred: {str(error)}") from error +def load_file(filename): + """function to open and load json file""" + file_path = os.path.join(TMPDIR, filename) + if not os.path.isfile(file_path): + raise FileNotFoundError(f"{filename} was not found or is a directory") + with open(file_path, "rb") as file_handler: + return json.load(file_handler) + + +def fetch_pubmed(references, file_name): + """method to fetch and populate references with pubmed""" + + try: + pubmed = load_file(file_name) + for reference in references: + if pubmed.get(reference["doc_id"]): + reference["pubmed"] = pubmed.get(reference["doc_id"]) + return references + + except FileNotFoundError: + return references + + def get_gnqa(query, auth_token): """entry function for the gn3 api endpoint()""" api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token) res, task_id = api_client.ask('?ask=' + quote(query), auth_token) - if task_id == 0 : - raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}") + if task_id == 0: + raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}") res, success = api_client.get_answer(task_id) if success == 1: resp_text = filter_response_text(res.text) @@ -83,6 +108,8 @@ def get_gnqa(query, auth_token): context = resp_text['data']['context'] references = parse_context( context, DocIDs().getInfo, format_bibliography_info) + references = fetch_pubmed(references, "pubmed.json") + return task_id, answer, references else: return task_id, "Unfortunately, I have nothing on the query", [] diff --git a/scripts/pub_med.py b/scripts/pub_med.py new file mode 100644 index 0000000..82b1730 --- /dev/null +++ b/scripts/pub_med.py @@ -0,0 +1,188 @@ +""""module contains code to fetch the data only from pubmed +At the moment we are only searching in pubmed db but this +feature can be extended to others e.g pmc +""" + + +# pylint: disable=C0301 + +import functools +import json +import requests + +from Bio import Entrez + + +def fetch_pub_details(id_list, db_name, retmode="xml", email="alexanderkabua@gmail.com"): + """fetch details of publocation based on their ids + Args: + id_list(list): List of publications iDs (pubmed) + email: (str,optional) + + Returns: + list: details of fetched publications + + """ + Entrez.email = email + if db_name.lower() == "pubmed": + handle = Entrez.efetch(db=db_name, retmode=retmode, + id=",".join(id_list)) + results = Entrez.read(handle) + handle.close() + + return extract_pub_metadata(results) + + return [] + + +def extract_pub_metadata(papers): + """ + Extract metadata from PubMed papers. + + Args: + papers (dict): Dictionary containing PubMed papers. + + Returns: + list: Extracted metadata for the papers. + """ + metadata = {} + for paper in papers["PubmedArticle"]: + + article = paper['MedlineCitation']['Article'] + author_list = article.get('AuthorList') + + authors = ",".join([f'{author.get("ForeName","")} {author.get("LastName", "")}' + for author in author_list]) + abstract = article.get( + 'Abstract', {}).get('AbstractText', '') + if isinstance(abstract, list): + abstract = ' '.join(abstract) + pub_id = str(paper["MedlineCitation"]["PMID"]) + metadata[pub_id] = { + "pub_id": str(paper["MedlineCitation"]["PMID"]), + "title": article.get('ArticleTitle'), + "authors": authors, + "abstract": abstract, + "journal_title": article['Journal']['Title'], + "languages": article.get("Language", ""), + "source": f"https://pubmed.ncbi.nlm.nih.gov/{pub_id}/" + } + + return metadata + + +def fetch_pubmed_id(query, db_name, max_search_count, retmode="xml", email="alexanderkabua@gmail.com"): + """method to fetch the id for a given search in pubmed""" + + Entrez.email = email + handle = Entrez.esearch(db=db_name, sort="relevance", + retmax=max_search_count, retmode=retmode, term=query) + results = Entrez.read(handle) + handle.close() + if results.get("IdList"): + return { + "query": query, + "id_list": results.get("IdList") + } + + return None + + +def fetch_all_queries(input_file, max_search_count=1, db_name="pubmed"): + """ + Search pubmed for publication from json files with values being query string + Args: + input_file: (str): path to the json file with the query strings + max_search_count: no of ids/lookups per each search + db_name: target db default pubmed + + + returns: (Result<(pub_medata:list,doc_ids:dict),Error) + + """ + try: + + pub_data = [] + doc_ids = {} + with open(input_file, "r", encoding="utf-8") as file_handler: + search_dict = json.load(file_handler) + + for (filename, file_obj) in search_dict.items(): + query_ids = fetch_pubmed_id(query=file_obj.get("doc_name"), + db_name=db_name, max_search_count=max_search_count) + if query_ids: + for doc_id in query_ids.get("id_list"): + doc_ids[doc_id] = filename + pub_data.append(query_ids) + + return (fetch_pub_details(functools.reduce( + lambda lst1, lst2: lst1 + lst2, [data.get("id_list") for data in pub_data]), db_name), doc_ids) + + except Exception as error: + raise error + + +def dump_all_to_file(response, doc_ids, output_file): + """ + function to map the pubmed data to doc_ids and dump to a json file + """ + + data = {} + + for (pub_id, pub_meta) in response.items(): + doc_id = doc_ids.get(pub_id) + if data.get(doc_id): + data[doc_id].append(pub_meta) + else: + data[doc_id] = [pub_meta] + + # + with open(output_file, "w+", encoding="utf-8") as file_handler: + json.dump(data, file_handler, indent=4) + + +# lossy method to fetch pub data +def fetch_id_lossy_search(query, db_name, max_results): + """ + Search PubMed data based on the provided search string. + + Args: + - search_string (str): The search string. + + Returns: + - dict: Dictionary containing search results and status code. + """ + + try: + response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}", + headers={"content-type": "application/json"} + ) + return response["esearchresult"]["idlist"] + + except requests.exceptions.RequestException as error: + raise error + + +def search_pubmed_lossy(pubmed_id, db_name): + """ + Fetches records based on the PubMed ID. + + Args: + - pubmed_id (str): PubMed ID. + + Returns: + - dict: Records fetched based on PubMed ID. + """ + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json' + response = requests.get(url) + response.raise_for_status() + data = response.json() + if db_name.lower() == "pmc": + return data['pmc-articleset']['article'] + return data["PubmedArticleSet"]["PubmedArticle"] + + +if __name__ == '__main__': + (pub_metadata, doc_ids_metadata) = fetch_all_queries( + input_file="parsed_all_files.json", max_search_count=1) + dump_all_to_file(pub_metadata, doc_ids_metadata, "output_file.json") |