diff options
author | Alexander Kabui | 2024-08-30 16:46:37 +0300 |
---|---|---|
committer | GitHub | 2024-08-30 16:46:37 +0300 |
commit | 25345b327f319c49798cef208be950e25f447da6 (patch) | |
tree | 56e34bd27b4066dbdc0adcc64bbb9edf8ebd5042 /gn3/llms | |
parent | f8c87e6fd1b26887c84a390a1a253d2c629942bc (diff) | |
parent | 57b4a4fd5bcb8a2b7f9af856d8f1212c0fbbe0da (diff) | |
download | genenetwork3-25345b327f319c49798cef208be950e25f447da6.tar.gz |
Merge pull request #165 from genenetwork/feature/gnqa-search
Feature/gnqa search
Diffstat (limited to 'gn3/llms')
-rw-r--r-- | gn3/llms/client.py | 57 | ||||
-rw-r--r-- | gn3/llms/errors.py | 7 | ||||
-rw-r--r-- | gn3/llms/process.py | 121 |
3 files changed, 90 insertions, 95 deletions
diff --git a/gn3/llms/client.py b/gn3/llms/client.py index d57bca2..ad6c400 100644 --- a/gn3/llms/client.py +++ b/gn3/llms/client.py @@ -55,6 +55,7 @@ class GeneNetworkQAClient(Session): self.base_url = "https://genenetwork.fahamuai.com/api/tasks" self.answer_url = f"{self.base_url}/answers" self.feedback_url = f"{self.base_url}/feedback" + self.query = "" adapter = TimeoutHTTPAdapter( timeout=timeout, @@ -83,44 +84,44 @@ class GeneNetworkQAClient(Session): """ handler for non 200 response from fahamu api""" return f"Error: Status code -{response.status_code}- Reason::{response.reason}" - def ask(self, ex_url, *args, **kwargs): + def ask(self, ex_url, query, *args, **kwargs): """fahamu ask api interface""" + self.query = query res = self.custom_request('POST', f"{self.base_url}{ex_url}", *args, **kwargs) - if res.status_code != 200: - return f"Error: Status code -{res.status_code}- Reason::{res.reason}", 0 return res, json.loads(res.text) def get_answer(self, taskid, *args, **kwargs): """Fahamu get answer interface""" - try: - query = f"{self.answer_url}?task_id={taskid['task_id']}" - res = self.custom_request('GET', query, *args, **kwargs) - if res.status_code != 200: - return f"Error: Status code -{res.status_code}- Reason::{res.reason}", 0 - return res, 1 - except TimeoutError: - return "Timeout error occured:try to rephrase your query", 0 + query = f"{self.answer_url}?task_id={taskid['task_id']}" + res = self.custom_request('GET', query, *args, **kwargs) + return res, 1 def custom_request(self, method, url, *args, **kwargs): """ make custom request to fahamu api ask and get response""" max_retries = 50 retry_delay = 3 + response_msg = { + 404: "Api endpoint Does not exist", + 500: "Use of Invalid Token/or the Fahamu Api is currently down", + 400: "You sent a bad Fahamu request", + 401: "You do not have authorization to perform the request", + } for _i in range(max_retries): - try: - response = super().request(method, url, *args, **kwargs) - response.raise_for_status() - if response.ok: - if method.lower() == "get" and response.json().get("data") is None: - time.sleep(retry_delay) - continue - return response - else: + response = super().request(method, url, *args, **kwargs) + if response.ok: + if method.lower() == "get" and not response.json().get("data"): + # note this is a dirty trick to check if fahamu has returned the results + # the issue is that the api only returns 500 or 200 satus code + # TODO: fix this on their end time.sleep(retry_delay) - except requests.exceptions.HTTPError as error: - if error.response.status_code == 500: - raise LLMError(error.request, error.response, f"Response Error with:status_code:{error.response.status_code},Reason for error: Use of Invalid Fahamu Token") from error - raise LLMError(error.request, error.response, - f"HTTP error occurred with error status:{error.response.status_code}") from error - except requests.exceptions.RequestException as error: - raise error - raise TimeoutError + continue + return response + else: + raise LLMError(f"Request error with code:\ + {response.status_code} occurred with reason:\ + {response_msg.get(response.status_code,response.reason)}", + self.query) + #time.sleep(retry_delay) + raise LLMError("Timeout error: We couldn't provide a response,Please try\ + to rephrase your question to receive feedback", + self.query) diff --git a/gn3/llms/errors.py b/gn3/llms/errors.py index af3d7b0..a3a47a3 100644 --- a/gn3/llms/errors.py +++ b/gn3/llms/errors.py @@ -35,8 +35,5 @@ class UnprocessableEntity(HTTPError): msg, request=request, response=response) -class LLMError(HTTPError): - """Custom error from making Fahamu APi request """ - def __init__(self, request, response, msg): - super(HTTPError, self).__init__( - msg, request=request, response=response) +class LLMError(Exception): + """custom exception for LLMErrorMIxins""" diff --git a/gn3/llms/process.py b/gn3/llms/process.py index 4725bcb..55c27a0 100644 --- a/gn3/llms/process.py +++ b/gn3/llms/process.py @@ -1,6 +1,7 @@ """this module contains code for processing response from fahamu client.py""" # pylint: disable=C0301 import os +import re import string import json import logging @@ -21,21 +22,12 @@ class DocIDs(): * doc_ids.json: opens doc)ids for gn references * sugar_doc_ids: open doci_ids for diabetes references """ - self.doc_ids = self.load_file("doc_ids.json") - self.sugar_doc_ids = self.load_file("all_files.json") + self.doc_ids = load_file("doc_ids.json", BASEDIR) + self.sugar_doc_ids = load_file("all_files.json", BASEDIR) self.format_doc_ids(self.sugar_doc_ids) - def load_file(self, file_name): - """Method to load and read doc_id files""" - file_path = os.path.join(BASEDIR, file_name) - if os.path.isfile(file_path): - with open(file_path, "rb") as file_handler: - return json.load(file_handler) - else: - raise FileNotFoundError(f"{file_path}-- FIle does not exist\n") - def format_doc_ids(self, docs): - """method to format doc_ids for list items""" + """method to format doc_ids for list items doc_id and doc_name""" for _key, val in docs.items(): if isinstance(val, list): for doc_obj in val: @@ -43,7 +35,14 @@ class DocIDs(): self.doc_ids.update({doc_obj["id"]: doc_name}) def get_info(self, doc_id): - """ interface to make read from doc_ids""" + """ interface to make read from doc_ids + and extract info data else returns + doc_id + Args: + doc_id: str: a search key for doc_ids + Returns: + an object with doc_info if doc_id in doc_ids + """ if doc_id in self.doc_ids.keys(): return self.doc_ids[doc_id] else: @@ -51,7 +50,8 @@ class DocIDs(): def format_bibliography_info(bib_info): - """Function for formatting bibliography info""" + """Utility function for formatting bibliography info + """ if isinstance(bib_info, str): return bib_info.removesuffix('.txt') elif isinstance(bib_info, dict): @@ -59,14 +59,16 @@ def format_bibliography_info(bib_info): return bib_info -def filter_response_text(val): - """helper function for filtering non-printable chars""" - return json.loads(''.join([str(char) - for char in val if char in string.printable])) - - def parse_context(context, get_info_func, format_bib_func): - """function to parse doc_ids content""" + """Function to parse doc_ids content + Args: + context: raw references from fahamu api + get_info_func: function to get doc_ids info + format_bib_func: function to foramt bibliography info + Returns: + an list with each item having (doc_id,bib_info, + combined reference text) + """ results = [] for doc_ids, summary in context.items(): combo_txt = "" @@ -75,13 +77,23 @@ def parse_context(context, get_info_func, format_bib_func): doc_info = get_info_func(doc_ids) bib_info = doc_ids if doc_ids == doc_info else format_bib_func( doc_info) + pattern = r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*' + combo_text = re.sub(pattern, + lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>", + combo_txt) results.append( - {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt}) + {"doc_id": doc_ids, "bibInfo": bib_info, + "comboTxt": combo_text}) return results def load_file(filename, dir_path): - """function to open and load json file""" + """Utility function to read json file + Args: + filename: file name to read + dir_path: base directory for the file + Returns: json data read to a dict + """ file_path = os.path.join(dir_path, f"{filename}") if not os.path.isfile(file_path): raise FileNotFoundError(f"{filename} was not found or is a directory") @@ -90,8 +102,19 @@ def load_file(filename, dir_path): def fetch_pubmed(references, file_name, data_dir=""): - """method to fetch and populate references with pubmed""" + """ + Fetches PubMed data from a JSON file and populates the\ + references dictionary. + + Args: + references (dict): Dictionary with document IDs as keys\ + and reference data as values. + filename (str): Name of the JSON file containing PubMed data. + data_dir (str): Base directory where the data files are located. + Returns: + dict: Updated references dictionary populated with the PubMed data. + """ try: pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit")) for reference in references: @@ -116,42 +139,16 @@ def get_gnqa(query, auth_token, data_dir=""): answer references: contains doc_name,reference,pub_med_info """ - api_client = GeneNetworkQAClient(api_key=auth_token) - res, task_id = api_client.ask('?ask=' + quote(query), auth_token) - if task_id == 0: - raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}") - res, status = api_client.get_answer(task_id) - if status == 1: - resp_text = filter_response_text(res.text) - if resp_text.get("data") is None: - return task_id, "Please try to rephrase your question to receive feedback", [] - answer = resp_text['data']['answer'] - context = resp_text['data']['context'] - references = parse_context( - context, DocIDs().get_info, format_bibliography_info) - references = fetch_pubmed(references, "pubmed.json", data_dir) - - return task_id, answer, references - else: - return task_id, "We couldn't provide a response,Please try to rephrase your question to receive feedback", [] - - -def fetch_query_results(query, user_id, redis_conn): - """this method fetches prev user query searches""" - result = redis_conn.get(f"LLM:{user_id}-{query}") - if result: - return json.loads(result) - return { - "query": query, - "answer": "Sorry No answer for you", - "references": [], - "task_id": None - } - - -def get_user_queries(user_id, redis_conn): - """methos to fetch all queries for a specific user""" - results = redis_conn.keys(f"LLM:{user_id}*") - return [query for query in - [result.partition("-")[2] for result in results] if query != ""] + res, task_id = api_client.ask('?ask=' + quote(query), query=query) + res, _status = api_client.get_answer(task_id) + resp_text = json.loads(''.join([str(char) + for char in res.text if char in string.printable])) + answer = re.sub(r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*', + lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>", + resp_text["data"]["answer"]) + context = resp_text['data']['context'] + return task_id, answer, fetch_pubmed(parse_context( + context, DocIDs().get_info, + format_bibliography_info), + "pubmed.json", data_dir) |