aboutsummaryrefslogtreecommitdiff
path: root/gn3/llms
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/llms')
-rw-r--r--gn3/llms/client.py215
-rw-r--r--gn3/llms/errors.py31
-rw-r--r--gn3/llms/process.py179
-rw-r--r--gn3/llms/response.py75
4 files changed, 164 insertions, 336 deletions
diff --git a/gn3/llms/client.py b/gn3/llms/client.py
index 042becd..ad6c400 100644
--- a/gn3/llms/client.py
+++ b/gn3/llms/client.py
@@ -1,72 +1,61 @@
-# pylint: skip-file
+"""Module Contains code for making request to fahamu Api"""
+# pylint: disable=C0301
import json
-import string
-import os
-import datetime
import time
-import requests
-from requests import Session
-from urllib.parse import urljoin
-from requests.packages.urllib3.util.retry import Retry
-from requests import HTTPError
+import requests
from requests import Session
from requests.adapters import HTTPAdapter
-from urllib.request import urlretrieve
-from urllib.parse import quote
-from gn3.llms.errors import UnprocessableEntity
-from gn3.llms.errors import LLMError
+from requests.adapters import Retry
-basedir = os.path.join(os.path.dirname(__file__))
+from gn3.llms.errors import LLMError
class TimeoutHTTPAdapter(HTTPAdapter):
+ """Set a default timeout for HTTP calls """
def __init__(self, timeout, *args, **kwargs):
- """TimeoutHTTPAdapter constructor.
- Args:
- timeout (int): How many seconds to wait for the server to send data before
- giving up.
- """
+ """TimeoutHTTPAdapter constructor."""
self.timeout = timeout
super().__init__(*args, **kwargs)
- def send(self, request, **kwargs):
+ def send(self, *args, **kwargs):
"""Override :obj:`HTTPAdapter` send method to add a default timeout."""
- timeout = kwargs.get("timeout")
- if timeout is None:
- kwargs["timeout"] = self.timeout
-
- return super().send(request, **kwargs)
+ kwargs["timeout"] = (
+ kwargs["timeout"] if kwargs.get("timeout") else self.timeout
+ )
+ return super().send(*args, **kwargs)
class GeneNetworkQAClient(Session):
"""GeneNetworkQA Client
This class provides a client object interface to the GeneNetworkQA API.
- It extends the `requests.Session` class and includes authorization, base URL,
+ It extends the `requests.Session` class and includes authorization,
+ base URL,
request timeouts, and request retries.
Args:
- account (str): Base address subdomain.
api_key (str): API key.
- version (str, optional): API version, defaults to "v3".
timeout (int, optional): Timeout value, defaults to 5.
total_retries (int, optional): Total retries value, defaults to 5.
- backoff_factor (int, optional): Retry backoff factor value, defaults to 30.
+ backoff_factor (int, optional): Retry backoff factor value,
+ defaults to 30.
Usage:
from genenetworkqa import GeneNetworkQAClient
- gnqa = GeneNetworkQAClient(account="account-name", api_key="XXXXXXXXXXXXXXXXXXX...")
+ gnqa = GeneNetworkQAClient(account="account-name",
+ api_key="XXXXXXXXXXXXXXXXXXX...")
"""
- BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
-
- def __init__(self, account, api_key, version="v3", timeout=30, total_retries=5, backoff_factor=30):
+ def __init__(self, api_key, timeout=30,
+ total_retries=5, backoff_factor=30):
super().__init__()
self.headers.update(
{"Authorization": "Bearer " + api_key})
- self.answer_url = f"{self.BASE_URL}/answers"
- self.feedback_url = f"{self.BASE_URL}/feedback"
+ self.base_url = "https://genenetwork.fahamuai.com/api/tasks"
+ self.answer_url = f"{self.base_url}/answers"
+ self.feedback_url = f"{self.base_url}/feedback"
+ self.query = ""
adapter = TimeoutHTTPAdapter(
timeout=timeout,
@@ -80,141 +69,59 @@ class GeneNetworkQAClient(Session):
self.mount("https://", adapter)
self.mount("http://", adapter)
- @staticmethod
- def format_bibliography_info(bib_info):
-
- if isinstance(bib_info, str):
- # Remove '.txt'
- bib_info = bib_info.removesuffix('.txt')
- elif isinstance(bib_info, dict):
- # Format string bibliography information
- bib_info = "{0}.{1}.{2}.{3} ".format(bib_info.get('author', ''),
- bib_info.get('title', ''),
- bib_info.get('year', ''),
- bib_info.get('doi', ''))
- return bib_info
-
- @staticmethod
- def ask_the_documents(extend_url, my_auth):
+ def get_answer_using_task_id(self, extend_url, my_auth):
+ """call this method with task id to fetch response"""
try:
- response = requests.post(
- base_url + extend_url, data={}, headers=my_auth)
+ response = requests.get(
+ self.answer_url + extend_url, data={}, headers=my_auth)
response.raise_for_status()
- except requests.exceptions.RequestException as e:
- # Handle the exception appropriately, e.g., log the error
- raise RuntimeError(f"Error making the request: {e}")
-
- if response.status_code != 200:
- return negative_status_msg(response), 0
-
- task_id = get_task_id_from_result(response)
- response = get_answer_using_task_id(task_id, my_auth)
-
- if response.status_code != 200:
-
- return negative_status_msg(response), 0
-
- return response, 1
+ return response
+ except requests.exceptions.RequestException as error:
+ raise error
@staticmethod
def negative_status_msg(response):
+ """ handler for non 200 response from fahamu api"""
return f"Error: Status code -{response.status_code}- Reason::{response.reason}"
- # return f"Problems\n\tStatus code => {response.status_code}\n\tReason => {response.reason}"
- def ask(self, exUrl, *args, **kwargs):
- askUrl = self.BASE_URL + exUrl
- res = self.custom_request('POST', askUrl, *args, **kwargs)
- if (res.status_code != 200):
- return self.negative_status_msg(res), 0
- task_id = self.getTaskIDFromResult(res)
- return res, task_id
+ def ask(self, ex_url, query, *args, **kwargs):
+ """fahamu ask api interface"""
+ self.query = query
+ res = self.custom_request('POST', f"{self.base_url}{ex_url}", *args, **kwargs)
+ return res, json.loads(res.text)
def get_answer(self, taskid, *args, **kwargs):
- query = self.answer_url + self.extendTaskID(taskid)
+ """Fahamu get answer interface"""
+ query = f"{self.answer_url}?task_id={taskid['task_id']}"
res = self.custom_request('GET', query, *args, **kwargs)
- if (res.status_code != 200):
- return self.negative_status_msg(res), 0
return res, 1
def custom_request(self, method, url, *args, **kwargs):
-
+ """ make custom request to fahamu api ask and get response"""
max_retries = 50
retry_delay = 3
-
- for i in range(max_retries):
- try:
- response = super().request(method, url, *args, **kwargs)
- response.raise_for_status()
-
- except requests.exceptions.HTTPError as error:
- if error.response.status_code ==500:
- raise LLMError(error.request, error.response, f"Response Error,status_code:{error.response.status_code},Reason: Use of Invalid Token")
- elif error.response.status_code ==404:
- raise LLMError(error.request,error.response,f"404 Client Error: Not Found for url: {self.BASE_URL}")
- raise error
-
- except requests.exceptions.RequestException as error:
- raise error
-
-
-
-
+ response_msg = {
+ 404: "Api endpoint Does not exist",
+ 500: "Use of Invalid Token/or the Fahamu Api is currently down",
+ 400: "You sent a bad Fahamu request",
+ 401: "You do not have authorization to perform the request",
+ }
+ for _i in range(max_retries):
+ response = super().request(method, url, *args, **kwargs)
if response.ok:
- if method.lower() == "get" and response.json().get("data") is None:
+ if method.lower() == "get" and not response.json().get("data"):
+ # note this is a dirty trick to check if fahamu has returned the results
+ # the issue is that the api only returns 500 or 200 satus code
+ # TODO: fix this on their end
time.sleep(retry_delay)
continue
- else:
- return response
+ return response
else:
- time.sleep(retry_delay)
- return response
-
- @staticmethod
- def get_task_id_from_result(response):
- task_id = json.loads(response.text)
- result = f"?task_id={task_id.get('task_id', '')}"
- return result
-
- @staticmethod
- def get_answer_using_task_id(extend_url, my_auth):
- try:
- response = requests.get(
- answer_url + extend_url, data={}, headers=my_auth)
- response.raise_for_status()
- return response
- except requests.exceptions.RequestException as error:
- # Handle the exception appropriately, e.g., log the error
- raise error
-
- @staticmethod
- def filter_response_text(val):
- """
- Filters out non-printable characters from the input string and parses it as JSON.
-
- Args:
- val (str): Input string to be filtered and parsed.
-
- Returns:
- dict: Parsed JSON object.
- # remove this
- """
- return json.loads(''.join([str(char) for char in val if char in string.printable]))
-
- def getTaskIDFromResult(self, res):
- return json.loads(res.text)
-
- def extendTaskID(self, task_id):
- return '?task_id=' + str(task_id['task_id'])
-
- def get_gnqa(self, query):
- qstr = quote(query)
- res, task_id = api_client.ask('?ask=' + qstr)
- res, success = api_client.get_answer(task_id)
-
- if success == 1:
- resp_text = filter_response_text(res.text)
- answer = resp_text.get('data', {}).get('answer', '')
- context = resp_text.get('data', {}).get('context', '')
- return answer, context
- else:
- return res, "Unfortunately, I have nothing."
+ raise LLMError(f"Request error with code:\
+ {response.status_code} occurred with reason:\
+ {response_msg.get(response.status_code,response.reason)}",
+ self.query)
+ #time.sleep(retry_delay)
+ raise LLMError("Timeout error: We couldn't provide a response,Please try\
+ to rephrase your question to receive feedback",
+ self.query)
diff --git a/gn3/llms/errors.py b/gn3/llms/errors.py
index e9f7c02..a3a47a3 100644
--- a/gn3/llms/errors.py
+++ b/gn3/llms/errors.py
@@ -1,32 +1,11 @@
-
-# pylint: skip-file
+""" Error handlers for Fahamu Api"""
import json
-
from requests import HTTPError
class UnprocessableEntity(HTTPError):
- """An HTTP 422 Unprocessable Entity error occurred.
-
+ """Error for HTTP 422 Unprocessable Entity
https://help.helpjuice.com/en_US/api-v3/api-v3#errors
-
- The request could not be processed, usually due to a missing or invalid parameter.
-
- The response will also include an error object with an explanation of fields that
- are missing or invalid. Here is an example:
-
- .. code-block::
-
- HTTP/1.1 422 Unprocessable Entity
-
-
- {
- "errors": [
- {
- "email": "is not valid."
- }
- ]
- }
"""
def __init__(self, request, response):
@@ -56,7 +35,5 @@ class UnprocessableEntity(HTTPError):
msg, request=request, response=response)
-class LLMError(HTTPError):
- def __init__(self, request, response, msg):
- super(HTTPError, self).__init__(
- msg, request=request, response=response)
+class LLMError(Exception):
+ """custom exception for LLMErrorMIxins"""
diff --git a/gn3/llms/process.py b/gn3/llms/process.py
index e38b73e..55c27a0 100644
--- a/gn3/llms/process.py
+++ b/gn3/llms/process.py
@@ -1,25 +1,57 @@
"""this module contains code for processing response from fahamu client.py"""
+# pylint: disable=C0301
import os
+import re
import string
import json
-
-from urllib.parse import urljoin
-from urllib.parse import quote
import logging
-import requests
+from urllib.parse import quote
from gn3.llms.client import GeneNetworkQAClient
-from gn3.llms.response import DocIDs
BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
-
-
-# pylint: disable=C0301
+BASEDIR = os.path.abspath(os.path.dirname(__file__))
+
+
+class DocIDs():
+ """ Class Method to Parse document id and names from files"""
+ def __init__(self):
+ """
+ init method for Docids
+ * doc_ids.json: opens doc)ids for gn references
+ * sugar_doc_ids: open doci_ids for diabetes references
+ """
+ self.doc_ids = load_file("doc_ids.json", BASEDIR)
+ self.sugar_doc_ids = load_file("all_files.json", BASEDIR)
+ self.format_doc_ids(self.sugar_doc_ids)
+
+ def format_doc_ids(self, docs):
+ """method to format doc_ids for list items doc_id and doc_name"""
+ for _key, val in docs.items():
+ if isinstance(val, list):
+ for doc_obj in val:
+ doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "")
+ self.doc_ids.update({doc_obj["id"]: doc_name})
+
+ def get_info(self, doc_id):
+ """ interface to make read from doc_ids
+ and extract info data else returns
+ doc_id
+ Args:
+ doc_id: str: a search key for doc_ids
+ Returns:
+ an object with doc_info if doc_id in doc_ids
+ """
+ if doc_id in self.doc_ids.keys():
+ return self.doc_ids[doc_id]
+ else:
+ return doc_id
def format_bibliography_info(bib_info):
- """Function for formatting bibliography info"""
+ """Utility function for formatting bibliography info
+ """
if isinstance(bib_info, str):
return bib_info.removesuffix('.txt')
elif isinstance(bib_info, dict):
@@ -27,14 +59,16 @@ def format_bibliography_info(bib_info):
return bib_info
-def filter_response_text(val):
- """helper function for filtering non-printable chars"""
- return json.loads(''.join([str(char)
- for char in val if char in string.printable]))
-
-
def parse_context(context, get_info_func, format_bib_func):
- """function to parse doc_ids content"""
+ """Function to parse doc_ids content
+ Args:
+ context: raw references from fahamu api
+ get_info_func: function to get doc_ids info
+ format_bib_func: function to foramt bibliography info
+ Returns:
+ an list with each item having (doc_id,bib_info,
+ combined reference text)
+ """
results = []
for doc_ids, summary in context.items():
combo_txt = ""
@@ -43,32 +77,23 @@ def parse_context(context, get_info_func, format_bib_func):
doc_info = get_info_func(doc_ids)
bib_info = doc_ids if doc_ids == doc_info else format_bib_func(
doc_info)
+ pattern = r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*'
+ combo_text = re.sub(pattern,
+ lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>",
+ combo_txt)
results.append(
- {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt})
+ {"doc_id": doc_ids, "bibInfo": bib_info,
+ "comboTxt": combo_text})
return results
-def rate_document(task_id, doc_id, rating, auth_token):
- """This method is used to provide feedback for a document by making a rating."""
- # todo move this to clients
- try:
- url = urljoin(BASE_URL,
- f"""/feedback?task_id={task_id}&document_id={doc_id}&feedback={rating}""")
- headers = {"Authorization": f"Bearer {auth_token}"}
-
- resp = requests.post(url, headers=headers)
- resp.raise_for_status()
-
- return {"status": "success", **resp.json()}
- except requests.exceptions.HTTPError as http_error:
- raise RuntimeError(f"HTTP Error Occurred:\
- {http_error.response.text} -with status code- {http_error.response.status_code}") from http_error
- except Exception as error:
- raise RuntimeError(f"An error occurred: {str(error)}") from error
-
-
def load_file(filename, dir_path):
- """function to open and load json file"""
+ """Utility function to read json file
+ Args:
+ filename: file name to read
+ dir_path: base directory for the file
+ Returns: json data read to a dict
+ """
file_path = os.path.join(dir_path, f"{filename}")
if not os.path.isfile(file_path):
raise FileNotFoundError(f"{filename} was not found or is a directory")
@@ -77,8 +102,19 @@ def load_file(filename, dir_path):
def fetch_pubmed(references, file_name, data_dir=""):
- """method to fetch and populate references with pubmed"""
-
+ """
+ Fetches PubMed data from a JSON file and populates the\
+ references dictionary.
+
+ Args:
+ references (dict): Dictionary with document IDs as keys\
+ and reference data as values.
+ filename (str): Name of the JSON file containing PubMed data.
+ data_dir (str): Base directory where the data files are located.
+
+ Returns:
+ dict: Updated references dictionary populated with the PubMed data.
+ """
try:
pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit"))
for reference in references:
@@ -92,44 +128,27 @@ def fetch_pubmed(references, file_name, data_dir=""):
return references
-def get_gnqa(query, auth_token, tmp_dir=""):
- """entry function for the gn3 api endpoint()"""
-
- api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token)
- res, task_id = api_client.ask('?ask=' + quote(query), auth_token)
- if task_id == 0:
- raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}")
- res, success = api_client.get_answer(task_id)
- if success == 1:
- resp_text = filter_response_text(res.text)
- if resp_text.get("data") is None:
- return task_id, "Please try to rephrase your question to receive feedback", []
- answer = resp_text['data']['answer']
- context = resp_text['data']['context']
- references = parse_context(
- context, DocIDs().getInfo, format_bibliography_info)
- references = fetch_pubmed(references, "pubmed.json", tmp_dir)
-
- return task_id, answer, references
- else:
- return task_id, "Please try to rephrase your question to receive feedback", []
-
-
-def fetch_query_results(query, user_id, redis_conn):
- """this method fetches prev user query searches"""
- result = redis_conn.get(f"LLM:{user_id}-{query}")
- if result:
- return json.loads(result)
- return {
- "query": query,
- "answer": "Sorry No answer for you",
- "references": [],
- "task_id": None
- }
-
-
-def get_user_queries(user_id, redis_conn):
- """methos to fetch all queries for a specific user"""
-
- results = redis_conn.keys(f"LLM:{user_id}*")
- return [query for query in [result.partition("-")[2] for result in results] if query != ""]
+def get_gnqa(query, auth_token, data_dir=""):
+ """entry function for the gn3 api endpoint()
+ ARGS:
+ query: what is a gene
+ auth_token: token to connect to api_client
+ data_dir: base datirectory for gn3 data
+ Returns:
+ task_id: fahamu unique identifier for task
+ answer
+ references: contains doc_name,reference,pub_med_info
+ """
+ api_client = GeneNetworkQAClient(api_key=auth_token)
+ res, task_id = api_client.ask('?ask=' + quote(query), query=query)
+ res, _status = api_client.get_answer(task_id)
+ resp_text = json.loads(''.join([str(char)
+ for char in res.text if char in string.printable]))
+ answer = re.sub(r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*',
+ lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>",
+ resp_text["data"]["answer"])
+ context = resp_text['data']['context']
+ return task_id, answer, fetch_pubmed(parse_context(
+ context, DocIDs().get_info,
+ format_bibliography_info),
+ "pubmed.json", data_dir)
diff --git a/gn3/llms/response.py b/gn3/llms/response.py
deleted file mode 100644
index 11cbd94..0000000
--- a/gn3/llms/response.py
+++ /dev/null
@@ -1,75 +0,0 @@
-
-# pylint: skip-file
-import string
-import json
-import os
-
-
-basedir = os.path.abspath(os.path.dirname(__file__))
-
-
-class DocIDs():
- def __init__(self):
- # open doc ids for GN refs
- self.doc_ids = self.loadFile("doc_ids.json")
- # open doc ids for Diabetes references
- self.sugar_doc_ids = self.loadFile("all_files.json")
- # format is not what I prefer, it needs to be rebuilt
- self.formatDocIDs(self.sugar_doc_ids)
-
- def loadFile(self, file_name):
- file_path = os.path.join(basedir, file_name)
- if os.path.isfile(file_path):
- f = open(file_path, "rb")
- result = json.load(f)
- f.close()
- return result
- else:
- raise Exception("\n{0} -- File does not exist\n".format(file_path))
-
- def formatDocIDs(self, values):
- for _key, _val in values.items():
- if isinstance(_val, list):
- for theObject in _val:
- docName = self.formatDocumentName(theObject['filename'])
- docID = theObject['id']
- self.doc_ids.update({docID: docName})
-
- def formatDocumentName(self, val):
- result = val.removesuffix('.pdf')
- result = result.removesuffix('.txt')
- result = result.replace('_', ' ')
- return result
-
-
- def getInfo(self, doc_id):
- if doc_id in self.doc_ids.keys():
- return self.doc_ids[doc_id]
- else:
- return doc_id
-
-class RespContext():
- def __init__(self, context):
- self.cntxt = context
- self.theObj = {}
-
- def parseIntoObject(self, info):
- # check for obj, arr, or val
- for key, val in info.items():
- if isinstance(val, list):
- self.parseIntoObject(val)
- elif isinstance(val, str) or isinstance(val, int):
- self.theObj[key] = val
- self.theObj[key] = self.val
-
-
-def createAccordionFromJson(theContext):
- result = ''
- # loop thru json array
- ndx = 0
- for docID, summaryLst in theContext.items():
- # item is a key with a list
- comboTxt = ''
- for entry in summaryLst:
- comboTxt += '\t' + entry['text']
- return result \ No newline at end of file