diff options
author | Alexander Kabui | 2024-09-06 11:17:18 +0300 |
---|---|---|
committer | GitHub | 2024-09-06 11:17:18 +0300 |
commit | cfeb54b776e95194381d26cff02ea738ad4fd3e0 (patch) | |
tree | 1a7cf011bbeb61df90963d79237643bc9f8611f5 | |
parent | 8e28770342b65cff78441670f1841e0130dc9c4b (diff) | |
parent | 8cb85c8f8c12180702cfc3a257bf9a513ac4da3d (diff) | |
download | genenetwork3-cfeb54b776e95194381d26cff02ea738ad4fd3e0.tar.gz |
Merge pull request #188 from genenetwork/chores/merge-gnqa-api
Chores/merge gnqa api
-rw-r--r-- | gn3/api/llm.py | 244 | ||||
-rw-r--r-- | gn3/app.py | 4 | ||||
-rw-r--r-- | gn3/errors.py | 18 | ||||
-rw-r--r-- | gn3/llms/client.py | 223 | ||||
-rw-r--r-- | gn3/llms/errors.py | 31 | ||||
-rw-r--r-- | gn3/llms/process.py | 192 | ||||
-rw-r--r-- | gn3/llms/response.py | 75 | ||||
-rw-r--r-- | sql/update/llm_db_tables.sql | 47 | ||||
-rw-r--r-- | sql/update/llm_db_update.sql | 37 | ||||
-rw-r--r-- | tests/unit/test_llm.py | 132 |
10 files changed, 474 insertions, 529 deletions
diff --git a/gn3/api/llm.py b/gn3/api/llm.py index 7d860d8..7e60271 100644 --- a/gn3/api/llm.py +++ b/gn3/api/llm.py @@ -1,128 +1,150 @@ -"""API for data used to generate menus""" - -# pylint: skip-file +"""Api endpoints for gnqa""" +import json +from datetime import datetime -from flask import jsonify, request, Blueprint, current_app +from flask import Blueprint +from flask import current_app +from flask import jsonify +from flask import request -from functools import wraps from gn3.llms.process import get_gnqa -from gn3.llms.process import get_user_queries -from gn3.llms.process import fetch_query_results +from gn3.llms.errors import LLMError from gn3.auth.authorisation.oauth2.resource_server import require_oauth from gn3.auth import db -from redis import Redis -import json -import sqlite3 -from datetime import timedelta - -GnQNA = Blueprint("GnQNA", __name__) -def handle_errors(func): - @wraps(func) - def decorated_function(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as error: - return jsonify({"error": str(error)}), 500 - return decorated_function +gnqa = Blueprint("gnqa", __name__) -@GnQNA.route("/gnqna", methods=["POST"]) -def gnqa(): - # todo add auth +@gnqa.route("/search", methods=["PUT"]) +def search(): + """Api endpoint for searching queries in fahamu Api""" query = request.json.get("querygnqa", "") if not query: return jsonify({"error": "querygnqa is missing in the request"}), 400 - - try: - fahamu_token = current_app.config.get("FAHAMU_AUTH_TOKEN") - if fahamu_token is None: - return jsonify({"query": query, "error": "Use of invalid fahamu auth token"}), 500 - task_id, answer, refs = get_gnqa( - query, fahamu_token, current_app.config.get("DATA_DIR")) - response = { - "task_id": task_id, - "query": query, - "answer": answer, - "references": refs - } - with (Redis.from_url(current_app.config["REDIS_URI"], - decode_responses=True) as redis_conn): - # The key will be deleted after 60 seconds - redis_conn.setex(f"LLM:random_user-{query}", timedelta(days=10), json.dumps(response)) - return jsonify({ - **response, - "prev_queries": get_user_queries("random_user", redis_conn) - }) - except Exception as error: - return jsonify({"query": query, "error": f"Request failed-{str(error)}"}), 500 - - -@GnQNA.route("/rating/<task_id>", methods=["POST"]) + fahamu_token = current_app.config.get("FAHAMU_AUTH_TOKEN") + if not fahamu_token: + raise LLMError( + "Request failed: an LLM authorisation token is required ", query) + task_id, answer, refs = get_gnqa( + query, fahamu_token, current_app.config.get("DATA_DIR")) + response = { + "task_id": task_id, + "query": query, + "answer": answer, + "references": refs + } + with (db.connection(current_app.config["LLM_DB_PATH"]) as conn, + require_oauth.acquire("profile user") as token): + cursor = conn.cursor() + cursor.execute("""CREATE TABLE IF NOT EXISTS + history(user_id TEXT NOT NULL, + task_id TEXT NOT NULL, + query TEXT NOT NULL, + results JSONB, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(task_id)) WITHOUT ROWID""") + cursor.execute( + """INSERT INTO history(user_id, task_id, query, results) + VALUES(?, ?, ?, ?) + """, (str(token.user.user_id), str(task_id["task_id"]), + query, + json.dumps(response)) + ) + return response + + +@gnqa.route("/rating/<task_id>", methods=["POST"]) @require_oauth("profile") -def rating(task_id): - try: - llm_db_path = current_app.config["LLM_DB_PATH"] - with (require_oauth.acquire("profile") as token, - db.connection(llm_db_path) as conn): - - results = request.json - user_id, query, answer, weight = (token.user.user_id, - results.get("query"), - results.get("answer"), - results.get("weight", 0)) - cursor = conn.cursor() - create_table = """CREATE TABLE IF NOT EXISTS Rating( - user_id TEXT NOT NULL, - query TEXT NOT NULL, - answer TEXT NOT NULL, - weight INTEGER NOT NULL DEFAULT 0, - task_id TEXT NOT NULL UNIQUE - )""" - cursor.execute(create_table) - cursor.execute("""INSERT INTO Rating(user_id,query,answer,weight,task_id) - VALUES(?,?,?,?,?) - ON CONFLICT(task_id) DO UPDATE SET - weight=excluded.weight - """, (str(user_id), query, answer, weight, task_id)) +def rate_queries(task_id): + """Api endpoint for rating GNQA query and answer""" + with (require_oauth.acquire("profile") as token, + db.connection(current_app.config["LLM_DB_PATH"]) as conn): + results = request.json + user_id, query, answer, weight = (token.user.user_id, + results.get("query"), + results.get("answer"), + results.get("weight", 0)) + cursor = conn.cursor() + create_table = """CREATE TABLE IF NOT EXISTS Rating( + user_id TEXT NOT NULL, + query TEXT NOT NULL, + answer TEXT NOT NULL, + weight INTEGER NOT NULL DEFAULT 0, + task_id TEXT NOT NULL UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(task_id))""" + cursor.execute(create_table) + cursor.execute("""INSERT INTO Rating(user_id, query, + answer, weight, task_id) + VALUES(?, ?, ?, ?, ?) + ON CONFLICT(task_id) DO UPDATE SET + weight=excluded.weight + """, (str(user_id), query, answer, weight, task_id)) return { - "message": "You have successfully rated this query:Thank you!!" - }, 200 - except sqlite3.Error as error: - return jsonify({"error": str(error)}), 500 - except Exception as error: - raise error + "message": "You have successfully rated this query. Thank you!" + }, 200 -@GnQNA.route("/history/<query>", methods=["GET"]) +@gnqa.route("/search/records", methods=["GET"]) @require_oauth("profile user") -@handle_errors -def fetch_user_hist(query): - - with (require_oauth.acquire("profile user") as the_token, Redis.from_url(current_app.config["REDIS_URI"], - decode_responses=True) as redis_conn): - return jsonify({ - **fetch_query_results(query, the_token.user.id, redis_conn), - "prev_queries": get_user_queries("random_user", redis_conn) - }) - - -@GnQNA.route("/historys/<query>", methods=["GET"]) -@handle_errors -def fetch_users_hist_records(query): - """method to fetch all users hist:note this is a test functionality to be replaced by fetch_user_hist""" - - with Redis.from_url(current_app.config["REDIS_URI"], decode_responses=True) as redis_conn: - return jsonify({ - **fetch_query_results(query, "random_user", redis_conn), - "prev_queries": get_user_queries("random_user", redis_conn) - }) - - -@GnQNA.route("/get_hist_names", methods=["GET"]) -@handle_errors -def fetch_prev_hist_ids(): - - with (Redis.from_url(current_app.config["REDIS_URI"], decode_responses=True)) as redis_conn: - return jsonify({"prev_queries": get_user_queries("random_user", redis_conn)}) +def get_user_search_records(): + """get all history records for a given user using their + user id + """ + with (require_oauth.acquire("profile user") as token, + db.connection(current_app.config["LLM_DB_PATH"]) as conn): + cursor = conn.cursor() + cursor.execute( + """SELECT task_id, query, created_at from history WHERE user_id=?""", + (str(token.user.user_id),)) + results = [dict(item) for item in cursor.fetchall()] + return jsonify(sorted(results, reverse=True, + key=lambda x: datetime.strptime(x.get("created_at"), + '%Y-%m-%d %H:%M:%S'))) + + +@gnqa.route("/search/record/<task_id>", methods=["GET"]) +@require_oauth("profile user") +def get_user_record_by_task(task_id): + """Get user previous search record by task id """ + with (require_oauth.acquire("profile user") as token, + db.connection(current_app.config["LLM_DB_PATH"]) as conn): + cursor = conn.cursor() + cursor.execute( + """SELECT results from history + Where task_id=? and user_id=?""", + (task_id, + str(token.user.user_id),)) + record = cursor.fetchone() + if record: + return dict(record).get("results") + return {} + + +@gnqa.route("/search/record/<task_id>", methods=["DELETE"]) +@require_oauth("profile user") +def delete_record(task_id): + """Delete user previous seach record by task-id""" + with (require_oauth.acquire("profile user") as token, + db.connection(current_app.config["LLM_DB_PATH"]) as conn): + cursor = conn.cursor() + query = """DELETE FROM history + WHERE task_id=? and user_id=?""" + cursor.execute(query, (task_id, token.user.user_id,)) + return {"msg": f"Successfully Deleted the task {task_id}"} + + +@gnqa.route("/search/records", methods=["DELETE"]) +@require_oauth("profile user") +def delete_records(): + """ Delete a users records using for all given task ids""" + with (require_oauth.acquire("profile user") as token, + db.connection(current_app.config["LLM_DB_PATH"]) as conn): + task_ids = list(request.json.values()) + cursor = conn.cursor() + query = """DELETE FROM history + WHERE task_id IN ({}) + and user_id=?""".format(",".join("?" * len(task_ids))) + cursor.execute(query, (*task_ids, str(token.user.user_id),)) + return jsonify({}) @@ -25,7 +25,7 @@ from gn3.api.menu import menu from gn3.api.search import search from gn3.api.metadata import metadata from gn3.api.sampledata import sampledata -from gn3.api.llm import GnQNA +from gn3.api.llm import gnqa from gn3.auth import oauth2 from gn3.case_attributes import caseattr @@ -78,7 +78,7 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask: app.register_blueprint(sampledata, url_prefix="/api/sampledata") app.register_blueprint(oauth2, url_prefix="/api/oauth2") app.register_blueprint(caseattr, url_prefix="/api/case-attribute") - app.register_blueprint(GnQNA, url_prefix="/api/llm") + app.register_blueprint(gnqa, url_prefix="/api/llm") register_error_handlers(app) return app diff --git a/gn3/errors.py b/gn3/errors.py index f618bab..ec7a554 100644 --- a/gn3/errors.py +++ b/gn3/errors.py @@ -17,7 +17,7 @@ from flask import Flask, jsonify, Response, current_app from gn3.oauth2 import errors as oautherrors from gn3.auth.authorisation.errors import AuthorisationError - +from gn3.llms.errors import LLMError def add_trace(exc: Exception, jsonmsg: dict) -> dict: """Add the traceback to the error handling object.""" @@ -118,6 +118,21 @@ def handle_local_authorisation_errors(exc: oautherrors.AuthorisationError): })), 400 +def handle_llm_error(exc: Exception) -> Response: + """ Handle llm erros if not handled anywhere else. """ + current_app.logger.error(exc) + resp = jsonify({ + "query": exc.args[1], + "error_type": type(exc).__name__, + "error": ( + exc.args[0] if bool(exc.args) else "Fahamu gnqa error occurred" + ), + "trace": traceback.format_exc() + }) + resp.status_code = 500 + return resp + + def register_error_handlers(app: Flask): """Register application-level error handlers.""" app.register_error_handler(NotFound, page_not_found) @@ -127,6 +142,7 @@ def register_error_handlers(app: Flask): app.register_error_handler(AuthorisationError, handle_authorisation_error) app.register_error_handler(RemoteDisconnected, internal_server_error) app.register_error_handler(URLError, url_server_error) + app.register_error_handler(LLMError, handle_llm_error) for exc in ( EndPointInternalError, EndPointNotFound, diff --git a/gn3/llms/client.py b/gn3/llms/client.py index 042becd..54a7a17 100644 --- a/gn3/llms/client.py +++ b/gn3/llms/client.py @@ -1,72 +1,59 @@ -# pylint: skip-file +"""Module Contains code for making request to fahamu Api""" +# pylint: disable=C0301 import json -import string -import os -import datetime import time -import requests - -from requests import Session -from urllib.parse import urljoin -from requests.packages.urllib3.util.retry import Retry -from requests import HTTPError from requests import Session from requests.adapters import HTTPAdapter -from urllib.request import urlretrieve -from urllib.parse import quote -from gn3.llms.errors import UnprocessableEntity -from gn3.llms.errors import LLMError +from requests.adapters import Retry -basedir = os.path.join(os.path.dirname(__file__)) +from gn3.llms.errors import LLMError class TimeoutHTTPAdapter(HTTPAdapter): + """Set a default timeout for HTTP calls """ def __init__(self, timeout, *args, **kwargs): - """TimeoutHTTPAdapter constructor. - Args: - timeout (int): How many seconds to wait for the server to send data before - giving up. - """ + """TimeoutHTTPAdapter constructor.""" self.timeout = timeout super().__init__(*args, **kwargs) - def send(self, request, **kwargs): + def send(self, *args, **kwargs): """Override :obj:`HTTPAdapter` send method to add a default timeout.""" - timeout = kwargs.get("timeout") - if timeout is None: - kwargs["timeout"] = self.timeout - - return super().send(request, **kwargs) + kwargs["timeout"] = ( + kwargs["timeout"] if kwargs.get("timeout") else self.timeout + ) + return super().send(*args, **kwargs) class GeneNetworkQAClient(Session): """GeneNetworkQA Client This class provides a client object interface to the GeneNetworkQA API. - It extends the `requests.Session` class and includes authorization, base URL, + It extends the `requests.Session` class and includes authorization, + base URL, request timeouts, and request retries. Args: - account (str): Base address subdomain. api_key (str): API key. - version (str, optional): API version, defaults to "v3". timeout (int, optional): Timeout value, defaults to 5. total_retries (int, optional): Total retries value, defaults to 5. - backoff_factor (int, optional): Retry backoff factor value, defaults to 30. + backoff_factor (int, optional): Retry backoff factor value, + defaults to 30. Usage: from genenetworkqa import GeneNetworkQAClient - gnqa = GeneNetworkQAClient(account="account-name", api_key="XXXXXXXXXXXXXXXXXXX...") + gnqa = GeneNetworkQAClient(account="account-name", + api_key="XXXXXXXXXXXXXXXXXXX...") """ - BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks' - - def __init__(self, account, api_key, version="v3", timeout=30, total_retries=5, backoff_factor=30): + def __init__(self, api_key, timeout=30, + total_retries=5, backoff_factor=2): super().__init__() self.headers.update( {"Authorization": "Bearer " + api_key}) - self.answer_url = f"{self.BASE_URL}/answers" - self.feedback_url = f"{self.BASE_URL}/feedback" + self.base_url = "https://genenetwork.fahamuai.com/api/tasks" + self.answer_url = f"{self.base_url}/answers" + self.feedback_url = f"{self.base_url}/feedback" + self.query = "" adapter = TimeoutHTTPAdapter( timeout=timeout, @@ -81,140 +68,54 @@ class GeneNetworkQAClient(Session): self.mount("http://", adapter) @staticmethod - def format_bibliography_info(bib_info): - - if isinstance(bib_info, str): - # Remove '.txt' - bib_info = bib_info.removesuffix('.txt') - elif isinstance(bib_info, dict): - # Format string bibliography information - bib_info = "{0}.{1}.{2}.{3} ".format(bib_info.get('author', ''), - bib_info.get('title', ''), - bib_info.get('year', ''), - bib_info.get('doi', '')) - return bib_info - - @staticmethod - def ask_the_documents(extend_url, my_auth): - try: - response = requests.post( - base_url + extend_url, data={}, headers=my_auth) - response.raise_for_status() - except requests.exceptions.RequestException as e: - # Handle the exception appropriately, e.g., log the error - raise RuntimeError(f"Error making the request: {e}") - - if response.status_code != 200: - return negative_status_msg(response), 0 - - task_id = get_task_id_from_result(response) - response = get_answer_using_task_id(task_id, my_auth) - - if response.status_code != 200: - - return negative_status_msg(response), 0 - - return response, 1 - - @staticmethod def negative_status_msg(response): + """ handler for non 200 response from fahamu api""" return f"Error: Status code -{response.status_code}- Reason::{response.reason}" - # return f"Problems\n\tStatus code => {response.status_code}\n\tReason => {response.reason}" - - def ask(self, exUrl, *args, **kwargs): - askUrl = self.BASE_URL + exUrl - res = self.custom_request('POST', askUrl, *args, **kwargs) - if (res.status_code != 200): - return self.negative_status_msg(res), 0 - task_id = self.getTaskIDFromResult(res) - return res, task_id - - def get_answer(self, taskid, *args, **kwargs): - query = self.answer_url + self.extendTaskID(taskid) + + def ask(self, ex_url, query, *args, **kwargs): + """fahamu ask api interface""" + self.query = query + res = self.custom_request('POST', f"{self.base_url}{ex_url}", *args, **kwargs) + return res, json.loads(res.text) + + def get_answer(self, task_obj, *args, **kwargs): + """Fahamu get answer interface""" + query = f"{self.answer_url}?task_id={task_obj['task_id']}" res = self.custom_request('GET', query, *args, **kwargs) - if (res.status_code != 200): - return self.negative_status_msg(res), 0 return res, 1 def custom_request(self, method, url, *args, **kwargs): - + """ + Make a custom request to the Fahamu API to ask and get a response. + This is a custom method, which is the current default for fetching items, + as it overrides the adapter provided above. + This function was created to debug the slow response rate of Fahamu and + provide custom a response. + """ max_retries = 50 retry_delay = 3 - - for i in range(max_retries): - try: - response = super().request(method, url, *args, **kwargs) - response.raise_for_status() - - except requests.exceptions.HTTPError as error: - if error.response.status_code ==500: - raise LLMError(error.request, error.response, f"Response Error,status_code:{error.response.status_code},Reason: Use of Invalid Token") - elif error.response.status_code ==404: - raise LLMError(error.request,error.response,f"404 Client Error: Not Found for url: {self.BASE_URL}") - raise error - - except requests.exceptions.RequestException as error: - raise error - - - - + response_msg = { + 404: "Api endpoint Does not exist", + 500: "Use of Invalid Token/or the Fahamu Api is currently down", + 400: "You sent a bad Fahamu request", + 401: "You do not have authorization to perform the request", + } + for _i in range(max_retries): + response = super().request(method, url, *args, **kwargs) if response.ok: - if method.lower() == "get" and response.json().get("data") is None: + if method.lower() == "get" and not response.json().get("data"): + # note this is a dirty trick to check if fahamu has returned the results + # the issue is that the api only returns 500 or 200 satus code + # TODO: fix this on their end time.sleep(retry_delay) continue - else: - return response + return response else: - time.sleep(retry_delay) - return response - - @staticmethod - def get_task_id_from_result(response): - task_id = json.loads(response.text) - result = f"?task_id={task_id.get('task_id', '')}" - return result - - @staticmethod - def get_answer_using_task_id(extend_url, my_auth): - try: - response = requests.get( - answer_url + extend_url, data={}, headers=my_auth) - response.raise_for_status() - return response - except requests.exceptions.RequestException as error: - # Handle the exception appropriately, e.g., log the error - raise error - - @staticmethod - def filter_response_text(val): - """ - Filters out non-printable characters from the input string and parses it as JSON. - - Args: - val (str): Input string to be filtered and parsed. - - Returns: - dict: Parsed JSON object. - # remove this - """ - return json.loads(''.join([str(char) for char in val if char in string.printable])) - - def getTaskIDFromResult(self, res): - return json.loads(res.text) - - def extendTaskID(self, task_id): - return '?task_id=' + str(task_id['task_id']) - - def get_gnqa(self, query): - qstr = quote(query) - res, task_id = api_client.ask('?ask=' + qstr) - res, success = api_client.get_answer(task_id) - - if success == 1: - resp_text = filter_response_text(res.text) - answer = resp_text.get('data', {}).get('answer', '') - context = resp_text.get('data', {}).get('context', '') - return answer, context - else: - return res, "Unfortunately, I have nothing." + raise LLMError(f"Request error with code:\ + {response.status_code} occurred with reason:\ + {response_msg.get(response.status_code,response.reason)}", + self.query) + #time.sleep(retry_delay) + raise LLMError("Timeout error: We couldn't provide a response,Please try\ + to rephrase your question to receive feedback", + self.query) diff --git a/gn3/llms/errors.py b/gn3/llms/errors.py index e9f7c02..a3a47a3 100644 --- a/gn3/llms/errors.py +++ b/gn3/llms/errors.py @@ -1,32 +1,11 @@ - -# pylint: skip-file +""" Error handlers for Fahamu Api""" import json - from requests import HTTPError class UnprocessableEntity(HTTPError): - """An HTTP 422 Unprocessable Entity error occurred. - + """Error for HTTP 422 Unprocessable Entity https://help.helpjuice.com/en_US/api-v3/api-v3#errors - - The request could not be processed, usually due to a missing or invalid parameter. - - The response will also include an error object with an explanation of fields that - are missing or invalid. Here is an example: - - .. code-block:: - - HTTP/1.1 422 Unprocessable Entity - - - { - "errors": [ - { - "email": "is not valid." - } - ] - } """ def __init__(self, request, response): @@ -56,7 +35,5 @@ class UnprocessableEntity(HTTPError): msg, request=request, response=response) -class LLMError(HTTPError): - def __init__(self, request, response, msg): - super(HTTPError, self).__init__( - msg, request=request, response=response) +class LLMError(Exception): + """custom exception for LLMErrorMIxins""" diff --git a/gn3/llms/process.py b/gn3/llms/process.py index e38b73e..b8e47e7 100644 --- a/gn3/llms/process.py +++ b/gn3/llms/process.py @@ -1,25 +1,54 @@ """this module contains code for processing response from fahamu client.py""" +# pylint: disable=C0301 import os +import re import string import json - -from urllib.parse import urljoin -from urllib.parse import quote import logging -import requests +from urllib.parse import quote from gn3.llms.client import GeneNetworkQAClient -from gn3.llms.response import DocIDs BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks' - - -# pylint: disable=C0301 - +BASEDIR = os.path.abspath(os.path.dirname(__file__)) + + +class DocIDs(): + """ Class Method to Parse document id and names from files""" + def __init__(self): + """ + init method for Docids + * doc_ids.json: open doc_ids for gn references + * sugar_doc_ids: open doc_ids for diabetes references + """ + self.doc_ids = load_file("doc_ids.json", BASEDIR) + sugar_doc_ids = load_file("all_files.json", BASEDIR) + self.format_doc_ids(sugar_doc_ids) + + def format_doc_ids(self, docs): + """method to format doc_ids for list items doc_id and doc_name""" + for _key, val in docs.items(): + if isinstance(val, list): + for doc_obj in val: + doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "") + self.doc_ids.update({doc_obj["id"]: doc_name}) + + def get_info(self, doc_id): + """ interface to make read from doc_ids + and extract info data else returns + doc_id + Args: + doc_id: str: a search key for doc_ids + Returns: + an object if doc id exists else + raises a KeyError + """ + return self.doc_ids[doc_id] def format_bibliography_info(bib_info): - """Function for formatting bibliography info""" + """Utility function for formatting bibliography info + """ if isinstance(bib_info, str): return bib_info.removesuffix('.txt') elif isinstance(bib_info, dict): @@ -27,58 +56,62 @@ def format_bibliography_info(bib_info): return bib_info -def filter_response_text(val): - """helper function for filtering non-printable chars""" - return json.loads(''.join([str(char) - for char in val if char in string.printable])) - - def parse_context(context, get_info_func, format_bib_func): - """function to parse doc_ids content""" + """Function to parse doc_ids content + Args: + context: raw references from fahamu api + get_info_func: function to get doc_ids info + format_bib_func: function to foramt bibliography info + Returns: + an list with each item having (doc_id,bib_info, + combined reference text) + """ results = [] for doc_ids, summary in context.items(): combo_txt = "" for entry in summary: combo_txt += "\t" + entry["text"] - doc_info = get_info_func(doc_ids) - bib_info = doc_ids if doc_ids == doc_info else format_bib_func( - doc_info) + try: + doc_info = get_info_func(doc_ids) + bib_info = format_bib_func(doc_info) + except KeyError: + bib_info = doc_ids + pattern = r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*' + combo_text = re.sub(pattern, + lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>", + combo_txt) results.append( - {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt}) + {"doc_id": doc_ids, "bibInfo": bib_info, + "comboTxt": combo_text}) return results -def rate_document(task_id, doc_id, rating, auth_token): - """This method is used to provide feedback for a document by making a rating.""" - # todo move this to clients - try: - url = urljoin(BASE_URL, - f"""/feedback?task_id={task_id}&document_id={doc_id}&feedback={rating}""") - headers = {"Authorization": f"Bearer {auth_token}"} - - resp = requests.post(url, headers=headers) - resp.raise_for_status() - - return {"status": "success", **resp.json()} - except requests.exceptions.HTTPError as http_error: - raise RuntimeError(f"HTTP Error Occurred:\ - {http_error.response.text} -with status code- {http_error.response.status_code}") from http_error - except Exception as error: - raise RuntimeError(f"An error occurred: {str(error)}") from error - - def load_file(filename, dir_path): - """function to open and load json file""" - file_path = os.path.join(dir_path, f"{filename}") - if not os.path.isfile(file_path): - raise FileNotFoundError(f"{filename} was not found or is a directory") - with open(file_path, "rb") as file_handler: + """Utility function to read json file + Args: + filename: file name to read + dir_path: base directory for the file + Returns: json data read to a dict + """ + with open(os.path.join(dir_path, f"{filename}"), + "rb") as file_handler: return json.load(file_handler) def fetch_pubmed(references, file_name, data_dir=""): - """method to fetch and populate references with pubmed""" - + """ + Fetches PubMed data from a JSON file and populates the\ + references dictionary. + + Args: + references (dict): Dictionary with document IDs as keys\ + and reference data as values. + filename (str): Name of the JSON file containing PubMed data. + data_dir (str): Base directory where the data files are located. + + Returns: + dict: Updated references dictionary populated with the PubMed data. + """ try: pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit")) for reference in references: @@ -92,44 +125,27 @@ def fetch_pubmed(references, file_name, data_dir=""): return references -def get_gnqa(query, auth_token, tmp_dir=""): - """entry function for the gn3 api endpoint()""" - - api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token) - res, task_id = api_client.ask('?ask=' + quote(query), auth_token) - if task_id == 0: - raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}") - res, success = api_client.get_answer(task_id) - if success == 1: - resp_text = filter_response_text(res.text) - if resp_text.get("data") is None: - return task_id, "Please try to rephrase your question to receive feedback", [] - answer = resp_text['data']['answer'] - context = resp_text['data']['context'] - references = parse_context( - context, DocIDs().getInfo, format_bibliography_info) - references = fetch_pubmed(references, "pubmed.json", tmp_dir) - - return task_id, answer, references - else: - return task_id, "Please try to rephrase your question to receive feedback", [] - - -def fetch_query_results(query, user_id, redis_conn): - """this method fetches prev user query searches""" - result = redis_conn.get(f"LLM:{user_id}-{query}") - if result: - return json.loads(result) - return { - "query": query, - "answer": "Sorry No answer for you", - "references": [], - "task_id": None - } - - -def get_user_queries(user_id, redis_conn): - """methos to fetch all queries for a specific user""" - - results = redis_conn.keys(f"LLM:{user_id}*") - return [query for query in [result.partition("-")[2] for result in results] if query != ""] +def get_gnqa(query, auth_token, data_dir=""): + """entry function for the gn3 api endpoint() + ARGS: + query: what is a gene + auth_token: token to connect to api_client + data_dir: base datirectory for gn3 data + Returns: + task_id: fahamu unique identifier for task + answer + references: contains doc_name,reference,pub_med_info + """ + api_client = GeneNetworkQAClient(api_key=auth_token) + res, task_id = api_client.ask('?ask=' + quote(query), query=query) + res, _status = api_client.get_answer(task_id) + resp_text = json.loads(''.join([str(char) + for char in res.text if char in string.printable])) + answer = re.sub(r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*', + lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>", + resp_text["data"]["answer"]) + context = resp_text['data']['context'] + return task_id, answer, fetch_pubmed(parse_context( + context, DocIDs().get_info, + format_bibliography_info), + "pubmed.json", data_dir) diff --git a/gn3/llms/response.py b/gn3/llms/response.py deleted file mode 100644 index 11cbd94..0000000 --- a/gn3/llms/response.py +++ /dev/null @@ -1,75 +0,0 @@ - -# pylint: skip-file -import string -import json -import os - - -basedir = os.path.abspath(os.path.dirname(__file__)) - - -class DocIDs(): - def __init__(self): - # open doc ids for GN refs - self.doc_ids = self.loadFile("doc_ids.json") - # open doc ids for Diabetes references - self.sugar_doc_ids = self.loadFile("all_files.json") - # format is not what I prefer, it needs to be rebuilt - self.formatDocIDs(self.sugar_doc_ids) - - def loadFile(self, file_name): - file_path = os.path.join(basedir, file_name) - if os.path.isfile(file_path): - f = open(file_path, "rb") - result = json.load(f) - f.close() - return result - else: - raise Exception("\n{0} -- File does not exist\n".format(file_path)) - - def formatDocIDs(self, values): - for _key, _val in values.items(): - if isinstance(_val, list): - for theObject in _val: - docName = self.formatDocumentName(theObject['filename']) - docID = theObject['id'] - self.doc_ids.update({docID: docName}) - - def formatDocumentName(self, val): - result = val.removesuffix('.pdf') - result = result.removesuffix('.txt') - result = result.replace('_', ' ') - return result - - - def getInfo(self, doc_id): - if doc_id in self.doc_ids.keys(): - return self.doc_ids[doc_id] - else: - return doc_id - -class RespContext(): - def __init__(self, context): - self.cntxt = context - self.theObj = {} - - def parseIntoObject(self, info): - # check for obj, arr, or val - for key, val in info.items(): - if isinstance(val, list): - self.parseIntoObject(val) - elif isinstance(val, str) or isinstance(val, int): - self.theObj[key] = val - self.theObj[key] = self.val - - -def createAccordionFromJson(theContext): - result = '' - # loop thru json array - ndx = 0 - for docID, summaryLst in theContext.items(): - # item is a key with a list - comboTxt = '' - for entry in summaryLst: - comboTxt += '\t' + entry['text'] - return result
\ No newline at end of file diff --git a/sql/update/llm_db_tables.sql b/sql/update/llm_db_tables.sql new file mode 100644 index 0000000..b501832 --- /dev/null +++ b/sql/update/llm_db_tables.sql @@ -0,0 +1,47 @@ +-- llm_db_update.sql --- + +-- Copyright (C) 2024 Alexander kabui <alexanderkabua@gmail.com> + +-- Author: Alexander Kabui <alexanderkabua@gmail.com> + +-- This program is free software; you can redistribute it and/or +-- modify it under the terms of the GNU General Public License +-- as published by the Free Software Foundation; either version 3 +-- of the License, or (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see <http://www.gnu.org/licenses/>. + +-- Sql file to create the tables for history rating and adding indexing for the history table +-- this targets setting up a new db +-- and adding timestamp column the Rating table + + +CREATE TABLE IF NOT EXISTS history ( + user_id TEXT NOT NULL, + task_id TEXT NOT NULL, + query TEXT NOT NULL, + results JSONB, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (task_id) +) WITHOUT ROWID; + + +CREATE INDEX IF NOT EXISTS idx_tbl_history_cols_task_id_user_id +ON history (task_id, user_id); + + + +CREATE TABLE IF NOT EXISTS Rating( + user_id TEXT NOT NULL, + query TEXT NOT NULL, + answer TEXT NOT NULL, + weight INTEGER NOT NULL DEFAULT 0, + task_id TEXT NOT NULL UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (task_id)); diff --git a/sql/update/llm_db_update.sql b/sql/update/llm_db_update.sql new file mode 100644 index 0000000..7f1a9f9 --- /dev/null +++ b/sql/update/llm_db_update.sql @@ -0,0 +1,37 @@ +-- llm_db_update.sql --- + +-- Copyright (C) 2024 Alexander kabui <alexanderkabua@gmail.com> + +-- Author: Alexander Kabui <alexanderkabua@gmail.com> + +-- This program is free software; you can redistribute it and/or +-- modify it under the terms of the GNU General Public License +-- as published by the Free Software Foundation; either version 3 +-- of the License, or (at your option) any later version. + +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. + +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see <http://www.gnu.org/licenses/>. + +-- Sql file to create the history table, adding indexing for the history table +-- and adding timestamp column the Rating table + + +CREATE TABLE IF NOT EXISTS history ( + user_id TEXT NOT NULL, + task_id TEXT NOT NULL, + query TEXT NOT NULL, + results JSONB, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (task_id) +) WITHOUT ROWID; + + +CREATE INDEX IF NOT EXISTS idx_tbl_history_cols_task_id_user_id +ON history (task_id, user_id); + +ALTER TABLE Rating ADD COLUMN created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP; diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 7b8a970..8fbaba6 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -1,10 +1,9 @@ -# pylint: disable=unused-argument -"""Test cases for procedures defined in llms module""" -from dataclasses import dataclass +"""Test cases for procedures defined in llms """ +# pylint: disable=C0301 import pytest -from gn3.llms.process import get_gnqa +from gn3.llms.process import fetch_pubmed from gn3.llms.process import parse_context - +from gn3.llms.process import format_bibliography_info @pytest.mark.unit_test @@ -36,67 +35,72 @@ def test_parse_context(): assert parsed_result == expected_result - -@dataclass(frozen=True) -class MockResponse: - """mock a response object""" - text: str - - def __getattr__(self, name: str): - return self.__dict__[f"_{name}"] - - -class MockGeneNetworkQAClient: - """mock the GeneNetworkQAClient class""" - - def __init__(self, session, api_key): - pass - - def ask(self, query, auth_token): - """mock method for ask query""" - # Simulate the ask method - return MockResponse("Mock response"), "F400995EAFE104EA72A5927CE10C73B7" - - def get_answer(self, task_id): - """mock get_answer method""" - return MockResponse("Mock answer"), 1 - - -def mock_filter_response_text(text): - """ method to simulate the filterResponseText method""" - return {"data": {"answer": "Mock answer for what is a gene", "context": {}}} - - -def mock_parse_context(context, get_info_func, format_bib_func): - """method to simulate the parse context method""" - return [] - - @pytest.mark.unit_test -def test_get_gnqa(monkeypatch): - """test for process.get_gnqa functoin""" - monkeypatch.setattr( - "gn3.llms.process.GeneNetworkQAClient", - MockGeneNetworkQAClient - ) +def test_format_bib_info(): + """Test for formatting bibliography info """ + mock_fahamu_bib_info = [ + { + "author": "J.m", + "firstName": "john", + "title": "Genes and aging", + "year": 2013, + "doi": "https://Articles.com/12231" + }, + "2019-Roy-Evaluation of Sirtuin-3 probe quality and co-expressed genes", + "2015 - Differential regional and cellular distribution of TFF3 peptide in the human brain.txt"] + expected_result = [ + "J.m.Genes and aging.2013.https://Articles.com/12231 ", + "2019-Roy-Evaluation of Sirtuin-3 probe quality and co-expressed genes", + "2015 - Differential regional and cellular distribution of TFF3 peptide in the human brain" + ] - monkeypatch.setattr( - 'gn3.llms.process.filter_response_text', - mock_filter_response_text - ) - monkeypatch.setattr( - 'gn3.llms.process.parse_context', - mock_parse_context - ) + assert all((format_bibliography_info(data) == expected + for data, expected + in zip(mock_fahamu_bib_info, expected_result))) - query = "What is a gene" - auth_token = "test_token" - result = get_gnqa(query, auth_token) - expected_result = ( - "F400995EAFE104EA72A5927CE10C73B7", - 'Mock answer for what is a gene', - [] - ) +@pytest.mark.unit_test +def test_fetching_pubmed_info(monkeypatch): + """Test for fetching and populating pubmed data with pubmed info""" + def mock_load_file(_filename, _dir_path): + return { + "12121": { + "Abstract": "items1", + "Author": "A1" + } + } + # patch the module with the mocked function + + monkeypatch.setattr("gn3.llms.process.load_file", mock_load_file) + expected_results = [ + { + "title": "Genes", + "year": "2014", + "doi": "https/article/genes/12121", + "doc_id": "12121", + "pubmed": { + "Abstract": "items1", + "Author": "A1" + } + }, + { + "title": "Aging", + "year": "2014", + "doc_id": "12122" + } + ] - assert result == expected_result + data = [{ + "title": "Genes", + "year": "2014", + "doi": "https/article/genes/12121", + "doc_id": "12121", + }, + { + "title": "Aging", + "year": "2014", + "doc_id": "12122" + }] + + assert (fetch_pubmed(data, "/pubmed.json", "data/") + == expected_results) |