From 3fa31b50af2861382fbe2c76406f5a04c3fefc93 Mon Sep 17 00:00:00 2001 From: SoloDShelby Date: Fri, 19 Jul 2024 14:41:40 +0300 Subject: Evaluation code for paper 1 --- gnqa/paper1_eval/src/apis/process.py | 152 +++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 gnqa/paper1_eval/src/apis/process.py (limited to 'gnqa/paper1_eval/src/apis/process.py') diff --git a/gnqa/paper1_eval/src/apis/process.py b/gnqa/paper1_eval/src/apis/process.py new file mode 100644 index 0000000..37f2d73 --- /dev/null +++ b/gnqa/paper1_eval/src/apis/process.py @@ -0,0 +1,152 @@ +"""this module contains code for processing response from fahamu client.py""" +import os +import string +import json + +from urllib.parse import urljoin +from urllib.parse import quote +import logging +import requests + +from apis.gnqaclient import GeneNetworkQAClient +from apis.resp import DocIDs + + +BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks' + + +# pylint: disable=C0301 + + +def format_bibliography_info(bib_info): + """Function for formatting bibliography info""" + if isinstance(bib_info, str): + return bib_info.removesuffix('.txt') + elif isinstance(bib_info, dict): + return f"{bib_info['author']}.{bib_info['title']}.{bib_info['year']}.{bib_info['doi']} " + return bib_info + + +def filter_response_text(val): + """helper function for filtering non-printable chars""" + return json.loads(''.join([str(char) + for char in val if char in string.printable])) + + +def parse_context(context, get_info_func, format_bib_func): + """function to parse doc_ids content""" + results = [] + for doc_ids, summary in context.items(): + combo_txt = "" + for entry in summary: + combo_txt += "\t" + entry["text"] + doc_info = get_info_func(doc_ids) + bib_info = doc_ids if doc_ids == doc_info else format_bib_func( + doc_info) + results.append( + {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt}) + return results + + +def rate_document(task_id, doc_id, rating, auth_token): + """This method is used to provide feedback for a document by making a rating.""" + # todo move this to clients + try: + url = urljoin(BASE_URL, + f"""/feedback?task_id={task_id}&document_id={doc_id}&feedback={rating}""") + headers = {"Authorization": f"Bearer {auth_token}"} + + resp = requests.post(url, headers=headers) + resp.raise_for_status() + + return {"status": "success", **resp.json()} + except requests.exceptions.HTTPError as http_error: + raise RuntimeError(f"HTTP Error Occurred:\ + {http_error.response.text} -with status code- {http_error.response.status_code}") from http_error + except Exception as error: + raise RuntimeError(f"An error occurred: {str(error)}") from error + + +def load_file(filename, dir_path): + """function to open and load json file""" + file_path = os.path.join(dir_path, f"{filename}") + if not os.path.isfile(file_path): + raise FileNotFoundError(f"{filename} was not found or is a directory") + with open(file_path, "rb") as file_handler: + return json.load(file_handler) + + +def fetch_pubmed(references, file_name, data_dir=""): + """method to fetch and populate references with pubmed""" + + try: + pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit")) + for reference in references: + if pubmed.get(reference["doc_id"]): + reference["pubmed"] = pubmed.get(reference["doc_id"]) + return references + + except FileNotFoundError: + logging.error("failed to find pubmed_path for %s/%s", + data_dir, file_name) + return references + + +def get_gnqa(query, auth_token, tmp_dir=""): + """entry function for the gn3 api endpoint()""" + + api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token) + res, task_id = api_client.ask('?ask=' + quote(query), auth_token) + if task_id == 0: + raise RuntimeError(f"Error connecting to Fahamu Api: {str(res)}") + res, success = api_client.get_answer(task_id) + if success == 1: + resp_text = filter_response_text(res.text) + if resp_text.get("data") is None: + return task_id, "Please try to rephrase your question to receive feedback", [] + answer = resp_text['data']['answer'] + context = resp_text['data']['context'] + references = parse_context( + context, DocIDs().getInfo, format_bibliography_info) + #references = fetch_pubmed(references, "pubmed.json", tmp_dir) + + return task_id, answer, references + else: + return task_id, "Please try to rephrase your question to receive feedback", [] + +def get_response_from_taskid(auth_token, task_id): + api_client = GeneNetworkQAClient(requests.Session(), api_key=auth_token) + res, success = api_client.answer(task_id) + if success == 1: + resp_text = filter_response_text(res.text) + if resp_text.get("data") is None: + return task_id, "Please try to rephrase your question to receive feedback", [] + answer = resp_text['data']['answer'] + context = resp_text['data']['context'] + references = parse_context( + context, DocIDs().getInfo, format_bibliography_info) + #references = fetch_pubmed(references, "pubmed.json", tmp_dir) + + return task_id, answer, references + else: + return task_id, "Please try to rephrase your question to receive feedback", [] + + +def fetch_query_results(query, user_id, redis_conn): + """this method fetches prev user query searches""" + result = redis_conn.get(f"LLM:{user_id}-{query}") + if result: + return json.loads(result) + return { + "query": query, + "answer": "Sorry No answer for you", + "references": [], + "task_id": None + } + + +def get_user_queries(user_id, redis_conn): + """methods to fetch all queries for a specific user""" + + results = redis_conn.keys(f"LLM:{user_id}*") + return [query for query in [result.partition("-")[2] for result in results] if query != ""] -- cgit v1.2.3