diff options
Diffstat (limited to 'gnqa/src/study2')
-rw-r--r-- | gnqa/src/study2/__init__.py | 0 | ||||
-rw-r--r-- | gnqa/src/study2/create_dataset.py | 20 | ||||
-rw-r--r-- | gnqa/src/study2/document_operations.py | 147 | ||||
-rw-r--r-- | gnqa/src/study2/parse_r2r_result.py | 63 | ||||
-rw-r--r-- | gnqa/src/study2/parsejson.py | 63 | ||||
-rw-r--r-- | gnqa/src/study2/retrieve_context.py | 171 | ||||
-rw-r--r-- | gnqa/src/study2/run_questions.py | 38 |
7 files changed, 502 insertions, 0 deletions
diff --git a/gnqa/src/study2/__init__.py b/gnqa/src/study2/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/gnqa/src/study2/__init__.py diff --git a/gnqa/src/study2/create_dataset.py b/gnqa/src/study2/create_dataset.py new file mode 100644 index 0000000..2d62f18 --- /dev/null +++ b/gnqa/src/study2/create_dataset.py @@ -0,0 +1,20 @@ +#!/usr/bin/python3 + +import sys +from study2.document_operations import DocOps + + +''' +******************************************************************************* +Commands +******************************************************************************* +''' + +try: + read_file = str(sys.argv[1]) + out_file = str(sys.argv[2]) +except: + exit('Example use "python create_dataset.py ../data/lists/human_list_cs_gn.json ../data/dataset/human_cs_gn.json"') + +doc_list = DocOps.read_json_document(read_file) +DocOps.combine_responses(doc_list, out_file)
\ No newline at end of file diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py new file mode 100644 index 0000000..3112d91 --- /dev/null +++ b/gnqa/src/study2/document_operations.py @@ -0,0 +1,147 @@ +import os +#import sys +import json +#import time +#import configparser +''' +from r2r import ( R2R, + Document, + GenerationConfig, + R2RClient ) +''' + +class DocOps: + _type = '' + values_key = { + "text" : {"name": "contexts", "append": 1}, + "associatedQuery": {"name": "question", "append": 0}, + "id": {"name": "id", "append": 1}, + "title": {"name": "titles", "append": 1}, + "document_id": {"name": "document_id", "append": 1}, + "extraction_id": {"name": "extraction_id", "append": 1}, + "content": {"name": "answer", "append": 0} + } + + def __init__(self): + self._type = 'QuestionList' + + def reset_responses(): + return { + 'question': [], + 'answer': [], + 'contexts': [] + #, + #'task_id': [] + } + + def writeDatasetFile(responses, outp_file): + print(outp_file) + output = json.dumps(responses, indent=2) + if os.path.exists(outp_file): + with open(outp_file, "a") as the_data: + the_data.write('\n\n' + output) + else: + with open(outp_file, "a") as the_data: + the_data.write(output) + + def get_r2r_ragas_out_dict(): + return { "titles": [], + "extraction_id": [], + "document_id": [], + "id": [], + "contexts": [], + "answer": "", + "question": ""} + + def read_json_document(file_name): + with open(file_name, "r") as result_file: + return json.load(result_file) + + def combine_responses(doc_lst, out_filename): + ragas_output = DocOps.reset_responses() + + for doc in doc_lst: + the_doc = DocOps.read_json_document(doc) + ragas_output['question'].append( + the_doc['question']) + ragas_output['answer'].append( + the_doc['answer']) + ragas_output['contexts'].append( + the_doc['contexts']) + DocOps.writeDatasetFile( + ragas_output, out_filename) + + + def extract_response(obj, values_key, thedict): + if isinstance(obj, dict): + for key, val in obj.items(): + if (key in values_key.keys()): + if (values_key[key]["append"]): + thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip()) + else: + thedict[values_key[key]["name"]] = val.replace("\n", " ").strip() + print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [DocOps.verbose]) + else: + if (len(obj.items()) == 1 ): + print(key, " --> ", val) + DocOps.extract_response(val, values_key, thedict) + elif isinstance(obj, list): + for item in obj: + DocOps.extract_response(item, values_key, thedict) + +class QuestionList: + _verbose = 0 + _doc = '' + _fname = '' + _question_list = { + "domainexpert": { + "gn": [], + "aging": [], + "diabetes": [] + }, + "citizenscientist": { + "gn": [], + "aging": [], + "diabetes": [] + } + } + + def __init__(self, the_file, verbose=0): + print('QuestionList has been initialized {0}, verbosity is {1}'.format(the_file, verbose)) + self._fname = the_file + self._verbose = verbose + self.read_document() + self.parse_document() + #self._print() + + + def read_document(self): + self._doc = DocOps.read_json_document( + self._fname) + + + + def parse_document(self): + print(('', '\nParse question list') [self._verbose] ) + for item in self._doc: + level = item['level'] + domain = item['domain'] + query_lst = item['query'] + self._question_list[level][domain] = query_lst + #print(('', 'Level --> {0} \tDomain --> {1}\n{2}'.format(level, domain, self.print_list(query_lst))) [self._verbose]) + #create_datasets(query_lst, domain, level) + + + def print_list(self, the_lst): + ndx = 1 + for item in the_lst: + print('\t[{0}] {1}'.format(ndx, item)) + ndx += 1 + + def _print(self): + print(json.dumps(self._question_list, indent=2)) + + def get(self, level, domain): + return self._question_list[level][domain] + + diff --git a/gnqa/src/study2/parse_r2r_result.py b/gnqa/src/study2/parse_r2r_result.py new file mode 100644 index 0000000..5cba6d3 --- /dev/null +++ b/gnqa/src/study2/parse_r2r_result.py @@ -0,0 +1,63 @@ +import json +import sys +from study2.document_operations import DocOps, QuestionList + +verbose = 0 +#read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/testresp2.json' +read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/responses/human/cs_diabetes_responses.json' +out_file = '../data/dataset/human/intermediate_files/human_cs_diabetes_' + +values_key = { + "text" : {"name": "contexts", "append": 1}, + "associatedQuery": {"name": "question", "append": 0}, + "id": {"name": "id", "append": 1}, + "title": {"name": "titles", "append": 1}, + "document_id": {"name": "document_id", "append": 1}, + "extraction_id": {"name": "extraction_id", "append": 1}, + "content": {"name": "answer", "append": 0} +} + +def get_ragas_out_dict(): + return { "titles": [], + "extraction_id": [], + "document_id": [], + "id": [], + "contexts": [], + "answer": "", + "question": ""} + +def extract_response(obj, values_key, thedict): + if isinstance(obj, dict): + for key, val in obj.items(): + if (key in values_key.keys()): + if (values_key[key]["append"]): + thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip()) + else: + thedict[values_key[key]["name"]] = val.replace("\n", " ").strip() + print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [verbose]) + else: + if (len(obj.items()) == 1 ): + print(key, " --> ", val) + extract_response(val, values_key, thedict) + elif isinstance(obj, list): + for item in obj: + extract_response(item, values_key, thedict) + +# this should be a json file with a list of input files and an output file +with open(read_file, "r") as r_file: + result_file = json.load(r_file) + +ragas_output = { + "titles": [], + "extraction_id": [], + "document_id": [], + "id": [], + "contexts": [], + "answer": "", + "question": ""} + +print('There are {0} keys in the result file'.format(result_file.keys())) +for key in result_file.keys(): + eval_dataset_dict = get_ragas_out_dict() + extract_response(result_file[key], values_key, eval_dataset_dict) + DocOps.writeDatasetFile(eval_dataset_dict, '{0}{1}'.format(out_file, key))
\ No newline at end of file diff --git a/gnqa/src/study2/parsejson.py b/gnqa/src/study2/parsejson.py new file mode 100644 index 0000000..b49a898 --- /dev/null +++ b/gnqa/src/study2/parsejson.py @@ -0,0 +1,63 @@ +import json +import sys + + +def iterate_json(obj, thedict): + if isinstance(obj, dict): + for key, val in obj.items(): + if (key == "text"): + thedict["contexts"].append(val.replace("\n", " ").strip()) + elif (key == "answer"): + thedict["answer"] = val.replace("\n", " ").strip() + elif (key == "question"): + thedict["question"] = val.replace("\n", " ").strip() + else: + if (len(obj.items()) == 1 ): + print(key, " --> ", val) + iterate_json(val, thedict) + elif isinstance(obj, list): + for item in obj: + iterate_json(item, thedict) + +def create_dataset_from_files(tag, file_name, rag_out): + for the_file in file_name[tag]: + ragas_output = { + "contexts": [], + "answer": "", + "question": ""} + #print(the_file) + with open("./data/"+the_file, "r") as r_file: + data_file = json.load(r_file) + iterate_json(data_file, ragas_output) + rag_out["answer"].append(ragas_output["answer"]) + rag_out["question"].append(ragas_output["question"]) + rag_out["contexts"].append(ragas_output["contexts"]) + +def create_resultset_from_file(file_name): + with open("./data/"+the_file, "r") as r_file: + data_file = json.load(r_file) + iterate_json(data_file, ragas_output) + + +file_list_tag = str(sys.argv[1]) +read_file = str(sys.argv[2]) # e.g. doc_list.json +outp_file = str(sys.argv[3]) + +rag_out = { + "question": [], + "answer": [], + "contexts": [] +} + +cntxt_lst = [] + +# this should be a json file with a list of input files and an output file +with open(read_file, "r") as r_file: + file_lst = json.load(r_file) + +create_dataset_from_files(file_list_tag, file_lst, rag_out) + +with open(outp_file, "a") as the_data: + #json.dump(ragas_output, the_data) + the_data.write(",\n") + the_data.write(json.dumps(rag_out, indent=2)) diff --git a/gnqa/src/study2/retrieve_context.py b/gnqa/src/study2/retrieve_context.py new file mode 100644 index 0000000..fca90dd --- /dev/null +++ b/gnqa/src/study2/retrieve_context.py @@ -0,0 +1,171 @@ +import os +import sys +import json +import time +import configparser +import apis.process as gnqa +from apis.process import get_gnqa, get_response_from_taskid + + +config = configparser.ConfigParser() +config.read('_config.cfg') + +''' +the refs object is a list of items containing doc_id, bibInfo, and comboTxt +We only need comboTxt +''' +def simplifyContext(refs): + result = [] + for item in refs: + combo_text = item['comboTxt'] + combo_text = combo_text.replace('\n','') + combo_text = combo_text.replace('\t','') + result.append(combo_text) + return result + +def writeDatasetFile(responses, outp_file): + print(outp_file) + output = json.dumps(responses, indent=2) + if os.path.exists(outp_file): + with open(outp_file, "a") as the_data: + the_data.write('' + output) + else: + with open(outp_file, "a") as the_data: + the_data.write(output) + + +def reset_responses(): + return { + 'question': [], + 'answer': [], + 'contexts': [], + 'task_id': [] + } + +def parse_document(jsonfile): + print('Parse document') + for item in jsonfile: + level = item['level'] + domain = item['domain'] + query_lst = item['query'] + create_datasets(query_lst, domain, level) + +def create_datasets(query_list, domain, level): + print('Creating dataset') + responses = reset_responses() + ndx = 0 + for query in query_list: + print(query) + task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR']) + responses['question'].append(query) + responses['answer'].append(answer) + responses['task_id'].append(task_id) + responses['contexts'].append(simplifyContext(refs)) + ndx+=1 + time.sleep(10) # sleep a bit to not overtask the api + if ndx % 5 == 0: + print('Will print to file number {0}'.format(int(ndx/5))) + outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5))) + writeDatasetFile(responses, outp_file) + responses = reset_responses() + if len(responses['question']) > 0: + outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5)+1)) + writeDatasetFile(responses, outp_file) + +def parse_responses(jsonfile): + print('Parsing human responses') + de_dict_general = {"level": "domainexpert", "domain": "general", "query": [], "task_id": []} + de_dict_aging = {"level": "domainexpert", "domain": "aging", "query": [], "task_id": []} + de_dict_diabetes = {"level": "domainexpert", "domain": "diabetes", "query": [], "task_id": []} + cs_dict_general = {"level": "citizenscientist", "domain": "general", "query": [], "task_id": []} + cs_dict_aging = {"level": "citizenscientist", "domain": "aging", "query": [], "task_id": []} + cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []} + j = 0 + for _, val in jsonfile.items(): + ndx = 0 + lvl = val.get("level") + for qry in val.get("query"): + ans = val.get("answer")[ndx] if "answer" in val else "" + tpc = val.get("topic")[ndx] + tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes" + tskd = val.get("task_id")[ndx] + if lvl == 'cs' and tpc == 'general': + addToDataList(cs_dict_general, qry, ans, tskd) + elif lvl == 'cs' and tpc == 'aging': + addToDataList(cs_dict_aging, qry, ans, tskd) + elif lvl == 'cs' and tpc == 'diabetes': + addToDataList(cs_dict_diabetes, qry, ans, tskd) + elif lvl == 'de' and tpc == 'general': + addToDataList(de_dict_general, qry, ans, tskd) + elif lvl == 'de' and tpc == 'aging': + addToDataList(de_dict_aging, qry, ans, tskd) + elif lvl == 'de' and tpc == 'diabetes': + addToDataList(de_dict_diabetes, qry, ans, tskd) + else: + print('Somehow there is a query without a topic or expertise level') + ndx+=1 + j+=1 + create_datasets_from_taskid(de_dict_general) + create_datasets_from_taskid(de_dict_aging) + create_datasets_from_taskid(de_dict_diabetes) + create_datasets_from_taskid(cs_dict_general) + create_datasets_from_taskid(cs_dict_aging) + create_datasets_from_taskid(cs_dict_diabetes) + +def addToDataList(data_lst, qry, ans, tskd): + data_lst["query"].append(qry) + data_lst["task_id"].append(tskd) + if "answer" not in data_lst.keys(): + data_lst["answer"] = [] + data_lst["answer"].append(ans) + + +def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level): + print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"])) + responses = reset_responses() + ndx = 0 + query_list = info_dict["query"] + if "answer" in info_dict: + answers = info_dict["answer"] + else: + info_dict["answer"] = [] + answers = [] + + for task_id in info_dict["task_id"]: + _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id) + responses['question'].append(query_list[ndx]) + if answers[ndx] == "": + responses['answer'].append(an_answer) + else: + responses['answer'].append(answers[ndx]) + responses['task_id'].append(task_id) + responses['contexts'].append(simplifyContext(refs)) + ndx+=1 + time.sleep(10) # sleep a bit to not overtask the api + if ndx % 5 == 0: + #print('Will print to file number {0}'.format(int(ndx/5))) + outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5))) + writeDatasetFile(responses, outp_file) + responses = reset_responses() + if len(responses['question']) > 0: + #print('Will print to file number {0}'.format(int((ndx/5)+1))) + #print(responses) + outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1)) + writeDatasetFile(responses, outp_file) + +try: + + read_file = str(sys.argv[1]) + file_type = str(sys.argv[2]) + +except: + exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"') + + +print('Read input file') +with open(read_file, "r") as r_file: + file_lst = json.load(r_file) +if file_type == "gpt4o": + parse_document(file_lst) +else: + parse_responses(file_lst)
\ No newline at end of file diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py new file mode 100644 index 0000000..07aee5f --- /dev/null +++ b/gnqa/src/study2/run_questions.py @@ -0,0 +1,38 @@ +import json +import sys +import os + +from r2r import R2RClient +from study2.document_operations import DocOps, QuestionList + +''' +******************************************************************************* +Variables +******************************************************************************* +''' +rag_response = {} +client = R2RClient("http://localhost:8000") +health_resp = client.health() + +''' +******************************************************************************* +Commands +******************************************************************************* +''' + +print("The R2R client's health status is {0}".format(health_resp)) + +try: + read_file = str(sys.argv[1]) + out_file = str(sys.argv[2]) +except: + exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"') + +qLst = QuestionList(read_file, 1) # second parameter is for verbose output +ndx = 1 +for question in qLst.get("domainexpert","aging"): + print('Getting response for the following question --> {0}'.format(question)) + rag_response[str(ndx)] = client.rag(question) + ndx += 1 + +DocOps.writeDatasetFile(rag_response, out_file)
\ No newline at end of file |