aboutsummaryrefslogtreecommitdiff
path: root/gnqa/src/study2
diff options
context:
space:
mode:
Diffstat (limited to 'gnqa/src/study2')
-rw-r--r--gnqa/src/study2/__init__.py0
-rw-r--r--gnqa/src/study2/create_dataset.py20
-rw-r--r--gnqa/src/study2/document_operations.py147
-rw-r--r--gnqa/src/study2/parse_r2r_result.py63
-rw-r--r--gnqa/src/study2/parsejson.py63
-rw-r--r--gnqa/src/study2/retrieve_context.py171
-rw-r--r--gnqa/src/study2/run_questions.py38
7 files changed, 502 insertions, 0 deletions
diff --git a/gnqa/src/study2/__init__.py b/gnqa/src/study2/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/gnqa/src/study2/__init__.py
diff --git a/gnqa/src/study2/create_dataset.py b/gnqa/src/study2/create_dataset.py
new file mode 100644
index 0000000..2d62f18
--- /dev/null
+++ b/gnqa/src/study2/create_dataset.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+
+import sys
+from study2.document_operations import DocOps
+
+
+'''
+*******************************************************************************
+Commands
+*******************************************************************************
+'''
+
+try:
+ read_file = str(sys.argv[1])
+ out_file = str(sys.argv[2])
+except:
+ exit('Example use "python create_dataset.py ../data/lists/human_list_cs_gn.json ../data/dataset/human_cs_gn.json"')
+
+doc_list = DocOps.read_json_document(read_file)
+DocOps.combine_responses(doc_list, out_file) \ No newline at end of file
diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py
new file mode 100644
index 0000000..3112d91
--- /dev/null
+++ b/gnqa/src/study2/document_operations.py
@@ -0,0 +1,147 @@
+import os
+#import sys
+import json
+#import time
+#import configparser
+'''
+from r2r import ( R2R,
+ Document,
+ GenerationConfig,
+ R2RClient )
+'''
+
+class DocOps:
+ _type = ''
+ values_key = {
+ "text" : {"name": "contexts", "append": 1},
+ "associatedQuery": {"name": "question", "append": 0},
+ "id": {"name": "id", "append": 1},
+ "title": {"name": "titles", "append": 1},
+ "document_id": {"name": "document_id", "append": 1},
+ "extraction_id": {"name": "extraction_id", "append": 1},
+ "content": {"name": "answer", "append": 0}
+ }
+
+ def __init__(self):
+ self._type = 'QuestionList'
+
+ def reset_responses():
+ return {
+ 'question': [],
+ 'answer': [],
+ 'contexts': []
+ #,
+ #'task_id': []
+ }
+
+ def writeDatasetFile(responses, outp_file):
+ print(outp_file)
+ output = json.dumps(responses, indent=2)
+ if os.path.exists(outp_file):
+ with open(outp_file, "a") as the_data:
+ the_data.write('\n\n' + output)
+ else:
+ with open(outp_file, "a") as the_data:
+ the_data.write(output)
+
+ def get_r2r_ragas_out_dict():
+ return { "titles": [],
+ "extraction_id": [],
+ "document_id": [],
+ "id": [],
+ "contexts": [],
+ "answer": "",
+ "question": ""}
+
+ def read_json_document(file_name):
+ with open(file_name, "r") as result_file:
+ return json.load(result_file)
+
+ def combine_responses(doc_lst, out_filename):
+ ragas_output = DocOps.reset_responses()
+
+ for doc in doc_lst:
+ the_doc = DocOps.read_json_document(doc)
+ ragas_output['question'].append(
+ the_doc['question'])
+ ragas_output['answer'].append(
+ the_doc['answer'])
+ ragas_output['contexts'].append(
+ the_doc['contexts'])
+ DocOps.writeDatasetFile(
+ ragas_output, out_filename)
+
+
+ def extract_response(obj, values_key, thedict):
+ if isinstance(obj, dict):
+ for key, val in obj.items():
+ if (key in values_key.keys()):
+ if (values_key[key]["append"]):
+ thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip())
+ else:
+ thedict[values_key[key]["name"]] = val.replace("\n", " ").strip()
+ print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [DocOps.verbose])
+ else:
+ if (len(obj.items()) == 1 ):
+ print(key, " --> ", val)
+ DocOps.extract_response(val, values_key, thedict)
+ elif isinstance(obj, list):
+ for item in obj:
+ DocOps.extract_response(item, values_key, thedict)
+
+class QuestionList:
+ _verbose = 0
+ _doc = ''
+ _fname = ''
+ _question_list = {
+ "domainexpert": {
+ "gn": [],
+ "aging": [],
+ "diabetes": []
+ },
+ "citizenscientist": {
+ "gn": [],
+ "aging": [],
+ "diabetes": []
+ }
+ }
+
+ def __init__(self, the_file, verbose=0):
+ print('QuestionList has been initialized {0}, verbosity is {1}'.format(the_file, verbose))
+ self._fname = the_file
+ self._verbose = verbose
+ self.read_document()
+ self.parse_document()
+ #self._print()
+
+
+ def read_document(self):
+ self._doc = DocOps.read_json_document(
+ self._fname)
+
+
+
+ def parse_document(self):
+ print(('', '\nParse question list') [self._verbose] )
+ for item in self._doc:
+ level = item['level']
+ domain = item['domain']
+ query_lst = item['query']
+ self._question_list[level][domain] = query_lst
+ #print(('', 'Level --> {0} \tDomain --> {1}\n{2}'.format(level, domain, self.print_list(query_lst))) [self._verbose])
+ #create_datasets(query_lst, domain, level)
+
+
+ def print_list(self, the_lst):
+ ndx = 1
+ for item in the_lst:
+ print('\t[{0}] {1}'.format(ndx, item))
+ ndx += 1
+
+ def _print(self):
+ print(json.dumps(self._question_list, indent=2))
+
+ def get(self, level, domain):
+ return self._question_list[level][domain]
+
+
diff --git a/gnqa/src/study2/parse_r2r_result.py b/gnqa/src/study2/parse_r2r_result.py
new file mode 100644
index 0000000..5cba6d3
--- /dev/null
+++ b/gnqa/src/study2/parse_r2r_result.py
@@ -0,0 +1,63 @@
+import json
+import sys
+from study2.document_operations import DocOps, QuestionList
+
+verbose = 0
+#read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/testresp2.json'
+read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/responses/human/cs_diabetes_responses.json'
+out_file = '../data/dataset/human/intermediate_files/human_cs_diabetes_'
+
+values_key = {
+ "text" : {"name": "contexts", "append": 1},
+ "associatedQuery": {"name": "question", "append": 0},
+ "id": {"name": "id", "append": 1},
+ "title": {"name": "titles", "append": 1},
+ "document_id": {"name": "document_id", "append": 1},
+ "extraction_id": {"name": "extraction_id", "append": 1},
+ "content": {"name": "answer", "append": 0}
+}
+
+def get_ragas_out_dict():
+ return { "titles": [],
+ "extraction_id": [],
+ "document_id": [],
+ "id": [],
+ "contexts": [],
+ "answer": "",
+ "question": ""}
+
+def extract_response(obj, values_key, thedict):
+ if isinstance(obj, dict):
+ for key, val in obj.items():
+ if (key in values_key.keys()):
+ if (values_key[key]["append"]):
+ thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip())
+ else:
+ thedict[values_key[key]["name"]] = val.replace("\n", " ").strip()
+ print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [verbose])
+ else:
+ if (len(obj.items()) == 1 ):
+ print(key, " --> ", val)
+ extract_response(val, values_key, thedict)
+ elif isinstance(obj, list):
+ for item in obj:
+ extract_response(item, values_key, thedict)
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+ result_file = json.load(r_file)
+
+ragas_output = {
+ "titles": [],
+ "extraction_id": [],
+ "document_id": [],
+ "id": [],
+ "contexts": [],
+ "answer": "",
+ "question": ""}
+
+print('There are {0} keys in the result file'.format(result_file.keys()))
+for key in result_file.keys():
+ eval_dataset_dict = get_ragas_out_dict()
+ extract_response(result_file[key], values_key, eval_dataset_dict)
+ DocOps.writeDatasetFile(eval_dataset_dict, '{0}{1}'.format(out_file, key)) \ No newline at end of file
diff --git a/gnqa/src/study2/parsejson.py b/gnqa/src/study2/parsejson.py
new file mode 100644
index 0000000..b49a898
--- /dev/null
+++ b/gnqa/src/study2/parsejson.py
@@ -0,0 +1,63 @@
+import json
+import sys
+
+
+def iterate_json(obj, thedict):
+ if isinstance(obj, dict):
+ for key, val in obj.items():
+ if (key == "text"):
+ thedict["contexts"].append(val.replace("\n", " ").strip())
+ elif (key == "answer"):
+ thedict["answer"] = val.replace("\n", " ").strip()
+ elif (key == "question"):
+ thedict["question"] = val.replace("\n", " ").strip()
+ else:
+ if (len(obj.items()) == 1 ):
+ print(key, " --> ", val)
+ iterate_json(val, thedict)
+ elif isinstance(obj, list):
+ for item in obj:
+ iterate_json(item, thedict)
+
+def create_dataset_from_files(tag, file_name, rag_out):
+ for the_file in file_name[tag]:
+ ragas_output = {
+ "contexts": [],
+ "answer": "",
+ "question": ""}
+ #print(the_file)
+ with open("./data/"+the_file, "r") as r_file:
+ data_file = json.load(r_file)
+ iterate_json(data_file, ragas_output)
+ rag_out["answer"].append(ragas_output["answer"])
+ rag_out["question"].append(ragas_output["question"])
+ rag_out["contexts"].append(ragas_output["contexts"])
+
+def create_resultset_from_file(file_name):
+ with open("./data/"+the_file, "r") as r_file:
+ data_file = json.load(r_file)
+ iterate_json(data_file, ragas_output)
+
+
+file_list_tag = str(sys.argv[1])
+read_file = str(sys.argv[2]) # e.g. doc_list.json
+outp_file = str(sys.argv[3])
+
+rag_out = {
+ "question": [],
+ "answer": [],
+ "contexts": []
+}
+
+cntxt_lst = []
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+ file_lst = json.load(r_file)
+
+create_dataset_from_files(file_list_tag, file_lst, rag_out)
+
+with open(outp_file, "a") as the_data:
+ #json.dump(ragas_output, the_data)
+ the_data.write(",\n")
+ the_data.write(json.dumps(rag_out, indent=2))
diff --git a/gnqa/src/study2/retrieve_context.py b/gnqa/src/study2/retrieve_context.py
new file mode 100644
index 0000000..fca90dd
--- /dev/null
+++ b/gnqa/src/study2/retrieve_context.py
@@ -0,0 +1,171 @@
+import os
+import sys
+import json
+import time
+import configparser
+import apis.process as gnqa
+from apis.process import get_gnqa, get_response_from_taskid
+
+
+config = configparser.ConfigParser()
+config.read('_config.cfg')
+
+'''
+the refs object is a list of items containing doc_id, bibInfo, and comboTxt
+We only need comboTxt
+'''
+def simplifyContext(refs):
+ result = []
+ for item in refs:
+ combo_text = item['comboTxt']
+ combo_text = combo_text.replace('\n','')
+ combo_text = combo_text.replace('\t','')
+ result.append(combo_text)
+ return result
+
+def writeDatasetFile(responses, outp_file):
+ print(outp_file)
+ output = json.dumps(responses, indent=2)
+ if os.path.exists(outp_file):
+ with open(outp_file, "a") as the_data:
+ the_data.write('' + output)
+ else:
+ with open(outp_file, "a") as the_data:
+ the_data.write(output)
+
+
+def reset_responses():
+ return {
+ 'question': [],
+ 'answer': [],
+ 'contexts': [],
+ 'task_id': []
+ }
+
+def parse_document(jsonfile):
+ print('Parse document')
+ for item in jsonfile:
+ level = item['level']
+ domain = item['domain']
+ query_lst = item['query']
+ create_datasets(query_lst, domain, level)
+
+def create_datasets(query_list, domain, level):
+ print('Creating dataset')
+ responses = reset_responses()
+ ndx = 0
+ for query in query_list:
+ print(query)
+ task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR'])
+ responses['question'].append(query)
+ responses['answer'].append(answer)
+ responses['task_id'].append(task_id)
+ responses['contexts'].append(simplifyContext(refs))
+ ndx+=1
+ time.sleep(10) # sleep a bit to not overtask the api
+ if ndx % 5 == 0:
+ print('Will print to file number {0}'.format(int(ndx/5)))
+ outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5)))
+ writeDatasetFile(responses, outp_file)
+ responses = reset_responses()
+ if len(responses['question']) > 0:
+ outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5)+1))
+ writeDatasetFile(responses, outp_file)
+
+def parse_responses(jsonfile):
+ print('Parsing human responses')
+ de_dict_general = {"level": "domainexpert", "domain": "general", "query": [], "task_id": []}
+ de_dict_aging = {"level": "domainexpert", "domain": "aging", "query": [], "task_id": []}
+ de_dict_diabetes = {"level": "domainexpert", "domain": "diabetes", "query": [], "task_id": []}
+ cs_dict_general = {"level": "citizenscientist", "domain": "general", "query": [], "task_id": []}
+ cs_dict_aging = {"level": "citizenscientist", "domain": "aging", "query": [], "task_id": []}
+ cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []}
+ j = 0
+ for _, val in jsonfile.items():
+ ndx = 0
+ lvl = val.get("level")
+ for qry in val.get("query"):
+ ans = val.get("answer")[ndx] if "answer" in val else ""
+ tpc = val.get("topic")[ndx]
+ tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes"
+ tskd = val.get("task_id")[ndx]
+ if lvl == 'cs' and tpc == 'general':
+ addToDataList(cs_dict_general, qry, ans, tskd)
+ elif lvl == 'cs' and tpc == 'aging':
+ addToDataList(cs_dict_aging, qry, ans, tskd)
+ elif lvl == 'cs' and tpc == 'diabetes':
+ addToDataList(cs_dict_diabetes, qry, ans, tskd)
+ elif lvl == 'de' and tpc == 'general':
+ addToDataList(de_dict_general, qry, ans, tskd)
+ elif lvl == 'de' and tpc == 'aging':
+ addToDataList(de_dict_aging, qry, ans, tskd)
+ elif lvl == 'de' and tpc == 'diabetes':
+ addToDataList(de_dict_diabetes, qry, ans, tskd)
+ else:
+ print('Somehow there is a query without a topic or expertise level')
+ ndx+=1
+ j+=1
+ create_datasets_from_taskid(de_dict_general)
+ create_datasets_from_taskid(de_dict_aging)
+ create_datasets_from_taskid(de_dict_diabetes)
+ create_datasets_from_taskid(cs_dict_general)
+ create_datasets_from_taskid(cs_dict_aging)
+ create_datasets_from_taskid(cs_dict_diabetes)
+
+def addToDataList(data_lst, qry, ans, tskd):
+ data_lst["query"].append(qry)
+ data_lst["task_id"].append(tskd)
+ if "answer" not in data_lst.keys():
+ data_lst["answer"] = []
+ data_lst["answer"].append(ans)
+
+
+def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level):
+ print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"]))
+ responses = reset_responses()
+ ndx = 0
+ query_list = info_dict["query"]
+ if "answer" in info_dict:
+ answers = info_dict["answer"]
+ else:
+ info_dict["answer"] = []
+ answers = []
+
+ for task_id in info_dict["task_id"]:
+ _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id)
+ responses['question'].append(query_list[ndx])
+ if answers[ndx] == "":
+ responses['answer'].append(an_answer)
+ else:
+ responses['answer'].append(answers[ndx])
+ responses['task_id'].append(task_id)
+ responses['contexts'].append(simplifyContext(refs))
+ ndx+=1
+ time.sleep(10) # sleep a bit to not overtask the api
+ if ndx % 5 == 0:
+ #print('Will print to file number {0}'.format(int(ndx/5)))
+ outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)))
+ writeDatasetFile(responses, outp_file)
+ responses = reset_responses()
+ if len(responses['question']) > 0:
+ #print('Will print to file number {0}'.format(int((ndx/5)+1)))
+ #print(responses)
+ outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1))
+ writeDatasetFile(responses, outp_file)
+
+try:
+
+ read_file = str(sys.argv[1])
+ file_type = str(sys.argv[2])
+
+except:
+ exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"')
+
+
+print('Read input file')
+with open(read_file, "r") as r_file:
+ file_lst = json.load(r_file)
+if file_type == "gpt4o":
+ parse_document(file_lst)
+else:
+ parse_responses(file_lst) \ No newline at end of file
diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py
new file mode 100644
index 0000000..07aee5f
--- /dev/null
+++ b/gnqa/src/study2/run_questions.py
@@ -0,0 +1,38 @@
+import json
+import sys
+import os
+
+from r2r import R2RClient
+from study2.document_operations import DocOps, QuestionList
+
+'''
+*******************************************************************************
+Variables
+*******************************************************************************
+'''
+rag_response = {}
+client = R2RClient("http://localhost:8000")
+health_resp = client.health()
+
+'''
+*******************************************************************************
+Commands
+*******************************************************************************
+'''
+
+print("The R2R client's health status is {0}".format(health_resp))
+
+try:
+ read_file = str(sys.argv[1])
+ out_file = str(sys.argv[2])
+except:
+ exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"')
+
+qLst = QuestionList(read_file, 1) # second parameter is for verbose output
+ndx = 1
+for question in qLst.get("domainexpert","aging"):
+ print('Getting response for the following question --> {0}'.format(question))
+ rag_response[str(ndx)] = client.rag(question)
+ ndx += 1
+
+DocOps.writeDatasetFile(rag_response, out_file) \ No newline at end of file