import os
import sys
import json
import time
import configparser
import apis.process as gnqa
from apis.process import get_gnqa, get_response_from_taskid
config = configparser.ConfigParser()
config.read('_config.cfg')
'''
the refs object is a list of items containing doc_id, bibInfo, and comboTxt
We only need comboTxt
'''
def simplifyContext(refs):
result = []
for item in refs:
combo_text = item['comboTxt']
combo_text = combo_text.replace('\n','')
combo_text = combo_text.replace('\t','')
result.append(combo_text)
return result
def writeDatasetFile(responses, outp_file):
print(outp_file)
output = json.dumps(responses, indent=2)
if os.path.exists(outp_file):
with open(outp_file, "a") as the_data:
the_data.write('' + output)
else:
with open(outp_file, "a") as the_data:
the_data.write(output)
def reset_responses():
return {
'question': [],
'answer': [],
'contexts': [],
'task_id': []
}
def parse_document(jsonfile):
print('Parse document')
for item in jsonfile:
level = item['level']
domain = item['domain']
query_lst = item['query']
create_datasets(query_lst, domain, level)
def create_datasets(query_list, domain, level):
print('Creating dataset')
responses = reset_responses()
ndx = 0
for query in query_list:
print(query)
task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR'])
responses['question'].append(query)
responses['answer'].append(answer)
responses['task_id'].append(task_id)
responses['contexts'].append(simplifyContext(refs))
ndx+=1
time.sleep(10) # sleep a bit to not overtask the api
if ndx % 5 == 0:
print('Will print to file number {0}'.format(int(ndx/5)))
outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)))
writeDatasetFile(responses, outp_file)
responses = reset_responses()
if len(responses['question']) > 0:
outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)+1))
writeDatasetFile(responses, outp_file)
def parse_responses(jsonfile):
print('Parsing human responses')
de_dict_general = {"level": "domainexpert", "domain": "general", "query": [], "task_id": []}
de_dict_aging = {"level": "domainexpert", "domain": "aging", "query": [], "task_id": []}
de_dict_diabetes = {"level": "domainexpert", "domain": "diabetes", "query": [], "task_id": []}
cs_dict_general = {"level": "citizenscientist", "domain": "general", "query": [], "task_id": []}
cs_dict_aging = {"level": "citizenscientist", "domain": "aging", "query": [], "task_id": []}
cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []}
j = 0
for _, val in jsonfile.items():
ndx = 0
lvl = val.get("level")
for qry in val.get("query"):
ans = val.get("answer")[ndx] if "answer" in val else ""
tpc = val.get("topic")[ndx]
tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes"
tskd = val.get("task_id")[ndx]
if lvl == 'cs' and tpc == 'general':
addToDataList(cs_dict_general, qry, ans, tskd)
elif lvl == 'cs' and tpc == 'aging':
addToDataList(cs_dict_aging, qry, ans, tskd)
elif lvl == 'cs' and tpc == 'diabetes':
addToDataList(cs_dict_diabetes, qry, ans, tskd)
elif lvl == 'de' and tpc == 'general':
addToDataList(de_dict_general, qry, ans, tskd)
elif lvl == 'de' and tpc == 'aging':
addToDataList(de_dict_aging, qry, ans, tskd)
elif lvl == 'de' and tpc == 'diabetes':
addToDataList(de_dict_diabetes, qry, ans, tskd)
else:
print('Somehow there is a query without a topic or expertise level')
ndx+=1
j+=1
create_datasets_from_taskid(de_dict_general)
create_datasets_from_taskid(de_dict_aging)
create_datasets_from_taskid(de_dict_diabetes)
create_datasets_from_taskid(cs_dict_general)
create_datasets_from_taskid(cs_dict_aging)
create_datasets_from_taskid(cs_dict_diabetes)
def addToDataList(data_lst, qry, ans, tskd):
data_lst["query"].append(qry)
data_lst["task_id"].append(tskd)
if "answer" not in data_lst.keys():
data_lst["answer"] = []
data_lst["answer"].append(ans)
def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level):
print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"]))
responses = reset_responses()
ndx = 0
query_list = info_dict["query"]
if "answer" in info_dict:
answers = info_dict["answer"]
else:
info_dict["answer"] = []
answers = []
for task_id in info_dict["task_id"]:
_, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id)
responses['question'].append(query_list[ndx])
if answers[ndx] == "":
responses['answer'].append(an_answer)
else:
responses['answer'].append(answers[ndx])
responses['task_id'].append(task_id)
responses['contexts'].append(simplifyContext(refs))
ndx+=1
time.sleep(10) # sleep a bit to not overtask the api
if ndx % 5 == 0:
#print('Will print to file number {0}'.format(int(ndx/5)))
outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)))
writeDatasetFile(responses, outp_file)
responses = reset_responses()
if len(responses['question']) > 0:
#print('Will print to file number {0}'.format(int((ndx/5)+1)))
#print(responses)
outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1))
writeDatasetFile(responses, outp_file)
try:
read_file = str(sys.argv[1])
file_type = str(sys.argv[2])
except:
exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"')
print('Read input file')
with open(read_file, "r") as r_file:
file_lst = json.load(r_file)
if file_type == "gpt4o":
parse_document(file_lst)
else:
parse_responses(file_lst)