import os import sys import json import time import configparser import apis.process as gnqa from apis.process import get_gnqa, get_response_from_taskid config = configparser.ConfigParser() config.read('_config.cfg') ''' the refs object is a list of items containing doc_id, bibInfo, and comboTxt We only need comboTxt ''' def simplifyContext(refs): result = [] for item in refs: combo_text = item['comboTxt'] combo_text = combo_text.replace('\n','') combo_text = combo_text.replace('\t','') result.append(combo_text) return result def writeDatasetFile(responses, outp_file): print(outp_file) output = json.dumps(responses, indent=2) if os.path.exists(outp_file): with open(outp_file, "a") as the_data: the_data.write('' + output) else: with open(outp_file, "a") as the_data: the_data.write(output) def reset_responses(): return { 'question': [], 'answer': [], 'contexts': [], 'task_id': [] } def parse_document(jsonfile): print('Parse document') for item in jsonfile: level = item['level'] domain = item['domain'] query_lst = item['query'] create_datasets(query_lst, domain, level) def create_datasets(query_list, domain, level): print('Creating dataset') responses = reset_responses() ndx = 0 for query in query_list: print(query) task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR']) responses['question'].append(query) responses['answer'].append(answer) responses['task_id'].append(task_id) responses['contexts'].append(simplifyContext(refs)) ndx+=1 time.sleep(10) # sleep a bit to not overtask the api if ndx % 5 == 0: print('Will print to file number {0}'.format(int(ndx/5))) outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5))) writeDatasetFile(responses, outp_file) responses = reset_responses() if len(responses['question']) > 0: outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)+1)) writeDatasetFile(responses, outp_file) def parse_responses(jsonfile): print('Parsing human responses') de_dict_general = {"level": "domainexpert", "domain": "general", "query": [], "task_id": []} de_dict_aging = {"level": "domainexpert", "domain": "aging", "query": [], "task_id": []} de_dict_diabetes = {"level": "domainexpert", "domain": "diabetes", "query": [], "task_id": []} cs_dict_general = {"level": "citizenscientist", "domain": "general", "query": [], "task_id": []} cs_dict_aging = {"level": "citizenscientist", "domain": "aging", "query": [], "task_id": []} cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []} j = 0 for _, val in jsonfile.items(): ndx = 0 lvl = val.get("level") for qry in val.get("query"): ans = val.get("answer")[ndx] if "answer" in val else "" tpc = val.get("topic")[ndx] tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes" tskd = val.get("task_id")[ndx] if lvl == 'cs' and tpc == 'general': addToDataList(cs_dict_general, qry, ans, tskd) elif lvl == 'cs' and tpc == 'aging': addToDataList(cs_dict_aging, qry, ans, tskd) elif lvl == 'cs' and tpc == 'diabetes': addToDataList(cs_dict_diabetes, qry, ans, tskd) elif lvl == 'de' and tpc == 'general': addToDataList(de_dict_general, qry, ans, tskd) elif lvl == 'de' and tpc == 'aging': addToDataList(de_dict_aging, qry, ans, tskd) elif lvl == 'de' and tpc == 'diabetes': addToDataList(de_dict_diabetes, qry, ans, tskd) else: print('Somehow there is a query without a topic or expertise level') ndx+=1 j+=1 create_datasets_from_taskid(de_dict_general) create_datasets_from_taskid(de_dict_aging) create_datasets_from_taskid(de_dict_diabetes) create_datasets_from_taskid(cs_dict_general) create_datasets_from_taskid(cs_dict_aging) create_datasets_from_taskid(cs_dict_diabetes) def addToDataList(data_lst, qry, ans, tskd): data_lst["query"].append(qry) data_lst["task_id"].append(tskd) if "answer" not in data_lst.keys(): data_lst["answer"] = [] data_lst["answer"].append(ans) def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level): print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"])) responses = reset_responses() ndx = 0 query_list = info_dict["query"] if "answer" in info_dict: answers = info_dict["answer"] else: info_dict["answer"] = [] answers = [] for task_id in info_dict["task_id"]: _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id) responses['question'].append(query_list[ndx]) if answers[ndx] == "": responses['answer'].append(an_answer) else: responses['answer'].append(answers[ndx]) responses['task_id'].append(task_id) responses['contexts'].append(simplifyContext(refs)) ndx+=1 time.sleep(10) # sleep a bit to not overtask the api if ndx % 5 == 0: #print('Will print to file number {0}'.format(int(ndx/5))) outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5))) writeDatasetFile(responses, outp_file) responses = reset_responses() if len(responses['question']) > 0: #print('Will print to file number {0}'.format(int((ndx/5)+1))) #print(responses) outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1)) writeDatasetFile(responses, outp_file) try: read_file = str(sys.argv[1]) file_type = str(sys.argv[2]) except: exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"') print('Read input file') with open(read_file, "r") as r_file: file_lst = json.load(r_file) if file_type == "gpt4o": parse_document(file_lst) else: parse_responses(file_lst)