about summary refs log tree commit diff
path: root/gnqa/src/study2
diff options
context:
space:
mode:
Diffstat (limited to 'gnqa/src/study2')
-rw-r--r--gnqa/src/study2/__init__.py0
-rw-r--r--gnqa/src/study2/create_dataset.py20
-rw-r--r--gnqa/src/study2/document_operations.py147
-rw-r--r--gnqa/src/study2/parse_r2r_result.py63
-rw-r--r--gnqa/src/study2/parsejson.py63
-rw-r--r--gnqa/src/study2/retrieve_context.py171
-rw-r--r--gnqa/src/study2/run_questions.py38
7 files changed, 502 insertions, 0 deletions
diff --git a/gnqa/src/study2/__init__.py b/gnqa/src/study2/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gnqa/src/study2/__init__.py
diff --git a/gnqa/src/study2/create_dataset.py b/gnqa/src/study2/create_dataset.py
new file mode 100644
index 00000000..2d62f18b
--- /dev/null
+++ b/gnqa/src/study2/create_dataset.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python3
+
+import sys
+from study2.document_operations import DocOps
+
+
+'''
+*******************************************************************************
+Commands
+*******************************************************************************
+'''
+
+try:
+    read_file = str(sys.argv[1])
+    out_file  = str(sys.argv[2])
+except:
+    exit('Example use "python create_dataset.py ../data/lists/human_list_cs_gn.json ../data/dataset/human_cs_gn.json"')
+
+doc_list = DocOps.read_json_document(read_file)
+DocOps.combine_responses(doc_list, out_file)
\ No newline at end of file
diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py
new file mode 100644
index 00000000..3112d915
--- /dev/null
+++ b/gnqa/src/study2/document_operations.py
@@ -0,0 +1,147 @@
+import os
+#import sys
+import json
+#import time
+#import configparser
+'''
+from r2r import ( R2R, 
+                  Document, 
+                  GenerationConfig, 
+                  R2RClient )
+'''
+
+class DocOps:
+    _type = ''
+    values_key = {
+        "text" :           {"name": "contexts",      "append": 1},
+        "associatedQuery": {"name": "question",      "append": 0},
+        "id":              {"name": "id",            "append": 1},
+        "title":           {"name": "titles",        "append": 1},
+        "document_id":     {"name": "document_id",   "append": 1},
+        "extraction_id":   {"name": "extraction_id", "append": 1},
+        "content":         {"name": "answer",        "append": 0}
+    }
+
+    def __init__(self):
+        self._type = 'QuestionList'
+
+    def reset_responses():
+        return {
+            'question': [],
+            'answer':   [],
+            'contexts':  []
+            #,
+            #'task_id': []
+        }
+
+    def writeDatasetFile(responses, outp_file):
+        print(outp_file)
+        output = json.dumps(responses, indent=2)
+        if os.path.exists(outp_file):
+            with open(outp_file, "a") as the_data:
+                the_data.write('\n\n' + output)
+        else:
+            with open(outp_file, "a") as the_data:
+                the_data.write(output)
+
+    def get_r2r_ragas_out_dict():
+        return { "titles":        [],
+                "extraction_id": [],
+                "document_id":   [],
+                "id":            [],
+                "contexts":      [],
+                "answer":        "",
+                "question":      ""}
+
+    def read_json_document(file_name):
+        with open(file_name, "r") as result_file:
+            return json.load(result_file)
+    
+    def combine_responses(doc_lst, out_filename):
+        ragas_output = DocOps.reset_responses()
+
+        for doc in doc_lst:
+            the_doc = DocOps.read_json_document(doc)
+            ragas_output['question'].append(
+                the_doc['question'])
+            ragas_output['answer'].append(
+                the_doc['answer'])
+            ragas_output['contexts'].append(
+                the_doc['contexts'])
+        DocOps.writeDatasetFile(
+            ragas_output, out_filename)
+
+
+    def extract_response(obj, values_key, thedict):
+        if isinstance(obj, dict):
+            for key, val in obj.items():
+                if (key in values_key.keys()):
+                    if (values_key[key]["append"]):
+                        thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip())
+                    else:
+                        thedict[values_key[key]["name"]] = val.replace("\n", " ").strip()
+                    print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [DocOps.verbose])
+                else:
+                    if (len(obj.items()) == 1 ):
+                        print(key, " --> ", val)
+                DocOps.extract_response(val, values_key, thedict)
+        elif isinstance(obj, list):
+            for item in obj:
+                DocOps.extract_response(item, values_key, thedict)
+
+class QuestionList:
+    _verbose = 0
+    _doc = ''
+    _fname = ''
+    _question_list = {
+        "domainexpert": { 
+            "gn":  [],
+            "aging":    [],
+            "diabetes": []
+        },
+        "citizenscientist": { 
+            "gn":  [],
+            "aging":    [],
+            "diabetes": []
+        }
+    }
+
+    def __init__(self, the_file, verbose=0):
+        print('QuestionList has been initialized {0}, verbosity is {1}'.format(the_file, verbose))
+        self._fname = the_file
+        self._verbose = verbose
+        self.read_document()
+        self.parse_document()
+        #self._print()
+
+
+    def read_document(self):
+        self._doc = DocOps.read_json_document(
+            self._fname)
+
+
+
+    def parse_document(self):
+        print(('', '\nParse question list') [self._verbose] )
+        for item in self._doc:
+            level     = item['level']
+            domain    = item['domain']
+            query_lst = item['query']
+            self._question_list[level][domain] = query_lst
+            #print(('', 'Level --> {0} \tDomain --> {1}\n{2}'.format(level, domain, self.print_list(query_lst))) [self._verbose])
+            #create_datasets(query_lst, domain, level)
+
+
+    def print_list(self, the_lst):
+        ndx = 1 
+        for item in the_lst:
+            print('\t[{0}] {1}'.format(ndx, item))
+            ndx += 1
+    
+    def _print(self):
+        print(json.dumps(self._question_list, indent=2))
+
+    def get(self, level, domain):
+        return self._question_list[level][domain]
+    
+
diff --git a/gnqa/src/study2/parse_r2r_result.py b/gnqa/src/study2/parse_r2r_result.py
new file mode 100644
index 00000000..5cba6d3e
--- /dev/null
+++ b/gnqa/src/study2/parse_r2r_result.py
@@ -0,0 +1,63 @@
+import json
+import sys
+from study2.document_operations import DocOps, QuestionList
+
+verbose = 0
+#read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/testresp2.json'
+read_file = '/home/shebes/Coding/gn-ai/gnqa/paper2_eval/data/responses/human/cs_diabetes_responses.json'
+out_file = '../data/dataset/human/intermediate_files/human_cs_diabetes_' 
+
+values_key = {
+    "text" :           {"name": "contexts",      "append": 1},
+    "associatedQuery": {"name": "question",      "append": 0},
+    "id":              {"name": "id",            "append": 1},
+    "title":           {"name": "titles",        "append": 1},
+    "document_id":     {"name": "document_id",   "append": 1},
+    "extraction_id":   {"name": "extraction_id", "append": 1},
+    "content":         {"name": "answer",        "append": 0}
+}
+
+def get_ragas_out_dict():
+    return { "titles":        [],
+             "extraction_id": [],
+             "document_id":   [],
+             "id":            [],
+             "contexts":      [],
+             "answer":        "",
+             "question":      ""}
+
+def extract_response(obj, values_key, thedict):
+    if isinstance(obj, dict):
+        for key, val in obj.items():
+            if (key in values_key.keys()):
+                if (values_key[key]["append"]):
+                    thedict[values_key[key]["name"]].append(val.replace("\n", " ").strip())
+                else:
+                    thedict[values_key[key]["name"]] = val.replace("\n", " ").strip()
+                print(("", "Key -> {0}\tValue -> {1}".format(key,val)) [verbose])
+            else:
+                if (len(obj.items()) == 1 ):
+                    print(key, " --> ", val)
+            extract_response(val, values_key, thedict)
+    elif isinstance(obj, list):
+        for item in obj:
+            extract_response(item, values_key, thedict)
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+    result_file = json.load(r_file)
+
+ragas_output = {
+    "titles":        [],
+    "extraction_id": [],
+    "document_id":   [],
+    "id":            [],
+    "contexts":      [],
+    "answer":        "",
+    "question":      ""}
+
+print('There are {0} keys in the result file'.format(result_file.keys()))
+for key in result_file.keys():
+    eval_dataset_dict = get_ragas_out_dict()
+    extract_response(result_file[key], values_key, eval_dataset_dict)
+    DocOps.writeDatasetFile(eval_dataset_dict, '{0}{1}'.format(out_file, key))
\ No newline at end of file
diff --git a/gnqa/src/study2/parsejson.py b/gnqa/src/study2/parsejson.py
new file mode 100644
index 00000000..b49a898a
--- /dev/null
+++ b/gnqa/src/study2/parsejson.py
@@ -0,0 +1,63 @@
+import json
+import sys
+
+
+def iterate_json(obj, thedict):
+    if isinstance(obj, dict):
+        for key, val in obj.items():
+            if (key == "text"):
+                thedict["contexts"].append(val.replace("\n", " ").strip())
+            elif (key == "answer"):
+                thedict["answer"] = val.replace("\n", " ").strip()
+            elif (key == "question"):
+                thedict["question"] = val.replace("\n", " ").strip()
+            else:
+                if (len(obj.items()) == 1 ):
+                    print(key, " --> ", val)
+            iterate_json(val, thedict)
+    elif isinstance(obj, list):
+        for item in obj:
+            iterate_json(item, thedict)
+
+def create_dataset_from_files(tag, file_name, rag_out):
+    for the_file in file_name[tag]:
+        ragas_output = {
+            "contexts": [],
+            "answer": "",
+            "question": ""}
+        #print(the_file)
+        with open("./data/"+the_file, "r") as r_file:
+            data_file = json.load(r_file)
+        iterate_json(data_file, ragas_output)
+        rag_out["answer"].append(ragas_output["answer"])
+        rag_out["question"].append(ragas_output["question"])
+        rag_out["contexts"].append(ragas_output["contexts"])
+
+def create_resultset_from_file(file_name):
+        with open("./data/"+the_file, "r") as r_file:
+            data_file = json.load(r_file)
+        iterate_json(data_file, ragas_output)
+
+
+file_list_tag = str(sys.argv[1])
+read_file = str(sys.argv[2]) # e.g. doc_list.json
+outp_file = str(sys.argv[3])
+
+rag_out = {
+    "question": [],
+    "answer": [],
+    "contexts": []
+}
+
+cntxt_lst = []
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+    file_lst = json.load(r_file)
+
+create_dataset_from_files(file_list_tag, file_lst, rag_out)
+
+with open(outp_file, "a") as the_data:
+    #json.dump(ragas_output, the_data)
+    the_data.write(",\n")
+    the_data.write(json.dumps(rag_out, indent=2))
diff --git a/gnqa/src/study2/retrieve_context.py b/gnqa/src/study2/retrieve_context.py
new file mode 100644
index 00000000..fca90dd9
--- /dev/null
+++ b/gnqa/src/study2/retrieve_context.py
@@ -0,0 +1,171 @@
+import os
+import sys
+import json
+import time
+import configparser
+import apis.process as gnqa
+from apis.process import get_gnqa, get_response_from_taskid
+
+
+config = configparser.ConfigParser()
+config.read('_config.cfg')
+
+'''
+the refs object is a list of items containing doc_id, bibInfo, and comboTxt
+We only need comboTxt
+'''
+def simplifyContext(refs):
+    result = []
+    for item in refs:
+        combo_text = item['comboTxt']
+        combo_text = combo_text.replace('\n','')
+        combo_text = combo_text.replace('\t','')
+        result.append(combo_text)
+    return result
+
+def writeDatasetFile(responses, outp_file):
+  print(outp_file)
+  output = json.dumps(responses, indent=2)
+  if os.path.exists(outp_file):
+    with open(outp_file, "a") as the_data:
+      the_data.write('' + output)
+  else:
+    with open(outp_file, "a") as the_data:
+      the_data.write(output)
+
+
+def reset_responses():
+  return {
+    'question': [],
+    'answer':   [],
+    'contexts':  [],
+    'task_id': []
+  }
+
+def parse_document(jsonfile):
+  print('Parse document')
+  for item in jsonfile:
+    level     = item['level']
+    domain    = item['domain']
+    query_lst = item['query']
+    create_datasets(query_lst, domain, level)
+
+def create_datasets(query_list, domain, level):
+  print('Creating dataset')
+  responses = reset_responses()
+  ndx = 0
+  for query in query_list:
+    print(query)
+    task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR'])
+    responses['question'].append(query)
+    responses['answer'].append(answer)
+    responses['task_id'].append(task_id)
+    responses['contexts'].append(simplifyContext(refs))
+    ndx+=1
+    time.sleep(10) # sleep a bit to not overtask the api
+    if ndx % 5 == 0:
+      print('Will print to file number {0}'.format(int(ndx/5)))
+      outp_file  = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5)))
+      writeDatasetFile(responses, outp_file)
+      responses = reset_responses()
+  if len(responses['question']) > 0:
+    outp_file  = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['human_dir'],level,domain,str(int(ndx/5)+1))
+    writeDatasetFile(responses, outp_file)
+
+def parse_responses(jsonfile):
+  print('Parsing human responses')
+  de_dict_general  = {"level": "domainexpert",     "domain": "general",  "query": [], "task_id": []}
+  de_dict_aging    = {"level": "domainexpert",     "domain": "aging",    "query": [], "task_id": []}
+  de_dict_diabetes = {"level": "domainexpert",     "domain": "diabetes", "query": [], "task_id": []}
+  cs_dict_general  = {"level": "citizenscientist", "domain": "general",  "query": [], "task_id": []}
+  cs_dict_aging    = {"level": "citizenscientist", "domain": "aging",    "query": [], "task_id": []}
+  cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []}
+  j = 0
+  for _, val in jsonfile.items():
+    ndx = 0
+    lvl = val.get("level")
+    for qry in val.get("query"):
+      ans = val.get("answer")[ndx] if "answer" in val else ""
+      tpc  = val.get("topic")[ndx]
+      tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes"
+      tskd = val.get("task_id")[ndx]
+      if   lvl == 'cs' and tpc == 'general':
+        addToDataList(cs_dict_general, qry, ans, tskd)
+      elif lvl == 'cs' and tpc == 'aging':
+        addToDataList(cs_dict_aging, qry, ans, tskd)
+      elif lvl == 'cs' and tpc == 'diabetes':
+        addToDataList(cs_dict_diabetes, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'general':
+        addToDataList(de_dict_general, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'aging':
+        addToDataList(de_dict_aging, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'diabetes':
+        addToDataList(de_dict_diabetes, qry, ans, tskd)
+      else:
+         print('Somehow there is a query without a topic or expertise level')
+      ndx+=1
+    j+=1
+  create_datasets_from_taskid(de_dict_general)
+  create_datasets_from_taskid(de_dict_aging)
+  create_datasets_from_taskid(de_dict_diabetes)
+  create_datasets_from_taskid(cs_dict_general)
+  create_datasets_from_taskid(cs_dict_aging)
+  create_datasets_from_taskid(cs_dict_diabetes)
+
+def addToDataList(data_lst, qry, ans, tskd):
+  data_lst["query"].append(qry)
+  data_lst["task_id"].append(tskd)
+  if "answer" not in data_lst.keys():
+    data_lst["answer"] = []
+  data_lst["answer"].append(ans)
+
+
+def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level):
+  print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"]))
+  responses = reset_responses()
+  ndx = 0
+  query_list = info_dict["query"]
+  if "answer" in info_dict:
+    answers    = info_dict["answer"]
+  else:
+    info_dict["answer"] = []
+    answers = []
+
+  for task_id in info_dict["task_id"]:
+    _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id)
+    responses['question'].append(query_list[ndx])
+    if answers[ndx] == "":
+      responses['answer'].append(an_answer)
+    else:
+      responses['answer'].append(answers[ndx])
+    responses['task_id'].append(task_id)
+    responses['contexts'].append(simplifyContext(refs))
+    ndx+=1
+    time.sleep(10) # sleep a bit to not overtask the api
+    if ndx % 5 == 0:
+      #print('Will print to file number {0}'.format(int(ndx/5)))
+      outp_file  = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)))
+      writeDatasetFile(responses, outp_file)
+      responses = reset_responses()
+  if len(responses['question']) > 0:
+    #print('Will print to file number {0}'.format(int((ndx/5)+1)))
+    #print(responses)
+    outp_file  = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1))
+    writeDatasetFile(responses, outp_file)
+
+try: 
+
+  read_file = str(sys.argv[1])
+  file_type = str(sys.argv[2])
+
+except:
+  exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"')
+
+
+print('Read input file')
+with open(read_file, "r") as r_file:
+  file_lst = json.load(r_file)
+if file_type == "gpt4o":
+  parse_document(file_lst)
+else:
+  parse_responses(file_lst)
\ No newline at end of file
diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py
new file mode 100644
index 00000000..07aee5f0
--- /dev/null
+++ b/gnqa/src/study2/run_questions.py
@@ -0,0 +1,38 @@
+import json
+import sys
+import os
+
+from r2r import R2RClient
+from study2.document_operations import DocOps, QuestionList
+
+'''
+*******************************************************************************
+Variables
+*******************************************************************************
+'''
+rag_response = {}
+client       = R2RClient("http://localhost:8000")
+health_resp  = client.health()
+
+'''
+*******************************************************************************
+Commands
+*******************************************************************************
+'''
+
+print("The R2R client's health status is {0}".format(health_resp))
+
+try:
+    read_file = str(sys.argv[1])
+    out_file  = str(sys.argv[2])
+except:
+    exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"')
+
+qLst = QuestionList(read_file, 1) # second parameter is for verbose output
+ndx = 1
+for question in qLst.get("domainexpert","aging"):
+    print('Getting response for the following question --> {0}'.format(question))
+    rag_response[str(ndx)] = client.rag(question)
+    ndx += 1
+
+DocOps.writeDatasetFile(rag_response, out_file)
\ No newline at end of file