5 files changed, 411 insertions, 0 deletions
diff --git a/gnqa/src/study1/__init__.py b/gnqa/src/study1/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gnqa/src/study1/__init__.py
diff --git a/gnqa/src/study1/parsejson.py b/gnqa/src/study1/parsejson.py
new file mode 100644
index 00000000..b49a898a
--- /dev/null
+++ b/gnqa/src/study1/parsejson.py
@@ -0,0 +1,63 @@
+import json
+import sys
+
+
+def iterate_json(obj, thedict):
+    if isinstance(obj, dict):
+        for key, val in obj.items():
+            if (key == "text"):
+                thedict["contexts"].append(val.replace("\n", " ").strip())
+            elif (key == "answer"):
+                thedict["answer"] = val.replace("\n", " ").strip()
+            elif (key == "question"):
+                thedict["question"] = val.replace("\n", " ").strip()
+            else:
+                if (len(obj.items()) == 1 ):
+                    print(key, " --> ", val)
+            iterate_json(val, thedict)
+    elif isinstance(obj, list):
+        for item in obj:
+            iterate_json(item, thedict)
+
+def create_dataset_from_files(tag, file_name, rag_out):
+    for the_file in file_name[tag]:
+        ragas_output = {
+            "contexts": [],
+            "answer": "",
+            "question": ""}
+        #print(the_file)
+        with open("./data/"+the_file, "r") as r_file:
+            data_file = json.load(r_file)
+        iterate_json(data_file, ragas_output)
+        rag_out["answer"].append(ragas_output["answer"])
+        rag_out["question"].append(ragas_output["question"])
+        rag_out["contexts"].append(ragas_output["contexts"])
+
+def create_resultset_from_file(file_name):
+        with open("./data/"+the_file, "r") as r_file:
+            data_file = json.load(r_file)
+        iterate_json(data_file, ragas_output)
+
+
+file_list_tag = str(sys.argv[1])
+read_file = str(sys.argv[2]) # e.g. doc_list.json
+outp_file = str(sys.argv[3])
+
+rag_out = {
+    "question": [],
+    "answer": [],
+    "contexts": []
+}
+
+cntxt_lst = []
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+    file_lst = json.load(r_file)
+
+create_dataset_from_files(file_list_tag, file_lst, rag_out)
+
+with open(outp_file, "a") as the_data:
+    #json.dump(ragas_output, the_data)
+    the_data.write(",\n")
+    the_data.write(json.dumps(rag_out, indent=2))
diff --git a/gnqa/src/study1/parsejson_ratings.py b/gnqa/src/study1/parsejson_ratings.py
new file mode 100644
index 00000000..bd20417e
--- /dev/null
+++ b/gnqa/src/study1/parsejson_ratings.py
@@ -0,0 +1,106 @@
+import json
+import sys
+
+"""
+This file converts the json report from GNQA into a list of individual users
+and their interactions with the system. At the moment we are getting their
+questions, the systems answers, and their ratings of the overall system response.
+Unfortunately the context is not saved with the answer.
+"""
+# report_data, ratings_dict
+def reorg_json_report(obj, resp_lst, ratings):
+  if isinstance(obj, dict):
+    user_id = ''
+    for key, val in obj.items():
+      if (key == "user_id"):
+        user_id = val
+        if isKeyInList(resp_lst, val) == 0:
+          resp_lst.append({val: ratings})
+        else:
+          print("\nKey {0} is already present".format(val))
+      elif (key in ["query","answer","weight","task_id"]):
+        ratings[key].append(val)
+      #else:
+      #  print('These are the current ratings --> {0}'.format(ratings))
+    print('The ratings before being pushed to user_responses ->  {0}'.format(ratings))
+    # add query to dictionary, if it is an update then don't update the ratings
+    qcount = query_dict.setdefault(ratings["query"][0], 0)
+    query_dict.update({ratings["query"][0]: qcount+1})
+    update_ratings(resp_lst, user_id, ratings)
+    if qcount == 0:
+      taskquery_dict.setdefault(ratings["task_id"][0], ratings["query"][0])
+      #update_ratings(resp_lst, user_id, ratings)
+    #reorg_json_report(val, resp_lst, ratings)
+  elif isinstance(obj, list):
+    for item in obj:
+      ratings = reset_ratings()
+      reorg_json_report(item, resp_lst, ratings)
+
+
+
+def create_resultset_from_file(resp_lst, file_name, output):
+  with open(file_name, "r") as r_file:
+    the_data = json.load(r_file)
+  reorg_json_report(the_data, resp_lst, output)
+
+def isKeyInList(the_lst, the_key):
+  result = 0
+  result_item = {}
+  for the_item in the_lst:
+    if the_key in the_item:
+      result = 1
+      result_item = the_item
+  return result, result_item
+
+def update_ratings(ratings, user_id, input_dict):
+  key_ndx, ratings_dict = isKeyInList(ratings, user_id)
+  if key_ndx == 0:
+    ratings.append({user_id: input_dict})
+  else:
+    for key, val in input_dict.items():
+      if isinstance(val,list):
+        ratings_dict[user_id][key].append(val[0])
+      else:
+        ratings_dict[user_id][key].append(val)
+      
+def reset_ratings():
+  return {
+    "task_id": [],
+    "weight": [],
+    "answer": [],
+    "query": []
+  }
+
+user_responses = []
+ratings_out = {
+  "task_id": [],
+  "weight": [],
+  "answer": [],
+  "query": []
+}
+
+query_dict = {}
+taskquery_dict = {}
+
+try:
+  read_file = str(sys.argv[1]) # e.g. doc_list.json
+  outp_file = str(sys.argv[2])
+except:
+  exit('Example use "python3 parsejson_ratings.py data/ratings/2024_06_25-gnqa_responses.json data/ratings/[date]-out.json"')
+
+#print('The input file is {0}, the output file is {1}'.format(read_file, outp_file))
+
+create_resultset_from_file(user_responses, read_file, ratings_out)
+print('The number of users is {0}'.format(len(user_responses)))
+#print(json.dumps(ratings_out, indent=2))
+with open(outp_file, "a") as the_data:
+    the_data.write(",\n")
+    the_data.write(json.dumps(user_responses, indent=2))
+
+print("Greetings shabes!")
+print('There are {0} unique queries.'.format(len(taskquery_dict)))
+print(json.dumps(taskquery_dict, indent=2))
+#get number of users
+# get number of questions asked per user
+# get average ratings
+
diff --git a/gnqa/src/study1/ragas_fahamuRAG.py b/gnqa/src/study1/ragas_fahamuRAG.py
new file mode 100644
index 00000000..8955a668
--- /dev/null
+++ b/gnqa/src/study1/ragas_fahamuRAG.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+import os
+import sys
+import json
+import time
+import configparser
+#import pandas as pd
+
+#from pandas import DataFrame as df
+#from langchain_together import Together
+#from langchain_together.embeddings import TogetherEmbeddings
+#from ragas.metrics import (faithfulness, answer_relevancy, context_relevancy, context_utilization, context_recall)
+# using ragas==0.1.9
+from ragas.metrics import (faithfulness, answer_relevancy, context_relevancy, context_utilization)
+from ragas import evaluate
+from datasets import Dataset#, load_dataset
+
+def evaluateDataset(num_evaluations, dataset, output_file):
+  for n in range(0,num_evaluations):
+
+    #results = evaluate(dataset, metrics=[faithfulness,context_utilization,context_relevancy,answer_relevancy], raise_exceptions=False)
+    results = evaluate(dataset, metrics=[faithfulness,answer_relevancy, context_relevancy, context_utilization])
+    print(results)
+    with open(output_file, "a") as the_data:
+        the_data.write(",\n")
+        the_data.write(json.dumps(results, indent=2))
+    time.sleep(20)
+
+
+
+config = configparser.ConfigParser()
+#config.read('/home/shebes/Coding/gn-ai/gnqa/paper1_eval/src/_config.cfg')
+config.read('/code/paper1_eval/src/_config.cfg')
+
+os.environ["OPENAI_API_KEY"] = config['key.api']['openai2']
+together_key = config['key.api']['togetherai']
+
+#embeddings = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
+#embeddings = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-32k-retrieval")
+
+#together_completion = Together(
+    #model="NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
+    #model="togethercomputer/Llama-2-7B-32K-Instruct",
+    #model="meta-llama/Llama-3-70b-chat-hf",
+#    model="google/gemma-7b-it",
+#    temperature=0.8,
+#    max_tokens=4000,
+#    top_k=1,
+#    together_api_key=together_key
+#)
+
+read_file = str(sys.argv[1])
+outp_file = str(sys.argv[2])
+num_evals = int(sys.argv[3])
+
+print(read_file)
+print(outp_file)
+
+with open(read_file, "r") as r_file:
+    data = json.load(r_file)
+
+dataset = Dataset.from_dict(data)
+print(dataset)
+evaluateDataset(num_evals, dataset, outp_file)
+"""
+results = evaluate(
+    dataset,
+    metrics=[faithfulness,answer_relevancy,context_relevancy,context_utilization],
+    llm=together_completion,
+    embeddings=embeddings)
+"""
diff --git a/gnqa/src/study1/retrieve_context.py b/gnqa/src/study1/retrieve_context.py
new file mode 100644
index 00000000..58b9d472
--- /dev/null
+++ b/gnqa/src/study1/retrieve_context.py
@@ -0,0 +1,171 @@
+import os
+import sys
+import json
+import time
+import configparser
+import apis.process as gnqa
+from apis.process import get_gnqa, get_response_from_taskid
+
+
+config = configparser.ConfigParser()
+config.read('_config.cfg')
+
+'''
+the refs object is a list of items containing doc_id, bibInfo, and comboTxt
+We only need comboTxt
+'''
+def simplifyContext(refs):
+    result = []
+    for item in refs:
+        combo_text = item['comboTxt']
+        combo_text = combo_text.replace('\n','')
+        combo_text = combo_text.replace('\t','')
+        result.append(combo_text)
+    return result
+
+def writeDatasetFile(responses, outp_file):
+  print(outp_file)
+  output = json.dumps(responses, indent=2)
+  if os.path.exists(outp_file):
+    with open(outp_file, "a") as the_data:
+      the_data.write('' + output)
+  else:
+    with open(outp_file, "a") as the_data:
+      the_data.write(output)
+
+
+def reset_responses():
+  return {
+    'question': [],
+    'answer':   [],
+    'contexts':  [],
+    'task_id': []
+  }
+
+def parse_document(jsonfile):
+  print('Parse document')
+  for item in jsonfile:
+    level     = item['level']
+    domain    = item['domain']
+    query_lst = item['query']
+    create_datasets(query_lst, domain, level)
+
+def create_datasets(query_list, domain, level):
+  print('Creating dataset')
+  responses = reset_responses()
+  ndx = 0
+  for query in query_list:
+    print(query)
+    task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR'])
+    responses['question'].append(query)
+    responses['answer'].append(answer)
+    responses['task_id'].append(task_id)
+    responses['contexts'].append(simplifyContext(refs))
+    ndx+=1
+    time.sleep(10) # sleep a bit to not overtask the api
+    if ndx % 5 == 0:
+      print('Will print to file number {0}'.format(int(ndx/5)))
+      outp_file  = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)))
+      writeDatasetFile(responses, outp_file)
+      responses = reset_responses()
+  if len(responses['question']) > 0:
+    outp_file  = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)+1))
+    writeDatasetFile(responses, outp_file)
+
+def parse_responses(jsonfile):
+  print('Parsing human responses')
+  de_dict_general  = {"level": "domainexpert",     "domain": "general",  "query": [], "task_id": []}
+  de_dict_aging    = {"level": "domainexpert",     "domain": "aging",    "query": [], "task_id": []}
+  de_dict_diabetes = {"level": "domainexpert",     "domain": "diabetes", "query": [], "task_id": []}
+  cs_dict_general  = {"level": "citizenscientist", "domain": "general",  "query": [], "task_id": []}
+  cs_dict_aging    = {"level": "citizenscientist", "domain": "aging",    "query": [], "task_id": []}
+  cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []}
+  j = 0
+  for _, val in jsonfile.items():
+    ndx = 0
+    lvl = val.get("level")
+    for qry in val.get("query"):
+      ans = val.get("answer")[ndx] if "answer" in val else ""
+      tpc  = val.get("topic")[ndx]
+      tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes"
+      tskd = val.get("task_id")[ndx]
+      if   lvl == 'cs' and tpc == 'general':
+        addToDataList(cs_dict_general, qry, ans, tskd)
+      elif lvl == 'cs' and tpc == 'aging':
+        addToDataList(cs_dict_aging, qry, ans, tskd)
+      elif lvl == 'cs' and tpc == 'diabetes':
+        addToDataList(cs_dict_diabetes, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'general':
+        addToDataList(de_dict_general, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'aging':
+        addToDataList(de_dict_aging, qry, ans, tskd)
+      elif lvl == 'de' and tpc == 'diabetes':
+        addToDataList(de_dict_diabetes, qry, ans, tskd)
+      else:
+         print('Somehow there is a query without a topic or expertise level')
+      ndx+=1
+    j+=1
+  create_datasets_from_taskid(de_dict_general)
+  create_datasets_from_taskid(de_dict_aging)
+  create_datasets_from_taskid(de_dict_diabetes)
+  create_datasets_from_taskid(cs_dict_general)
+  create_datasets_from_taskid(cs_dict_aging)
+  create_datasets_from_taskid(cs_dict_diabetes)
+
+def addToDataList(data_lst, qry, ans, tskd):
+  data_lst["query"].append(qry)
+  data_lst["task_id"].append(tskd)
+  if "answer" not in data_lst.keys():
+    data_lst["answer"] = []
+  data_lst["answer"].append(ans)
+
+
+def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level):
+  print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"]))
+  responses = reset_responses()
+  ndx = 0
+  query_list = info_dict["query"]
+  if "answer" in info_dict:
+    answers    = info_dict["answer"]
+  else:
+    info_dict["answer"] = []
+    answers = []
+
+  for task_id in info_dict["task_id"]:
+    _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id)
+    responses['question'].append(query_list[ndx])
+    if answers[ndx] == "":
+      responses['answer'].append(an_answer)
+    else:
+      responses['answer'].append(answers[ndx])
+    responses['task_id'].append(task_id)
+    responses['contexts'].append(simplifyContext(refs))
+    ndx+=1
+    time.sleep(10) # sleep a bit to not overtask the api
+    if ndx % 5 == 0:
+      #print('Will print to file number {0}'.format(int(ndx/5)))
+      outp_file  = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)))
+      writeDatasetFile(responses, outp_file)
+      responses = reset_responses()
+  if len(responses['question']) > 0:
+    #print('Will print to file number {0}'.format(int((ndx/5)+1)))
+    #print(responses)
+    outp_file  = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1))
+    writeDatasetFile(responses, outp_file)
+
+try: 
+
+  read_file = str(sys.argv[1])
+  file_type = str(sys.argv[2])
+
+except:
+  exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"')
+
+
+print('Read input file')
+with open(read_file, "r") as r_file:
+  file_lst = json.load(r_file)
+if file_type == "gpt4o":
+  parse_document(file_lst)
+else:
+  parse_responses(file_lst)
\ No newline at end of file