gnqa/paper2_eval/src/parse_r2r_result.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

import json
import sys

read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json'

def iterate_json(obj, thedict):
    if isinstance(obj, dict):
        for key, val in obj.items():
            if (key == "text"):
                thedict["contexts"].append(val.replace("\n", " ").strip())
                print("Key -> {0}\tValue -> {1}".format(key,val))
            elif (key == "metadata"):
                thedict["answer"] = val#.replace("\n", " ").strip()
                print("Key -> {0}\tValue -> {1}".format(key,val))
            elif (key == "id"):
                print("Key -> {0}\tValue -> {1}".format(key,val))
            elif (key == "associatedQuery"):
                thedict["question"] = val.replace("\n", " ").strip()
                print("Key -> {0}\tValue -> {1}".format(key,val))
            elif (key == "title"):
                print("Key -> {0}\tValue -> {1}".format(key,val))
            elif (key == "document_id"):
                print("Key -> {0}\tValue -> {1}".format(key,val))
            else:
                if (len(obj.items()) == 1 ):
                    print(key, " --> ", val)
            iterate_json(val, thedict)
    elif isinstance(obj, list):
        for item in obj:
            iterate_json(item, thedict)

# this should be a json file with a list of input files and an output file
with open(read_file, "r") as r_file:
    result_file = json.load(r_file)

ragas_output = {
    "contexts": [],
    "titles": [],
    "answer": "",
    "question": ""}
vector_search_results = result_file["vector_search_results"]
iterate_json(vector_search_results, ragas_output)

print(json.dumps(ragas_output, indent=2))