gnqa/src/study1/parsejson.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

import json
import sys


def iterate_json(obj, thedict):
    if isinstance(obj, dict):
        for key, val in obj.items():
            if (key == "text"):
                thedict["contexts"].append(val.replace("\n", " ").strip())
            elif (key == "answer"):
                thedict["answer"] = val.replace("\n", " ").strip()
            elif (key == "question"):
                thedict["question"] = val.replace("\n", " ").strip()
            else:
                if (len(obj.items()) == 1 ):
                    print(key, " --> ", val)
            iterate_json(val, thedict)
    elif isinstance(obj, list):
        for item in obj:
            iterate_json(item, thedict)

def create_dataset_from_files(tag, file_name, rag_out):
    for the_file in file_name[tag]:
        ragas_output = {
            "contexts": [],
            "answer": "",
            "question": ""}
        #print(the_file)
        with open("./data/"+the_file, "r") as r_file:
            data_file = json.load(r_file)
        iterate_json(data_file, ragas_output)
        rag_out["answer"].append(ragas_output["answer"])
        rag_out["question"].append(ragas_output["question"])
        rag_out["contexts"].append(ragas_output["contexts"])

def create_resultset_from_file(file_name):
        with open("./data/"+the_file, "r") as r_file:
            data_file = json.load(r_file)
        iterate_json(data_file, ragas_output)


file_list_tag = str(sys.argv[1])
read_file = str(sys.argv[2]) # e.g. doc_list.json
outp_file = str(sys.argv[3])

rag_out = {
    "question": [],
    "answer": [],
    "contexts": []
}

cntxt_lst = []

# this should be a json file with a list of input files and an output file
with open(read_file, "r") as r_file:
    file_lst = json.load(r_file)

create_dataset_from_files(file_list_tag, file_lst, rag_out)

with open(outp_file, "a") as the_data:
    #json.dump(ragas_output, the_data)
    the_data.write(",\n")
    the_data.write(json.dumps(rag_out, indent=2))