gnqa/paper2_eval/src/document_operations.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

import os
import sys
import json
import time
import configparser

from r2r import R2R, Document, GenerationConfig, R2RClient

class DocOps:
    _type = ''

    def __init__(self):
        self._type = 'QuestionList'

    def writeDatasetFile(responses, outp_file):
        print(outp_file)
        output = json.dumps(responses, indent=2)
        if os.path.exists(outp_file):
            with open(outp_file, "a") as the_data:
                the_data.write('' + output)
        else:
            with open(outp_file, "a") as the_data:
                the_data.write(output)

class QuestionList:
    _verbose = 0
    _doc = ''
    _fname = ''
    _question_list = {
        "domainexpert": { 
            "gn":  [],
            "aging":    [],
            "diabetes": []
        },
        "citizenscientist": { 
            "gn":  [],
            "aging":    [],
            "diabetes": []
        }
    }

    def __init__(self, the_file, verbose=0):
        print('QuestionList has been initialized {0}, verbosity is {1}'.format(the_file, verbose))
        self._fname = the_file
        self._verbose = verbose
        self.read_document()
        self.parse_document()
        #self._print()

    def read_document(self):
        with open(self._fname, "r") as r_file:
            self._doc = json.load(r_file)

    def reset_responses():
        return {
            'question': [],
            'answer':   [],
            'contexts':  [],
            'task_id': []
        }

    def parse_document(self):
        print(('', '\nParse question list') [self._verbose] )
        for item in self._doc:
            level     = item['level']
            domain    = item['domain']
            query_lst = item['query']
            self._question_list[level][domain] = query_lst
            #print(('', 'Level --> {0} \tDomain --> {1}\n{2}'.format(level, domain, self.print_list(query_lst))) [self._verbose])
            #create_datasets(query_lst, domain, level)

    def print_list(self, the_lst):
        ndx = 1 
        for item in the_lst:
            print('\t[{0}] {1}'.format(ndx, item))
            ndx += 1
    
    def _print(self):
        print(json.dumps(self._question_list, indent=2))

    def get(self, level, domain):
        return self._question_list[level][domain]