aboutsummaryrefslogtreecommitdiff
path: root/gnqa/src/ingest_my_data.py
blob: 2e696844ee80135bb49d9c24e006d967be98297d (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from os import listdir
from os.path import isfile, join
import sys
import time
import datetime
import configparser
from r2r import R2RClient, R2RException

cfg = configparser.ConfigParser()
cfg.read('_config.cfg')

client = R2RClient("http://localhost:7272/")
client.documents.create_sample(hi_res=True)

data_dir = cfg['DEFAULT']['PDF_DIR']

dir_paths = [
    data_dir+cfg['genetics']['diabetes'],
    data_dir+cfg['genetics']['aging']
]

print("The defined directory paths {0}".format(dir_paths))

def ingest_files(client, sleep_time, doc_list):
    responses = []
    resp      = ()
    for the_doc in doc_list:
        #print(the_doc)
        try:
            resp = client.documents.create(file_path=the_doc)
            print("Ingested {0} with the following response {1}".format(the_doc,resp))
        except R2RException:
            print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception()))
        #except:
        #responses.append(resp)
        # brace against pinging API too quickly
        time.sleep(sleep_time)
    return responses

def create_file_list(the_dir):
    return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))]
    #print("The list has {0} files".format(len(result)))
    '''
    # subroutine to test list content
    for the_pdf in the_pdfs:
        print('{0} -> {1}'.format(ndx, the_pdf))
        ndx += 1
    '''
    return result

def ingest_all_files(client, dir_list):
    result = []
    sleep_time = 3
    print(datetime.datetime.now())
    begin_ingesting = datetime.datetime.now()

    for the_dir in dir_list:
        the_list = create_file_list(the_dir)
        result.append(ingest_files(client, sleep_time, the_list))

    #ingest_resp = ()
    #resps = ingest_files(client, sleep_time, file_list)
    end_ingesting = datetime.datetime.now()

    # show results of ingesting documents
    #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response))

    # brace against pinging API too quickly
    #time.sleep(sleeptime)


ingest_all_files(client, dir_paths)