1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
from os import listdir
from os.path import isfile, join
import sys
import time
import datetime
import configparser
from r2r import R2RClient, R2RException
cfg = configparser.ConfigParser()
cfg.read('_config.cfg')
client = R2RClient("http://localhost:7272/")
client.documents.create_sample(hi_res=True)
data_dir = cfg['DEFAULT']['PDF_DIR']
dir_paths = [
data_dir+cfg['genetics']['diabetes'],
data_dir+cfg['genetics']['aging']
]
print("The defined directory paths {0}".format(dir_paths))
def ingest_files(client, sleep_time, doc_list):
responses = []
resp = ()
for the_doc in doc_list:
#print(the_doc)
try:
resp = client.documents.create(file_path=the_doc)
print("Ingested {0} with the following response {1}".format(the_doc,resp))
except R2RException:
print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception()))
#except:
#responses.append(resp)
# brace against pinging API too quickly
time.sleep(sleep_time)
return responses
def create_file_list(the_dir):
return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))]
#print("The list has {0} files".format(len(result)))
'''
# subroutine to test list content
for the_pdf in the_pdfs:
print('{0} -> {1}'.format(ndx, the_pdf))
ndx += 1
'''
return result
def ingest_all_files(client, dir_list):
result = []
sleep_time = 3
print(datetime.datetime.now())
begin_ingesting = datetime.datetime.now()
for the_dir in dir_list:
the_list = create_file_list(the_dir)
result.append(ingest_files(client, sleep_time, the_list))
#ingest_resp = ()
#resps = ingest_files(client, sleep_time, file_list)
end_ingesting = datetime.datetime.now()
# show results of ingesting documents
#print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response))
# brace against pinging API too quickly
#time.sleep(sleeptime)
ingest_all_files(client, dir_paths)
|