1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
from os import listdir
from os.path import isfile, join
import time
import datetime
import configparser
from r2r import R2RClient
cfg = configparser.ConfigParser()
cfg.read('_config.cfg')
client = R2RClient("http://localhost:7272")
#print("The status of the client is {0}".format(client.health()))
# should be read from a configuration file
main_file_dir = 'full path'
data_dir = cfg['PDF_DIR']
file_paths = [
data_dir+cfg['genetics']['diabetes'],
data_dir+cfg['genetics']['aging']
]
print("The defined file paths {0}".format(file_paths))
"""
file_paths = [
main_file_dir+'pt02/b/',
main_file_dir+'pt03/c/',
main_file_dir+'pt03/a/',
main_file_dir+'pt03/b/',
main_file_dir+'pt03/c/',
main_file_dir+'pt04/a/',
main_file_dir+'pt04/b/',
main_file_dir+'pt04/c/',
main_file_dir+'pt05/a/',
main_file_dir+'pt05/b/',
main_file_dir+'pt05/c/',
main_file_dir+'pt06/a/',
main_file_dir+'pt06/b/',
main_file_dir+'pt06/c/',
main_file_dir+'pt07/a/',
main_file_dir+'pt07/b/',
main_file_dir+'pt07/c/',
main_file_dir+'pt08/a/',
main_file_dir+'pt08/b/',
main_file_dir+'pt08/c/',
main_file_dir+'pt09/a/',
main_file_dir+'pt09/b/',
main_file_dir+'pt09/c/',
main_file_dir+'pt10/a/',
main_file_dir+'pt10/b/',
main_file_dir+'pt10/c/',
main_file_dir+'pt11/a/',
main_file_dir+'pt11/b/',
main_file_dir+'pt11/c/',
main_file_dir+'pt12/a/',
main_file_dir+'pt12/b/',
main_file_dir+'pt12/c/',
main_file_dir+'pt13/a/',
main_file_dir+'pt13/b/',
main_file_dir+'pt13/c/',
main_file_dir+'pt14/a/',
main_file_dir+'pt14/b/',
main_file_dir+'pt14/c/',
main_file_dir+'pt14/d/'
]
ndx = 0
for file_list in file_paths:
the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
'''
# subroutine to test list content
for the_pdf in the_pdfs:
print('{0} -> {1}'.format(ndx, the_pdf))
ndx += 1
'''
print(datetime.datetime.now())
begin_ingesting = datetime.datetime.now()
sleeptime = 30
try:
ingest_response = client.ingest_files(
file_paths=the_pdfs
)
except:
ingest_response = "Nothing ingested from {0}".format(file_list)
sleeptime = 1
end_ingesting = datetime.datetime.now()
# show results of ingesting documents
print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response))
# brace against pinging API too quickly
time.sleep(sleeptime)
ndx += 1
"""
|