from os import listdir
from os.path import isfile, join
import time
import datetime
import configparser
from r2r import R2RClient
cfg = configparser.ConfigParser()
cfg.read('_config.cfg')
client = R2RClient("http://localhost:7272")
#print("The status of the client is {0}".format(client.health()))
# should be read from a configuration file
main_file_dir = 'full path'
data_dir = cfg['PDF_DIR']
file_paths = [
data_dir+cfg['genetics']['diabetes'],
data_dir+cfg['genetics']['aging']
]
print("The defined file paths {0}".format(file_paths))
"""
file_paths = [
main_file_dir+'pt02/b/',
main_file_dir+'pt03/c/',
main_file_dir+'pt03/a/',
main_file_dir+'pt03/b/',
main_file_dir+'pt03/c/',
main_file_dir+'pt04/a/',
main_file_dir+'pt04/b/',
main_file_dir+'pt04/c/',
main_file_dir+'pt05/a/',
main_file_dir+'pt05/b/',
main_file_dir+'pt05/c/',
main_file_dir+'pt06/a/',
main_file_dir+'pt06/b/',
main_file_dir+'pt06/c/',
main_file_dir+'pt07/a/',
main_file_dir+'pt07/b/',
main_file_dir+'pt07/c/',
main_file_dir+'pt08/a/',
main_file_dir+'pt08/b/',
main_file_dir+'pt08/c/',
main_file_dir+'pt09/a/',
main_file_dir+'pt09/b/',
main_file_dir+'pt09/c/',
main_file_dir+'pt10/a/',
main_file_dir+'pt10/b/',
main_file_dir+'pt10/c/',
main_file_dir+'pt11/a/',
main_file_dir+'pt11/b/',
main_file_dir+'pt11/c/',
main_file_dir+'pt12/a/',
main_file_dir+'pt12/b/',
main_file_dir+'pt12/c/',
main_file_dir+'pt13/a/',
main_file_dir+'pt13/b/',
main_file_dir+'pt13/c/',
main_file_dir+'pt14/a/',
main_file_dir+'pt14/b/',
main_file_dir+'pt14/c/',
main_file_dir+'pt14/d/'
]
ndx = 0
for file_list in file_paths:
the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
'''
# subroutine to test list content
for the_pdf in the_pdfs:
print('{0} -> {1}'.format(ndx, the_pdf))
ndx += 1
'''
print(datetime.datetime.now())
begin_ingesting = datetime.datetime.now()
sleeptime = 30
try:
ingest_response = client.ingest_files(
file_paths=the_pdfs
)
except:
ingest_response = "Nothing ingested from {0}".format(file_list)
sleeptime = 1
end_ingesting = datetime.datetime.now()
# show results of ingesting documents
print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response))
# brace against pinging API too quickly
time.sleep(sleeptime)
ndx += 1
"""