from os import listdir from os.path import isfile, join import time import datetime import configparser from r2r import R2RClient cfg = configparser.ConfigParser() cfg.read('_config.cfg') client = R2RClient("http://localhost:7272") #print("The status of the client is {0}".format(client.health())) # should be read from a configuration file main_file_dir = 'full path' data_dir = cfg['PDF_DIR'] file_paths = [ data_dir+cfg['genetics']['diabetes'], data_dir+cfg['genetics']['aging'] ] print("The defined file paths {0}".format(file_paths)) """ file_paths = [ main_file_dir+'pt02/b/', main_file_dir+'pt03/c/', main_file_dir+'pt03/a/', main_file_dir+'pt03/b/', main_file_dir+'pt03/c/', main_file_dir+'pt04/a/', main_file_dir+'pt04/b/', main_file_dir+'pt04/c/', main_file_dir+'pt05/a/', main_file_dir+'pt05/b/', main_file_dir+'pt05/c/', main_file_dir+'pt06/a/', main_file_dir+'pt06/b/', main_file_dir+'pt06/c/', main_file_dir+'pt07/a/', main_file_dir+'pt07/b/', main_file_dir+'pt07/c/', main_file_dir+'pt08/a/', main_file_dir+'pt08/b/', main_file_dir+'pt08/c/', main_file_dir+'pt09/a/', main_file_dir+'pt09/b/', main_file_dir+'pt09/c/', main_file_dir+'pt10/a/', main_file_dir+'pt10/b/', main_file_dir+'pt10/c/', main_file_dir+'pt11/a/', main_file_dir+'pt11/b/', main_file_dir+'pt11/c/', main_file_dir+'pt12/a/', main_file_dir+'pt12/b/', main_file_dir+'pt12/c/', main_file_dir+'pt13/a/', main_file_dir+'pt13/b/', main_file_dir+'pt13/c/', main_file_dir+'pt14/a/', main_file_dir+'pt14/b/', main_file_dir+'pt14/c/', main_file_dir+'pt14/d/' ] ndx = 0 for file_list in file_paths: the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))] ''' # subroutine to test list content for the_pdf in the_pdfs: print('{0} -> {1}'.format(ndx, the_pdf)) ndx += 1 ''' print(datetime.datetime.now()) begin_ingesting = datetime.datetime.now() sleeptime = 30 try: ingest_response = client.ingest_files( file_paths=the_pdfs ) except: ingest_response = "Nothing ingested from {0}".format(file_list) sleeptime = 1 end_ingesting = datetime.datetime.now() # show results of ingesting documents print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response)) # brace against pinging API too quickly time.sleep(sleeptime) ndx += 1 """