aboutsummaryrefslogtreecommitdiff
path: root/R2R/ingesting/ingest_my_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'R2R/ingesting/ingest_my_data.py')
-rwxr-xr-xR2R/ingesting/ingest_my_data.py99
1 files changed, 99 insertions, 0 deletions
diff --git a/R2R/ingesting/ingest_my_data.py b/R2R/ingesting/ingest_my_data.py
new file mode 100755
index 00000000..cb0fa551
--- /dev/null
+++ b/R2R/ingesting/ingest_my_data.py
@@ -0,0 +1,99 @@
+from os import listdir
+from os.path import isfile, join
+import time
+import datetime
+import configparser
+from r2r import R2RClient
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272")
+
+#print("The status of the client is {0}".format(client.health()))
+
+# should be read from a configuration file
+main_file_dir = 'full path'
+
+data_dir = cfg['PDF_DIR']
+
+file_paths = [
+ data_dir+cfg['genetics']['diabetes'],
+ data_dir+cfg['genetics']['aging']
+]
+
+print("The defined file paths {0}".format(file_paths))
+
+"""
+file_paths = [
+ main_file_dir+'pt02/b/',
+ main_file_dir+'pt03/c/',
+ main_file_dir+'pt03/a/',
+ main_file_dir+'pt03/b/',
+ main_file_dir+'pt03/c/',
+ main_file_dir+'pt04/a/',
+ main_file_dir+'pt04/b/',
+ main_file_dir+'pt04/c/',
+ main_file_dir+'pt05/a/',
+ main_file_dir+'pt05/b/',
+ main_file_dir+'pt05/c/',
+ main_file_dir+'pt06/a/',
+ main_file_dir+'pt06/b/',
+ main_file_dir+'pt06/c/',
+ main_file_dir+'pt07/a/',
+ main_file_dir+'pt07/b/',
+ main_file_dir+'pt07/c/',
+ main_file_dir+'pt08/a/',
+ main_file_dir+'pt08/b/',
+ main_file_dir+'pt08/c/',
+ main_file_dir+'pt09/a/',
+ main_file_dir+'pt09/b/',
+ main_file_dir+'pt09/c/',
+ main_file_dir+'pt10/a/',
+ main_file_dir+'pt10/b/',
+ main_file_dir+'pt10/c/',
+ main_file_dir+'pt11/a/',
+ main_file_dir+'pt11/b/',
+ main_file_dir+'pt11/c/',
+ main_file_dir+'pt12/a/',
+ main_file_dir+'pt12/b/',
+ main_file_dir+'pt12/c/',
+ main_file_dir+'pt13/a/',
+ main_file_dir+'pt13/b/',
+ main_file_dir+'pt13/c/',
+ main_file_dir+'pt14/a/',
+ main_file_dir+'pt14/b/',
+ main_file_dir+'pt14/c/',
+ main_file_dir+'pt14/d/'
+ ]
+
+
+ndx = 0
+for file_list in file_paths:
+ the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
+ '''
+ # subroutine to test list content
+ for the_pdf in the_pdfs:
+ print('{0} -> {1}'.format(ndx, the_pdf))
+ ndx += 1
+ '''
+ print(datetime.datetime.now())
+ begin_ingesting = datetime.datetime.now()
+ sleeptime = 30
+ try:
+ ingest_response = client.ingest_files(
+ file_paths=the_pdfs
+ )
+ except:
+ ingest_response = "Nothing ingested from {0}".format(file_list)
+ sleeptime = 1
+
+ end_ingesting = datetime.datetime.now()
+
+ # show results of ingesting documents
+ print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+ # brace against pinging API too quickly
+ time.sleep(sleeptime)
+ ndx += 1
+""" \ No newline at end of file