about summary refs log tree commit diff
path: root/R2R/ingesting/ingest_my_data.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/ingesting/ingest_my_data.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to 'R2R/ingesting/ingest_my_data.py')
-rwxr-xr-xR2R/ingesting/ingest_my_data.py99
1 files changed, 99 insertions, 0 deletions
diff --git a/R2R/ingesting/ingest_my_data.py b/R2R/ingesting/ingest_my_data.py
new file mode 100755
index 00000000..cb0fa551
--- /dev/null
+++ b/R2R/ingesting/ingest_my_data.py
@@ -0,0 +1,99 @@
+from os import listdir
+from os.path import isfile, join
+import time
+import datetime
+import configparser
+from r2r import R2RClient
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272")
+
+#print("The status of the client is {0}".format(client.health()))
+
+# should be read from a configuration file
+main_file_dir = 'full path'
+
+data_dir = cfg['PDF_DIR']
+
+file_paths = [
+    data_dir+cfg['genetics']['diabetes'],
+    data_dir+cfg['genetics']['aging']
+]
+
+print("The defined file paths {0}".format(file_paths))
+
+"""
+file_paths = [
+              main_file_dir+'pt02/b/',
+              main_file_dir+'pt03/c/',
+              main_file_dir+'pt03/a/',
+              main_file_dir+'pt03/b/',
+              main_file_dir+'pt03/c/',
+              main_file_dir+'pt04/a/',
+              main_file_dir+'pt04/b/',
+              main_file_dir+'pt04/c/',
+              main_file_dir+'pt05/a/',
+              main_file_dir+'pt05/b/',
+              main_file_dir+'pt05/c/',
+              main_file_dir+'pt06/a/',
+              main_file_dir+'pt06/b/',
+              main_file_dir+'pt06/c/',
+              main_file_dir+'pt07/a/',
+              main_file_dir+'pt07/b/',
+              main_file_dir+'pt07/c/',
+              main_file_dir+'pt08/a/',
+              main_file_dir+'pt08/b/',
+              main_file_dir+'pt08/c/',
+              main_file_dir+'pt09/a/',
+              main_file_dir+'pt09/b/',
+              main_file_dir+'pt09/c/',
+              main_file_dir+'pt10/a/',
+              main_file_dir+'pt10/b/',
+              main_file_dir+'pt10/c/',
+              main_file_dir+'pt11/a/',
+              main_file_dir+'pt11/b/',
+              main_file_dir+'pt11/c/',
+              main_file_dir+'pt12/a/',
+              main_file_dir+'pt12/b/',
+              main_file_dir+'pt12/c/',
+              main_file_dir+'pt13/a/',
+              main_file_dir+'pt13/b/',
+              main_file_dir+'pt13/c/',
+              main_file_dir+'pt14/a/',
+              main_file_dir+'pt14/b/',
+              main_file_dir+'pt14/c/',
+              main_file_dir+'pt14/d/'
+            ]
+
+
+ndx = 0
+for file_list in file_paths:
+    the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
+    '''
+    # subroutine to test list content
+    for the_pdf in the_pdfs:
+        print('{0} -> {1}'.format(ndx, the_pdf))
+        ndx += 1
+    '''
+    print(datetime.datetime.now())
+    begin_ingesting = datetime.datetime.now()
+    sleeptime = 30
+    try:
+        ingest_response = client.ingest_files(
+        file_paths=the_pdfs
+        )
+    except:
+        ingest_response = "Nothing ingested from {0}".format(file_list)
+        sleeptime = 1 
+
+    end_ingesting = datetime.datetime.now()
+
+    # show results of ingesting documents
+    print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,      file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+    # brace against pinging API too quickly
+    time.sleep(sleeptime)
+    ndx += 1
+"""
\ No newline at end of file