aboutsummaryrefslogtreecommitdiff
path: root/gnqa/src/ingest_my_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'gnqa/src/ingest_my_data.py')
-rwxr-xr-xgnqa/src/ingest_my_data.py72
1 files changed, 72 insertions, 0 deletions
diff --git a/gnqa/src/ingest_my_data.py b/gnqa/src/ingest_my_data.py
new file mode 100755
index 00000000..2e696844
--- /dev/null
+++ b/gnqa/src/ingest_my_data.py
@@ -0,0 +1,72 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import time
+import datetime
+import configparser
+from r2r import R2RClient, R2RException
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272/")
+client.documents.create_sample(hi_res=True)
+
+data_dir = cfg['DEFAULT']['PDF_DIR']
+
+dir_paths = [
+ data_dir+cfg['genetics']['diabetes'],
+ data_dir+cfg['genetics']['aging']
+]
+
+print("The defined directory paths {0}".format(dir_paths))
+
+def ingest_files(client, sleep_time, doc_list):
+ responses = []
+ resp = ()
+ for the_doc in doc_list:
+ #print(the_doc)
+ try:
+ resp = client.documents.create(file_path=the_doc)
+ print("Ingested {0} with the following response {1}".format(the_doc,resp))
+ except R2RException:
+ print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception()))
+ #except:
+ #responses.append(resp)
+ # brace against pinging API too quickly
+ time.sleep(sleep_time)
+ return responses
+
+def create_file_list(the_dir):
+ return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))]
+ #print("The list has {0} files".format(len(result)))
+ '''
+ # subroutine to test list content
+ for the_pdf in the_pdfs:
+ print('{0} -> {1}'.format(ndx, the_pdf))
+ ndx += 1
+ '''
+ return result
+
+def ingest_all_files(client, dir_list):
+ result = []
+ sleep_time = 3
+ print(datetime.datetime.now())
+ begin_ingesting = datetime.datetime.now()
+
+ for the_dir in dir_list:
+ the_list = create_file_list(the_dir)
+ result.append(ingest_files(client, sleep_time, the_list))
+
+ #ingest_resp = ()
+ #resps = ingest_files(client, sleep_time, file_list)
+ end_ingesting = datetime.datetime.now()
+
+ # show results of ingesting documents
+ #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+ # brace against pinging API too quickly
+ #time.sleep(sleeptime)
+
+
+ingest_all_files(client, dir_paths) \ No newline at end of file