diff options
Diffstat (limited to 'gnqa/src/ingest_my_data.py')
-rwxr-xr-x | gnqa/src/ingest_my_data.py | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/gnqa/src/ingest_my_data.py b/gnqa/src/ingest_my_data.py new file mode 100755 index 00000000..2e696844 --- /dev/null +++ b/gnqa/src/ingest_my_data.py @@ -0,0 +1,72 @@ +from os import listdir +from os.path import isfile, join +import sys +import time +import datetime +import configparser +from r2r import R2RClient, R2RException + +cfg = configparser.ConfigParser() +cfg.read('_config.cfg') + +client = R2RClient("http://localhost:7272/") +client.documents.create_sample(hi_res=True) + +data_dir = cfg['DEFAULT']['PDF_DIR'] + +dir_paths = [ + data_dir+cfg['genetics']['diabetes'], + data_dir+cfg['genetics']['aging'] +] + +print("The defined directory paths {0}".format(dir_paths)) + +def ingest_files(client, sleep_time, doc_list): + responses = [] + resp = () + for the_doc in doc_list: + #print(the_doc) + try: + resp = client.documents.create(file_path=the_doc) + print("Ingested {0} with the following response {1}".format(the_doc,resp)) + except R2RException: + print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception())) + #except: + #responses.append(resp) + # brace against pinging API too quickly + time.sleep(sleep_time) + return responses + +def create_file_list(the_dir): + return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))] + #print("The list has {0} files".format(len(result))) + ''' + # subroutine to test list content + for the_pdf in the_pdfs: + print('{0} -> {1}'.format(ndx, the_pdf)) + ndx += 1 + ''' + return result + +def ingest_all_files(client, dir_list): + result = [] + sleep_time = 3 + print(datetime.datetime.now()) + begin_ingesting = datetime.datetime.now() + + for the_dir in dir_list: + the_list = create_file_list(the_dir) + result.append(ingest_files(client, sleep_time, the_list)) + + #ingest_resp = () + #resps = ingest_files(client, sleep_time, file_list) + end_ingesting = datetime.datetime.now() + + # show results of ingesting documents + #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response)) + + # brace against pinging API too quickly + #time.sleep(sleeptime) + + +ingest_all_files(client, dir_paths)
\ No newline at end of file |