diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /R2R/ingesting/ingest_my_data.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to 'R2R/ingesting/ingest_my_data.py')
-rwxr-xr-x | R2R/ingesting/ingest_my_data.py | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/R2R/ingesting/ingest_my_data.py b/R2R/ingesting/ingest_my_data.py new file mode 100755 index 00000000..cb0fa551 --- /dev/null +++ b/R2R/ingesting/ingest_my_data.py @@ -0,0 +1,99 @@ +from os import listdir +from os.path import isfile, join +import time +import datetime +import configparser +from r2r import R2RClient + +cfg = configparser.ConfigParser() +cfg.read('_config.cfg') + +client = R2RClient("http://localhost:7272") + +#print("The status of the client is {0}".format(client.health())) + +# should be read from a configuration file +main_file_dir = 'full path' + +data_dir = cfg['PDF_DIR'] + +file_paths = [ + data_dir+cfg['genetics']['diabetes'], + data_dir+cfg['genetics']['aging'] +] + +print("The defined file paths {0}".format(file_paths)) + +""" +file_paths = [ + main_file_dir+'pt02/b/', + main_file_dir+'pt03/c/', + main_file_dir+'pt03/a/', + main_file_dir+'pt03/b/', + main_file_dir+'pt03/c/', + main_file_dir+'pt04/a/', + main_file_dir+'pt04/b/', + main_file_dir+'pt04/c/', + main_file_dir+'pt05/a/', + main_file_dir+'pt05/b/', + main_file_dir+'pt05/c/', + main_file_dir+'pt06/a/', + main_file_dir+'pt06/b/', + main_file_dir+'pt06/c/', + main_file_dir+'pt07/a/', + main_file_dir+'pt07/b/', + main_file_dir+'pt07/c/', + main_file_dir+'pt08/a/', + main_file_dir+'pt08/b/', + main_file_dir+'pt08/c/', + main_file_dir+'pt09/a/', + main_file_dir+'pt09/b/', + main_file_dir+'pt09/c/', + main_file_dir+'pt10/a/', + main_file_dir+'pt10/b/', + main_file_dir+'pt10/c/', + main_file_dir+'pt11/a/', + main_file_dir+'pt11/b/', + main_file_dir+'pt11/c/', + main_file_dir+'pt12/a/', + main_file_dir+'pt12/b/', + main_file_dir+'pt12/c/', + main_file_dir+'pt13/a/', + main_file_dir+'pt13/b/', + main_file_dir+'pt13/c/', + main_file_dir+'pt14/a/', + main_file_dir+'pt14/b/', + main_file_dir+'pt14/c/', + main_file_dir+'pt14/d/' + ] + + +ndx = 0 +for file_list in file_paths: + the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))] + ''' + # subroutine to test list content + for the_pdf in the_pdfs: + print('{0} -> {1}'.format(ndx, the_pdf)) + ndx += 1 + ''' + print(datetime.datetime.now()) + begin_ingesting = datetime.datetime.now() + sleeptime = 30 + try: + ingest_response = client.ingest_files( + file_paths=the_pdfs + ) + except: + ingest_response = "Nothing ingested from {0}".format(file_list) + sleeptime = 1 + + end_ingesting = datetime.datetime.now() + + # show results of ingesting documents + print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx, file_list, (end_ingesting-begin_ingesting), ingest_response)) + + # brace against pinging API too quickly + time.sleep(sleeptime) + ndx += 1 +"""
\ No newline at end of file |