aboutsummaryrefslogtreecommitdiff
path: root/R2R/ingesting/ingest_my_data.py
blob: cb0fa55155b4165bf3fad8b961844f9576b9d013 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from os import listdir
from os.path import isfile, join
import time
import datetime
import configparser
from r2r import R2RClient

cfg = configparser.ConfigParser()
cfg.read('_config.cfg')

client = R2RClient("http://localhost:7272")

#print("The status of the client is {0}".format(client.health()))

# should be read from a configuration file
main_file_dir = 'full path'

data_dir = cfg['PDF_DIR']

file_paths = [
    data_dir+cfg['genetics']['diabetes'],
    data_dir+cfg['genetics']['aging']
]

print("The defined file paths {0}".format(file_paths))

"""
file_paths = [
              main_file_dir+'pt02/b/',
              main_file_dir+'pt03/c/',
              main_file_dir+'pt03/a/',
              main_file_dir+'pt03/b/',
              main_file_dir+'pt03/c/',
              main_file_dir+'pt04/a/',
              main_file_dir+'pt04/b/',
              main_file_dir+'pt04/c/',
              main_file_dir+'pt05/a/',
              main_file_dir+'pt05/b/',
              main_file_dir+'pt05/c/',
              main_file_dir+'pt06/a/',
              main_file_dir+'pt06/b/',
              main_file_dir+'pt06/c/',
              main_file_dir+'pt07/a/',
              main_file_dir+'pt07/b/',
              main_file_dir+'pt07/c/',
              main_file_dir+'pt08/a/',
              main_file_dir+'pt08/b/',
              main_file_dir+'pt08/c/',
              main_file_dir+'pt09/a/',
              main_file_dir+'pt09/b/',
              main_file_dir+'pt09/c/',
              main_file_dir+'pt10/a/',
              main_file_dir+'pt10/b/',
              main_file_dir+'pt10/c/',
              main_file_dir+'pt11/a/',
              main_file_dir+'pt11/b/',
              main_file_dir+'pt11/c/',
              main_file_dir+'pt12/a/',
              main_file_dir+'pt12/b/',
              main_file_dir+'pt12/c/',
              main_file_dir+'pt13/a/',
              main_file_dir+'pt13/b/',
              main_file_dir+'pt13/c/',
              main_file_dir+'pt14/a/',
              main_file_dir+'pt14/b/',
              main_file_dir+'pt14/c/',
              main_file_dir+'pt14/d/'
            ]


ndx = 0
for file_list in file_paths:
    the_pdfs = [ join(file_list, f) for f in listdir(file_list) if isfile(join(file_list, f))]
    '''
    # subroutine to test list content
    for the_pdf in the_pdfs:
        print('{0} -> {1}'.format(ndx, the_pdf))
        ndx += 1
    '''
    print(datetime.datetime.now())
    begin_ingesting = datetime.datetime.now()
    sleeptime = 30
    try:
        ingest_response = client.ingest_files(
        file_paths=the_pdfs
        )
    except:
        ingest_response = "Nothing ingested from {0}".format(file_list)
        sleeptime = 1 

    end_ingesting = datetime.datetime.now()

    # show results of ingesting documents
    print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,      file_list, (end_ingesting-begin_ingesting), ingest_response))

    # brace against pinging API too quickly
    time.sleep(sleeptime)
    ndx += 1
"""