aboutsummaryrefslogtreecommitdiff
path: root/load-rdf.scm
blob: aaf1b002d9101dc07452bde25cbbcc27378fc0db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#! /usr/bin/env guile
!#

(use-modules (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (srfi srfi-71)
             (srfi srfi-171)
             (ice-9 ftw)
             (ice-9 match)
             (ice-9 popen)
             (hashing md5)
             ((web client) #:select (http-head open-socket-for-uri))
             (web request)
             (web response)
             (web uri))

(define %graph-uri
  "http://genenetwork.org")

(define (call-with-pipe proc mode program . args)
  "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to
it. Call PROC with a port to that pipe. Close the pipe once PROC
exits, even if it exits non-locally. Return the value returned by
PROC."
  (let ((port #f))
    (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args)))
                  (cut proc port)
                  (lambda ()
                    (let ((return-value (status:exit-val (close-pipe port))))
                      (unless (and return-value
                                   (zero? return-value))
                        (error "Invocation of program failed" (cons program args))))))))


(define (delete-graph port password graph)
  "Delete GRAPH from virtuoso connecting to virtuoso on PORT
authenticating as the dba user with PASSWORD."
  ;; We do this with SQL because doing it with SPARQL is too
  ;; slow. Note that this does not delete free-text index data, if
  ;; any. See
  ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');"
             port
             password
             graph))
   OPEN_WRITE
   "isql"))

(define (empty-load-queue port password)
  "Empty the "
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM DB.DBA.load_list;"
             port
             password))
   OPEN_WRITE
   "isql"))

(define (bulk-load-data port password graph)
  "Bulk load data into virtuoso"
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
ld_dir('/var/lib/data', '*.ttl', '~a');
rdf_loader_run();
CHECKPOINT;
"
             port
             password
             graph))
   OPEN_WRITE
   "isql"))

(define (index-data port password)
  "Index all text data for quicker search"
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'All');
DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ();
quit;
"
             port
             password))
   OPEN_WRITE
   "isql"))

(define (time-thunk thunk)
  "Run THUNK and return the time taken in seconds."
  (let ((start-time (current-time)))
    (thunk)
    (- (current-time) start-time)))

(define main
  (match-lambda*
    ((_ connection-settings-file)
     (let ((connection-settings
            (call-with-input-file connection-settings-file
              read)))
       ;; Delete existing data. We do not rely on the implicit
       ;; deletion in the PUT method of the SPARQL 1.1 Graph Store
       ;; HTTP Protocol because that is too slow.
       (format (current-output-port)
               "Existing virtuoso data deleted in ~a seconds~%"
               (time-thunk
                (cut delete-graph
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password)
                     %graph-uri)))
       ;; Delete the load queue
       (format (current-output-port)
               "Existing DB.LOAD queue deleted in ~a seconds~%"
               (time-thunk
                (cut empty-load-queue
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password))))
       ;; Bulk load data
       (format (current-output-port)
               "Existing virtuoso data uploaded in ~a seconds~%"
               (time-thunk
                (cut bulk-load-data
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password)
                     %graph-uri)))
       ;; Index the data
       (format (current-output-port)
               "Indexing the data"
               (time-thunk
                (cut index-data
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password))))))
    ((arg0 _ ...)
     (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0)
     (exit #f))))

(apply main (command-line))