aboutsummaryrefslogtreecommitdiff
path: root/load-rdf.scm
blob: fcb4f4dcc00103687b3791f753094cc28c949fd8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#! /usr/bin/env guile
!#

(use-modules (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (srfi srfi-71)
             (srfi srfi-171)
             (ice-9 match)
             (ice-9 popen)
             (hashing md5)
             ((web client) #:select (http-head open-socket-for-uri))
             (web request)
             (web response)
             (web uri))

(define %graph-uri
  "http://genenetwork.org")

(define (random-cnonce len)
  "Return a random hexadecimal string LEN characters long."
  (list->string (map (lambda _
                       (let ((i (random 16)))
                         (integer->char
                          (if (< i 10)
                              (+ i (char->integer #\0))
                              (+ (- i 10)
                                 (char->integer #\a))))))
                     (iota len))))

(define (md5-digest-authorization method uri username password)
  "Return a digest authorization header to access URI with METHOD,
USERNAME and PASSWORD."
  (let* ((response response-body (http-head uri))
         (challenge (or (find (match-lambda
                                (('digest . challenge)
                                 (let ((algorithm (assq-ref challenge 'algorithm)))
                                   (or (not algorithm)
                                       (string=? algorithm "MD5")))))
                              (assq-ref (response-headers response)
                                        'www-authenticate))
                        (error "No MD5 authentication challenges advertised")))
         (realm (assq-ref challenge 'realm))
         (nonce (assq-ref challenge 'nonce))
         (qop (assq-ref challenge 'qop))
         ;; Hard-code the request counter (nc) to "1".
         (nc "1")
         (cnonce (random-cnonce 8)))
    `(digest (username . ,username)
             (realm . ,realm)
             (nonce . ,nonce)
             (uri . ,(uri-path uri))
             (algorithm . "MD5")
             (qop . ,qop)
             (nc . ,nc)
             (cnonce . ,cnonce)
             (response . ,(md5->string
                           (md5 (string->bytevector
                                 (string-join
                                  (list (md5->string
                                         (md5 (string->bytevector (string-join
                                                                   (list username realm password)
                                                                   ":")
                                                                  (make-transcoder (utf-8-codec)))))
                                        nonce
                                        nc
                                        cnonce
                                        qop
                                        (md5->string
                                         (md5 (string->bytevector (string-join (list (symbol->string method)
                                                                                     (uri-path uri))
                                                                               ":")
                                                                  (make-transcoder (utf-8-codec))))))
                                  ":")
                                 (make-transcoder (utf-8-codec))))))
             (opaque . ,(assq-ref challenge 'opaque)))))

(define* (http-upload-file method uri file #:key username password)
  "Upload FILE to URI using METHOD. USERNAME and PASSWORD, if
provided, are the username and password to authenticate with."
  (let ((authorization-headers
         (if username
             `((authorization . ,(md5-digest-authorization method uri username password)))
             '())))
    (call-with-port (open-socket-for-uri uri)
      (lambda (port)
        (let ((request (write-request (build-request
                                       uri
                                       #:method method
                                       #:headers `((connection close)
                                                   (content-length . ,(stat:size (stat file)))
                                                   ,@authorization-headers)
                                       #:port port)
                                      port)))
          (when file
            (call-with-input-file file
              (lambda (in)
                (port-transduce (tmap (cut write-request-body request <>))
                                (const #t)
                                get-bytevector-some
                                in)
                (force-output (request-port request)))))
          (read-response port))))))

(define (call-with-pipe proc mode program . args)
  "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to
it. Call PROC with a port to that pipe. Close the pipe once PROC
exits, even if it exits non-locally. Return the value returned by
PROC."
  (let ((port #f))
    (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args)))
                  (cut proc port)
                  (lambda ()
                    (let ((return-value (status:exit-val (close-pipe port))))
                      (unless (and return-value
                                   (zero? return-value))
                        (error "Invocation of program failed" (cons program args))))))))

(define* (put-graph sparql-endpoint username password rdf-file graph #:optional retry?)
  "Load RDF-FILE into GRAPH at SPARQL-ENDPOINT, a SPARQL 1.1 Graph
Store HTTP Protocol endpoint, authenticating with USERNAME and
PASSWORD. The PUT method is used, and therefore, any existing data in
the graph is deleted."
  (let ((response (http-upload-file 'PUT
                                    (build-uri (uri-scheme sparql-endpoint)
                                               #:host (uri-host sparql-endpoint)
                                               #:port (uri-port sparql-endpoint)
                                               #:path (uri-path sparql-endpoint)
                                               #:query (string-append "graph-uri=" (uri-encode graph)))
                                    rdf-file
                                    #:username username
                                    #:password password)))
    ;; Fail if response code is not 2xx.
    (unless (= (quotient (response-code response)
                         100)
               2)
      ;; When uploading a large RDF file into a clean virtuoso, it
      ;; fails for the first time, and succeeds only the second
      ;; time. It is unclear why. So, perform this ugly hack of trying
      ;; twice.
      (if retry?
          (put-graph sparql-endpoint username password rdf-file graph #f)
          (error "Putting graph failed" response)))))

(define (delete-graph port password graph)
  "Delete GRAPH from virtuoso connecting to virtuoso on PORT
authenticating as the dba user with PASSWORD."
  ;; We do this with SQL because doing it with SPARQL is too
  ;; slow. Note that this does not delete free-text index data, if
  ;; any. See
  ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');"
             port
             password
             graph))
   OPEN_WRITE
   "isql"))

(define (time-thunk thunk)
  "Run THUNK and return the time taken in seconds."
  (let ((start-time (current-time)))
    (thunk)
    (- (current-time) start-time)))

(define main
  (match-lambda*
    ((_ connection-settings-file rdf-file)
     (let ((connection-settings
            (call-with-input-file connection-settings-file
              read)))
       ;; Delete existing data. We do not rely on the implicit
       ;; deletion in the PUT method of the SPARQL 1.1 Graph Store
       ;; HTTP Protocol because that is too slow.
       (format (current-output-port)
               "Existing virtuoso data deleted in ~a seconds~%"
               (time-thunk
                (cut delete-graph
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password)
                     %graph-uri)))
       ;; Load data into virtuoso.
       (format (current-output-port)
               "~a loaded into virtuoso in ~a seconds~%"
               rdf-file
               (time-thunk
                (cut put-graph
                     (build-uri
                      (assq-ref connection-settings 'sparql-scheme)
                      #:host (assq-ref connection-settings 'sparql-host)
                      #:port (assq-ref connection-settings 'sparql-port)
                      #:path "/sparql-graph-crud-auth")
                     (assq-ref connection-settings 'virtuoso-username)
                     (assq-ref connection-settings 'virtuoso-password)
                     rdf-file
                     %graph-uri
                     #t)))))
    ((arg0 _ ...)
     (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE RDF-FILE~%" arg0)
     (exit #f))))

(apply main (command-line))