diff options
author | Arun Isaac | 2022-05-04 18:50:19 +0530 |
---|---|---|
committer | Arun Isaac | 2022-05-04 18:52:37 +0530 |
commit | baa9b54f1eb232ed891beaf8a24c3d0d058fbd31 (patch) | |
tree | a573b00dcee88b43a26c455a853b72b0cac37909 | |
parent | 4684e4a5036a9b438560f1aa1767e5fee91c0ca3 (diff) | |
download | gn-transform-databases-baa9b54f1eb232ed891beaf8a24c3d0d058fbd31.tar.gz |
Add script to load RDF into virtuoso.
* load-rdf.scm: New file.
* manifest.scm: Import virtuoso-ose from (gnu packages databases). Add
virtuoso-ose to manifest.
-rwxr-xr-x | load-rdf.scm | 193 | ||||
-rw-r--r-- | manifest.scm | 4 |
2 files changed, 196 insertions, 1 deletions
diff --git a/load-rdf.scm b/load-rdf.scm new file mode 100755 index 0000000..f6e0f62 --- /dev/null +++ b/load-rdf.scm @@ -0,0 +1,193 @@ +#! /usr/bin/env guile +!# + +(use-modules (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (srfi srfi-71) + (srfi srfi-171) + (ice-9 match) + (ice-9 popen) + (hashing md5) + ((web client) #:select (http-head open-socket-for-uri)) + (web request) + (web response) + (web uri)) + +(define %graph-uri + "http://genenetwork.org") + +(define (random-cnonce len) + "Return a random hexadecimal string LEN characters long." + (list->string (map (lambda _ + (let ((i (random 16))) + (integer->char + (if (< i 10) + (+ i (char->integer #\0)) + (+ (- i 10) + (char->integer #\a)))))) + (iota len)))) + +(define (md5-digest-authorization method uri username password) + "Return a digest authorization header to access URI with METHOD, +USERNAME and PASSWORD." + (let* ((response response-body (http-head uri)) + (challenge (or (find (match-lambda + (('digest . challenge) + (let ((algorithm (assq-ref challenge 'algorithm))) + (or (not algorithm) + (string=? algorithm "MD5"))))) + (assq-ref (response-headers response) + 'www-authenticate)) + (error "No MD5 authentication challenges advertised"))) + (realm (assq-ref challenge 'realm)) + (nonce (assq-ref challenge 'nonce)) + (qop (assq-ref challenge 'qop)) + ;; Hard-code the request counter (nc) to "1". + (nc "1") + (cnonce (random-cnonce 8))) + `(digest (username . ,username) + (realm . ,realm) + (nonce . ,nonce) + (uri . ,(uri-path uri)) + (algorithm . "MD5") + (qop . ,qop) + (nc . ,nc) + (cnonce . ,cnonce) + (response . ,(md5->string + (md5 (string->bytevector + (string-join + (list (md5->string + (md5 (string->bytevector (string-join + (list username realm password) + ":") + (make-transcoder (utf-8-codec))))) + nonce + nc + cnonce + qop + (md5->string + (md5 (string->bytevector (string-join (list (symbol->string method) + (uri-path uri)) + ":") + (make-transcoder (utf-8-codec)))))) + ":") + (make-transcoder (utf-8-codec)))))) + (opaque . ,(assq-ref challenge 'opaque))))) + +(define* (http-upload-file method uri file #:key username password) + "Upload FILE to URI using METHOD. USERNAME and PASSWORD, if +provided, are the username and password to authenticate with." + (let ((authorization-headers + (if username + `((authorization . ,(md5-digest-authorization method uri username password))) + '()))) + (call-with-port (open-socket-for-uri uri) + (lambda (port) + (let ((request (write-request (build-request + uri + #:method method + #:headers `((connection close) + (content-length . ,(stat:size (stat file))) + ,@authorization-headers) + #:port port) + port))) + (when file + (call-with-input-file file + (lambda (in) + (port-transduce (tmap (cut write-request-body request <>)) + (const #t) + get-bytevector-some + in) + (force-output (request-port request))))) + (read-response port)))))) + +(define (call-with-pipe proc mode program . args) + "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to +it. Call PROC with a port to that pipe. Close the pipe once PROC +exits, even if it exits non-locally. Return the value returned by +PROC." + (let ((port #f)) + (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args))) + (cut proc port) + (lambda () + (let ((return-value (status:exit-val (close-pipe port)))) + (unless (and return-value + (zero? return-value)) + (error "Invocation of program failed" (cons program args)))))))) + +(define (put-graph sparql-endpoint username password rdf-file graph) + "Load RDF-FILE into GRAPH at SPARQL-ENDPOINT, a SPARQL 1.1 Graph +Store HTTP Protocol endpoint, authenticating with USERNAME and +PASSWORD. The PUT method is used, and therefore, any existing data in +the graph is deleted." + (let ((response (http-upload-file 'PUT + (build-uri (uri-scheme sparql-endpoint) + #:host (uri-host sparql-endpoint) + #:port (uri-port sparql-endpoint) + #:path (uri-path sparql-endpoint) + #:query (string-append "graph-uri=" (uri-encode graph))) + rdf-file + #:username username + #:password password))) + ;; Fail if response code is not 2xx. + (unless (= (quotient (response-code response) + 100) + 2) + (error "Putting graph failed" response)))) + +(define (delete-graph port graph) + "Delete GRAPH from virtuoso connecting to virtuoso on PORT." + ;; We do this with SQL because doing it with SPARQL is too + ;; slow. Note that this does not delete free-text index data, if + ;; any. See + ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs + (call-with-pipe + (lambda (port) + (format port + "DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');" + graph)) + OPEN_WRITE + "isql" "-S" (number->string port))) + +(define (time-thunk thunk) + "Run THUNK and return the time taken in seconds." + (let ((start-time (current-time))) + (thunk) + (- (current-time) start-time))) + +(define main + (match-lambda* + ((_ connection-settings-file rdf-file) + (let ((connection-settings + (call-with-input-file connection-settings-file + read))) + ;; Delete existing data. We do not rely on the implicit + ;; deletion in the PUT method of the SPARQL 1.1 Graph Store + ;; HTTP Protocol because that is too slow. + (format (current-output-port) + "Existing virtuoso data deleted in ~a seconds~%" + (time-thunk + (cut delete-graph + (assq-ref connection-settings 'virtuoso-port) + %graph-uri))) + ;; Load data into virtuoso. + (format (current-output-port) + "~a loaded into virtuoso in ~a seconds~%" + rdf-file + (time-thunk + (cut put-graph + (build-uri + (assq-ref connection-settings 'sparql-scheme) + #:host (assq-ref connection-settings 'sparql-host) + #:port (assq-ref connection-settings 'sparql-port) + #:path "/sparql-graph-crud-auth") + (assq-ref connection-settings 'virtuoso-username) + (assq-ref connection-settings 'virtuoso-password) + rdf-file + %graph-uri))))) + ((arg0 _ ...) + (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE RDF-FILE~%" arg0) + (exit #f)))) + +(apply main (command-line)) diff --git a/manifest.scm b/manifest.scm index 9e7c150..c8cd15a 100644 --- a/manifest.scm +++ b/manifest.scm @@ -6,6 +6,7 @@ (use-modules (gnu packages autotools) ((gnu packages bioinformatics) #:prefix guix:) + ((gnu packages databases) #:select (virtuoso-ose)) (gnu packages graphviz) (gnu packages guile) ((gnu packages guile-xyz) #:select (guile-sparql) #:prefix guix:) @@ -71,4 +72,5 @@ (list guile-3.0 guile-dbi guile-dbd-mysql ;; We abuse (ccwl graphviz) as a library to visualize the database ;; schema. Hence we need ccwl and guile-libyaml. - ccwl graphviz guile-libyaml guile-sparql run64)) + ccwl graphviz guile-hashing guile-libyaml guile-sparql + run64 virtuoso-ose)) |