aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2022-05-04 18:50:19 +0530
committerArun Isaac2022-05-04 18:52:37 +0530
commitbaa9b54f1eb232ed891beaf8a24c3d0d058fbd31 (patch)
treea573b00dcee88b43a26c455a853b72b0cac37909
parent4684e4a5036a9b438560f1aa1767e5fee91c0ca3 (diff)
downloadgn-transform-databases-baa9b54f1eb232ed891beaf8a24c3d0d058fbd31.tar.gz
Add script to load RDF into virtuoso.
* load-rdf.scm: New file. * manifest.scm: Import virtuoso-ose from (gnu packages databases). Add virtuoso-ose to manifest.
-rwxr-xr-xload-rdf.scm193
-rw-r--r--manifest.scm4
2 files changed, 196 insertions, 1 deletions
diff --git a/load-rdf.scm b/load-rdf.scm
new file mode 100755
index 0000000..f6e0f62
--- /dev/null
+++ b/load-rdf.scm
@@ -0,0 +1,193 @@
+#! /usr/bin/env guile
+!#
+
+(use-modules (rnrs io ports)
+ (srfi srfi-1)
+ (srfi srfi-26)
+ (srfi srfi-71)
+ (srfi srfi-171)
+ (ice-9 match)
+ (ice-9 popen)
+ (hashing md5)
+ ((web client) #:select (http-head open-socket-for-uri))
+ (web request)
+ (web response)
+ (web uri))
+
+(define %graph-uri
+ "http://genenetwork.org")
+
+(define (random-cnonce len)
+ "Return a random hexadecimal string LEN characters long."
+ (list->string (map (lambda _
+ (let ((i (random 16)))
+ (integer->char
+ (if (< i 10)
+ (+ i (char->integer #\0))
+ (+ (- i 10)
+ (char->integer #\a))))))
+ (iota len))))
+
+(define (md5-digest-authorization method uri username password)
+ "Return a digest authorization header to access URI with METHOD,
+USERNAME and PASSWORD."
+ (let* ((response response-body (http-head uri))
+ (challenge (or (find (match-lambda
+ (('digest . challenge)
+ (let ((algorithm (assq-ref challenge 'algorithm)))
+ (or (not algorithm)
+ (string=? algorithm "MD5")))))
+ (assq-ref (response-headers response)
+ 'www-authenticate))
+ (error "No MD5 authentication challenges advertised")))
+ (realm (assq-ref challenge 'realm))
+ (nonce (assq-ref challenge 'nonce))
+ (qop (assq-ref challenge 'qop))
+ ;; Hard-code the request counter (nc) to "1".
+ (nc "1")
+ (cnonce (random-cnonce 8)))
+ `(digest (username . ,username)
+ (realm . ,realm)
+ (nonce . ,nonce)
+ (uri . ,(uri-path uri))
+ (algorithm . "MD5")
+ (qop . ,qop)
+ (nc . ,nc)
+ (cnonce . ,cnonce)
+ (response . ,(md5->string
+ (md5 (string->bytevector
+ (string-join
+ (list (md5->string
+ (md5 (string->bytevector (string-join
+ (list username realm password)
+ ":")
+ (make-transcoder (utf-8-codec)))))
+ nonce
+ nc
+ cnonce
+ qop
+ (md5->string
+ (md5 (string->bytevector (string-join (list (symbol->string method)
+ (uri-path uri))
+ ":")
+ (make-transcoder (utf-8-codec))))))
+ ":")
+ (make-transcoder (utf-8-codec))))))
+ (opaque . ,(assq-ref challenge 'opaque)))))
+
+(define* (http-upload-file method uri file #:key username password)
+ "Upload FILE to URI using METHOD. USERNAME and PASSWORD, if
+provided, are the username and password to authenticate with."
+ (let ((authorization-headers
+ (if username
+ `((authorization . ,(md5-digest-authorization method uri username password)))
+ '())))
+ (call-with-port (open-socket-for-uri uri)
+ (lambda (port)
+ (let ((request (write-request (build-request
+ uri
+ #:method method
+ #:headers `((connection close)
+ (content-length . ,(stat:size (stat file)))
+ ,@authorization-headers)
+ #:port port)
+ port)))
+ (when file
+ (call-with-input-file file
+ (lambda (in)
+ (port-transduce (tmap (cut write-request-body request <>))
+ (const #t)
+ get-bytevector-some
+ in)
+ (force-output (request-port request)))))
+ (read-response port))))))
+
+(define (call-with-pipe proc mode program . args)
+ "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to
+it. Call PROC with a port to that pipe. Close the pipe once PROC
+exits, even if it exits non-locally. Return the value returned by
+PROC."
+ (let ((port #f))
+ (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args)))
+ (cut proc port)
+ (lambda ()
+ (let ((return-value (status:exit-val (close-pipe port))))
+ (unless (and return-value
+ (zero? return-value))
+ (error "Invocation of program failed" (cons program args))))))))
+
+(define (put-graph sparql-endpoint username password rdf-file graph)
+ "Load RDF-FILE into GRAPH at SPARQL-ENDPOINT, a SPARQL 1.1 Graph
+Store HTTP Protocol endpoint, authenticating with USERNAME and
+PASSWORD. The PUT method is used, and therefore, any existing data in
+the graph is deleted."
+ (let ((response (http-upload-file 'PUT
+ (build-uri (uri-scheme sparql-endpoint)
+ #:host (uri-host sparql-endpoint)
+ #:port (uri-port sparql-endpoint)
+ #:path (uri-path sparql-endpoint)
+ #:query (string-append "graph-uri=" (uri-encode graph)))
+ rdf-file
+ #:username username
+ #:password password)))
+ ;; Fail if response code is not 2xx.
+ (unless (= (quotient (response-code response)
+ 100)
+ 2)
+ (error "Putting graph failed" response))))
+
+(define (delete-graph port graph)
+ "Delete GRAPH from virtuoso connecting to virtuoso on PORT."
+ ;; We do this with SQL because doing it with SPARQL is too
+ ;; slow. Note that this does not delete free-text index data, if
+ ;; any. See
+ ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs
+ (call-with-pipe
+ (lambda (port)
+ (format port
+ "DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');"
+ graph))
+ OPEN_WRITE
+ "isql" "-S" (number->string port)))
+
+(define (time-thunk thunk)
+ "Run THUNK and return the time taken in seconds."
+ (let ((start-time (current-time)))
+ (thunk)
+ (- (current-time) start-time)))
+
+(define main
+ (match-lambda*
+ ((_ connection-settings-file rdf-file)
+ (let ((connection-settings
+ (call-with-input-file connection-settings-file
+ read)))
+ ;; Delete existing data. We do not rely on the implicit
+ ;; deletion in the PUT method of the SPARQL 1.1 Graph Store
+ ;; HTTP Protocol because that is too slow.
+ (format (current-output-port)
+ "Existing virtuoso data deleted in ~a seconds~%"
+ (time-thunk
+ (cut delete-graph
+ (assq-ref connection-settings 'virtuoso-port)
+ %graph-uri)))
+ ;; Load data into virtuoso.
+ (format (current-output-port)
+ "~a loaded into virtuoso in ~a seconds~%"
+ rdf-file
+ (time-thunk
+ (cut put-graph
+ (build-uri
+ (assq-ref connection-settings 'sparql-scheme)
+ #:host (assq-ref connection-settings 'sparql-host)
+ #:port (assq-ref connection-settings 'sparql-port)
+ #:path "/sparql-graph-crud-auth")
+ (assq-ref connection-settings 'virtuoso-username)
+ (assq-ref connection-settings 'virtuoso-password)
+ rdf-file
+ %graph-uri)))))
+ ((arg0 _ ...)
+ (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE RDF-FILE~%" arg0)
+ (exit #f))))
+
+(apply main (command-line))
diff --git a/manifest.scm b/manifest.scm
index 9e7c150..c8cd15a 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -6,6 +6,7 @@
(use-modules (gnu packages autotools)
((gnu packages bioinformatics) #:prefix guix:)
+ ((gnu packages databases) #:select (virtuoso-ose))
(gnu packages graphviz)
(gnu packages guile)
((gnu packages guile-xyz) #:select (guile-sparql) #:prefix guix:)
@@ -71,4 +72,5 @@
(list guile-3.0 guile-dbi guile-dbd-mysql
;; We abuse (ccwl graphviz) as a library to visualize the database
;; schema. Hence we need ccwl and guile-libyaml.
- ccwl graphviz guile-libyaml guile-sparql run64))
+ ccwl graphviz guile-hashing guile-libyaml guile-sparql
+ run64 virtuoso-ose))