#! /usr/bin/env guile
!#
(use-modules (rnrs io ports)
(srfi srfi-1)
(srfi srfi-26)
(srfi srfi-71)
(srfi srfi-171)
(ice-9 ftw)
(ice-9 match)
(ice-9 popen)
(hashing md5)
((web client) #:select (http-head open-socket-for-uri))
(web request)
(web response)
(web uri))
(define %graph-uri
"http://rdf.genenetwork.org/v1")
(define (call-with-pipe proc mode program . args)
"Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to
it. Call PROC with a port to that pipe. Close the pipe once PROC
exits, even if it exits non-locally. Return the value returned by
PROC."
(let ((port #f))
(dynamic-wind (lambda () (set! port (apply open-pipe* mode program args)))
(cut proc port)
(lambda ()
(let ((return-value (status:exit-val (close-pipe port))))
(unless (and return-value
(zero? return-value))
(error "Invocation of program failed" (cons program args))))))))
(define (delete-graph port password graph)
"Delete GRAPH from virtuoso connecting to virtuoso on PORT
authenticating as the dba user with PASSWORD."
;; We do this with SQL because doing it with SPARQL is too
;; slow. Note that this does not delete free-text index data, if
;; any. See
;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs
(call-with-pipe
(lambda (out)
(format out
"SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');"
port
password
graph))
OPEN_WRITE
"isql"))
(define (empty-load-queue port password)
"Empty the "
(call-with-pipe
(lambda (out)
(format out
"SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM DB.DBA.load_list;"
port
password))
OPEN_WRITE
"isql"))
(define (bulk-load-data port password graph)
"Bulk load data into virtuoso"
(call-with-pipe
(lambda (out)
(format out
"SET DSN=localhost:~a;
SET PWD=~s;
ld_dir('/var/lib/data', '*.ttl', '~a');
rdf_loader_run();
CHECKPOINT;
"
port
password
graph))
OPEN_WRITE
"isql"))
(define (set-global-namespaces port password)
"Set the global namespaces"
(call-with-pipe
(lambda (out)
(format out
"SET DSN=localhost:~a;
SET PWD=~s;
DB.DBA.XML_SET_NS_DECL ('dcat', 'http://www.w3.org/ns/dcat#', 2);
DB.DBA.XML_SET_NS_DECL ('dct', 'http://purl.org/dc/terms/', 2);
DB.DBA.XML_SET_NS_DECL ('fabio', 'http://purl.org/spar/fabio/', 2);
DB.DBA.XML_SET_NS_DECL ('genbank', 'https://bioregistry.io/reference/genbank:', 2);
DB.DBA.XML_SET_NS_DECL ('gene', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2);
DB.DBA.XML_SET_NS_DECL ('generif', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2);
DB.DBA.XML_SET_NS_DECL ('geoSeries', 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=', 2);
DB.DBA.XML_SET_NS_DECL ('gn', 'http://rdf.genenetwork.org/v1/id/', 2);
DB.DBA.XML_SET_NS_DECL ('gnc', 'http://rdf.genenetwork.org/v1/category/', 2);
DB.DBA.XML_SET_NS_DECL ('gnt', 'http://rdf.genenetwork.org/v1/term/', 2);
DB.DBA.XML_SET_NS_DECL ('ncbiTaxon', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 2);
DB.DBA.XML_SET_NS_DECL ('prism', 'http://prismstandard.org/namespaces/basic/2.0/', 2);
DB.DBA.XML_SET_NS_DECL ('probeset', 'http://rdf.genenetwork.org/v1/probeset/', 2);
DB.DBA.XML_SET_NS_DECL ('pubmed', 'http://rdf.ncbi.nlm.nih.gov/pubmed/', 2);
DB.DBA.XML_SET_NS_DECL ('qb', 'http://purl.org/linked-data/cube#', 2);
DB.DBA.XML_SET_NS_DECL ('sdmx-measure', 'http://purl.org/linked-data/sdmx/2009/measure#', 2);
DB.DBA.XML_SET_NS_DECL ('taxon', 'http://purl.uniprot.org/taxonomy/', 2);
DB.DBA.XML_SET_NS_DECL ('transcript', 'https://portals.broadinstitute.org/gpp/public/trans/details?transName=', 2);
DB.DBA.XML_SET_NS_DECL ('v', 'http://www.w3.org/2006/vcard/ns#', 2);
DB.DBA.XML_SET_NS_DECL ('xkos', 'http://rdf-vocabulary.ddialliance.org/xkos#', 2);
DB.DBA.XML_SET_NS_DECL ('schema', 'https://schema.org/', 2);
DB.DBA.XML_SET_NS_DECL ('foaf', 'http://xmlns.com/foaf/0.1/#term_', 2);
DB.DBA.XML_SET_NS_DECL ('gnd', 'https://cd.genenetwork.org/lmdb/v1/data/traits/', 2);
DB.DBA.XML_SET_NS_DECL ('gn-files', 'http://files.genenetwork.org/current/', 2);
"
port
password))
OPEN_WRITE
"isql"))
(define (index-data port password)
"Index all text data for quicker search"
(call-with-pipe
(lambda (out)
(format out
"SET DSN=localhost:~a;
SET PWD=~s;
DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'All');
DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ();
quit;
"
port
password))
OPEN_WRITE
"isql"))
(define (time-thunk thunk)
"Run THUNK and return the time taken in seconds."
(let ((start-time (current-time)))
(thunk)
(- (current-time) start-time)))
(define main
(match-lambda*
((_ connection-settings-file)
(let ((connection-settings
(call-with-input-file connection-settings-file
read)))
;; Delete existing data. We do not rely on the implicit
;; deletion in the PUT method of the SPARQL 1.1 Graph Store
;; HTTP Protocol because that is too slow.
(format (current-output-port)
"Existing virtuoso data deleted in ~a seconds~%"
(time-thunk
(cut delete-graph
(assq-ref connection-settings 'virtuoso-port)
(assq-ref connection-settings 'virtuoso-password)
%graph-uri)))
;; Update global namespaces
(format (current-output-port)
"Global namespaces set in ~a seconds~%"
(time-thunk
(cut set-global-namespaces
(assq-ref connection-settings 'virtuoso-port)
(assq-ref connection-settings 'virtuoso-password))))
;; Delete the load queue
(format (current-output-port)
"Existing DB.LOAD queue deleted in ~a seconds~%"
(time-thunk
(cut empty-load-queue
(assq-ref connection-settings 'virtuoso-port)
(assq-ref connection-settings 'virtuoso-password))))
;; Bulk load data
(format (current-output-port)
"Existing virtuoso data uploaded in ~a seconds~%"
(time-thunk
(cut bulk-load-data
(assq-ref connection-settings 'virtuoso-port)
(assq-ref connection-settings 'virtuoso-password)
%graph-uri)))
;; Index the data
(format (current-output-port)
"Indexing the data"
(time-thunk
(cut index-data
(assq-ref connection-settings 'virtuoso-port)
(assq-ref connection-settings 'virtuoso-password))))))
((arg0 _ ...)
(format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0)
(exit #f))))
(apply main (command-line))