#! /usr/bin/env guile !# (use-modules (rnrs io ports) (srfi srfi-1) (srfi srfi-26) (srfi srfi-71) (srfi srfi-171) (ice-9 ftw) (ice-9 match) (ice-9 popen) (hashing md5) ((web client) #:select (http-head open-socket-for-uri)) (web request) (web response) (web uri)) (define %graph-uri "http://rdf.genenetwork.org/v1") (define (call-with-pipe proc mode program . args) "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to it. Call PROC with a port to that pipe. Close the pipe once PROC exits, even if it exits non-locally. Return the value returned by PROC." (let ((port #f)) (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args))) (cut proc port) (lambda () (let ((return-value (status:exit-val (close-pipe port)))) (unless (and return-value (zero? return-value)) (error "Invocation of program failed" (cons program args)))))))) (define (delete-graph port password graph) "Delete GRAPH from virtuoso connecting to virtuoso on PORT authenticating as the dba user with PASSWORD." ;; We do this with SQL because doing it with SPARQL is too ;; slow. Note that this does not delete free-text index data, if ;; any. See ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs (call-with-pipe (lambda (out) (format out "SET DSN=localhost:~a; SET PWD=~s; DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');" port password graph)) OPEN_WRITE "isql")) (define (empty-load-queue port password) "Empty the " (call-with-pipe (lambda (out) (format out "SET DSN=localhost:~a; SET PWD=~s; DELETE FROM DB.DBA.load_list;" port password)) OPEN_WRITE "isql")) (define (bulk-load-data port password graph) "Bulk load data into virtuoso" (call-with-pipe (lambda (out) (format out "SET DSN=localhost:~a; SET PWD=~s; ld_dir('/var/lib/data', '*.ttl', '~a'); rdf_loader_run(); CHECKPOINT; " port password graph)) OPEN_WRITE "isql")) (define (set-global-namespaces port password) "Set the global namespaces" (call-with-pipe (lambda (out) (format out "SET DSN=localhost:~a; SET PWD=~s; DB.DBA.XML_SET_NS_DECL ('dcat', 'http://www.w3.org/ns/dcat#', 2); DB.DBA.XML_SET_NS_DECL ('dct', 'http://purl.org/dc/terms/', 2); DB.DBA.XML_SET_NS_DECL ('fabio', 'http://purl.org/spar/fabio/', 2); DB.DBA.XML_SET_NS_DECL ('genbank', 'https://bioregistry.io/reference/genbank:', 2); DB.DBA.XML_SET_NS_DECL ('gene', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2); DB.DBA.XML_SET_NS_DECL ('generif', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2); DB.DBA.XML_SET_NS_DECL ('geoSeries', 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=', 2); DB.DBA.XML_SET_NS_DECL ('gn', 'http://rdf.genenetwork.org/v1/id/', 2); DB.DBA.XML_SET_NS_DECL ('gnc', 'http://rdf.genenetwork.org/v1/category/', 2); DB.DBA.XML_SET_NS_DECL ('gnt', 'http://rdf.genenetwork.org/v1/term/', 2); DB.DBA.XML_SET_NS_DECL ('ncbiTaxon', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 2); DB.DBA.XML_SET_NS_DECL ('prism', 'http://prismstandard.org/namespaces/basic/2.0/', 2); DB.DBA.XML_SET_NS_DECL ('probeset', 'http://rdf.genenetwork.org/v1/probeset/', 2); DB.DBA.XML_SET_NS_DECL ('pubmed', 'http://rdf.ncbi.nlm.nih.gov/pubmed/', 2); DB.DBA.XML_SET_NS_DECL ('qb', 'http://purl.org/linked-data/cube#', 2); DB.DBA.XML_SET_NS_DECL ('sdmx-measure', 'http://purl.org/linked-data/sdmx/2009/measure#', 2); DB.DBA.XML_SET_NS_DECL ('taxon', 'http://purl.uniprot.org/taxonomy/', 2); DB.DBA.XML_SET_NS_DECL ('transcript', 'https://portals.broadinstitute.org/gpp/public/trans/details?transName=', 2); DB.DBA.XML_SET_NS_DECL ('v', 'http://www.w3.org/2006/vcard/ns#', 2); DB.DBA.XML_SET_NS_DECL ('xkos', 'http://rdf-vocabulary.ddialliance.org/xkos#', 2); DB.DBA.XML_SET_NS_DECL ('schema', 'https://schema.org/', 2); DB.DBA.XML_SET_NS_DECL ('foaf', 'http://xmlns.com/foaf/0.1/#term_', 2); " port password)) OPEN_WRITE "isql")) (define (index-data port password) "Index all text data for quicker search" (call-with-pipe (lambda (out) (format out "SET DSN=localhost:~a; SET PWD=~s; DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'All'); DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ(); quit; " port password)) OPEN_WRITE "isql")) (define (time-thunk thunk) "Run THUNK and return the time taken in seconds." (let ((start-time (current-time))) (thunk) (- (current-time) start-time))) (define main (match-lambda* ((_ connection-settings-file) (let ((connection-settings (call-with-input-file connection-settings-file read))) ;; Delete existing data. We do not rely on the implicit ;; deletion in the PUT method of the SPARQL 1.1 Graph Store ;; HTTP Protocol because that is too slow. (format (current-output-port) "Existing virtuoso data deleted in ~a seconds~%" (time-thunk (cut delete-graph (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password) %graph-uri))) ;; Update global namespaces (format (current-output-port) "Global namespaces set in ~a seconds~%" (time-thunk (cut set-global-namespaces (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password)))) ;; Delete the load queue (format (current-output-port) "Existing DB.LOAD queue deleted in ~a seconds~%" (time-thunk (cut empty-load-queue (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password)))) ;; Bulk load data (format (current-output-port) "Existing virtuoso data uploaded in ~a seconds~%" (time-thunk (cut bulk-load-data (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password) %graph-uri))) ;; Index the data (format (current-output-port) "Indexing the data" (time-thunk (cut index-data (assq-ref connection-settings 'virtuoso-port) (assq-ref connection-settings 'virtuoso-password)))))) ((arg0 _ ...) (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0) (exit #f)))) (apply main (command-line))