about summary refs log tree commit diff
#! /usr/bin/env guile
!#

(use-modules (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-26)
             (srfi srfi-71)
             (srfi srfi-171)
             (ice-9 ftw)
             (ice-9 match)
             (ice-9 popen)
             (hashing md5)
             ((web client) #:select (http-head open-socket-for-uri))
             (web request)
             (web response)
             (web uri))

(define %graph-uri
  "http://rdf.genenetwork.org/v1")

(define (call-with-pipe proc mode program . args)
  "Execute PROGRAM ARGS ... in a subprocess with a pipe of MODE to
it. Call PROC with a port to that pipe. Close the pipe once PROC
exits, even if it exits non-locally. Return the value returned by
PROC."
  (let ((port #f))
    (dynamic-wind (lambda () (set! port (apply open-pipe* mode program args)))
                  (cut proc port)
                  (lambda ()
                    (let ((return-value (status:exit-val (close-pipe port))))
                      (unless (and return-value
                                   (zero? return-value))
                        (error "Invocation of program failed" (cons program args))))))))


(define (delete-graph port password graph)
  "Delete GRAPH from virtuoso connecting to virtuoso on PORT
authenticating as the dba user with PASSWORD."
  ;; We do this with SQL because doing it with SPARQL is too
  ;; slow. Note that this does not delete free-text index data, if
  ;; any. See
  ;; http://vos.openlinksw.com/owiki/wiki/VOS/VirtTipsAndTricksGuideDeleteLargeGraphs
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM rdf_quad WHERE g = iri_to_id ('~a');"
             port
             password
             graph))
   OPEN_WRITE
   "isql"))

(define (empty-load-queue port password)
  "Empty the "
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DELETE FROM DB.DBA.load_list;"
             port
             password))
   OPEN_WRITE
   "isql"))

(define (bulk-load-data port password graph)
  "Bulk load data into virtuoso"
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
ld_dir('/var/lib/data', '*.ttl', '~a');
rdf_loader_run();
CHECKPOINT;
"
             port
             password
             graph))
   OPEN_WRITE
   "isql"))

(define (set-global-namespaces port password)
  "Set the global namespaces"
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DB.DBA.XML_SET_NS_DECL ('dcat', 'http://www.w3.org/ns/dcat#', 2);
DB.DBA.XML_SET_NS_DECL ('dct', 'http://purl.org/dc/terms/', 2);
DB.DBA.XML_SET_NS_DECL ('fabio', 'http://purl.org/spar/fabio/', 2);
DB.DBA.XML_SET_NS_DECL ('genbank', 'https://bioregistry.io/reference/genbank:', 2);
DB.DBA.XML_SET_NS_DECL ('gene', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2);
DB.DBA.XML_SET_NS_DECL ('generif', 'http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=', 2);
DB.DBA.XML_SET_NS_DECL ('geoSeries', 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=', 2);
DB.DBA.XML_SET_NS_DECL ('gn', 'http://rdf.genenetwork.org/v1/id/', 2);
DB.DBA.XML_SET_NS_DECL ('gnc', 'http://rdf.genenetwork.org/v1/category/', 2);
DB.DBA.XML_SET_NS_DECL ('gnt', 'http://rdf.genenetwork.org/v1/term/', 2);
DB.DBA.XML_SET_NS_DECL ('ncbiTaxon', 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=', 2);
DB.DBA.XML_SET_NS_DECL ('prism', 'http://prismstandard.org/namespaces/basic/2.0/', 2);
DB.DBA.XML_SET_NS_DECL ('probeset', 'http://rdf.genenetwork.org/v1/probeset/', 2);
DB.DBA.XML_SET_NS_DECL ('pubmed', 'http://rdf.ncbi.nlm.nih.gov/pubmed/', 2);
DB.DBA.XML_SET_NS_DECL ('qb', 'http://purl.org/linked-data/cube#', 2);
DB.DBA.XML_SET_NS_DECL ('sdmx-measure', 'http://purl.org/linked-data/sdmx/2009/measure#', 2);
DB.DBA.XML_SET_NS_DECL ('taxon', 'http://purl.uniprot.org/taxonomy/', 2);
DB.DBA.XML_SET_NS_DECL ('transcript', 'https://portals.broadinstitute.org/gpp/public/trans/details?transName=', 2);
DB.DBA.XML_SET_NS_DECL ('v', 'http://www.w3.org/2006/vcard/ns#', 2);
DB.DBA.XML_SET_NS_DECL ('xkos', 'http://rdf-vocabulary.ddialliance.org/xkos#', 2);
DB.DBA.XML_SET_NS_DECL ('schema', 'https://schema.org/', 2);
DB.DBA.XML_SET_NS_DECL ('foaf', 'http://xmlns.com/foaf/0.1/#term_', 2);
DB.DBA.XML_SET_NS_DECL ('gnd', 'https://cd.genenetwork.org/lmdb/v1/data/traits/', 2);
DB.DBA.XML_SET_NS_DECL ('gn-files', 'http://files.genenetwork.org/current/', 2);
"
             port
             password))
   OPEN_WRITE
   "isql"))

(define (index-data port password)
  "Index all text data for quicker search"
  (call-with-pipe
   (lambda (out)
     (format out
             "SET DSN=localhost:~a;
SET PWD=~s;
DB.DBA.RDF_OBJ_FT_RULE_ADD (null, null, 'All');
DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ();
quit;
"
             port
             password))
   OPEN_WRITE
   "isql"))

(define (time-thunk thunk)
  "Run THUNK and return the time taken in seconds."
  (let ((start-time (current-time)))
    (thunk)
    (- (current-time) start-time)))

(define main
  (match-lambda*
    ((_ connection-settings-file)
     (let ((connection-settings
            (call-with-input-file connection-settings-file
              read)))
       ;; Delete existing data. We do not rely on the implicit
       ;; deletion in the PUT method of the SPARQL 1.1 Graph Store
       ;; HTTP Protocol because that is too slow.
       (format (current-output-port)
               "Existing virtuoso data deleted in ~a seconds~%"
               (time-thunk
                (cut delete-graph
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password)
                     %graph-uri)))
       ;; Update global namespaces
       (format (current-output-port)
               "Global namespaces set in ~a seconds~%"
               (time-thunk
                (cut set-global-namespaces
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password))))
       ;; Delete the load queue
       (format (current-output-port)
               "Existing DB.LOAD queue deleted in ~a seconds~%"
               (time-thunk
                (cut empty-load-queue
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password))))
       ;; Bulk load data
       (format (current-output-port)
               "Existing virtuoso data uploaded in ~a seconds~%"
               (time-thunk
                (cut bulk-load-data
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password)
                     %graph-uri)))
       ;; Index the data
       (format (current-output-port)
               "Indexing the data"
               (time-thunk
                (cut index-data
                     (assq-ref connection-settings 'virtuoso-port)
                     (assq-ref connection-settings 'virtuoso-password))))))
    ((arg0 _ ...)
     (format (current-error-port) "Usage: ~a CONNECTION-SETTINGS-FILE~%" arg0)
     (exit #f))))

(apply main (command-line))