about summary refs log tree commit diff
path: root/dump.scm
diff options
context:
space:
mode:
Diffstat (limited to 'dump.scm')
-rwxr-xr-xdump.scm57
1 files changed, 55 insertions, 2 deletions
diff --git a/dump.scm b/dump.scm
index 295bf1e..e26cdd7 100755
--- a/dump.scm
+++ b/dump.scm
@@ -5,12 +5,15 @@
              (rnrs io ports)
              (srfi srfi-1)
              (srfi srfi-26)
+             (srfi srfi-171)
              (ice-9 match)
+             (ice-9 regex)
              (ice-9 string-fun)
              (dump sql)
              (dump table)
              (dump triples)
-             (dump utils))
+             (dump utils)
+             (zlib))
 
 
 ;;; GeneNetwork database connection parameters and dump path
@@ -768,6 +771,55 @@ is a <table> object."
     (set gn:species (field Species Name))))
 
 
+;; Import GeneRIF
+
+(define decode-html-entities
+  (cut regexp-substitute/global
+       #f
+       ;; We tolerate the absence of the trailing semicolon.
+       "&#([[:digit:]]+);{0,1}"
+       <>
+       'pre
+       (compose string integer->char string->number (cut match:substring <> 1))
+       'post))
+
+(define (import-generif generif-data-file)
+  ;; TODO: Link to gene objects, not merely literal Gene IDs.
+  (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF)
+  (triple 'gn:geneId 'rdfs:range 'rdfs:Literal)
+  ;; TODO: Link to gn:publication objects, not merely literal PubMed
+  ;; IDs.
+  (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF)
+  (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal)
+  (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF)
+  (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal)
+
+  (call-with-gzip-input-port (open-input-file generif-data-file)
+    (lambda (port)
+      ;; Read and discard header.
+      (get-line port)
+      ;; Dump other lines.
+      (port-transduce
+       (compose (tenumerate)
+                (tmap (match-lambda
+                        ;; Is there a better way to identify GeneRIF
+                        ;; entries instead of merely enumerating them?
+                        ((i . line)
+                         (match (string-split line #\tab)
+                           ((_ gene-id pubmed-id _ text)
+                            (scm->triples
+                             `((rdf:type . gn:geneRIF)
+                               (gn:geneId . ,(string->number gene-id))
+                               (gn:pubMedId . ,(string->number pubmed-id))
+                               ;; Some text has HTML entities. Decode
+                               ;; them.
+                               (gn:geneRIFText . ,(decode-html-entities text)))
+                             (string->identifier "geneRIF" (number->string i)))))))))
+       (const #t)
+       get-line
+       port))))
+
+
 ;; Main function
 
 (call-with-genenetwork-database
@@ -793,4 +845,5 @@ is a <table> object."
        (dump-info-files db)
        (dump-schema db)
        (dump-case-attributes db)
-       (dump-groups db)))))
+       (dump-groups db)
+       (import-generif (assq-ref %connection-settings 'generif-data-file))))))