diff options
Diffstat (limited to 'dump.scm')
-rwxr-xr-x | dump.scm | 57 |
1 files changed, 55 insertions, 2 deletions
@@ -5,12 +5,15 @@ (rnrs io ports) (srfi srfi-1) (srfi srfi-26) + (srfi srfi-171) (ice-9 match) + (ice-9 regex) (ice-9 string-fun) (dump sql) (dump table) (dump triples) - (dump utils)) + (dump utils) + (zlib)) ;;; GeneNetwork database connection parameters and dump path @@ -768,6 +771,55 @@ is a <table> object." (set gn:species (field Species Name)))) +;; Import GeneRIF + +(define decode-html-entities + (cut regexp-substitute/global + #f + ;; We tolerate the absence of the trailing semicolon. + "&#([[:digit:]]+);{0,1}" + <> + 'pre + (compose string integer->char string->number (cut match:substring <> 1)) + 'post)) + +(define (import-generif generif-data-file) + ;; TODO: Link to gene objects, not merely literal Gene IDs. + (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneId 'rdfs:range 'rdfs:Literal) + ;; TODO: Link to gn:publication objects, not merely literal PubMed + ;; IDs. + (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal) + (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal) + + (call-with-gzip-input-port (open-input-file generif-data-file) + (lambda (port) + ;; Read and discard header. + (get-line port) + ;; Dump other lines. + (port-transduce + (compose (tenumerate) + (tmap (match-lambda + ;; Is there a better way to identify GeneRIF + ;; entries instead of merely enumerating them? + ((i . line) + (match (string-split line #\tab) + ((_ gene-id pubmed-id _ text) + (scm->triples + `((rdf:type . gn:geneRIF) + (gn:geneId . ,(string->number gene-id)) + (gn:pubMedId . ,(string->number pubmed-id)) + ;; Some text has HTML entities. Decode + ;; them. + (gn:geneRIFText . ,(decode-html-entities text))) + (string->identifier "geneRIF" (number->string i))))))))) + (const #t) + get-line + port)))) + + ;; Main function (call-with-genenetwork-database @@ -793,4 +845,5 @@ is a <table> object." (dump-info-files db) (dump-schema db) (dump-case-attributes db) - (dump-groups db))))) + (dump-groups db) + (import-generif (assq-ref %connection-settings 'generif-data-file)))))) |