aboutsummaryrefslogtreecommitdiff
path: root/import-generif.scm
diff options
context:
space:
mode:
Diffstat (limited to 'import-generif.scm')
-rwxr-xr-ximport-generif.scm74
1 files changed, 0 insertions, 74 deletions
diff --git a/import-generif.scm b/import-generif.scm
deleted file mode 100755
index 69a5f78..0000000
--- a/import-generif.scm
+++ /dev/null
@@ -1,74 +0,0 @@
-#! /usr/bin/env guile
-!#
-
-;; This script imports GeneRIF data downloaded from
-;; https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz into RDF.
-
-(use-modules (rnrs io ports)
- (srfi srfi-26)
- (srfi srfi-171)
- (ice-9 match)
- (ice-9 regex)
- (dump triples)
- (dump utils)
- (zlib))
-
-(define decode-html-entities
- (cut regexp-substitute/global
- #f
- ;; We tolerate the absence of the trailing semicolon.
- "&#([[:digit:]]+);{0,1}"
- <>
- 'pre
- (compose string integer->char string->number (cut match:substring <> 1))
- 'post))
-
-(define (main generif-data-file dump-directory)
- (with-output-to-file (string-append dump-directory "/generif.ttl")
- (lambda ()
- (prefix "rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
- (prefix "rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
- (prefix "gn:" "<http://genenetwork.org/>")
- (newline)
- ;; TODO: Link to gene objects, not merely literal Gene IDs.
- (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF)
- (triple 'gn:geneId 'rdfs:range 'rdfs:Literal)
- ;; TODO: Link to gn:publication objects, not merely literal
- ;; PubMed IDs.
- (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF)
- (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal)
- (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF)
- (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal)
-
- (call-with-gzip-input-port (open-input-file generif-data-file)
- (lambda (port)
- ;; Read and discard header.
- (get-line port)
- ;; Dump other lines.
- (port-transduce
- (compose (tenumerate)
- (tmap (match-lambda
- ;; Is there a better way to identify
- ;; GeneRIF entries instead of merely
- ;; enumerating them?
- ((i . line)
- (match (string-split line #\tab)
- ((_ gene-id pubmed-id _ text)
- (scm->triples
- `((rdf:type . gn:geneRIF)
- (gn:geneId . ,(string->number gene-id))
- (gn:pubMedId . ,(string->number pubmed-id))
- ;; Some text has HTML
- ;; entities. Decode them.
- (gn:geneRIFText . ,(decode-html-entities text)))
- (string->identifier "geneRIF" (number->string i)))))))))
- (const #t)
- get-line
- port))))))
-
-(match (command-line)
- ((arg0 generif-data-file dump-directory)
- (main generif-data-file dump-directory))
- ((arg0 _ ...)
- (format (current-error-port) "Usage: ~a GENERIF-DATA-FILE DUMP-DIRECTORY~%" arg0)
- (exit #f)))