aboutsummaryrefslogtreecommitdiff
path: root/dump.scm
diff options
context:
space:
mode:
authorArun Isaac2022-11-04 15:26:31 +0530
committerArun Isaac2022-11-04 15:26:31 +0530
commita73fca62199c40808e5bc33093baae08438c3f11 (patch)
tree58594c1f0734e628095e442748c2a81ffdc1d145 /dump.scm
parent3ca59162963fa3977659b9e6c987c70df38028ea (diff)
downloadgn-transform-databases-a73fca62199c40808e5bc33093baae08438c3f11.tar.gz
Unite importing GeneRIF with dumping SQL data.
* README.md: Document generif-data-file parameter in connection settings. * dump.scm: Import (srfi srfi-171), (ice-9 regex) and (zlib). (decode-html-entities, import-generif): New functions. (main): Call import-generif. * import-generif.scm: Delete file.
Diffstat (limited to 'dump.scm')
-rwxr-xr-xdump.scm57
1 files changed, 55 insertions, 2 deletions
diff --git a/dump.scm b/dump.scm
index 295bf1e..e26cdd7 100755
--- a/dump.scm
+++ b/dump.scm
@@ -5,12 +5,15 @@
(rnrs io ports)
(srfi srfi-1)
(srfi srfi-26)
+ (srfi srfi-171)
(ice-9 match)
+ (ice-9 regex)
(ice-9 string-fun)
(dump sql)
(dump table)
(dump triples)
- (dump utils))
+ (dump utils)
+ (zlib))
;;; GeneNetwork database connection parameters and dump path
@@ -768,6 +771,55 @@ is a <table> object."
(set gn:species (field Species Name))))
+;; Import GeneRIF
+
+(define decode-html-entities
+ (cut regexp-substitute/global
+ #f
+ ;; We tolerate the absence of the trailing semicolon.
+ "&#([[:digit:]]+);{0,1}"
+ <>
+ 'pre
+ (compose string integer->char string->number (cut match:substring <> 1))
+ 'post))
+
+(define (import-generif generif-data-file)
+ ;; TODO: Link to gene objects, not merely literal Gene IDs.
+ (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF)
+ (triple 'gn:geneId 'rdfs:range 'rdfs:Literal)
+ ;; TODO: Link to gn:publication objects, not merely literal PubMed
+ ;; IDs.
+ (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF)
+ (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal)
+ (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF)
+ (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal)
+
+ (call-with-gzip-input-port (open-input-file generif-data-file)
+ (lambda (port)
+ ;; Read and discard header.
+ (get-line port)
+ ;; Dump other lines.
+ (port-transduce
+ (compose (tenumerate)
+ (tmap (match-lambda
+ ;; Is there a better way to identify GeneRIF
+ ;; entries instead of merely enumerating them?
+ ((i . line)
+ (match (string-split line #\tab)
+ ((_ gene-id pubmed-id _ text)
+ (scm->triples
+ `((rdf:type . gn:geneRIF)
+ (gn:geneId . ,(string->number gene-id))
+ (gn:pubMedId . ,(string->number pubmed-id))
+ ;; Some text has HTML entities. Decode
+ ;; them.
+ (gn:geneRIFText . ,(decode-html-entities text)))
+ (string->identifier "geneRIF" (number->string i)))))))))
+ (const #t)
+ get-line
+ port))))
+
+
;; Main function
(call-with-genenetwork-database
@@ -793,4 +845,5 @@ is a <table> object."
(dump-info-files db)
(dump-schema db)
(dump-case-attributes db)
- (dump-groups db)))))
+ (dump-groups db)
+ (import-generif (assq-ref %connection-settings 'generif-data-file))))))