From a73fca62199c40808e5bc33093baae08438c3f11 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 4 Nov 2022 15:26:31 +0530 Subject: Unite importing GeneRIF with dumping SQL data. * README.md: Document generif-data-file parameter in connection settings. * dump.scm: Import (srfi srfi-171), (ice-9 regex) and (zlib). (decode-html-entities, import-generif): New functions. (main): Call import-generif. * import-generif.scm: Delete file. --- README.md | 3 ++- dump.scm | 57 +++++++++++++++++++++++++++++++++++++++-- import-generif.scm | 74 ------------------------------------------------------ 3 files changed, 57 insertions(+), 77 deletions(-) delete mode 100755 import-generif.scm diff --git a/README.md b/README.md index b9084fa..c4276e6 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ shown below. Take care to replace the placeholders within angle brackets with the appropriate values. ``` scheme -((sql-username . "") +((generif-data-file . "/path/to/generifs_basic.gz") + (sql-username . "") (sql-password . "") (sql-database . "") (sql-host . "") diff --git a/dump.scm b/dump.scm index 295bf1e..e26cdd7 100755 --- a/dump.scm +++ b/dump.scm @@ -5,12 +5,15 @@ (rnrs io ports) (srfi srfi-1) (srfi srfi-26) + (srfi srfi-171) (ice-9 match) + (ice-9 regex) (ice-9 string-fun) (dump sql) (dump table) (dump triples) - (dump utils)) + (dump utils) + (zlib)) ;;; GeneNetwork database connection parameters and dump path @@ -767,6 +770,55 @@ is a object." (set gn:binomialName (field InbredSet fullName)) (set gn:species (field Species Name)))) + +;; Import GeneRIF + +(define decode-html-entities + (cut regexp-substitute/global + #f + ;; We tolerate the absence of the trailing semicolon. + "&#([[:digit:]]+);{0,1}" + <> + 'pre + (compose string integer->char string->number (cut match:substring <> 1)) + 'post)) + +(define (import-generif generif-data-file) + ;; TODO: Link to gene objects, not merely literal Gene IDs. + (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneId 'rdfs:range 'rdfs:Literal) + ;; TODO: Link to gn:publication objects, not merely literal PubMed + ;; IDs. + (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal) + (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF) + (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal) + + (call-with-gzip-input-port (open-input-file generif-data-file) + (lambda (port) + ;; Read and discard header. + (get-line port) + ;; Dump other lines. + (port-transduce + (compose (tenumerate) + (tmap (match-lambda + ;; Is there a better way to identify GeneRIF + ;; entries instead of merely enumerating them? + ((i . line) + (match (string-split line #\tab) + ((_ gene-id pubmed-id _ text) + (scm->triples + `((rdf:type . gn:geneRIF) + (gn:geneId . ,(string->number gene-id)) + (gn:pubMedId . ,(string->number pubmed-id)) + ;; Some text has HTML entities. Decode + ;; them. + (gn:geneRIFText . ,(decode-html-entities text))) + (string->identifier "geneRIF" (number->string i))))))))) + (const #t) + get-line + port)))) + ;; Main function @@ -793,4 +845,5 @@ is a
object." (dump-info-files db) (dump-schema db) (dump-case-attributes db) - (dump-groups db))))) + (dump-groups db) + (import-generif (assq-ref %connection-settings 'generif-data-file)))))) diff --git a/import-generif.scm b/import-generif.scm deleted file mode 100755 index 69a5f78..0000000 --- a/import-generif.scm +++ /dev/null @@ -1,74 +0,0 @@ -#! /usr/bin/env guile -!# - -;; This script imports GeneRIF data downloaded from -;; https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz into RDF. - -(use-modules (rnrs io ports) - (srfi srfi-26) - (srfi srfi-171) - (ice-9 match) - (ice-9 regex) - (dump triples) - (dump utils) - (zlib)) - -(define decode-html-entities - (cut regexp-substitute/global - #f - ;; We tolerate the absence of the trailing semicolon. - "&#([[:digit:]]+);{0,1}" - <> - 'pre - (compose string integer->char string->number (cut match:substring <> 1)) - 'post)) - -(define (main generif-data-file dump-directory) - (with-output-to-file (string-append dump-directory "/generif.ttl") - (lambda () - (prefix "rdf:" "") - (prefix "rdfs:" "") - (prefix "gn:" "") - (newline) - ;; TODO: Link to gene objects, not merely literal Gene IDs. - (triple 'gn:geneId 'rdfs:domain 'gn:geneRIF) - (triple 'gn:geneId 'rdfs:range 'rdfs:Literal) - ;; TODO: Link to gn:publication objects, not merely literal - ;; PubMed IDs. - (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:domain 'gn:geneRIF) - (triple 'gn:geneRIFEvidencedByPubMedId 'rdfs:range 'rdfs:Literal) - (triple 'gn:geneRIFText 'rdfs:domain 'gn:geneRIF) - (triple 'gn:geneRIFText 'rdfs:range 'rdfs:Literal) - - (call-with-gzip-input-port (open-input-file generif-data-file) - (lambda (port) - ;; Read and discard header. - (get-line port) - ;; Dump other lines. - (port-transduce - (compose (tenumerate) - (tmap (match-lambda - ;; Is there a better way to identify - ;; GeneRIF entries instead of merely - ;; enumerating them? - ((i . line) - (match (string-split line #\tab) - ((_ gene-id pubmed-id _ text) - (scm->triples - `((rdf:type . gn:geneRIF) - (gn:geneId . ,(string->number gene-id)) - (gn:pubMedId . ,(string->number pubmed-id)) - ;; Some text has HTML - ;; entities. Decode them. - (gn:geneRIFText . ,(decode-html-entities text))) - (string->identifier "geneRIF" (number->string i))))))))) - (const #t) - get-line - port)))))) - -(match (command-line) - ((arg0 generif-data-file dump-directory) - (main generif-data-file dump-directory)) - ((arg0 _ ...) - (format (current-error-port) "Usage: ~a GENERIF-DATA-FILE DUMP-DIRECTORY~%" arg0) - (exit #f))) -- cgit v1.2.3