#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (rnrs bytevectors) (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms) (transform uuid)) (define-transformer genewiki-symbols (tables (GeneRIF_BASIC) "GROUP BY BINARY symbol") (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF_BASIC symbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:label (field GeneRIF_BASIC symbol)))) ;; Some symbols exist in the RIF table that don't exist in the GeneRIF ;; table. (define-transformer generif-symbols (tables (GeneRIF) "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol") (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF symbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:label (field GeneRIF symbol)))) (define-transformer gn-genewiki-entries (tables (GeneRIF (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol") (schema-triples (gnc:GeneWikiEntry a rdfs:Class) (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnt:initial a owl:ObjectProperty) (gnt:initial rdfs:domain gnc:GeneWikiEntry) (gnt:initial skos:definition "Optional user or project code or your initials") (gnt:reason a owl:ObjectProperty) (gnt:reason rdfs:domain gnc:GeneWikiEntry) (gnt:reason skos:definition "The reason why this resource was modified") (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry)) ;; We want to avoid manually generating a unique identifier for each ;; comment. As such we use a blank node (that has the comment) as ;; the subject of the triples produced by matching the ;; predicateObjectList production: ;; (triples (format #f "[ rdfs:comment '''~a'''@en] " (field GeneRIF comment)) (set rdf:type (let* ((create-time (field ("CAST(createtime AS CHAR)" EntryCreateTime))) (pmid (field GeneRIF PubMed_ID PMID)) (web-url (field GeneRIF weburl)) (species (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "" #:proc string-capitalize-first)) (version-id (field GeneRIF versionId)) (identifier (field GeneRIF Id)) (initial (sanitize-rdf-string (field GeneRIF initial))) (reason (field GeneRIF reason)) (email (sanitize-rdf-string (field GeneRIF email))) (category (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')" GeneCategory)))) (string->symbol (string-append (format #f "gnc:GNWikiEntry ;\n") (if (string? species) "" (format #f "\tgnt:belongsToSpecies ~a ;\n" species)) (format #f "\tdct:created ~s^^xsd:datetime ;\n" create-time) (if (and (string? pmid) (not (string-null? pmid))) (format #f "\tdct:references ( ~{pubmed:~a ~}) ;\n" (string-split pmid #\space)) "\tdct:references rdf:nil ;\n") (if (string-blank? email) "" (format #f "\tfoaf:mbox <~a> ;\n" email)) (format #f "\tdct:identifier \"~s\"^^xsd:integer ;\n" identifier) (if (and (string? web-url) (not (string-null? web-url))) (format #f "\tfoaf:homepage <~a> ;\n" web-url) "") (format #f "\tdct:hasVersion \"~s\"^^xsd:integer ;\n" version-id) (if (or (null? initial) (string-blank? initial)) "" (format #f "\tgnt:initial ~s ;\n" initial)) (if (string-blank? reason) "" (format #f "\tgnt:reason ~s ;\n" reason)) (if (string-blank? category) "\tgnt:belongsToCategory rdf:nil ;\n" (format #f "\tgnt:belongsToCategory ( ~{~s ~}) ;\n" (string-split category #\;))) ;; We have this symbol at the very end of this transform ;; because we have a strong guarantee that it will be a ;; non-null value hence always terminating this triple ;; properly with a "." (format #f "\tgnt:symbol ~a" (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF symbol) 'pre "_" 'post) #:proc (lambda (x) x))))))))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC (left-join Species "USING (SpeciesId)")) "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID") (schema-triples (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI") (gnt:hasVersionId a owl:ObjectProperty) (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry) (gnt:hasVersionId skos:definition "The VersionId of this this resource")) (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF_BASIC symbol GeneRIFSymbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:comment (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))] [species-name (string->identifier "" (remap-species-identifiers (field Species Fullname SpeciesFullName)) #:separator "" #:proc string-capitalize-first)] [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)] [create-time (field GeneRIF_BASIC createtime EntryCreateTime)] [pmid (field GeneRIF_BASIC PubMed_ID PMID)] [gene-id (field GeneRIF_BASIC GeneId)] [version-id (field GeneRIF_BASIC VersionId)]) (string->symbol (string-append "[ " (format #f "rdf:type gnc:NCBIWikiEntry ; ") (format #f "rdfs:comment ~s^^xsd:string ; " ncbi-comment) (format #f "gnt:belongsToSpecies ~a ; " species-name) (if (eq? #f taxonomic-id) "" (format #f "skos:notation taxon:~a ; " taxonomic-id)) (format #f "gnt:hasGeneId generif:~a ; " gene-id) (format #f "dct:hasVersion '~a'^^xsd:int ; " version-id) (if (and (string? pmid) (not (string-null? pmid))) (format #f "~{dct:references pubmed:~a ; ~}" (string-split pmid #\space)) "") (if (string? create-time) "" (format #f "dct:created ~s^^xsd:datetime ; " (time-unix->string create-time "~5"))) " ]")))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "GeneRIF Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("rdf:" "") ("rdfs:" "") ("skos:" "") ("xkos:" "") ("gn:" "") ("gnc:" "") ("gnt:" "") ("dct:" "") ("foaf:" "") ("pubmed:" "") ("taxon:" "") ("generif:" "") ("xsd:" "") ("owl:" ""))) (inputs (list genewiki-symbols generif-symbols gn-genewiki-entries ncbi-genewiki-entries)) (outputs `(#:documentation ,documentation #:rdf ,output))))