#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (rnrs bytevectors) (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms) (transform uuid)) (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) "_"))) (define-transformer genewiki-symbols (tables (GeneRIF_BASIC) "GROUP BY BINARY symbol") (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF_BASIC symbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:label (field GeneRIF_BASIC symbol)))) ;; Some symbols exist in the RIF table that don't exist in the GeneRIF ;; table. (define-transformer generif-symbols (tables (GeneRIF) "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol") (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF symbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:label (field GeneRIF symbol)))) (define-transformer gn-genewiki-entries (tables (GeneRIF (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") (left-join Investigators "ON Investigators.Email = GeneRIF.email")) "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol") (schema-triples (gnc:GeneWikiEntry a rdfs:Class) (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnt:initial a owl:ObjectProperty) (gnt:initial rdfs:domain gnc:GeneWikiEntry) (gnt:initial skos:definition "Optional user or project code or your initials") (gnt:reason a owl:ObjectProperty) (gnt:reason rdfs:domain gnc:GeneWikiEntry) (gnt:reason skos:definition "The reason why this resource was modified") (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry)) (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF symbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:comment (let* ((generif-comment (sanitize-rdf-string (field GeneRIF comment))) (create-time (field GeneRIF createtime EntryCreateTime)) (pmid (field GeneRIF PubMed_ID PMID)) (web-url (field GeneRIF weburl)) (species (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "" #:proc string-capitalize-first)) (version-id (field GeneRIF versionId)) (identifier (field GeneRIF Id)) (initial (field GeneRIF initial)) (reason (field GeneRIF reason)) (categories (remove (lambda (x) (or (eq? x #f) (and (string? x) (string-null? x)))) (remove-duplicates (string-split-substring (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" GeneCategory)) "$$"))))) (string->symbol (string-append "[ " (format #f "rdf:type gnc:GNWikiEntry ; ") (if (string? species) "" (format #f "gnt:belongsToSpecies ~a ; " species)) (format #f "rdfs:comment ~s^^xsd:string ; " generif-comment) (if (string? create-time) "" (format #f "dct:created ~s^^xsd:datetime ; " (time-unix->string create-time "~5"))) (if (and (string? pmid) (not (string-null? pmid))) (format #f "~{dct:references pubmed:~a ; ~}" (string-split pmid #\space)) "") (if (and (not (string-null? (string-trim-both (field GeneRIF email)))) (not (string-null? (field Investigators Email)))) (format #f "dct:creator ~a ; " (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) "") (format #f "dct:identifier ~s ; " identifier) (format #f "dct:hasVersion \"~s\"^^xsd:int ; " version-id) (if (string-blank? reason) "" (format #f "gnt:reason ~s ; " reason)) (if (null? initial) "" (format #f "gnt:initial ~s ; " initial)) (if (not (null? categories)) (format #f "~{gnt:belongsToCategory ~s ; ~}" categories) "") (if (and (string? web-url) (not (string-null? web-url))) (format #f "foaf:homepage ~s ; " web-url) "") " ] ")))))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC (left-join Species "USING (SpeciesId)")) "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID") (schema-triples (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI") (gnt:hasVersionId a owl:ObjectProperty) (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry) (gnt:hasVersionId skos:definition "The VersionId of this this resource")) (triples (string->identifier "symbol" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneRIF_BASIC symbol GeneRIFSymbol) 'pre "_" 'post) #:proc (lambda (x) x)) (set rdfs:comment (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))] [species-name (string->identifier "" (remap-species-identifiers (field Species Fullname SpeciesFullName)) #:separator "" #:proc string-capitalize-first)] [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)] [create-time (field GeneRIF_BASIC createtime EntryCreateTime)] [pmid (field GeneRIF_BASIC PubMed_ID PMID)] [gene-id (field GeneRIF_BASIC GeneId)] [version-id (field GeneRIF_BASIC VersionId)]) (string->symbol (string-append "[ " (format #f "rdf:type gnc:NCBIWikiEntry ; ") (format #f "rdfs:comment ~s^^xsd:string ; " ncbi-comment) (format #f "gnt:belongsToSpecies ~a ; " species-name) (if (eq? #f taxonomic-id) "" (format #f "skos:notation taxon:~a ; " taxonomic-id)) (format #f "gnt:hasGeneId generif:~a ; " gene-id) (format #f "dct:hasVersion '~a'^^xsd:int ; " version-id) (if (and (string? pmid) (not (string-null? pmid))) (format #f "~{dct:references pubmed:~a ; ~}" (string-split pmid #\space)) "") (if (string? create-time) "" (format #f "dct:created ~s^^xsd:datetime ; " (time-unix->string create-time "~5"))) " ]")))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "GeneRIF Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("rdf:" "") ("rdfs:" "") ("skos:" "") ("xkos:" "") ("gn:" "") ("gnc:" "") ("gnt:" "") ("dct:" "") ("foaf:" "") ("pubmed:" "") ("taxon:" "") ("generif:" "") ("xsd:" "") ("owl:" ""))) (inputs (list genewiki-symbols generif-symbols gn-genewiki-entries ncbi-genewiki-entries)) (outputs `(#:documentation ,documentation #:rdf ,output))))