#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (rnrs bytevectors) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms) (transform uuid)) (define (remap-species-identifiers str) "This procedure remaps identifiers to standard binominal. Obviously this should be sorted by correcting the database!" (match str ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] ["Macaca mulatta" "Macaca nemestrina"] ["Bat (Glossophaga soricina)" "Glossophaga soricina"] [str str])) (define (fix-email-id email) (string-delete #\space email)) (define (investigator-attributes->id first-name last-name email) ;; There is just one record corresponding to "Evan Williams" which ;; does not have an email ID. To accommodate that record, we ;; construct the investigator ID from not just the email ID, but ;; also the first and the last names. It would be preferable to just ;; find Evan Williams' email ID and insert it into the database. (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) "_"))) (define-transformer genewiki-symbols (tables (GeneRIF_BASIC (left-join Species "USING (SpeciesId)")) "GROUP BY GeneId ORDER BY BINARY symbol") (schema-triples (gnt:belongsToSpecies rdfs:domain gnc:strain) (gnt:belongsToSpecies rdfs:domain gnc:NCBIWikiEntry) (gnt:symbol rdfs:domain gnc:NCBIWikiEntry)) (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) #\,)) (multiset gnt:belongsToSpecies (string-split (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) #\,)) (multiset dct:relation (map (cut ontology 'ncbiTaxon: <>) (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) #\,))))) (define-transformer gn-genewiki-entries (tables (GeneRIF (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id") (left-join Investigators "ON Investigators.Email = GeneRIF.email")) "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime") (schema-triples (gnc:GeneWikiEntry a rdfs:Class) (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork") (gnt:belongsToCategory rdfs:domain gnc:GNWikiEntry) (gnt:belongsToSpecies rdfs:domain gnc:GNWikiEntry) (gnt:symbol rdfs:domain gnc:GNWikiEntry)) (triples (string->identifier "generif" (make-version-3-uuid (u8-list->bytevector ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 '(107 167 184 17 157 173 17 209 128 180 0 192 79 212 48 200)) (format #f "~a~a~a~a" (field Species FullName) (field GeneRIF comment) (field GeneRIF symbol) (field GeneRIF createtime)) "")) (set rdf:type 'gnc:GNWikiEntry) (set rdfs:label (sanitize-rdf-string (field GeneRIF comment))) (set gnt:symbol (field GeneRIF symbol)) (multiset gnt:belongsToCategory (remove-duplicates (string-split-substring (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')" GeneCategory)) "$$"))) (set gnt:belongsToSpecies (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "" #:proc string-capitalize-first)) (multiset dct:references (map (lambda (x) (ontology 'pubmed: x)) (string-split (let ((pmid (field ("IFNULL(GeneRIF.PubMed_ID, '')" PubMed_ID)))) (if (number? pmid) (number->string pmid) pmid)) #\space))) (set dct:created (let ((createtime (field GeneRIF createtime))) (if (string? createtime) "" (annotate-field (time-unix->string createtime "~5") '^^xsd:datetime)))) (set dct:creator (if (and (not (string-null? (string-trim-both (field GeneRIF email)))) (not (string-null? (field Investigators Email)))) (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email)) "")) (set foaf:homepage (field GeneRIF weburl)))) (define-transformer ncbi-genewiki-entries (tables (GeneRIF_BASIC) "GROUP BY GeneId, comment, createtime") (schema-triples (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry) (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")) (triples (string->identifier "generif" (make-version-3-uuid (u8-list->bytevector ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8 '(107 167 184 17 157 173 17 209 128 180 0 192 79 212 48 200)) (format #f "~a~a~a~a~a" (field GeneRIF_BASIC GeneId) (field GeneRIF_BASIC VersionId) (field GeneRIF_BASIC comment) (field GeneRIF_BASIC symbol) (field GeneRIF_BASIC createtime)) "")) (set rdf:type 'gnc:NCBIWikiEntry) (set rdfs:label (annotate-field (field GeneRIF_BASIC comment) '^^xsd:string)) (set gnt:symbol (field GeneRIF_BASIC symbol)) (multiset dct:references (map (lambda (el) (if (string-null? el) "" (ontology 'pubmed: el))) (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids)) #\,))) (set gnt:hasVersion (format #f "~a" (field GeneRIF_BASIC VersionId))) (set dct:created (let ((createtime (field GeneRIF_BASIC createtime))) (if (string? createtime) "" (annotate-field (time-unix->string createtime "~5") '^^xsd:datetime)))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "GeneRIF Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("rdf:" "") ("rdfs:" "") ("gn:" "") ("gnc:" "") ("gnt:" "") ("dct:" "") ("foaf:" "") ("pubmed:" "") ("ncbiTaxon:" "") ("generif:" "") ("xsd:" "") ("owl:" ""))) (inputs (list ;; genewiki-symbols gn-genewiki-entries ;; ncbi-genewiki-entries )) (outputs `(#:documentation ,documentation #:rdf ,output))))