#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 match) (ice-9 regex) (dump strings) (dump sql) (dump triples) (dump special-forms)) (define %connection-settings (call-with-input-file (list-ref (command-line) 1) read)) (define %dump-directory (list-ref (command-line) 2)) (define-dump dump-genewiki-symbols (tables (GeneRIF_BASIC (left-join Species "USING (SpeciesId)")) "GROUP BY GeneId ORDER BY BINARY symbol") (schema-triples (gn:symbol rdfs:domain gn:geneWikiEntry) (gn:wikiEntryOfSpecies rdfs:range gn:species) (gn:taxid rdfs:domain gn:geneWikiEntry)) (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) (multiset gn:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol)) #\,)) (multiset gn:wikiEntryOfSpecies (string-split (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species)) #\,)) (multiset gn:taxId (map (cut ontology 'ncbiTaxon: <>) (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId)) #\,))))) (define-dump dump-gn-genewiki-entries (tables (GeneRIF (left-join GeneRIF_BASIC "USING (symbol)") (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId") (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id") (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")) "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol") (schema-triples (gn:geneWikiEntry a rdfs:Class) (gn:geneWikiEntry a owl:Class) (gn:geneWikiEntry rdfs:comment "Represents GeneRIF Entries") (gn:geneCategory rdfs:domain gn:geneWikiEntry) (gn:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry) (gn:geneWikiEntry rdfs:domain gn:geneWikiEntry)) (triples (let ([geneid (field GeneRIF_BASIC GeneId)]) (if (eq? geneid 0) (ontology 'gn:anonSymbol_ (field GeneRIF symbol)) (ontology 'generif: geneid))) (set rdf:type (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP))) "" 'gn:geneWikiEntry)) (set gn:wikiEntryOfSpecies (field Species SpeciesName)) ;; This only dumps symbols not present in the GeneRIF_BASIC table (set gn:symbol (let ([geneid (field GeneRIF_BASIC GeneId)]) (if (eq? geneid 0) (field GeneRIF symbol) ""))) (multiset gn:geneWikiEntryOfGn (let* ([entries (sanitize-rdf-string (field ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')" wikientry)))] [comments (string-split-substring entries ";;;;;")]) (map (match-lambda ((genecategory pmid email text createtime weburl) (blank-node (set gn:geneCategory genecategory) (multiset dct:source (map (lambda (el) (if (string-null? el) "" (ontology 'pubmed: el))) (string-split pmid #\space))) (set dct:creator (regexp-substitute/global #f "@.*$" email 'pre "" 'post)) (set gn:geneWikiEntry (annotate-field text '^^xsd:string)) (set dct:created (annotate-field createtime '^^xsd:datetime)) (set foaf:homepage weburl)))) (map (cut string-split-substring <> "::::") comments)))))) (define-dump dump-ncbi-genewiki-entries (tables (GeneRIF_BASIC) "GROUP BY GeneId, comment, createtime") (schema-triples (gn:geneWikiEntryofNCBI rdfs:domain gn:geneWikiEntry)) (triples (ontology 'generif: (field GeneRIF_BASIC GeneId)) (set gn:geneWikiEntryOfNCBI (blank-node (set gn:geneWikiEntry (annotate-field (field GeneRIF_BASIC comment) '^^xsd:string)) (multiset dct:source (map (lambda (el) (if (string-null? el) "" (ontology 'pubmed: el))) (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids)) #\,))) (set dct:created (annotate-field (time-unix->string (field GeneRIF_BASIC createtime) "~5") '^^xsd:datetime)))))) (call-with-target-database %connection-settings (lambda (db) (with-output-to-file (string-append %dump-directory "dump-generif.ttl") (lambda () (prefix "rdf:" "") (prefix "rdfs:" "") (prefix "foaf:" "") (prefix "gn:" "") (prefix "dct:" "") (prefix "pubmed:" "") (prefix "up:" "") (prefix "ncbiTaxon:" "") (prefix "generif:" "") (prefix "xsd:" "") (prefix "owl:" "") (prefix "phenotype:" "") (prefix "molecularTrait:" "") (prefix "nuccore:" "") (prefix "omim:" "") (prefix "pubchem:" "") (prefix "uniprot:" "") (prefix "hgnc:" "") (prefix "homologene:" "") (prefix "chebi:" "") (prefix "kegg:" "") (newline) (dump-genewiki-symbols db) (dump-gn-genewiki-entries db) (dump-ncbi-genewiki-entries db)) #:encoding "utf8")))