#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms)) (define-transformer genelist (tables (GeneList (left-join Species "USING (SpeciesId)")) "GROUP BY BINARY GeneSymbol, GeneId, chromosome, txStart, txEnd") (schema-triples (gnt:gene rdfs:domain gnc:GeneSymbol) (gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol) (gnc:GeneSymbol a rdfs:Class) (gnc:GeneSymbol rdfs:label "Gene Symbol") (gnc:transcript rdfs:domain gnc:GeneSymbol) (gnt:transcript a owl:ObjectProperty) (gnc:transcript rdfs:comments "The gene transcript of this resource") (gnc:hasKgID rdfs:domain gnc:GeneSymbol) (gnt:hasKgID a owl:ObjectProperty) (gnc:hasKgID rdfs:comments "The kgID of this resource") (gnc:hasUnigenID rdfs:domain gnc:GeneSymbol) (gnt:hasUnigenID a owl:ObjectProperty) (gnc:hasUnigenID rdfs:comments "The UnigenID of this resource") (gnc:hasProteinID rdfs:domain gnc:GeneSymbol) (gnt:hasProteinID a owl:ObjectProperty) (gnc:hasProteinID rdfs:comments "The ProteinID of this resource") (gnc:hasAlignID rdfs:domain gnc:GeneSymbol) (gnt:hasAlignID a owl:ObjectProperty) (gnc:hasAlignID rdfs:comments "The AlignID of this resource") (gnt:TxEnd rdfs:range xsd:double) (gnt:TxStart rdfs:range xsd:double) (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) (triples (string->identifier "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneList GeneSymbol) 'pre "_" 'post)) (set rdf:type 'gnc:GeneSymbol) (set rdfs:label (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) (set gnt:gene (ontology 'gene: (field GeneList GeneId))) (set gnt:chromosome (field GeneList Chromosome)) (set gnt:TxStart (annotate-field (field GeneList TxStart) '^^xsd:double)) (set gnt:TxEnd (annotate-field (field GeneList TxEnd) '^^xsd:double)) (set gnt:Strand (string-trim-both (field GeneList Strand))) (multiset gnt:belongsToSpecies (map (lambda (species) (string->identifier "" (remap-species-identifiers (string-trim-both species)) #:separator "" #:proc string-capitalize-first)) (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName))) #\,))) (multiset gnt:transcript (map (lambda (transcript) (ontology 'transcript: (string-trim-both transcript))) (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID))) #\,))) (multiset gnt:hasKgID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT kgID )" kgID))) #\,))) (multiset gnt:hasUnigenID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID))) #\,))) (multiset gnt:hasProteinID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID))) #\,))) (multiset gnt:hasAlignID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID))) #\,))) (multiset gnt:hasRgdID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID))) #\,))))) (define-transformer genelist-rn33 (tables (GeneList_rn33) "GROUP BY BINARY GeneSymbol, chromosome, txStart, txEnd") (triples (string->identifier "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" (field GeneList_rn33 geneSymbol) 'pre "_" 'post)) (set rdf:type 'gnc:GeneSymbol) (set rdfs:label (field GeneList_rn33 geneSymbol)) (set gnt:chromosome (field GeneList_rn33 chromosome)) (set gnt:TxStart (annotate-field (field GeneList_rn33 txStart) '^^xsd:double)) (set gnt:TxEnd (annotate-field (field GeneList_rn33 txEnd) '^^xsd:double)) (set gnt:Strand (string-trim-both (field GeneList_rn33 strand))) (set gnt:belongsToSpecies 'gn:Rattus_norvegicus) (multiset gnt:transcript (map (lambda (transcript) (ontology 'transcript: (string-trim-both transcript))) (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID))) #\,))) (multiset gnt:hasKgID (map string-trim-both (string-split (sanitize-rdf-string (field ("GROUP_CONCAT( DISTINCT kgID )" kgID))) #\,))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "Gene Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("gn:" "") ("probeset:" "") ("gnc:" "") ("gnt:" "") ("rdf:" "") ("rdfs:" "") ("dct:" "") ("owl:" "") ("xsd:" "") ("qb:" "") ("gene:" "") ("sdmx-measure:" "") ("transcript:" "") ("skos:" ""))) (inputs (list genelist-rn33 genelist)) (outputs `(#:documentation ,documentation #:rdf ,output))))