#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms))
(define-transformer genelist
(tables (GeneList
(left-join Species "USING (SpeciesId)"))
"GROUP BY BINARY GeneSymbol, GeneId, chromosome, txStart, txEnd")
(schema-triples
(gnt:gene rdfs:domain gnc:GeneSymbol)
(gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol)
(gnc:GeneSymbol a rdfs:Class)
(gnc:GeneSymbol rdfs:label "Gene Symbol")
(gnc:transcript rdfs:domain gnc:GeneSymbol)
(gnt:transcript a owl:ObjectProperty)
(gnc:transcript rdfs:comments "The gene transcript of this resource")
(gnc:hasKgID rdfs:domain gnc:GeneSymbol)
(gnt:hasKgID a owl:ObjectProperty)
(gnc:hasKgID rdfs:comments "The kgID of this resource")
(gnc:hasUnigenID rdfs:domain gnc:GeneSymbol)
(gnt:hasUnigenID a owl:ObjectProperty)
(gnc:hasUnigenID rdfs:comments "The UnigenID of this resource")
(gnc:hasProteinID rdfs:domain gnc:GeneSymbol)
(gnt:hasProteinID a owl:ObjectProperty)
(gnc:hasProteinID rdfs:comments "The ProteinID of this resource")
(gnc:hasAlignID rdfs:domain gnc:GeneSymbol)
(gnt:hasAlignID a owl:ObjectProperty)
(gnc:hasAlignID rdfs:comments "The AlignID of this resource")
(gnt:TxEnd rdfs:range xsd:double)
(gnt:TxStart rdfs:range xsd:double)
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(string->identifier
"gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
(field GeneList GeneSymbol)
'pre "_" 'post))
(set rdf:type 'gnc:GeneSymbol)
(set rdfs:label (field GeneList GeneSymbol))
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
(set gnt:gene (ontology 'gene: (field GeneList GeneId)))
(set gnt:chromosome (field GeneList Chromosome))
(set gnt:TxStart (annotate-field
(field GeneList TxStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList TxEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList Strand)))
(multiset
gnt:belongsToSpecies
(map
(lambda (species)
(string->identifier
""
(remap-species-identifiers
(string-trim-both species))
#:separator ""
#:proc string-capitalize-first))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName)))
#\,)))
(multiset
gnt:transcript
(map
(lambda (transcript)
(ontology 'transcript:
(string-trim-both transcript)))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
#\,)))
(multiset
gnt:hasKgID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
#\,)))
(multiset
gnt:hasUnigenID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID)))
#\,)))
(multiset
gnt:hasProteinID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID)))
#\,)))
(multiset
gnt:hasAlignID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID)))
#\,)))
(multiset
gnt:hasRgdID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID)))
#\,)))))
(define-transformer genelist-rn33
(tables (GeneList_rn33)
"GROUP BY BINARY GeneSymbol, chromosome, txStart, txEnd")
(triples
(string->identifier
"gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
(field GeneList_rn33 geneSymbol)
'pre "_" 'post))
(set rdf:type 'gnc:GeneSymbol)
(set rdfs:label (field GeneList_rn33 geneSymbol))
(set gnt:chromosome (field GeneList_rn33 chromosome))
(set gnt:TxStart (annotate-field
(field GeneList_rn33 txStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList_rn33 txEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
(set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
(multiset
gnt:transcript
(map
(lambda (transcript)
(ontology 'transcript:
(string-trim-both transcript)))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
#\,)))
(multiset
gnt:hasKgID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
#\,)))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "Gene Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("gene:" "")
("sdmx-measure:" "")
("transcript:" "")
("skos:" "")))
(inputs
(list genelist-rn33
genelist))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))