#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms)
(web uri))
(define-transformer genelist
(tables (GeneList
(left-join Species "USING (SpeciesId)"))
"GROUP BY BINARY GeneSymbol, GeneId, chromosome, txStart, txEnd")
(schema-triples
(gnt:gene rdfs:domain gnc:GeneSymbol)
(gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol)
(gnc:GeneSymbol a rdfs:Class)
(gnc:GeneSymbol rdfs:label "Gene Symbol")
(gnc:transcript rdfs:domain gnc:GeneSymbol)
(gnt:transcript a owl:ObjectProperty)
(gnc:transcript rdfs:comments "The gene transcript of this resource")
(gnc:ebiGwasLink rdfs:Class gnc:ResourceLink)
(gnc:ebiGwasLink rdfs:label "EBI GWAS")
(gnc:ebiGwasLink rdfs:comments "EBI GWAS")
(gnc:biogpsLink rdfs:Class gnc:ResourceLink)
(gnc:biogpsLink rdfs:label "BioGPS Resource Link")
(gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types")
(gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink)
(gnc:proteinAtlasLink rdfs:label "Protein Atlas")
(gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas")
(gnc:genemaniaLink rdfs:Class gnc:ResourceLink)
(gnc:genemaniaLink rdfs:label "GeneMANIA")
(gnc:genemaniaLink rdfs:comments "GeneMANIA")
(gnc:gemmaLink rdfs:Class gnc:ResourceLink)
(gnc:gemmaLink rdfs:label "Gemma")
(gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data")
(gnc:gtexLink rdfs:Class gnc:ResourceLink)
(gnc:gtexLink rdfs:label "GTEx Portal")
(gnc:gtexLink rdfs:comments "GTEx Portal")
(gnc:biogpsLink rdfs:Class gnc:ResourceLink)
(gnc:biogpsLink rdfs:label "BioGPS")
(gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types")
(gnc:abaLink rdfs:Class gnc:ResourceLink)
(gnc:abaLink rdfs:label "ABA")
(gnc:abaLink rdfs:comments "Allen Brain Atlas")
(gnc:pantherLink rdf:type gnc:ResourceLink)
(gnc:pantherLink rdfs:label "PANTHER")
(gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI")
(gnc:stringLink rdfs:Class gnc:ResourceLink)
(gnc:stringLink rdfs:label "STRING")
(gnc:stringLink rdfs:comments "Protein interactions: known and inferred")
(gnc:gtexLink rdfs:Class gnc:ResourceLink)
(gnc:gtexLink rdfs:label "GTEx Portal")
(gnc:gtexLink rdfs:comments "GTEx Portal")
(gnc:rgdLink rdfs:Class gnc:ResourceLink)
(gnc:rgdLink rdfs:label "Rat Genome DB")
(gnc:rgdLink rdfs:comments "Rat Genome DB")
(gnc:hasKgID rdfs:domain gnc:GeneSymbol)
(gnt:hasKgID a owl:ObjectProperty)
(gnc:hasKgID rdfs:comments "The kgID of this resource")
(gnc:hasUnigenID rdfs:domain gnc:GeneSymbol)
(gnt:hasUnigenID a owl:ObjectProperty)
(gnc:hasUnigenID rdfs:comments "The UnigenID of this resource")
(gnc:hasProteinID rdfs:domain gnc:GeneSymbol)
(gnt:hasProteinID a owl:ObjectProperty)
(gnc:hasProteinID rdfs:comments "The ProteinID of this resource")
(gnc:hasAlignID rdfs:domain gnc:GeneSymbol)
(gnt:hasAlignID a owl:ObjectProperty)
(gnc:hasAlignID rdfs:comments "The AlignID of this resource")
(gnt:TxEnd rdfs:range xsd:double)
(gnt:TxStart rdfs:range xsd:double)
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(string->identifier
"gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
(string-trim-both
(field GeneList GeneSymbol))
'pre "_" 'post))
(set rdf:type 'gnc:GeneSymbol)
(set rdfs:label (field GeneList GeneSymbol))
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
(set gnt:gene (ontology 'gene: (field GeneList GeneId)))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(uri-encode
(string-trim-both symbol))
"a gnc:ebiGwasLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol)))
(geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or (string=? species "human")
(string=? species "mouse")))
(string->symbol
(format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a"
"http://mouse.brain-map.org/search/show?search_type=gene&search_term="
"a gnc:abaLink"
(if (string=? species "mouse")
(uri-encode
(string-trim-both symbol))
geneId)))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol)))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"https://rgd.mcw.edu/rgdweb/elasticResults.html?term="
(uri-encode
(string-trim-both symbol))
"&category=Gene&species="
(string-capitalize species)
"a gnc:rgdLink"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? geneId))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "rat")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"http://biogps.org/?org="
species
"#goto=genereport&id="
geneId
"a gnc:biogpsLink"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID)))
(if (not (string-blank? geneId))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid="
geneId
"a gnc:gemmaLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneID))
(species (lower-case-and-replace-spaces
(field Species FullName))))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mus-musculus")
(string=? species "rattus-norvegicus")
(string=? species "homo-sapiens")))
(string->symbol
(format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a"
"https://genemania.org/search"
species
(uri-encode
(string-trim-both symbol))
"a gnc:genemaniaLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(uri-encode
(string-trim-both symbol))
"a gnc:PantherLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:stringLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtexLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList GeneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:proteinAtlasLink"))
"")))
(set gnt:chromosome (field GeneList Chromosome))
(set gnt:TxStart (annotate-field
(field GeneList TxStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList TxEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList Strand)))
(multiset
gnt:belongsToSpecies
(map
(lambda (species)
(string->identifier
""
(remap-species-identifiers
(string-trim-both species))
#:separator ""
#:proc string-capitalize-first))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT Species.Name )" SpeciesName)))
#\,)))
(multiset
gnt:transcript
(map
(lambda (transcript)
(ontology 'transcript:
(string-trim-both transcript)))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
#\,)))
(multiset
gnt:hasKgID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
#\,)))
(multiset
gnt:hasUnigenID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT UnigenID )" UnigenID)))
#\,)))
(multiset
gnt:hasProteinID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT ProteinID )" ProteinID)))
#\,)))
(multiset
gnt:hasAlignID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT AlignID )" AlignID)))
#\,)))
(multiset
gnt:hasRgdID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT RGD_ID )" RgdID)))
#\,)))))
(define-transformer genelist-rn33
(tables (GeneList_rn33)
"GROUP BY BINARY GeneSymbol, chromosome, txStart, txEnd")
(triples
(string->identifier
"gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
(string-trim-both
(field GeneList_rn33 geneSymbol))
'pre "_" 'post))
(set dct:references
(let ((symbol (field GeneList_rn33 geneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(string-trim-both symbol)
"a gnc:PantherLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(string-trim-both symbol)
"a gnc:ebiGwasLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:stringLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtexLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both
(field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:proteinAtlasLink"))
"")))
(set rdf:type 'gnc:GeneSymbol)
(set rdfs:label (string-trim-both
(field GeneList_rn33 geneSymbol)))
(set gnt:chromosome (field GeneList_rn33 chromosome))
(set gnt:TxStart (annotate-field
(field GeneList_rn33 txStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList_rn33 txEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
(set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
(multiset
gnt:transcript
(map
(lambda (transcript)
(ontology 'transcript:
(string-trim-both transcript)))
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT NM_ID )" NMID)))
#\,)))
(multiset
gnt:hasKgID
(map string-trim-both
(string-split
(sanitize-rdf-string
(field ("GROUP_CONCAT( DISTINCT kgID )" kgID)))
#\,)))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "Gene Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("gene:" "")
("sdmx-measure:" "")
("transcript:" "")
("skos:" "")))
(inputs
(list genelist-rn33
genelist))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))