#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 format)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms)
(web uri))
(define-transformer genelist
(tables (GeneList
(left-join Species "USING (SpeciesId)")))
(schema-triples
(gnt:gene rdfs:domain gnc:GeneSymbol)
(gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol)
(gnc:Gene a rdfs:Class)
(gnc:Gene rdfs:label "Gene")
(gnt:hasGeneId a owl:ObjectProperty)
(gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry)
(gnt:hasGeneId skos:definition "The GeneId of this this resource")
(gnc:transcript rdfs:domain gnc:GeneSymbol)
(gnt:transcript a owl:ObjectProperty)
(gnc:transcript rdfs:comments "The gene transcript of this resource")
(gnc:ebiGwasLink rdfs:Class gnc:ResourceLink)
(gnc:ebiGwasLink rdfs:label "EBI GWAS")
(gnc:ebiGwasLink rdfs:comments "EBI GWAS")
(gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink)
(gnc:proteinAtlasLink rdfs:label "Protein Atlas")
(gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas")
(gnc:genemaniaLink rdfs:Class gnc:ResourceLink)
(gnc:genemaniaLink rdfs:label "GeneMANIA")
(gnc:genemaniaLink rdfs:comments "GeneMANIA")
(gnc:gemmaLink rdfs:Class gnc:ResourceLink)
(gnc:gemmaLink rdfs:label "Gemma")
(gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data")
(gnc:biogpsLink rdfs:Class gnc:ResourceLink)
(gnc:biogpsLink rdfs:label "BioGPS")
(gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types")
(gnc:abaLink rdfs:Class gnc:ResourceLink)
(gnc:abaLink rdfs:label "ABA")
(gnc:abaLink rdfs:comments "Allen Brain Atlas")
(gnc:pantherLink rdfs:Class gnc:ResourceLink)
(gnc:pantherLink rdfs:label "PANTHER")
(gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI")
(gnc:stringLink rdfs:Class gnc:ResourceLink)
(gnc:stringLink rdfs:label "STRING")
(gnc:stringLink rdfs:comments "Protein interactions: known and inferred")
(gnc:gtexLink rdfs:Class gnc:ResourceLink)
(gnc:gtexLink rdfs:label "GTEx Portal")
(gnc:gtexLink rdfs:comments "GTEx Portal")
(gnc:rgdLink rdfs:Class gnc:ResourceLink)
(gnc:rgdLink rdfs:label "Rat Genome DB")
(gnc:rgdLink rdfs:comments "Rat Genome DB")
(gnc:hasKgID rdfs:domain gnc:GeneSymbol)
(gnt:hasKgID a owl:ObjectProperty)
(gnc:hasKgID rdfs:comments "The kgID of this resource")
(gnc:hasUnigenID rdfs:domain gnc:GeneSymbol)
(gnt:hasUnigenID a owl:ObjectProperty)
(gnc:hasUnigenID rdfs:comments "The UnigenID of this resource")
(gnc:hasProteinID rdfs:domain gnc:GeneSymbol)
(gnt:hasProteinID a owl:ObjectProperty)
(gnc:hasProteinID rdfs:comments "The ProteinID of this resource")
(gnc:hasAlignID rdfs:domain gnc:GeneSymbol)
(gnt:hasAlignID a owl:ObjectProperty)
(gnc:hasAlignID rdfs:comments "The AlignID of this resource")
(gnt:TxEnd rdfs:range xsd:double)
(gnt:TxStart rdfs:range xsd:double)
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(string->identifier
"gene" (regexp-substitute/global
#f "[^A-Za-z0-9:]"
(string-trim-both
(field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID)))
'pre "_" 'post)
#:proc (lambda (x) x))
(set rdf:type 'gnc:Gene)
(set gnt:geneSymbol (field GeneList GeneSymbol))
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
(set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId)))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(uri-encode
(string-trim-both symbol))
"a gnc:ebiGwasLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or (string=? species "human")
(string=? species "mouse")))
(string->symbol
(format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a"
"http://mouse.brain-map.org/search/show?search_type=gene&search_term="
"a gnc:abaLink"
(if (string=? species "mouse")
(uri-encode
(string-trim-both symbol))
geneId)))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"https://rgd.mcw.edu/rgdweb/elasticResults.html?term="
(uri-encode
(string-trim-both symbol))
"&category=Gene&species="
(string-capitalize species)
"a gnc:rgdLink"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? geneId))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "rat")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"http://biogps.org/?org="
species
"#goto=genereport&id="
geneId
"a gnc:biogpsLink"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID)))
(if (not (string-blank? geneId))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid="
geneId
"a gnc:gemmaLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(species (lower-case-and-replace-spaces
(field Species FullName))))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mus-musculus")
(string=? species "rattus-norvegicus")
(string=? species "homo-sapiens")))
(string->symbol
(format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a"
"https://genemania.org/search"
species
(uri-encode
(string-trim-both symbol))
"a gnc:genemaniaLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(uri-encode
(string-trim-both symbol))
"a gnc:pantherLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:stringLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtexLink"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:proteinAtlasLink"))
"")))
(set gnt:chromosome (field GeneList Chromosome))
(set gnt:TxStart (annotate-field
(field GeneList TxStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList TxEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList Strand)))
(set
gnt:belongsToSpecies
(string->identifier
""
(remap-species-identifiers
(string-trim-both (field Species Name)))
#:separator ""
#:proc string-capitalize-first))
(set
gnt:transcript
(ontology 'transcript:
(string-trim-both (field GeneList NM_ID))))
(set gnt:hasKgID (string-trim-both (field GeneList kgID)))
(set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID)))
(set gnt:hasProteinID (string-trim-both (field GeneList ProteinID)))
(set gnt:hasAlignID (string-trim-both (field GeneList AlignID)))
(set gnt:hasRgdID
(field ("IFNULL(RGD_ID, '')" RGD_ID)))))
(define-transformer genelist-rn33
(tables (GeneList_rn33))
(triples
(let ([gene-uid (field GeneList_rn33 id GENE_UID)])
(string->identifier
"gene_rn33"
(if (number? gene-uid)
(number->string
gene-uid)
gene-uid)))
(set rdf:type 'gnc:Gene)
(set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
(set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol)))
(set gnt:chromosome (field GeneList_rn33 chromosome))
(set gnt:TxStart (annotate-field
(field GeneList_rn33 txStart)
'^^xsd:double))
(set gnt:TxEnd (annotate-field
(field GeneList_rn33 txEnd)
'^^xsd:double))
(set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
(set
gnt:transcript
(ontology
'transcript:
(string-trim-both (field GeneList_rn33 NM_ID))))
(set
gnt:hasKgID
(string-trim-both (field GeneList_rn33 kgID)))
(set dct:references
(let ((symbol (field GeneList_rn33 geneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(string-trim-both symbol)
"a gnc:PantherLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(string-trim-both symbol)
"a gnc:ebiGwasLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:stringLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtexLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:proteinAtlasLink"))
"")))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "Gene Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("gene:" "")
("sdmx-measure:" "")
("transcript:" "")
("skos:" "")))
(inputs
(list genelist-rn33
genelist))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))