#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 format)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms)
(web uri))
(define-transformer genelist
(tables (GeneList
(left-join Species "USING (SpeciesId)")))
(schema-triples
(gnc:gene_symbol a rdfs:Class)
(gnc:gene_symbol rdfs:label "A gene symbol")
(gnt:gene rdfs:domain gnc:gene_symbol)
(gnt:belongs_to_species rdfs:domain gnc:gene_symbol)
(gnc:gene a rdfs:Class)
(gnc:gene rdfs:label "Gene")
(gnt:has_gene_id a owl:ObjectProperty)
(gnt:has_gene_id rdfs:domain gnc:ncbi_wiki_entry)
(gnt:has_gene_id skos:definition "The GeneId of this this resource")
(gnc:transcript rdfs:domain gnc:gene_symbol)
(gnt:transcript a owl:ObjectProperty)
(gnc:transcript rdfs:comments "The gene transcript of this resource")
(gnc:ebi_gwas_link rdfs:Class gnc:ResourceLink)
(gnc:ebi_gwas_link rdfs:label "EBI GWAS")
(gnc:ebi_gwas_link rdfs:comments "EBI GWAS")
(gnc:protein_atlas_link rdfs:Class gnc:ResourceLink)
(gnc:protein_atlas_link rdfs:label "Protein Atlas")
(gnc:protein_atlas_link rdfs:comments "Human Protein Atlas")
(gnc:genemania_link rdfs:Class gnc:ResourceLink)
(gnc:genemania_link rdfs:label "GeneMANIA")
(gnc:genemania_link rdfs:comments "GeneMANIA")
(gnc:gemma_link rdfs:Class gnc:ResourceLink)
(gnc:gemma_link rdfs:label "Gemma")
(gnc:gemma_link rdfs:comments "Meta-analysis of gene expression data")
(gnc:biogps_link rdfs:Class gnc:ResourceLink)
(gnc:biogps_link rdfs:label "BioGPS")
(gnc:biogps_link rdfs:comments "Expression across many tissues and cell types")
(gnc:aba_link rdfs:Class gnc:ResourceLink)
(gnc:aba_link rdfs:label "ABA")
(gnc:aba_link rdfs:comments "Allen Brain Atlas")
(gnc:panther_link rdfs:Class gnc:ResourceLink)
(gnc:panther_link rdfs:label "PANTHER")
(gnc:panther_link rdfs:comments "Gene and protein data resources from Celera-ABI")
(gnc:panther_link rdfs:Class gnc:ResourceLink)
(gnc:panther_link rdfs:label "STRING")
(gnc:panther_link rdfs:comments "Protein interactions: known and inferred")
(gnc:gtex_link rdfs:Class gnc:ResourceLink)
(gnc:gtex_link rdfs:label "GTEx Portal")
(gnc:gtex_link rdfs:comments "GTEx Portal")
(gnc:rgd_link rdfs:Class gnc:ResourceLink)
(gnc:rgd_link rdfs:label "Rat Genome DB")
(gnc:rgd_link rdfs:comments "Rat Genome DB")
(gnc:has_kg_id rdfs:domain gnc:gene_symbol)
(gnc:has_kg_id a owl:ObjectProperty)
(gnc:has_kg_id rdfs:comments "The kgID of this resource")
(gnc:has_unigen_id rdfs:domain gnc:gene_symbol)
(gnc:has_unigen_id a owl:ObjectProperty)
(gnc:has_unigen_id rdfs:comments "The UnigenID of this resource")
(gnc:has_protein_id rdfs:domain gnc:gene_symbol)
(gnt:has_protein_id a owl:ObjectProperty)
(gnc:has_protein_id rdfs:comments "The ProteinID of this resource")
(gnc:has_align_id rdfs:domain gnc:gene_symbol)
(gnt:has_align_id a owl:ObjectProperty)
(gnc:has_align_id rdfs:comments "The AlignID of this resource")
(gnt:tx_end rdfs:range xsd:double)
(gnt:tx_start rdfs:range xsd:double)
(gnt:has_target_seq rdfs:domain gnc:probeset))
(triples
(string->identifier
"gene" (normalize-string-field (string-trim-both
(field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID))))
#:separator "_")
(set rdf:type 'gnc:gene)
(set gnt:gene_symbol (field GeneList GeneSymbol))
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
(set gnt:has_gene_id (ontology 'gene: (field GeneList GeneId)))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(uri-encode
(string-trim-both symbol))
"a gnc:ebi_gwas_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or (string=? species "human")
(string=? species "mouse")))
(string->symbol
(format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a"
"http://mouse.brain-map.org/search/show?search_type=gene&search_term="
"a gnc:aba_link"
(if (string=? species "mouse")
(uri-encode
(string-trim-both symbol))
geneId)))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(species (field Species Name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"https://rgd.mcw.edu/rgdweb/elasticResults.html?term="
(uri-encode
(string-trim-both symbol))
"&category=Gene&species="
(string-capitalize species)
"a gnc:rgd_link"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID))
(species (field Species Name)))
(if (and (not (string-blank? geneId))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "rat")
(string=? species "human")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
"http://biogps.org/?org="
species
"#goto=genereport&id="
geneId
"a gnc:biogps_link"))
"")))
(set dct:references
(let ((geneId (field GeneList GeneID)))
(if (not (string-blank? geneId))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid="
geneId
"a gnc:gemma_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol))
(species (lower-case-and-replace-spaces
(field Species FullName))))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mus-musculus")
(string=? species "rattus-norvegicus")
(string=? species "homo-sapiens")))
(string->symbol
(format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a"
"https://genemania.org/search"
species
(uri-encode
(string-trim-both symbol))
"a gnc:genemania_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(uri-encode
(string-trim-both symbol))
"a gnc:panther_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:panther_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtex_link"))
"")))
(set dct:references
(let ((symbol (field GeneList GeneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:protein_atlas_link"))
"")))
(set gnt:chromosome (field GeneList Chromosome))
(set gnt:tx_start (annotate-field
(field GeneList TxStart)
'^^xsd:double))
(set gnt:tx_end (annotate-field
(field GeneList TxEnd)
'^^xsd:double))
(set gnt:strand (string-trim-both (field GeneList Strand)))
(set
gnt:belongs_to_species (string->identifier "" (remap-species-identifiers (field Species Fullname))))
(set
gnt:transcript
(ontology 'transcript:
(string-trim-both (field GeneList NM_ID))))
(set gnc:has_kg_id (string-trim-both (field GeneList kgID)))
(set gnc:has_unigen_id (string-trim-both (field GeneList UnigenID)))
(set gnt:has_protein_id (string-trim-both (field GeneList ProteinID)))
(set gnt:has_align_id (string-trim-both (field GeneList AlignID)))
(set gnt:has_rgd_id
(field ("IFNULL(RGD_ID, '')" RGD_ID)))))
(define-transformer genelist-rn33
(tables (GeneList_rn33))
(triples
(let ([gene-uid (field GeneList_rn33 id GENE_UID)])
(string->identifier
"gene_rn33"
(if (number? gene-uid)
(number->string
gene-uid)
gene-uid)
#:separator "_"))
(set rdf:type 'gnc:gene)
(set gnt:belongs_to_species 'gn:Rattus_norvegicus)
(set gnt:gene_symbol (string-trim-both (field GeneList_rn33 geneSymbol)))
(set gnt:chromosome (field GeneList_rn33 chromosome))
(set gnt:tx_start (annotate-field
(field GeneList_rn33 txStart)
'^^xsd:double))
(set gnt:tx_end (annotate-field
(field GeneList_rn33 txEnd)
'^^xsd:double))
(set gnt:strand (string-trim-both (field GeneList_rn33 strand)))
(set
gnt:transcript
(ontology
'transcript:
(string-trim-both (field GeneList_rn33 NM_ID))))
(set
gnc:has_kg_id
(string-trim-both (field GeneList_rn33 kgID)))
(set dct:references
(let ((symbol (field GeneList_rn33 geneSymbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(string-trim-both symbol)
"a gnc:PantherLink"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(string-trim-both symbol)
"a gnc:ebi_gwas_link"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
(uri-encode
(string-trim-both symbol))
"a gnc:panther_link"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(uri-encode
(string-trim-both symbol))
"a gnc:gtex_link"))
"")))
(set dct:references
(let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(uri-encode
(string-trim-both symbol))
"a gnc:protein_atlas_link"))
"")))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "Gene Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("gene:" "")
("sdmx-measure:" "")
("transcript:" "")
("skos:" "")))
(inputs
(list genelist-rn33
genelist))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))