#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms))
(define-transformer probeset
(tables (ProbeSet
(left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")
(left-join GeneList "ON GeneList.GeneID = ProbeSet.GeneId")
(left-join GeneList_rn33 "ON GeneList.geneSymbol = ProbeSet.Symbol")
(left-join Species "ON GeneChip.SpeciesId = Species.Id")))
(schema-triples
(gnc:pantherLink rdf:type gnc:ResourceLink)
(gnc:pantherLink rdfs:label "PANTHER")
(gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI")
(gnc:gnt:NCBIGeneLink rdfs:Class gnc:ResourceLink)
(gnc:gnt:NCBIGeneLink rdfs:label "Gene")
(gnc:gnt:NCBIGeneLink rdfs:comments "Info from NCBI Entrez Gene")
(gnc:omimLink rdfs:Class gnc:ResourceLink)
(gnc:omimLink rdfs:label "OMIM")
(gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man")
(gnc:homologeneLink rdfs:Class gnc:ResourceLink)
(gnc:homologeneLink rdfs:label "HomoloGene")
(gnc:homologeneLink rdfs:comments "Find similar genes in other species")
(gnc:uniprotLink rdfs:Class gnc:ResourceLink)
(gnc:uniprotLink rdfs:label "UniProt")
(gnc:uniprotLink rdfs:comments "UniProt")
(gnc:stringLink rdfs:Class gnc:ResourceLink)
(gnc:stringLink rdfs:label "STRING")
(gnc:stringLink rdfs:comments "Protein interactions: known and inferred")
(gnc:gtexLink rdfs:Class gnc:ResourceLink)
(gnc:gtexLink rdfs:label "GTEx Portal")
(gnc:gtexLink rdfs:comments "GTEx Portal")
(gnc:ebiGwasLink rdfs:Class gnc:ResourceLink)
(gnc:ebiGwasLink rdfs:label "EBI GWAS")
(gnc:ebiGwasLink rdfs:comments "EBI GWAS")
(gnc:genemaniaLink rdfs:Class gnc:ResourceLink)
(gnc:genemaniaLink rdfs:label "GeneMANIA")
(gnc:genemaniaLink rdfs:comments "GeneMANIA")
(gnc:gemmaLink rdfs:Class gnc:ResourceLink)
(gnc:gemmaLink rdfs:label "Gemma")
(gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data")
(gnc:biogpsLink rdfs:Class gnc:ResourceLink)
(gnc:biogpsLink rdfs:label "BioGPS")
(gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types")
(gnc:abaLink rdfs:Class gnc:ResourceLink)
(gnc:abaLink rdfs:label "ABA")
(gnc:abaLink rdfs:comments "Allen Brain Atlas")
(gnc:ucsRefSeqLink rdfs:Class gnc:ResourceLink)
(gnc:ucsRefSeqLink rdfs:label "Info from UCSC Genome Browser")
(gnc:ucsRefSeqLink rdfs:comments "UCSC")
(gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink)
(gnc:proteinAtlasLink rdfs:label "Protein Atlas")
(gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas")
(gnt:hasChip a owl:ObjectProperty)
(gnt:hasChip rdfs:domain gnc:Probeset)
(gnt:hasTargetId a owl:ObjectProperty)
(gnt:hasTargetId rdfs:domain gnc:Probeset)
(gnt:symbol rdfs:domain gnc:Probeset)
(gnt:location rdfs:domain gnc:ProbeSet)
(gnt:location a owl:ObjectProperty)
(gnt:strandPosition rdfs:domain gnc:ProbeSet)
(gnt:strandPosition a owl:ObjectProperty)
(gnt:targetsRegion a owl:ObjectProperty)
(gnt:targetsRegion rdfs:domain gnc:Probeset)
(gnt:chr rdfs:domain gnc:Probeset)
(gnt:mb rdfs:domain gnc:Probeset)
(gnt:hasSpecificity a owl:ObjectProperty)
(gnt:hasSpecificity rdfs:domain gnc:Probeset)
(gnt:hasBlatScore a owl:ObjectProperty)
(gnt:hasBlatScore rdfs:domain gnc:Probeset)
(gnt:hasBlatMbStart a owl:ObjectProperty)
(gnt:hasBlatMbStart rdfs:domain gnc:Probeset)
(gnt:hasBlatMbEnd a owl:ObjectProperty)
(gnt:hasBlatMbEnd rdfs:domain gnc:Probeset)
(gnt:hasBlatSeq a owl:ObjectProperty)
(gnt:hasBlatSeq rdfs:domain gnc:Probeset)
(gnt:hasTargetSeq a owl:ObjectProperty)
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))"
ProbeSetIdName)))
(probeset-id (field ProbeSet Id)))
(string->identifier
"probeset"
(if (string-null? id)
(number->string probeset-id)
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
id
'pre "_" 'post))))
(set rdf:type 'gnc:Probeset)
(set rdfs:label (field ProbeSet Name))
(set skos:altLabel
(replace-substrings
(field ProbeSet alias)
'(("\r\n" . "; "))))
(set gnt:hasChip
(string->identifier
"platform"
(field ("IFNULL(GeneChip.Name, '')" GeneChipName))))
(set gnt:hasTargetId
(field ("NULLIF(TRIM(ProbeSet.TargetId), '')"
TargetId)))
(set gnt:symbol (field ProbeSet Symbol))
(set dct:description (sanitize-rdf-string (field ProbeSet description)))
(set gnt:targetsRegion
(sanitize-rdf-string
(field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')"
Probe_set_target_region))))
(set gnt:chr (field ProbeSet Chr))
(set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
(set gnt:location
(let* ((mb (field ProbeSet Mb))
(chr (field ProbeSet Chr))
(strand-probe (field ProbeSet Strand_Probe))
(location (list chr mb)))
(match location
(("Un" mb)
(format #f "Not available"))
((chr "")
(if (string-blank? chr)
(format #f "Not available")
(format #f "Chr ~a @ Unknown position ~a~:[~;~a~]"
chr mb
(and (string? strand-probe) (or (string=? "+" strand-probe)
(string=? "-" strand-probe)))
(cond ((string=? "+" strand-probe)
" on the plus strand")
((string=? "-" strand-probe)
" on the minus strand")
(else "")))))
(_
(format #f "Chr ~a @ ~a~:[~;~a~]"
chr mb
(and (string? strand-probe) (or (string=? "+" strand-probe)
(string=? "-" strand-probe)))
(cond ((string=? "+" strand-probe)
" on the plus strand")
((string=? "-" strand-probe)
" on the minus strand")
(else "")))))))
;; NCBI Gene Link
(set dct:references
(let ((geneId (field ProbeSet GeneId)))
(if (not (string-blank? geneId))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids="
geneId
"a gnc:NCBIGeneLink"))
"")))
;; OMIM Link
(set dct:references
(let ((omim (field ProbeSet OMIM)))
(if (not (string-blank? omim))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.ncbi.nlm.nih.gov/omim/"
omim
"a gnc:omimLink"))
"")))
;; Homologene Link
(set dct:references
(let ((homologene (field ProbeSet HomoloGeneID)))
(if (not (string-blank? homologene))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.ncbi.nlm.nih.gov/homologene/?term="
homologene
"a gnc:homologeneLink"))
"")))
;; UniProt Link
(set dct:references
(let ((uniprot (field ProbeSet UniProtID)))
(if (not (string-blank? uniprot))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.uniprot.org/uniprot/"
uniprot
"a gnc:uniprotLink"))
"")))
;; STRING Link
(set dct:references
(let ((symbol (field ProbeSet Symbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
symbol
"a gnc:stringLink"))
"")))
;; GTEX link
(set dct:references
(let ((symbol (field ProbeSet Symbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.gtexportal.org/home/gene/"
(string-trim-both symbol)
"a gnc:gtexLink"))
"")))
;; EBI GWAS Link
(set dct:references
(let ((symbol (field ProbeSet Symbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"https://www.ebi.ac.uk/gwas/search?query="
(string-trim-both symbol)
"a gnc:ebiGwasLink"))
"")))
;; Protein Atlas Link
(set dct:references
(let ((symbol (field ProbeSet Symbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.proteinatlas.org/search/"
(string-trim-both symbol)
"a gnc:proteinAtlasLink"))
"")))
;; UCS Link
(set dct:references
(let* ((symbol (field ProbeSet Symbol))
(species (field Species Name))
(db (if (string=? species "mouse")
"mm10" "rn7"))
(transcriptId (field ProbeSet RefSeq_TranscriptId))
(kgId (field GeneList_rn33 kgId))
(transcriptStart
(if (string=? species "mouse")
(field ("(GeneList.txStart * 1000000)" TranscriptStartMm10))
(field ("(GeneList_rn33.txStart * 1000000)" TranscriptStartRn7))))
(chromosome
(if (string=? species "mouse")
(field GeneList Chromosome)
(field GeneList_rn33 Chromosome)))
(transcriptEnd
(if (string=? species "mouse")
(field ("(GeneList.txEnd * 1000000)" TranscriptEndMm10))
(field ("(GeneList_rn33.txEnd * 1000000)" TranscriptEndRn7))))
(url (format
#f
"http://genome.cse.ucsc.edu/cgi-bin/hgTracks?db=~a&hgg_gene="
db)))
(if (and (not (string-blank? symbol))
(not (string-blank? transcriptId))
(number? transcriptStart)
(number? transcriptEnd)
(not (string-blank? chromosome))
(or (string=? species "mouse")
(string=? species "rat")))
(string->symbol
(format #f
"<~0@*~a~1@*~a&hgg_chrom=chr~2@*~a&hgg_start=~3@*~a&hgg_end=~4@*~a> .~%<~0@*~a~1@*~a&hgg_chrom=chr~2@*~a&hgg_start=~3@*~a&hgg_end=~4@*~a> ~2@*~a"
url
transcriptId
chromosome
transcriptStart
transcriptEnd
"a gnc:ucsRefSeqLink"
))
"")))
;; PANTHER link
(set dct:references
(let ((symbol (field ProbeSet Symbol)))
(if (not (string-blank? symbol))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
(string-trim-both symbol)
"a gnc:PantherLink"))
"")))
;; A GeneManiaLink
(set dct:references
(let ((symbol (field ProbeSet GeneId))
(species (lower-case-and-replace-spaces
(field Species FullName))))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or
(string=? species "mus-musculus")
(string=? species "rattus-norvegicus")
(string=? species "homo-sapiens")))
(string->symbol
(format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a"
"https://genemania.org/search" species (string-trim-both symbol)
"a gnc:genemaniaLink"))
"")))
;; ABA Link
(set dct:references
(let ((symbol (field ProbeSet Symbol))
(geneId (field ProbeSet GeneId))
(species (field Species name)))
(if (and (not (string-blank? symbol))
(not (string-blank? species))
(or (string=? species "human")
(string=? species "mouse")))
(string->symbol
(format #f "<~0@*~a> .~%<~0@*~a> ~2@*~a"
"http://mouse.brain-map.org/search/show?search_type=gene&search_term="
"a "
(if (string=? species "mouse")
(string-trim-both symbol)
geneId)))
"")))
;; Gemma Link
(set dct:references
(let ((geneId (field ProbeSet GeneId)))
(if (not (string-blank? geneId))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid="
geneId
"a gnc:gemmaLink"))
"")))
;; BioGPS Link
(set dct:references
(let ((geneId (field ProbeSet GeneId))
(species (field Species Name)))
(if (and (not (string-blank? geneId))
(not (string-blank? species))
(or
(string=? species "mouse")
(string=? species "rat")
(string=? species "humans")))
(string->symbol
(format #f
"<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a ; ~5@*~a"
"http://biogps.org/?org="
species
"#goto=genereport&id="
geneId
"a gnc:ResourceLink"))
"")))
(set gnt:strandProbe
(field ProbeSet Strand_Probe))
(set gnt:hasSpecificity
(field ("IFNULL(ProbeSet.Probe_set_specificity, '')"
Probe_set_specificity)))
(set gnt:hasBlatScore
(field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')"
Probe_set_BLAT_score)))
(set gnt:hasBlatMbStart
(annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')"
Probe_set_Blat_Mb_start))
'^^xsd:double))
(set gnt:hasBlatMbEnd
(annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')"
Probe_set_Blat_Mb_end))
'^^xsd:double))
(set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq)))
(set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "ProbeSet Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("sdmx-measure:" "")
("skos:" "")))
(inputs
(list probeset))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))