#! /usr/bin/env guile
!#
(use-modules (srfi srfi-1)
(srfi srfi-26)
(ice-9 format)
(ice-9 getopt-long)
(ice-9 match)
(ice-9 regex)
(transform strings)
(transform sql)
(transform triples)
(transform special-forms)
(web uri))
(define-transformer probeset
(tables (ProbeSet
(left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId")
(left-join Species "ON GeneChip.SpeciesId = Species.Id"))
"WHERE ProbeSet.Name IS NOT NULL")
(schema-triples
(gnc:omimLink rdfs:Class gnc:ResourceLink)
(gnc:omimLink rdfs:label "OMIM")
(gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man")
(gnc:homologeneLink rdfs:Class gnc:ResourceLink)
(gnc:homologeneLink rdfs:label "HomoloGene")
(gnc:homologeneLink rdfs:comments "Find similar genes in other species")
(gnc:uniprot a owl:ObjectProperty)
(gnc:uniprot rdfs:label "UniProt")
(gnc:uniprot rdfs:comments "UniProt resource")
(gnt:hasChip a owl:ObjectProperty)
(gnt:hasChip rdfs:domain gnc:Probeset)
(gnt:hasTargetId a owl:ObjectProperty)
(gnt:hasTargetId rdfs:domain gnc:Probeset)
(gnt:geneSymbol rdfs:domain gnc:Probeset)
(gnt:location rdfs:domain gnc:ProbeSet)
(gnt:location a owl:ObjectProperty)
(gnt:strandPosition rdfs:domain gnc:ProbeSet)
(gnt:strandPosition a owl:ObjectProperty)
(gnt:targetsRegion a owl:ObjectProperty)
(gnt:targetsRegion rdfs:domain gnc:Probeset)
(gnt:chr rdfs:domain gnc:Probeset)
(gnt:mb rdfs:domain gnc:Probeset)
(gnt:hasSpecificity a owl:ObjectProperty)
(gnt:hasSpecificity rdfs:domain gnc:Probeset)
(gnt:hasBlatScore a owl:ObjectProperty)
(gnt:hasBlatScore rdfs:domain gnc:Probeset)
(gnt:hasBlatMbStart a owl:ObjectProperty)
(gnt:hasBlatMbStart rdfs:domain gnc:Probeset)
(gnt:hasBlatMbEnd a owl:ObjectProperty)
(gnt:hasBlatMbEnd rdfs:domain gnc:Probeset)
(gnt:hasBlatSeq a owl:ObjectProperty)
(gnt:hasBlatSeq rdfs:domain gnc:Probeset)
(gnt:hasTargetSeq a owl:ObjectProperty)
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))"
ProbeSetIdName)))
(probeset-id (field ProbeSet Id)))
(string->identifier
"probeset"
(if (string-null? id)
(number->string probeset-id)
(regexp-substitute/global
#f "[^A-Za-z0-9:]"
id
'pre "_" 'post))))
(set rdf:type 'gnc:Probeset)
(set rdfs:label (field ProbeSet Name))
(set skos:altLabel
(replace-substrings
(field ProbeSet alias)
'(("\r\n" . "; "))))
(set gnt:hasChip
(string->identifier
"platform"
(field ("IFNULL(GeneChip.Name, '')" GeneChipName))))
(set gnt:hasTargetId
(field ("NULLIF(TRIM(ProbeSet.TargetId), '')"
TargetId)))
(multiset gnt:geneSymbol
(map string-trim (string-split
(field ProbeSet Symbol)
#\,)))
(set dct:description (sanitize-rdf-string (field ProbeSet description)))
(set gnt:targetsRegion
(sanitize-rdf-string
(field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')"
Probe_set_target_region))))
(set gnt:chr (field ProbeSet Chr))
(set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double))
(set gnt:location
(let* ((mb (field ProbeSet Mb))
(chr (field ProbeSet Chr))
(strand-probe (field ProbeSet Strand_Probe))
(location (list chr mb)))
(match location
(("Un" mb)
(format #f "Not available"))
((chr "")
(if (string-blank? chr)
(format #f "Not available")
(format #f "Chr ~a @ Unknown position ~a~:[~;~a~]"
chr mb
(and (string? strand-probe) (or (string=? "+" strand-probe)
(string=? "-" strand-probe)))
(cond ((string=? "+" strand-probe)
"on the plus strand")
((string=? "-" strand-probe)
"on the minus strand")
(else "")))))
(_
(format #f "Chr ~a @ ~a Mb ~:[~;~a~]"
chr mb
(and (string? strand-probe) (or (string=? "+" strand-probe)
(string=? "-" strand-probe)))
(cond ((string=? "+" strand-probe)
"on the plus strand")
((string=? "-" strand-probe)
"on the minus strand")
(else "")))))))
(set gnt:hasGeneId
(ontology 'gene:
(string-trim-both (field ProbeSet GeneId))))
;; OMIM Link
(set dct:references
(let ((omim (field ProbeSet OMIM)))
(if (not (string-blank? omim))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.ncbi.nlm.nih.gov/omim/"
(uri-encode omim)
"a gnc:omimLink"))
"")))
;; Homologene Link
(set dct:references
(let ((homologene (field ProbeSet HomoloGeneID)))
(if (not (string-blank? homologene))
(string->symbol
(format #f
"<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
"http://www.ncbi.nlm.nih.gov/homologene/?term="
(uri-encode homologene)
"a gnc:homologeneLink"))
"")))
(set gnt:uniprot
(ontology 'uniprot: (field ProbeSet UniProtID)))
(set gnt:strandProbe
(field ProbeSet Strand_Probe))
(set gnt:hasSpecificity
(field ("IFNULL(ProbeSet.Probe_set_specificity, '')"
Probe_set_specificity)))
(set gnt:hasBlatScore
(field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')"
Probe_set_BLAT_score)))
(set gnt:hasBlatMbStart
(annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')"
Probe_set_Blat_Mb_start))
'^^xsd:double))
(set gnt:hasBlatMbEnd
(annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')"
Probe_set_Blat_Mb_end))
'^^xsd:double))
(set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq)))
(set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq)))))
(let* ((option-spec
'((settings (single-char #\s) (value #t))
(output (single-char #\o) (value #t))
(documentation (single-char #\d) (value #t))))
(options (getopt-long (command-line) option-spec))
(settings (option-ref options 'settings #f))
(output (option-ref options 'output #f))
(documentation (option-ref options 'documentation #f))
(%connection-settings
(call-with-input-file settings
read)))
(with-documentation
(name "ProbeSet Metadata")
(connection %connection-settings)
(table-metadata? #f)
(prefixes
'(("gn:" "")
("probeset:" "")
("gnc:" "")
("gene:" "")
("gnt:" "")
("rdf:" "")
("rdfs:" "")
("dct:" "")
("uniprot:" "")
("owl:" "")
("xsd:" "")
("qb:" "")
("sdmx-measure:" "")
("skos:" "")))
(inputs
(list probeset))
(outputs
`(#:documentation ,documentation
#:rdf ,output))))