#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms) (web uri)) (define-transformer probeset (tables (ProbeSet (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId") (left-join Species "ON GeneChip.SpeciesId = Species.Id")) "WHERE ProbeSet.Name IS NOT NULL") (schema-triples (gnc:omimLink rdfs:Class gnc:ResourceLink) (gnc:omimLink rdfs:label "OMIM") (gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man") (gnc:homologeneLink rdfs:Class gnc:ResourceLink) (gnc:homologeneLink rdfs:label "HomoloGene") (gnc:homologeneLink rdfs:comments "Find similar genes in other species") (gnc:uniprot a owl:ObjectProperty) (gnc:uniprot rdfs:label "UniProt") (gnc:uniprot rdfs:comments "UniProt resource") (gnt:hasChip a owl:ObjectProperty) (gnt:hasChip rdfs:domain gnc:Probeset) (gnt:hasTargetId a owl:ObjectProperty) (gnt:hasTargetId rdfs:domain gnc:Probeset) (gnt:geneSymbol rdfs:domain gnc:Probeset) (gnt:location rdfs:domain gnc:ProbeSet) (gnt:location a owl:ObjectProperty) (gnt:strandPosition rdfs:domain gnc:ProbeSet) (gnt:strandPosition a owl:ObjectProperty) (gnt:targetsRegion a owl:ObjectProperty) (gnt:targetsRegion rdfs:domain gnc:Probeset) (gnt:chr rdfs:domain gnc:Probeset) (gnt:mb rdfs:domain gnc:Probeset) (gnt:hasSpecificity a owl:ObjectProperty) (gnt:hasSpecificity rdfs:domain gnc:Probeset) (gnt:hasBlatScore a owl:ObjectProperty) (gnt:hasBlatScore rdfs:domain gnc:Probeset) (gnt:hasBlatMbStart a owl:ObjectProperty) (gnt:hasBlatMbStart rdfs:domain gnc:Probeset) (gnt:hasBlatMbEnd a owl:ObjectProperty) (gnt:hasBlatMbEnd rdfs:domain gnc:Probeset) (gnt:hasBlatSeq a owl:ObjectProperty) (gnt:hasBlatSeq rdfs:domain gnc:Probeset) (gnt:hasTargetSeq a owl:ObjectProperty) (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) (triples (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" ProbeSetIdName))) (probeset-id (field ProbeSet Id))) (string->identifier "probeset" (if (string-null? id) (number->string probeset-id) (regexp-substitute/global #f "[^A-Za-z0-9:]" id 'pre "_" 'post)))) (set rdf:type 'gnc:Probeset) (set rdfs:label (field ProbeSet Name)) (set skos:altLabel (replace-substrings (field ProbeSet alias) '(("\r\n" . "; ")))) (set gnt:hasChip (string->identifier "platform" (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) (set gnt:hasTargetId (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" TargetId))) (multiset gnt:geneSymbol (map string-trim (string-split (field ProbeSet Symbol) #\,))) (set dct:description (sanitize-rdf-string (field ProbeSet description))) (set gnt:targetsRegion (sanitize-rdf-string (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" Probe_set_target_region)))) (set gnt:chr (field ProbeSet Chr)) (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) (set gnt:location (let* ((mb (field ProbeSet Mb)) (chr (field ProbeSet Chr)) (strand-probe (field ProbeSet Strand_Probe)) (location (list chr mb))) (match location (("Un" mb) (format #f "Not available")) ((chr "") (if (string-blank? chr) (format #f "Not available") (format #f "Chr ~a @ Unknown position ~a~:[~;~a~]" chr mb (and (string? strand-probe) (or (string=? "+" strand-probe) (string=? "-" strand-probe))) (cond ((string=? "+" strand-probe) "on the plus strand") ((string=? "-" strand-probe) "on the minus strand") (else ""))))) (_ (format #f "Chr ~a @ ~a Mb ~:[~;~a~]" chr mb (and (string? strand-probe) (or (string=? "+" strand-probe) (string=? "-" strand-probe))) (cond ((string=? "+" strand-probe) "on the plus strand") ((string=? "-" strand-probe) "on the minus strand") (else ""))))))) (set gnt:hasGeneId (ontology 'gene: (string-trim-both (field ProbeSet GeneId)))) ;; OMIM Link (set dct:references (let ((omim (field ProbeSet OMIM))) (if (not (string-blank? omim)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.ncbi.nlm.nih.gov/omim/" (uri-encode omim) "a gnc:omimLink")) ""))) ;; Homologene Link (set dct:references (let ((homologene (field ProbeSet HomoloGeneID))) (if (not (string-blank? homologene)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.ncbi.nlm.nih.gov/homologene/?term=" (uri-encode homologene) "a gnc:homologeneLink")) ""))) (set gnt:uniprot (ontology 'uniprot: (field ProbeSet UniProtID))) (set gnt:strandProbe (field ProbeSet Strand_Probe)) (set gnt:hasSpecificity (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" Probe_set_specificity))) (set gnt:hasBlatScore (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" Probe_set_BLAT_score))) (set gnt:hasBlatMbStart (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" Probe_set_Blat_Mb_start)) '^^xsd:double)) (set gnt:hasBlatMbEnd (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" Probe_set_Blat_Mb_end)) '^^xsd:double)) (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "ProbeSet Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("gn:" "") ("probeset:" "") ("gnc:" "") ("gene:" "") ("gnt:" "") ("rdf:" "") ("rdfs:" "") ("dct:" "") ("uniprot:" "") ("owl:" "") ("xsd:" "") ("qb:" "") ("sdmx-measure:" "") ("skos:" ""))) (inputs (list probeset)) (outputs `(#:documentation ,documentation #:rdf ,output))))