#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms)) (define-transformer probeset (tables (ProbeSet (left-join GeneChip "ON GeneChip.Id = ProbeSet.ChipId") (left-join GeneList "ON GeneList.GeneID = ProbeSet.GeneId") (left-join GeneList_rn33 "ON GeneList.geneSymbol = ProbeSet.Symbol") (left-join Species "ON GeneChip.SpeciesId = Species.Id"))) (schema-triples (gnc:pantherLink rdf:type gnc:ResourceLink) (gnc:pantherLink rdfs:label "PANTHER") (gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI") (gnc:gnt:NCBIGeneLink rdfs:Class gnc:ResourceLink) (gnc:gnt:NCBIGeneLink rdfs:label "Gene") (gnc:gnt:NCBIGeneLink rdfs:comments "Info from NCBI Entrez Gene") (gnc:omimLink rdfs:Class gnc:ResourceLink) (gnc:omimLink rdfs:label "OMIM") (gnc:omimLink rdfs:comments "Summary from On Mendelion Inheritance in Man") (gnc:homologeneLink rdfs:Class gnc:ResourceLink) (gnc:homologeneLink rdfs:label "HomoloGene") (gnc:homologeneLink rdfs:comments "Find similar genes in other species") (gnc:uniprotLink rdfs:Class gnc:ResourceLink) (gnc:uniprotLink rdfs:label "UniProt") (gnc:uniprotLink rdfs:comments "UniProt") (gnc:stringLink rdfs:Class gnc:ResourceLink) (gnc:stringLink rdfs:label "STRING") (gnc:stringLink rdfs:comments "Protein interactions: known and inferred") (gnc:gtexLink rdfs:Class gnc:ResourceLink) (gnc:gtexLink rdfs:label "GTEx Portal") (gnc:gtexLink rdfs:comments "GTEx Portal") (gnc:ebiGwasLink rdfs:Class gnc:ResourceLink) (gnc:ebiGwasLink rdfs:label "EBI GWAS") (gnc:ebiGwasLink rdfs:comments "EBI GWAS") (gnc:genemaniaLink rdfs:Class gnc:ResourceLink) (gnc:genemaniaLink rdfs:label "GeneMANIA") (gnc:genemaniaLink rdfs:comments "GeneMANIA") (gnc:gemmaLink rdfs:Class gnc:ResourceLink) (gnc:gemmaLink rdfs:label "Gemma") (gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data") (gnc:biogpsLink rdfs:Class gnc:ResourceLink) (gnc:biogpsLink rdfs:label "BioGPS") (gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types") (gnc:abaLink rdfs:Class gnc:ResourceLink) (gnc:abaLink rdfs:label "ABA") (gnc:abaLink rdfs:comments "Allen Brain Atlas") (gnc:ucsRefSeqLink rdfs:Class gnc:ResourceLink) (gnc:ucsRefSeqLink rdfs:label "Info from UCSC Genome Browser") (gnc:ucsRefSeqLink rdfs:comments "UCSC") (gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink) (gnc:proteinAtlasLink rdfs:label "Protein Atlas") (gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas") (gnt:hasChip a owl:ObjectProperty) (gnt:hasChip rdfs:domain gnc:Probeset) (gnt:hasTargetId a owl:ObjectProperty) (gnt:hasTargetId rdfs:domain gnc:Probeset) (gnt:symbol rdfs:domain gnc:Probeset) (gnt:location rdfs:domain gnc:ProbeSet) (gnt:location a owl:ObjectProperty) (gnt:strandPosition rdfs:domain gnc:ProbeSet) (gnt:strandPosition a owl:ObjectProperty) (gnt:targetsRegion a owl:ObjectProperty) (gnt:targetsRegion rdfs:domain gnc:Probeset) (gnt:chr rdfs:domain gnc:Probeset) (gnt:mb rdfs:domain gnc:Probeset) (gnt:hasSpecificity a owl:ObjectProperty) (gnt:hasSpecificity rdfs:domain gnc:Probeset) (gnt:hasBlatScore a owl:ObjectProperty) (gnt:hasBlatScore rdfs:domain gnc:Probeset) (gnt:hasBlatMbStart a owl:ObjectProperty) (gnt:hasBlatMbStart rdfs:domain gnc:Probeset) (gnt:hasBlatMbEnd a owl:ObjectProperty) (gnt:hasBlatMbEnd rdfs:domain gnc:Probeset) (gnt:hasBlatSeq a owl:ObjectProperty) (gnt:hasBlatSeq rdfs:domain gnc:Probeset) (gnt:hasTargetSeq a owl:ObjectProperty) (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) (triples (let ((id (field ("IF(NULLIF(TRIM(ProbeSet.Name), '') IS NULL, '', TRIM(ProbeSet.Name))" ProbeSetIdName))) (probeset-id (field ProbeSet Id))) (string->identifier "probeset" (if (string-null? id) (number->string probeset-id) (regexp-substitute/global #f "[^A-Za-z0-9:]" id 'pre "_" 'post)))) (set rdf:type 'gnc:Probeset) (set rdfs:label (field ProbeSet Name)) (set skos:altLabel (replace-substrings (field ProbeSet alias) '(("\r\n" . "; ")))) (set gnt:hasChip (string->identifier "platform" (field ("IFNULL(GeneChip.Name, '')" GeneChipName)))) (set gnt:hasTargetId (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" TargetId))) (set gnt:symbol (field ProbeSet Symbol)) (set dct:description (sanitize-rdf-string (field ProbeSet description))) (set gnt:targetsRegion (sanitize-rdf-string (field ("NULLIF(TRIM(ProbeSet.Probe_set_target_region), '')" Probe_set_target_region)))) (set gnt:chr (field ProbeSet Chr)) (set gnt:mb (annotate-field (field ("IFNULL(ProbeSet.Mb, '')" Mb)) '^^xsd:double)) (set gnt:location (let* ((mb (field ProbeSet Mb)) (chr (field ProbeSet Chr)) (strand-probe (field ProbeSet Strand_Probe)) (location (list chr mb))) (match location (("Un" mb) (format #f "Not available")) ((chr "") (if (string-blank? chr) (format #f "Not available") (format #f "Chr ~a @ Unknown position ~a~:[~;~a~]" chr mb (and (string? strand-probe) (or (string=? "+" strand-probe) (string=? "-" strand-probe))) (cond ((string=? "+" strand-probe) " on the plus strand") ((string=? "-" strand-probe) " on the minus strand") (else ""))))) (_ (format #f "Chr ~a @ ~a~:[~;~a~]" chr mb (and (string? strand-probe) (or (string=? "+" strand-probe) (string=? "-" strand-probe))) (cond ((string=? "+" strand-probe) " on the plus strand") ((string=? "-" strand-probe) " on the minus strand") (else ""))))))) ;; NCBI Gene Link (set dct:references (let ((geneId (field ProbeSet GeneId))) (if (not (string-blank? geneId)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=" geneId "a gnc:NCBIGeneLink")) ""))) ;; OMIM Link (set dct:references (let ((omim (field ProbeSet OMIM))) (if (not (string-blank? omim)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.ncbi.nlm.nih.gov/omim/" omim "a gnc:omimLink")) ""))) ;; Homologene Link (set dct:references (let ((homologene (field ProbeSet HomoloGeneID))) (if (not (string-blank? homologene)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.ncbi.nlm.nih.gov/homologene/?term=" homologene "a gnc:homologeneLink")) ""))) ;; UniProt Link (set dct:references (let ((uniprot (field ProbeSet UniProtID))) (if (not (string-blank? uniprot)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.uniprot.org/uniprot/" uniprot "a gnc:uniprotLink")) ""))) ;; STRING Link (set dct:references (let ((symbol (field ProbeSet Symbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" symbol "a gnc:stringLink")) ""))) ;; GTEX link (set dct:references (let ((symbol (field ProbeSet Symbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.gtexportal.org/home/gene/" (string-trim-both symbol) "a gnc:gtexLink")) ""))) ;; EBI GWAS Link (set dct:references (let ((symbol (field ProbeSet Symbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.ebi.ac.uk/gwas/search?query=" (string-trim-both symbol) "a gnc:ebiGwasLink")) ""))) ;; Protein Atlas Link (set dct:references (let ((symbol (field ProbeSet Symbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.proteinatlas.org/search/" (string-trim-both symbol) "a gnc:proteinAtlasLink")) ""))) ;; UCS Link (set dct:references (let* ((symbol (field ProbeSet Symbol)) (species (field Species Name)) (db (if (string=? species "mouse") "mm10" "rn7")) (transcriptId (field ProbeSet RefSeq_TranscriptId)) (kgId (field GeneList_rn33 kgId)) (transcriptStart (if (string=? species "mouse") (field ("(GeneList.txStart * 1000000)" TranscriptStartMm10)) (field ("(GeneList_rn33.txStart * 1000000)" TranscriptStartRn7)))) (chromosome (if (string=? species "mouse") (field GeneList Chromosome) (field GeneList_rn33 Chromosome))) (transcriptEnd (if (string=? species "mouse") (field ("(GeneList.txEnd * 1000000)" TranscriptEndMm10)) (field ("(GeneList_rn33.txEnd * 1000000)" TranscriptEndRn7)))) (url (format #f "http://genome.cse.ucsc.edu/cgi-bin/hgTracks?db=~a&hgg_gene=" db))) (if (and (not (string-blank? symbol)) (not (string-blank? transcriptId)) (number? transcriptStart) (number? transcriptEnd) (not (string-blank? chromosome)) (or (string=? species "mouse") (string=? species "rat"))) (string->symbol (format #f "<~0@*~a~1@*~a&hgg_chrom=chr~2@*~a&hgg_start=~3@*~a&hgg_end=~4@*~a> .~%<~0@*~a~1@*~a&hgg_chrom=chr~2@*~a&hgg_start=~3@*~a&hgg_end=~4@*~a> ~2@*~a" url transcriptId chromosome transcriptStart transcriptEnd "a gnc:ucsRefSeqLink" )) ""))) ;; PANTHER link (set dct:references (let ((symbol (field ProbeSet Symbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (string-trim-both symbol) "a gnc:PantherLink")) ""))) ;; A GeneManiaLink (set dct:references (let ((symbol (field ProbeSet GeneId)) (species (lower-case-and-replace-spaces (field Species FullName)))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) (or (string=? species "mus-musculus") (string=? species "rattus-norvegicus") (string=? species "homo-sapiens"))) (string->symbol (format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a" "https://genemania.org/search" species (string-trim-both symbol) "a gnc:genemaniaLink")) ""))) ;; ABA Link (set dct:references (let ((symbol (field ProbeSet Symbol)) (geneId (field ProbeSet GeneId)) (species (field Species name))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) (or (string=? species "human") (string=? species "mouse"))) (string->symbol (format #f "<~0@*~a> .~%<~0@*~a> ~2@*~a" "http://mouse.brain-map.org/search/show?search_type=gene&search_term=" "a " (if (string=? species "mouse") (string-trim-both symbol) geneId))) ""))) ;; Gemma Link (set dct:references (let ((geneId (field ProbeSet GeneId))) (if (not (string-blank? geneId)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid=" geneId "a gnc:gemmaLink")) ""))) ;; BioGPS Link (set dct:references (let ((geneId (field ProbeSet GeneId)) (species (field Species Name))) (if (and (not (string-blank? geneId)) (not (string-blank? species)) (or (string=? species "mouse") (string=? species "rat") (string=? species "humans"))) (string->symbol (format #f "<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a ; ~5@*~a" "http://biogps.org/?org=" species "#goto=genereport&id=" geneId "a gnc:ResourceLink")) ""))) (set gnt:strandProbe (field ProbeSet Strand_Probe)) (set gnt:hasSpecificity (field ("IFNULL(ProbeSet.Probe_set_specificity, '')" Probe_set_specificity))) (set gnt:hasBlatScore (field ("IFNULL(ProbeSet.Probe_set_BLAT_score, '')" Probe_set_BLAT_score))) (set gnt:hasBlatMbStart (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_start, '')" Probe_set_Blat_Mb_start)) '^^xsd:double)) (set gnt:hasBlatMbEnd (annotate-field (field ("IFNULL(ProbeSet.Probe_set_Blat_Mb_end, '')" Probe_set_Blat_Mb_end)) '^^xsd:double)) (set gnt:hasBlatSeq (sanitize-rdf-string (field ProbeSet BlatSeq))) (set gnt:hasTargetSeq (sanitize-rdf-string (field ProbeSet TargetSeq))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "ProbeSet Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("gn:" "") ("probeset:" "") ("gnc:" "") ("gnt:" "") ("rdf:" "") ("rdfs:" "") ("dct:" "") ("owl:" "") ("xsd:" "") ("qb:" "") ("sdmx-measure:" "") ("skos:" ""))) (inputs (list probeset)) (outputs `(#:documentation ,documentation #:rdf ,output))))