#! /usr/bin/env guile !# (use-modules (srfi srfi-1) (srfi srfi-26) (ice-9 format) (ice-9 getopt-long) (ice-9 match) (ice-9 regex) (transform strings) (transform sql) (transform triples) (transform special-forms) (web uri)) (define-transformer genelist (tables (GeneList (left-join Species "USING (SpeciesId)"))) (schema-triples (gnc:gene_symbol a rdfs:Class) (gnc:gene_symbol rdfs:label "A gene symbol") (gnt:gene rdfs:domain gnc:gene_symbol) (gnt:belongs_to_species rdfs:domain gnc:gene_symbol) (gnc:gene a rdfs:Class) (gnc:gene rdfs:label "Gene") (gnt:has_gene_id a owl:ObjectProperty) (gnt:has_gene_id rdfs:domain gnc:ncbi_wiki_entry) (gnt:has_gene_id skos:definition "The GeneId of this this resource") (gnc:transcript rdfs:domain gnc:gene_symbol) (gnt:transcript a owl:ObjectProperty) (gnc:transcript rdfs:comments "The gene transcript of this resource") (gnc:ebi_gwas_link rdfs:Class gnc:ResourceLink) (gnc:ebi_gwas_link rdfs:label "EBI GWAS") (gnc:ebi_gwas_link rdfs:comments "EBI GWAS") (gnc:protein_atlas_link rdfs:Class gnc:ResourceLink) (gnc:protein_atlas_link rdfs:label "Protein Atlas") (gnc:protein_atlas_link rdfs:comments "Human Protein Atlas") (gnc:genemania_link rdfs:Class gnc:ResourceLink) (gnc:genemania_link rdfs:label "GeneMANIA") (gnc:genemania_link rdfs:comments "GeneMANIA") (gnc:gemma_link rdfs:Class gnc:ResourceLink) (gnc:gemma_link rdfs:label "Gemma") (gnc:gemma_link rdfs:comments "Meta-analysis of gene expression data") (gnc:biogps_link rdfs:Class gnc:ResourceLink) (gnc:biogps_link rdfs:label "BioGPS") (gnc:biogps_link rdfs:comments "Expression across many tissues and cell types") (gnc:aba_link rdfs:Class gnc:ResourceLink) (gnc:aba_link rdfs:label "ABA") (gnc:aba_link rdfs:comments "Allen Brain Atlas") (gnc:panther_link rdfs:Class gnc:ResourceLink) (gnc:panther_link rdfs:label "PANTHER") (gnc:panther_link rdfs:comments "Gene and protein data resources from Celera-ABI") (gnc:panther_link rdfs:Class gnc:ResourceLink) (gnc:panther_link rdfs:label "STRING") (gnc:panther_link rdfs:comments "Protein interactions: known and inferred") (gnc:gtex_link rdfs:Class gnc:ResourceLink) (gnc:gtex_link rdfs:label "GTEx Portal") (gnc:gtex_link rdfs:comments "GTEx Portal") (gnc:rgd_link rdfs:Class gnc:ResourceLink) (gnc:rgd_link rdfs:label "Rat Genome DB") (gnc:rgd_link rdfs:comments "Rat Genome DB") (gnc:has_kg_id rdfs:domain gnc:gene_symbol) (gnc:has_kg_id a owl:ObjectProperty) (gnc:has_kg_id rdfs:comments "The kgID of this resource") (gnc:has_unigen_id rdfs:domain gnc:gene_symbol) (gnc:has_unigen_id a owl:ObjectProperty) (gnc:has_unigen_id rdfs:comments "The UnigenID of this resource") (gnc:has_protein_id rdfs:domain gnc:gene_symbol) (gnt:has_protein_id a owl:ObjectProperty) (gnc:has_protein_id rdfs:comments "The ProteinID of this resource") (gnc:has_align_id rdfs:domain gnc:gene_symbol) (gnt:has_align_id a owl:ObjectProperty) (gnc:has_align_id rdfs:comments "The AlignID of this resource") (gnt:tx_end rdfs:range xsd:double) (gnt:tx_start rdfs:range xsd:double) (gnt:has_target_seq rdfs:domain gnc:probeset)) (triples (string->identifier "gene" (normalize-string-field (string-trim-both (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID)))) #:separator "_") (set rdf:type 'gnc:gene) (set gnt:gene_symbol (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) (set gnt:has_gene_id (ontology 'gene: (field GeneList GeneId))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.ebi.ac.uk/gwas/search?query=" (uri-encode (string-trim-both symbol)) "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) (geneId (field GeneList GeneID)) (species (field Species Name))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) (or (string=? species "human") (string=? species "mouse"))) (string->symbol (format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a" "http://mouse.brain-map.org/search/show?search_type=gene&search_term=" "a gnc:aba_link" (if (string=? species "mouse") (uri-encode (string-trim-both symbol)) geneId))) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) (species (field Species Name))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) (or (string=? species "mouse") (string=? species "human"))) (string->symbol (format #f "<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a" "https://rgd.mcw.edu/rgdweb/elasticResults.html?term=" (uri-encode (string-trim-both symbol)) "&category=Gene&species=" (string-capitalize species) "a gnc:rgd_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID)) (species (field Species Name))) (if (and (not (string-blank? geneId)) (not (string-blank? species)) (or (string=? species "mouse") (string=? species "rat") (string=? species "human"))) (string->symbol (format #f "<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a" "http://biogps.org/?org=" species "#goto=genereport&id=" geneId "a gnc:biogps_link")) ""))) (set dct:references (let ((geneId (field GeneList GeneID))) (if (not (string-blank? geneId)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid=" geneId "a gnc:gemma_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol)) (species (lower-case-and-replace-spaces (field Species FullName)))) (if (and (not (string-blank? symbol)) (not (string-blank? species)) (or (string=? species "mus-musculus") (string=? species "rattus-norvegicus") (string=? species "homo-sapiens"))) (string->symbol (format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a" "https://genemania.org/search" species (uri-encode (string-trim-both symbol)) "a gnc:genemania_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (uri-encode (string-trim-both symbol)) "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (field GeneList GeneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) "a gnc:protein_atlas_link")) ""))) (set gnt:chromosome (field GeneList Chromosome)) (set gnt:tx_start (annotate-field (field GeneList TxStart) '^^xsd:double)) (set gnt:tx_end (annotate-field (field GeneList TxEnd) '^^xsd:double)) (set gnt:strand (string-trim-both (field GeneList Strand))) (set gnt:belongs_to_species (string->identifier "" (remap-species-identifiers (field Species Fullname)))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList NM_ID)))) (set gnc:has_kg_id (string-trim-both (field GeneList kgID))) (set gnc:has_unigen_id (string-trim-both (field GeneList UnigenID))) (set gnt:has_protein_id (string-trim-both (field GeneList ProteinID))) (set gnt:has_align_id (string-trim-both (field GeneList AlignID))) (set gnt:has_rgd_id (field ("IFNULL(RGD_ID, '')" RGD_ID))))) (define-transformer genelist-rn33 (tables (GeneList_rn33)) (triples (let ([gene-uid (field GeneList_rn33 id GENE_UID)]) (string->identifier "gene_rn33" (if (number? gene-uid) (number->string gene-uid) gene-uid) #:separator "_")) (set rdf:type 'gnc:gene) (set gnt:belongs_to_species 'gn:Rattus_norvegicus) (set gnt:gene_symbol (string-trim-both (field GeneList_rn33 geneSymbol))) (set gnt:chromosome (field GeneList_rn33 chromosome)) (set gnt:tx_start (annotate-field (field GeneList_rn33 txStart) '^^xsd:double)) (set gnt:tx_end (annotate-field (field GeneList_rn33 txEnd) '^^xsd:double)) (set gnt:strand (string-trim-both (field GeneList_rn33 strand))) (set gnt:transcript (ontology 'transcript: (string-trim-both (field GeneList_rn33 NM_ID)))) (set gnc:has_kg_id (string-trim-both (field GeneList_rn33 kgID))) (set dct:references (let ((symbol (field GeneList_rn33 geneSymbol))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue=" (string-trim-both symbol) "a gnc:PantherLink")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.ebi.ac.uk/gwas/search?query=" (string-trim-both symbol) "a gnc:ebi_gwas_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://string-db.org/newstring_cgi/show_network_section.pl?identifier=" (uri-encode (string-trim-both symbol)) "a gnc:panther_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "https://www.gtexportal.org/home/gene/" (uri-encode (string-trim-both symbol)) "a gnc:gtex_link")) ""))) (set dct:references (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol)))) (if (not (string-blank? symbol)) (string->symbol (format #f "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a" "http://www.proteinatlas.org/search/" (uri-encode (string-trim-both symbol)) "a gnc:protein_atlas_link")) ""))))) (let* ((option-spec '((settings (single-char #\s) (value #t)) (output (single-char #\o) (value #t)) (documentation (single-char #\d) (value #t)))) (options (getopt-long (command-line) option-spec)) (settings (option-ref options 'settings #f)) (output (option-ref options 'output #f)) (documentation (option-ref options 'documentation #f)) (%connection-settings (call-with-input-file settings read))) (with-documentation (name "Gene Metadata") (connection %connection-settings) (table-metadata? #f) (prefixes '(("gn:" "") ("probeset:" "") ("gnc:" "") ("gnt:" "") ("rdf:" "") ("rdfs:" "") ("dct:" "") ("owl:" "") ("xsd:" "") ("qb:" "") ("gene:" "") ("sdmx-measure:" "") ("transcript:" "") ("skos:" ""))) (inputs (list genelist-rn33 genelist)) (outputs `(#:documentation ,documentation #:rdf ,output))))