aboutsummaryrefslogtreecommitdiff
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (ice-9 format)
             (ice-9 getopt-long)
             (ice-9 match)
             (ice-9 regex)
             (transform strings)
             (transform sql)
             (transform triples)
             (transform special-forms)
             (web uri))


(define-transformer genelist
  (tables (GeneList
           (left-join Species "USING (SpeciesId)")))
  (schema-triples
   (gnc:GeneSymbol a rdfs:Class)
   (gnc:GeneSymbol rdfs:label "A gene symbol")
   (gnt:gene rdfs:domain gnc:GeneSymbol)
   (gnt:belongsToSpecies rdfs:domain gnc:GeneSymbol)
   (gnc:Gene a rdfs:Class)
   (gnc:Gene rdfs:label "Gene")
   (gnt:hasGeneId a owl:ObjectProperty)
   (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry)
   (gnt:hasGeneId skos:definition "The GeneId of this this resource")
   (gnc:transcript rdfs:domain gnc:GeneSymbol)
   (gnt:transcript a owl:ObjectProperty)
   (gnc:transcript rdfs:comments "The gene transcript of this resource")
   (gnc:ebiGwasLink rdfs:Class gnc:ResourceLink)
   (gnc:ebiGwasLink rdfs:label "EBI GWAS")
   (gnc:ebiGwasLink rdfs:comments "EBI GWAS")
   (gnc:proteinAtlasLink rdfs:Class gnc:ResourceLink)
   (gnc:proteinAtlasLink rdfs:label "Protein Atlas")
   (gnc:proteinAtlasLink rdfs:comments "Human Protein Atlas")
   (gnc:genemaniaLink rdfs:Class gnc:ResourceLink)
   (gnc:genemaniaLink rdfs:label "GeneMANIA")
   (gnc:genemaniaLink rdfs:comments "GeneMANIA")
   (gnc:gemmaLink rdfs:Class gnc:ResourceLink)
   (gnc:gemmaLink rdfs:label "Gemma")
   (gnc:gemmaLink rdfs:comments "Meta-analysis of gene expression data")
   (gnc:biogpsLink rdfs:Class gnc:ResourceLink)
   (gnc:biogpsLink rdfs:label "BioGPS")
   (gnc:biogpsLink rdfs:comments "Expression across many tissues and cell types")
   (gnc:abaLink rdfs:Class gnc:ResourceLink)
   (gnc:abaLink rdfs:label "ABA")
   (gnc:abaLink rdfs:comments "Allen Brain Atlas")
   (gnc:pantherLink rdfs:Class gnc:ResourceLink)
   (gnc:pantherLink rdfs:label "PANTHER")
   (gnc:pantherLink rdfs:comments "Gene and protein data resources from Celera-ABI")
   (gnc:stringLink rdfs:Class gnc:ResourceLink)
   (gnc:stringLink rdfs:label "STRING")
   (gnc:stringLink rdfs:comments "Protein interactions: known and inferred")
   (gnc:gtexLink rdfs:Class gnc:ResourceLink)
   (gnc:gtexLink rdfs:label "GTEx Portal")
   (gnc:gtexLink rdfs:comments "GTEx Portal")
   (gnc:rgdLink rdfs:Class gnc:ResourceLink)
   (gnc:rgdLink rdfs:label "Rat Genome DB")
   (gnc:rgdLink rdfs:comments "Rat Genome DB")
   (gnc:hasKgID rdfs:domain gnc:GeneSymbol)
   (gnt:hasKgID a owl:ObjectProperty)
   (gnc:hasKgID rdfs:comments "The kgID of this resource")
   (gnc:hasUnigenID rdfs:domain gnc:GeneSymbol)
   (gnt:hasUnigenID a owl:ObjectProperty)
   (gnc:hasUnigenID rdfs:comments "The UnigenID of this resource")
   (gnc:hasProteinID rdfs:domain gnc:GeneSymbol)
   (gnt:hasProteinID a owl:ObjectProperty)
   (gnc:hasProteinID rdfs:comments "The ProteinID of this resource")
   (gnc:hasAlignID rdfs:domain gnc:GeneSymbol)
   (gnt:hasAlignID a owl:ObjectProperty)
   (gnc:hasAlignID rdfs:comments "The AlignID of this resource")
   (gnt:TxEnd rdfs:range xsd:double)
   (gnt:TxStart rdfs:range xsd:double)
   (gnt:hasTargetSeq rdfs:domain gnc:Probeset))
  (triples
      (string->identifier
       "gene" (regexp-substitute/global
               #f "[^A-Za-z0-9:]"
               (string-trim-both
                (field ("CONCAT_WS('_', GeneSymbol, GeneID, AlignID)" GENE_UID)))
               'pre "_" 'post)
       #:proc (lambda (x) x))
    (set rdf:type 'gnc:Gene)
    (set gnt:geneSymbol (field GeneList GeneSymbol))
    (set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
    (set gnt:hasGeneId (ontology 'gene: (field GeneList GeneId)))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "https://www.ebi.ac.uk/gwas/search?query="
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:ebiGwasLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol))
               (geneId (field GeneList GeneID))
               (species (field Species Name)))
           (if (and (not (string-blank? symbol))
                    (not (string-blank? species))
                    (or (string=? species "human")
                        (string=? species "mouse")))
               (string->symbol
                (format #f "<~0@*~a> .~%<~0@*~a> ~1@*~a"
                        "http://mouse.brain-map.org/search/show?search_type=gene&search_term="
                        "a gnc:abaLink"
                        (if (string=? species "mouse")
                            (uri-encode
                             (string-trim-both symbol))
                            geneId)))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol))
               (species (field Species Name)))
           (if (and (not (string-blank? symbol))
                    (not (string-blank? species))
                    (or
                     (string=? species "mouse")
                     (string=? species "human")))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
                        "https://rgd.mcw.edu/rgdweb/elasticResults.html?term="
                        (uri-encode
                         (string-trim-both symbol))
                        "&category=Gene&species="
                        (string-capitalize species)
                        "a gnc:rgdLink"))
               "")))
    (set dct:references
         (let ((geneId (field GeneList GeneID))
               (species (field Species Name)))
           (if (and (not (string-blank? geneId))
                    (not (string-blank? species))
                    (or
                     (string=? species "mouse")
                     (string=? species "rat")
                     (string=? species "human")))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a~2@*~a~3@*~a> .~%<~0@*~a~1@*~a~2@*~a~3@*~a> ~4@*~a"
                        "http://biogps.org/?org="
                        species
                        "#goto=genereport&id="
                        geneId
                        "a gnc:biogpsLink"))
               "")))
    (set dct:references
         (let ((geneId (field GeneList GeneID)))
           (if (not (string-blank? geneId))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://www.chibi.ubc.ca/Gemma/gene/showGene.html?ncbiid="
                        geneId
                        "a gnc:gemmaLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol))
               (species (lower-case-and-replace-spaces
                         (field Species FullName))))
           (if (and (not (string-blank? symbol))
                    (not (string-blank? species))
                    (or
                     (string=? species "mus-musculus")
                     (string=? species "rattus-norvegicus")
                     (string=? species "homo-sapiens")))
               (string->symbol
                (format #f "<~0@*~a/~1@*~a/~2@*~a> .~%<~0@*~a/~1@*~a/~2@*~a> ~3@*~a"
                        "https://genemania.org/search"
                        species
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:genemaniaLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:pantherLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:stringLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "https://www.gtexportal.org/home/gene/"
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:gtexLink"))
               "")))
    (set dct:references
         (let ((symbol (field GeneList GeneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://www.proteinatlas.org/search/"
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:proteinAtlasLink"))
               "")))
    (set gnt:chromosome (field GeneList Chromosome))
    (set gnt:TxStart (annotate-field
                      (field GeneList TxStart)
                      '^^xsd:double))
    (set gnt:TxEnd (annotate-field
                    (field GeneList TxEnd)
                    '^^xsd:double))
    (set gnt:Strand (string-trim-both (field GeneList Strand)))
    (set
     gnt:belongsToSpecies
     (string->identifier
      ""
      (remap-species-identifiers
       (string-trim-both (field Species Name)))
      #:separator ""
      #:proc string-capitalize-first))
    (set
     gnt:transcript
     (ontology 'transcript:
               (string-trim-both (field GeneList NM_ID))))
    (set gnt:hasKgID (string-trim-both (field GeneList kgID)))
    (set gnt:hasUnigenID (string-trim-both (field GeneList UnigenID)))
    (set gnt:hasProteinID (string-trim-both (field GeneList ProteinID)))
    (set gnt:hasAlignID (string-trim-both (field GeneList AlignID)))
    (set gnt:hasRgdID
         (field ("IFNULL(RGD_ID, '')" RGD_ID)))))

(define-transformer genelist-rn33
  (tables (GeneList_rn33))
  (triples
      (let ([gene-uid (field GeneList_rn33 id GENE_UID)])
        (string->identifier
         "gene_rn33"
         (if (number? gene-uid)
             (number->string
              gene-uid)
             gene-uid)))
    (set rdf:type 'gnc:Gene)
    (set gnt:belongsToSpecies 'gn:Rattus_norvegicus)
    (set gnt:geneSymbol (string-trim-both (field GeneList_rn33 geneSymbol)))
    (set gnt:chromosome (field GeneList_rn33 chromosome))
    (set gnt:TxStart (annotate-field
                      (field GeneList_rn33 txStart)
                      '^^xsd:double))
    (set gnt:TxEnd (annotate-field
                    (field GeneList_rn33 txEnd)
                    '^^xsd:double))
    (set gnt:Strand (string-trim-both (field GeneList_rn33 strand)))
    (set
     gnt:transcript
     (ontology
      'transcript:
      (string-trim-both (field GeneList_rn33 NM_ID))))
    (set
     gnt:hasKgID
     (string-trim-both (field GeneList_rn33 kgID)))
    (set dct:references
         (let ((symbol (field GeneList_rn33 geneSymbol)))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://www.pantherdb.org/genes/geneList.do?searchType=basic&fieldName=all&organism=all&listType=1&fieldValue="
                        (string-trim-both symbol)
                        "a gnc:PantherLink"))
               "")))
    (set dct:references
         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "https://www.ebi.ac.uk/gwas/search?query="
                        (string-trim-both symbol)
                        "a gnc:ebiGwasLink"))
               "")))
    (set dct:references
         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://string-db.org/newstring_cgi/show_network_section.pl?identifier="
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:stringLink"))
               "")))
    (set dct:references
         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "https://www.gtexportal.org/home/gene/"
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:gtexLink"))
               "")))
    (set dct:references
         (let ((symbol (string-trim-both (field GeneList_rn33 geneSymbol))))
           (if (not (string-blank? symbol))
               (string->symbol
                (format #f
                        "<~0@*~a~1@*~a> .~%<~0@*~a~1@*~a> ~2@*~a"
                        "http://www.proteinatlas.org/search/"
                        (uri-encode
                         (string-trim-both symbol))
                        "a gnc:proteinAtlasLink"))
               "")))))



(let* ((option-spec
        '((settings (single-char #\s) (value #t))
          (output (single-char #\o) (value #t))
          (documentation (single-char #\d) (value #t))))
       (options (getopt-long (command-line) option-spec))
       (settings (option-ref options 'settings #f))
       (output (option-ref options 'output #f))
       (documentation (option-ref options 'documentation #f))
       (%connection-settings
        (call-with-input-file settings
          read)))
  (with-documentation
   (name "Gene Metadata")
   (connection %connection-settings)
   (table-metadata? #f)
   (prefixes
    '(("gn:" "<http://genenetwork.org/id/>")
      ("probeset:" "<http://genenetwork.org/probeset/>")
      ("gnc:" "<http://genenetwork.org/category/>")
      ("gnt:" "<http://genenetwork.org/term/>")
      ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
      ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
      ("dct:" "<http://purl.org/dc/terms/>")
      ("owl:" "<http://www.w3.org/2002/07/owl#>")
      ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
      ("qb:" "<http://purl.org/linked-data/cube#>")
      ("gene:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
      ("sdmx-measure:" "<http://purl.org/linked-data/sdmx/2009/measure#>")
      ("transcript:" "<https://portals.broadinstitute.org/gpp/public/trans/details?transName=>")
      ("skos:" "<http://www.w3.org/2004/02/skos/core#>")))
   (inputs
    (list genelist-rn33
          genelist))
   (outputs
    `(#:documentation ,documentation
      #:rdf ,output))))