aboutsummaryrefslogtreecommitdiff
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (rnrs bytevectors)
             (ice-9 format)
             (ice-9 getopt-long)
             (ice-9 match)
             (ice-9 regex)
             (transform strings)
             (transform sql)
             (transform triples)
             (transform special-forms))



(define (fix-email-id email)
  (string-delete #\space email))

(define (investigator-attributes->id first-name last-name email)
  ;; There is just one record corresponding to "Evan Williams" which
  ;; does not have an email ID. To accommodate that record, we
  ;; construct the investigator ID from not just the email ID, but
  ;; also the first and the last names. It would be preferable to just
  ;; find Evan Williams' email ID and insert it into the database.
  (string->identifier "investigator"
                      (string-join
                       (list first-name last-name (fix-email-id email))
                       "_")))



(define-transformer genewiki-symbols
  (tables (GeneRIF_BASIC)
          "GROUP BY BINARY symbol")
  (triples
      (string->identifier
       "symbol"
       (regexp-substitute/global #f "[^A-Za-z0-9:]"
                                 (field GeneRIF_BASIC symbol)
                                 'pre "_" 'post)
       #:proc (lambda (x) x))
    (set rdfs:label
         (field GeneRIF_BASIC symbol))))

;; Some symbols exist in the RIF table that don't exist in the GeneRIF
;; table.
(define-transformer generif-symbols
  (tables (GeneRIF)
          "WHERE symbol NOT IN (SELECT symbol from GeneRIF_BASIC) GROUP BY BINARY symbol")
  (triples
      (string->identifier
       "symbol"
       (regexp-substitute/global #f "[^A-Za-z0-9:]"
                                 (field GeneRIF symbol)
                                 'pre "_" 'post)
       #:proc (lambda (x) x))
    (set rdfs:label
         (field GeneRIF symbol))))

(define-transformer gn-genewiki-entries
  (tables (GeneRIF
           (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
           (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
           (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
           (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
          "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol")
  (schema-triples
   (gnc:GeneWikiEntry a rdfs:Class)
   (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
   (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
   (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry))
  (triples
      (string->identifier
       "symbol"
       (regexp-substitute/global
        #f "[^A-Za-z0-9:]"
        (field GeneRIF symbol)
        'pre "_" 'post)
       #:proc (lambda (x) x))
    (set rdfs:comment
         (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))]
                [create-time (field GeneRIF createtime EntryCreateTime)]
                [pmid (field GeneRIF PubMed_ID PMID)]
                [web-url (field GeneRIF weburl)]
                [species (string->identifier
                          ""
                          (remap-species-identifiers (field Species Fullname))
                          #:separator ""
                          #:proc string-capitalize-first)]
                [categories
                 (remove (lambda (x)
                           (or (eq? x #f)
                               (and (string? x)
                                    (string-null? x))))
                         (remove-duplicates
                          (string-split-substring
                           (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
                                   GeneCategory))
                           "$$")))])
           (string->symbol
            (string-append
             "[ "
             (format #f "rdf:type gnc:GNWikiEntry ; ")
             (if (string? species)
                 ""
                 (format #f "gnt:belongsToSpecies ~a ; "
                         species))
             (format #f "rdfs:comment ~s^^xsd:string ; "
                     generif-comment)
             (if (string? create-time)
                 ""
                 (format #f "dct:created ~s^^xsd:datetime ; "
                         (time-unix->string
                          create-time "~5")))
             (if (and (string? pmid) (not (string-null? pmid)))
                 (format #f
                         "~{dct:references pubmed:~a ; ~}"
                         (string-split pmid #\space))
                 "")
             (if (and (not (string-null?
                            (string-trim-both (field GeneRIF email))))
                      (not (string-null? (field Investigators Email))))
                 (format #f "dct:creator ~a ; "
                         (investigator-attributes->id
                          (field Investigators FirstName)
                          (field Investigators LastName)
                          (field Investigators Email)))
                 "")
             (if (not (null? categories))
                 (format #f
                         "~{gnt:belongsToCategory ~s ; ~}"
                         categories)
                 "")
             (if (and (string? web-url) (not (string-null? web-url)))
                 (format #f "foaf:homepage ~s ; "
                         web-url)
                 "")
             " ] "))))))

(define-transformer ncbi-genewiki-entries
  (tables (GeneRIF_BASIC
           (left-join Species "USING (SpeciesId)"))
          "WHERE GeneRIF_BASIC.comment IS NOT NULL AND TRIM(GeneRIF_BASIC.comment) != '' AND TRIM(GeneRIF_BASIC.symbol) != '' GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID")
  (schema-triples
   (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
   (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")
   (gnt:hasVersionId a owl:ObjectProperty)
   (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry)
   (gnt:hasVersionId skos:definition "The VersionId of this this resource"))
  (triples
      (string->identifier
       "symbol"
       (regexp-substitute/global #f "[^A-Za-z0-9:]"
                                 (field GeneRIF_BASIC symbol GeneRIFSymbol)
                                 'pre "_" 'post)
       #:proc (lambda (x) x))
    (set rdfs:comment
         (let ([ncbi-comment (sanitize-rdf-string (field GeneRIF_BASIC comment))]
               [species-name
                (string->identifier
                 ""
                 (remap-species-identifiers (field Species Fullname SpeciesFullName))
                 #:separator ""
                 #:proc string-capitalize-first)]
               [taxonomic-id (field GeneRIF_BASIC TaxID TaxonomicId)]
               [create-time (field GeneRIF_BASIC createtime EntryCreateTime)]
               [pmid (field GeneRIF_BASIC PubMed_ID PMID)]
               [gene-id (field GeneRIF_BASIC GeneId)]
               [version-id (field GeneRIF_BASIC VersionId)])
           (string->symbol
            (string-append
             "[ "
             (format #f "rdf:type gnc:NCBIWikiEntry ; ")
             (format #f "rdfs:comment ~s^^xsd:string ; "
                     ncbi-comment)
             (format #f "gnt:belongsToSpecies ~a ; "
                     species-name)
             (if (eq? #f taxonomic-id)
                 ""
                 (format #f "skos:notation taxon:~a ; "
                         taxonomic-id))
             (format #f "gnt:hasGeneId generif:~a ; "
                     gene-id)
             (format #f "gnt:hasVersionId '~a'^^xsd:integer ; "
                     version-id)
             (if (and (string? pmid) (not (string-null? pmid)))
                 (format #f
                         "~{dct:references pubmed:~a ; ~}"
                         (string-split pmid #\space))
                 "")
             (if (string? create-time)
                 ""
                 (format #f "dct:created ~s^^xsd:datetime ; "
                         (time-unix->string
                          create-time "~5")))
             " ]"))))))



(let* ((option-spec
        '((settings (single-char #\s) (value #t))
          (output (single-char #\o) (value #t))
          (documentation (single-char #\d) (value #t))))
       (options (getopt-long (command-line) option-spec))
       (settings (option-ref options 'settings #f))
       (output (option-ref options 'output #f))
       (documentation (option-ref options 'documentation #f))
       (%connection-settings
        (call-with-input-file settings
          read)))

  (with-documentation
   (name "GeneRIF Metadata")
   (connection %connection-settings)
   (table-metadata? #f)
   (prefixes
    '(("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
      ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
      ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
      ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>")
      ("gn:" "<http://genenetwork.org/id/>")
      ("gnc:" "<http://genenetwork.org/category/>")
      ("gnt:" "<http://genenetwork.org/term/>")
      ("dct:" "<http://purl.org/dc/terms/>")
      ("foaf:" "<http://xmlns.com/foaf/0.1/>")
      ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
      ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
      ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
      ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
      ("owl:" "<http://www.w3.org/2002/07/owl#>")))
   (inputs
    (list
     genewiki-symbols
     generif-symbols
     gn-genewiki-entries
     ncbi-genewiki-entries))
   (outputs
    `(#:documentation ,documentation
      #:rdf ,output))))