aboutsummaryrefslogtreecommitdiff
#! /usr/bin/env guile
!#

(use-modules (srfi srfi-1)
             (srfi srfi-26)
             (rnrs bytevectors)
             (ice-9 format)
             (ice-9 getopt-long)
             (ice-9 match)
             (ice-9 regex)
             (transform strings)
             (transform sql)
             (transform triples)
             (transform special-forms))



(define-transformer gn-genewiki-entries
  (tables (GeneRIF
           (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
           (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
           (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
          "WHERE GeneRIF.display > 0 AND GeneRIF.comment IS NOT NULL
GROUP BY GeneRIF.Id, GeneRIF.versionId, GeneRIF.symbol")
  (schema-triples
   (gnc:GeneWikiEntry a rdfs:Class)
   (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
   (gnt:initial a owl:ObjectProperty)
   (gnt:initial rdfs:domain gnc:GeneWikiEntry)
   (gnt:initial skos:definition "Optional user or project code or your initials")
   (gnt:reason a owl:ObjectProperty)
   (gnt:reason rdfs:domain gnc:GeneWikiEntry)
   (gnt:reason skos:definition "The reason why this resource was modified")
   (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
   (gnt:geneSymbol rdfs:domain gnc:GNWikiEntry))
  (triples
      (format
       #f "gn:wiki-~a-~a"
       (field GeneRIF Id)
       (field GeneRIF versionId))
    (set rdfs:label (string->symbol
                     (format #f "'~a'@en"
                             (replace-substrings
                              (sanitize-rdf-string
                               (field GeneRIF comment))
                              '(("'" . "\\'"))))))
    (set rdf:type 'gnc:GNWikiEntry)
    (set gnt:symbol (field GeneRIF symbol))
    (set gnt:belongsToSpecies (string->identifier
                               ""
                               (remap-species-identifiers (field Species Fullname))
                               #:separator ""
                               #:proc string-capitalize-first))
    (set dct:created
         (string->symbol
          (format #f "~s^^xsd:datetime "
                  (field
                   ("CAST(createtime AS CHAR)" EntryCreateTime)))))
    (multiset dct:references
              (map (lambda (pmid)
                     (match pmid
                       ((? string-blank? p) "")
                       (p (string->symbol
                           (format #f "pubmed:~a" (string-trim-both pmid))))))
                   (string-split (field GeneRIF PubMed_ID PMID)
                                 #\space)))
    (set foaf:mbox
         (match (sanitize-rdf-string (field GeneRIF email))
           ((? string-blank? mbox) "")
           (mbox (string->symbol
                  (format #f "<~a>" mbox)))))
    (set dct:identifier (annotate-field (format #f "~s" (field GeneRIF Id))
                                        '^^xsd:integer))
    (set foaf:homepage
         (match (sanitize-rdf-string (field GeneRIF weburl))
           ((? string-blank? homepage) "")
           (homepage (string->symbol
                      (format #f "<~a>" homepage)))))
    (set dct:hasVersion (annotate-field (format #f "~s" (field GeneRIF versionId))
                                        '^^xsd:integer))
    (set gnt:initial (sanitize-rdf-string (field GeneRIF initial)))
    (set gnt:reason (field GeneRIF reason))
    (multiset gnt:belongsToCategory
              (string-split
               (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR ';')"
                       GeneCategory))
               #\;))))

(define-transformer ncbi-genewiki-entries
  (tables (GeneRIF_BASIC
           (left-join Species "USING (SpeciesId)")))
  (schema-triples
   (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
   (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI"))
  (triples
      (format
       #f "gn:rif-~a-~a-~a-~a"
       (field GeneRIF_BASIC GeneId)
       (field GeneRIF_BASIC PubMed_ID)
       (field
        ("DATE_FORMAT(createtime, '%Y-%m-%dT%T')" CreateTime))
       (field GeneRIF_BASIC VersionId))
    (set rdf:type
         (let* ((comment (format #f "'~a'@en"
                                 (replace-substrings
                                  (sanitize-rdf-string
                                   (field GeneRIF_BASIC comment))
                                  '(("\\" . "\\\\")
                                    ("\n" . "\\n")
                                    ("\r" . "\\r")
                                    ("'" . "\\'")))))
                (create-time (format #f "~s^^xsd:datetime"
                                     (field
                                      ("CAST(createtime AS CHAR)" EntryCreateTime))))
                (symbol (field GeneRIF_BASIC symbol))
                (species (string->identifier
                          ""
                          (remap-species-identifiers (field Species Fullname))
                          #:separator ""
                          #:proc string-capitalize-first))
                (gene-id (field GeneRIF_BASIC GeneId))
                (taxon-id (field GeneRIF_BASIC TaxID TaxonomicId))
                (pmid (field GeneRIF_BASIC PubMed_ID))
                (version-id (field GeneRIF_BASIC versionId)))
           (string->symbol
            (string-append
             (format #f "gnc:NCBIWikiEntry ;\n")
             (format #f "\trdfs:label ~a ;\n" comment)
             (format #f "\tgnt:belongsToSpecies ~a ;\n" species)
             (format #f "\tgnt:symbol ~s ;\n" symbol)
             (format #f "\tgnt:hasGeneId generif:~a ;\n" gene-id)
             (match taxon-id
               ((? number? x)
                (format #f "\tskos:notation taxon:~a ;\n" taxon-id))
               (else ""))
             (format #f "\tdct:hasVersion \"~a\"^^xsd:integer ;\n" version-id)
             (format #f "\tdct:references pubmed:~a ;\n" pmid)
             (format #f "\tdct:created ~a" create-time)))))))



(let* ((option-spec
        '((settings (single-char #\s) (value #t))
          (output (single-char #\o) (value #t))
          (documentation (single-char #\d) (value #t))))
       (options (getopt-long (command-line) option-spec))
       (settings (option-ref options 'settings #f))
       (output (option-ref options 'output #f))
       (documentation (option-ref options 'documentation #f))
       (%connection-settings
        (call-with-input-file settings
          read)))

  (with-documentation
   (name "GeneRIF Metadata")
   (connection %connection-settings)
   (table-metadata? #f)
   (prefixes
    '(("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
      ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
      ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
      ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>")
      ("gn:" "<http://genenetwork.org/id/>")
      ("gnc:" "<http://genenetwork.org/category/>")
      ("gnt:" "<http://genenetwork.org/term/>")
      ("dct:" "<http://purl.org/dc/terms/>")
      ("foaf:" "<http://xmlns.com/foaf/0.1/>")
      ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
      ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
      ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
      ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
      ("owl:" "<http://www.w3.org/2002/07/owl#>")))
   (inputs
    (list
     gn-genewiki-entries
     ncbi-genewiki-entries))
   (outputs
    `(#:documentation ,documentation
      #:rdf ,output))))