about summary refs log tree commit diff
path: root/examples
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-30 18:05:46 +0300
committerMunyoki Kilyungi2023-08-30 18:16:10 +0300
commitb212b91f7f0454d64c86b85693d37783a42d5bc3 (patch)
treeb043930a7cff3e88973b94989df54f7139478b03 /examples
parentf3ede362e1d7d00022a6f9f74d7ca304014f07fe (diff)
downloadgn-transform-databases-b212b91f7f0454d64c86b85693d37783a42d5bc3.tar.gz
Remodel how GeneRIF metadata is transformed
* examples/generif.scm: Import (ice-9 format).
(genewiki-symbols): Transform symbols and their names only.  This way
there's no need to transform the symbol names in the other names
thereby preventing duplication.
(gn-genewiki-entries): Use format strings to create the comment blank-node.
(ncbi-genewiki-entries): Ditto.

Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/generif.scm254
1 files changed, 135 insertions, 119 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index a5930ad..89e7fce 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -4,6 +4,7 @@
 (use-modules (srfi srfi-1)
              (srfi srfi-26)
              (rnrs bytevectors)
+             (ice-9 format)
              (ice-9 getopt-long)
              (ice-9 match)
              (ice-9 regex)
@@ -42,23 +43,17 @@
 
 
 (define-transformer genewiki-symbols
-  (tables (GeneRIF_BASIC
-           (left-join Species "USING (SpeciesId)"))
-          "GROUP BY GeneId ORDER BY BINARY symbol")
-  (schema-triples
-   (gnt:symbol rdfs:domain gnc:NCBIWikiEntry))
-  (triples (ontology 'generif: (field GeneRIF_BASIC GeneId))
-    (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol))
-                                       #\,))
-    (multiset xkos:classifiedUnder
-              (string-split
-               (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species))
-               #\,))
-    (multiset dct:relation
-              (map
-               (cut ontology 'ncbiTaxon: <>)
-               (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
-                             #\,)))))
+  (tables (GeneRIF_BASIC)
+          "GROUP BY BINARY symbol")
+  (triples
+      (string->identifier
+       "symbol"
+       (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                 (field GeneRIF_BASIC symbol)
+                                 'pre "_" 'post)
+       #:proc (lambda (x) x))
+    (set rdfs:label
+         (field GeneRIF_BASIC symbol))))
 
 (define-transformer gn-genewiki-entries
   (tables (GeneRIF
@@ -66,115 +61,136 @@
            (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
            (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
            (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
-          "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime")
+          "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, BINARY GeneRIF.symbol")
   (schema-triples
    (gnc:GeneWikiEntry a rdfs:Class)
    (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
    (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
    (gnt:symbol rdfs:domain gnc:GNWikiEntry))
   (triples
-    (string->identifier
-     "generif"
-     (make-version-3-uuid
-      (u8-list->bytevector
-       ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
-       '(107 167 184 17 157 173 17 209 128 180
-             0 192 79 212 48 200))
-      (format #f "~a~a~a~a"
-              (field Species FullName)
-              (field GeneRIF comment)
-              (field GeneRIF symbol)
-              (field GeneRIF createtime))
-      ""))
-    (set rdf:type 'gnc:GNWikiEntry)
-    (set rdfs:label (sanitize-rdf-string (field GeneRIF comment)))
-    (set gnt:symbol (field GeneRIF symbol))
-    (multiset gnt:belongsToCategory
-              (remove-duplicates
-               (string-split-substring
-                (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
-                        GeneCategory))
-                "$$")))
-    (set xkos:classifiedUnder
-         (string->identifier
-          ""
-          (remap-species-identifiers (field Species Fullname))
-          #:separator ""
-          #:proc string-capitalize-first))
-    (multiset dct:references
-              (map (lambda (x)
-                     (ontology 'pubmed: x))
-                   (string-split
-                    (let ((pmid (field
-                                 ("IFNULL(GeneRIF.PubMed_ID, '')"
-                                  PubMed_ID))))
-                      (if (number? pmid)
-                          (number->string pmid)
-                          pmid))
-                    #\space)))
-    (set dct:created
-         (let ((createtime (field GeneRIF createtime)))
-           (if (string? createtime)
-               ""
-               (annotate-field
-                (time-unix->string
-                 createtime "~5")
-                '^^xsd:datetime))))
-    (set dct:creator
-         (if (and (not (string-null?
-                        (string-trim-both (field GeneRIF email))))
-                  (not (string-null? (field Investigators Email))))
-             (investigator-attributes->id
-              (field Investigators FirstName)
-              (field Investigators LastName)
-              (field Investigators Email))
-             ""))
-    (set foaf:homepage (field GeneRIF weburl))))
+      (string->identifier
+       "symbol"
+       (regexp-substitute/global
+        #f "[^A-Za-z0-9:]"
+        (field GeneRIF symbol)
+        'pre "_" 'post)
+       #:proc (lambda (x) x))
+    (set rdfs:comment
+         (let* ([generif-comment (sanitize-rdf-string (field GeneRIF comment))]
+                [create-time (field GeneRIF createtime EntryCreateTime)]
+                [pmid (field GeneRIF PubMed_ID PMID)]
+                [web-url (field GeneRIF weburl)]
+                [species (string->identifier
+                          ""
+                          (remap-species-identifiers (field Species Fullname))
+                          #:separator ""
+                          #:proc string-capitalize-first)]
+                [categories
+                 (remove (lambda (x)
+                           (or (eq? x #f)
+                               (and (string? x)
+                                    (string-null? x))))
+                         (remove-duplicates
+                          (string-split-substring
+                           (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
+                                   GeneCategory))
+                           "$$")))])
+           (string->symbol
+            (string-append
+             "[ "
+             (format #f "rdf:type gnc:GNWikiEntry ; ")
+             (if (string? species)
+                 ""
+                 (format #f "xkos:classifiedUnder ~a ; "
+                         species))
+             (format #f "rdfs:comment ~s^^xsd:string ; "
+                     generif-comment)
+             (if (string? create-time)
+                 ""
+                 (format #f "dct:created ~s^^xsd:datetime ; "
+                         (time-unix->string
+                          create-time "~5")))
+             (if (and (string? pmid) (not (string-null? pmid)))
+                 (format #f
+                         "~{dct:references pubmed:~a ; ~}"
+                         (string-split pmid #\space))
+                 "")
+             (if (and (not (string-null?
+                            (string-trim-both (field GeneRIF email))))
+                      (not (string-null? (field Investigators Email))))
+                 (format #f "dct:creator ~a ; "
+                         (investigator-attributes->id
+                          (field Investigators FirstName)
+                          (field Investigators LastName)
+                          (field Investigators Email)))
+                 "")
+             (if (not (null? categories))
+                 (format #f
+                         "~{gnt:belongsToCategory ~s ; ~}"
+                         categories)
+                 "")
+             (if (and (string? web-url) (not (string-null? web-url)))
+                 (format #f "foaf:homepage ~s ; "
+                         web-url)
+                 "")
+             " ] "))))))
 
 (define-transformer ncbi-genewiki-entries
-        (tables (GeneRIF_BASIC)
-                "GROUP BY GeneId, comment, createtime")
-        (schema-triples
-         (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
-         (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI"))
-        (triples
-            (string->identifier
-             "generif"
-             (make-version-3-uuid
-              (u8-list->bytevector
-               ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
-               '(107 167 184 17 157 173 17 209 128 180
-                     0 192 79 212 48 200))
-              (format #f "~a~a~a~a~a"
-                      (field GeneRIF_BASIC GeneId)
-                      (field GeneRIF_BASIC VersionId)
-                      (field GeneRIF_BASIC comment)
-                      (field GeneRIF_BASIC symbol)
-                      (field GeneRIF_BASIC createtime))
-              ""))
-          (set rdf:type 'gnc:NCBIWikiEntry)
-          (set rdfs:label
-               (annotate-field (field GeneRIF_BASIC comment)
-                               '^^xsd:string))
-          (set gnt:symbol (field GeneRIF_BASIC symbol))
-          (multiset dct:references
-                    (map
-                     (lambda (el)
-                       (if (string-null? el)
-                           ""
-                           (ontology 'pubmed: el)))
-                     (string-split (field ("GROUP_CONCAT(PubMed_ID)" pmids))
-                                   #\,)))
-          (set gnt:hasVersion
-               (format #f "~a" (field GeneRIF_BASIC VersionId)))
-          (set dct:created
-               (let ((createtime (field GeneRIF_BASIC createtime)))
-                 (if (string? createtime)
-                     ""
-                     (annotate-field
-                      (time-unix->string
-                       createtime "~5")
-                      '^^xsd:datetime))))))
+  (tables (GeneRIF_BASIC
+           (left-join Species "USING (SpeciesId)"))
+          "WHERE GeneRIF_BASIC.comment IS NOT NULL GROUP BY GeneRIF_BASIC.comment, GeneRIF_BASIC.createtime, GeneRIF_BASIC.VersionId, GeneRIF_BASIC.SpeciesId, GeneRIF_BASIC.TaxID")
+  (schema-triples
+   (gnc:NCBIWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+   (gnc:NCBIWikiEntry rdfs:comment "Represents GeneRIF Entries obtained from NCBI")
+   (gnt:hasGeneId a owl:ObjectProperty)
+   (gnt:hasGeneId rdfs:domain gnc:NCBIWikiEntry)
+   (gnt:hasGeneId skos:definition "The GeneId of this this resource")
+   (gnt:hasVersionId a owl:ObjectProperty)
+   (gnt:hasVersionId rdfs:domain gnc:NCBIWikiEntry)
+   (gnt:hasVersionId skos:definition "The VersionId of this this resource"))
+  (triples
+      (string->identifier
+       "symbol"
+       (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                 (field GeneRIF_BASIC symbol)
+                                 'pre "_" 'post)
+       #:proc (lambda (x) x))
+    (set rdfs:comment
+         (let* ([ncbi-comment (field GeneRIF_BASIC comment)]
+                [species
+                 (string->identifier
+                  ""
+                  (remap-species-identifiers (field Species Fullname))
+                  #:separator ""
+                  #:proc string-capitalize-first)]
+                [taxonomic-id (field GeneRIF_BASIC TaxID)]
+                [create-time (field GeneRIF_BASIC createtime EntryCreateTime)]
+                [pmid (field GeneRIF_BASIC PubMed_ID PMID)])
+           (string->symbol
+            (string-append
+             "[ "
+             (format #f "rdf:type gnc:NCBIWikiEntry ; ")
+             (format #f "xkos:classifiedUnder ~a ; "
+                     species)
+             (if (eq? #f taxonomic-id)
+                 ""
+                 (format #f "skos:notation taxon:~a ; "
+                         (field GeneRIF_BASIC TaxID)))
+             (format #f "gnt:hasGeneId generif:~a ; "
+                     (field GeneRIF_BASIC GeneId))
+             (format #f "gnt:hasVersionId '~a'^^xsd:integer ; "
+                     (field GeneRIF_BASIC VersionId))
+             (if (and (string? pmid) (not (string-null? pmid)))
+                 (format #f
+                         "~{dct:references pubmed:~a ; ~}"
+                         (string-split pmid #\space))
+                 "")
+             (if (string? create-time)
+                 ""
+                 (format #f "dct:created ~s^^xsd:datetime ; "
+                         (time-unix->string
+                          create-time "~5")))
+             " ]"))))))
 
 
 
@@ -205,7 +221,7 @@
       ("dct:" "<http://purl.org/dc/terms/>")
       ("foaf:" "<http://xmlns.com/foaf/0.1/>")
       ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")
-      ("ncbiTaxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
+      ("taxon:" "<https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=>")
       ("generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
       ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
       ("owl:" "<http://www.w3.org/2002/07/owl#>")))