about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xexamples/generif.scm173
1 files changed, 111 insertions, 62 deletions
diff --git a/examples/generif.scm b/examples/generif.scm
index 170cf0c..e960104 100755
--- a/examples/generif.scm
+++ b/examples/generif.scm
@@ -3,13 +3,41 @@
 
 (use-modules (srfi srfi-1)
              (srfi srfi-26)
+             (rnrs bytevectors)
              (ice-9 getopt-long)
              (ice-9 match)
              (ice-9 regex)
              (transform strings)
              (transform sql)
              (transform triples)
-             (transform special-forms))
+             (transform special-forms)
+             (transform uuid))
+
+
+
+(define (remap-species-identifiers str)
+  "This procedure remaps identifiers to standard binominal. Obviously this should
+   be sorted by correcting the database!"
+  (match str
+    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
+    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
+    ["Macaca mulatta" "Macaca nemestrina"]
+    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
+    [str str]))
+
+(define (fix-email-id email)
+  (string-delete #\space email))
+
+(define (investigator-attributes->id first-name last-name email)
+  ;; There is just one record corresponding to "Evan Williams" which
+  ;; does not have an email ID. To accommodate that record, we
+  ;; construct the investigator ID from not just the email ID, but
+  ;; also the first and the last names. It would be preferable to just
+  ;; find Evan Williams' email ID and insert it into the database.
+  (string->identifier "investigator"
+                      (string-join
+                       (list first-name last-name (fix-email-id email))
+                       "_")))
 
 
 
@@ -23,79 +51,30 @@
    (gnt:taxid rdfs:domain gn-term:geneWikiEntry))
   (triples (ontology 'generif: (field GeneRIF_BASIC GeneId))
     (multiset gnt:symbol (string-split (field ("GROUP_CONCAT(DISTINCT symbol)" symbol))
-                                      #\,))
+                                       #\,))
     (multiset gnt:wikiEntryOfSpecies
               (string-split
                (field ("GROUP_CONCAT(DISTINCT Species.SpeciesName)" species))
                #\,))
     (multiset gnt:taxId (map (cut ontology 'ncbiTaxon: <>)
-                            (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
-                                          #\,)))))
+                             (string-split (field ("GROUP_CONCAT(DISTINCT TaxID)" taxId))
+                                           #\,)))))
 
 (define-transformer gn-genewiki-entries
   (tables (GeneRIF
-           (left-join GeneRIF_BASIC "USING (symbol)")
            (left-join Species "ON Species.SpeciesId = GeneRIF.SpeciesId")
            (left-join GeneRIFXRef "ON GeneRIFXRef.GeneRIFId = GeneRIF.Id")
-           (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id"))
-          "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 GROUP BY GeneRIF.symbol")
+           (left-join GeneCategory "ON GeneRIFXRef.GeneCategoryId = GeneCategory.Id")
+           (left-join Investigators "ON Investigators.Email = GeneRIF.email"))
+          "WHERE GeneRIF.display > 0 AND GeneRIF.VersionId = 0 AND GeneRIF.comment IS NOT NULL GROUP BY GeneRIF.comment, GeneRIF.createtime")
   (schema-triples
-   (gnt:geneWikiEntry a rdfs:Class)
-   (gnt:geneWikiEntry a owl:Class)
-   (gnt:geneWikiEntry rdfs:comment "Represents GeneRIF Entries")
-   (gnt:geneCategory rdfs:domain gn:geneWikiEntry)
-   (gnt:geneWikiEntryOfGn rdfs:domain gn:geneWikiEntry)
-   (gnt:geneWikiEntry rdfs:domain gn:geneWikiEntry))
+   (gnc:GeneWikiEntry a rdfs:Class)
+   (gnc:GNWikiEntry rdfs:subClassOf gnc:GeneWikiEntry)
+   (gnc:GNWikiEntry rdfs:comment "Represents GeneRIF Entries entered from GeneNetwork")
+   (gnt:belongsToCategory rdfs:domain gnc:GNWikiEntry)
+   (gnt:belongsToSpecies rdfs:domain gnc:GNWikiEntry)
+   (gnt:symbol rdfs:domain gnc:GNWikiEntry))
   (triples
-      (let ([geneid (field GeneRIF_BASIC GeneId)])
-        (if (eq? geneid 0)
-            (ontology 'gnt:anonSymbol_
-                      (field GeneRIF symbol))
-            (ontology 'generif:
-                      geneid)))
-    (set rdf:type
-         (if (string-null? (field ("IFNULL(GeneRIF_BASIC.GeneId, '')" geneWikiEntryP)))
-             ""
-             'gn:geneWikiEntry))
-    (set gnt:wikiEntryOfSpecies
-         (string->binomial-name (field Species FullName)))
-    ;; This only transforms symbols not present in the GeneRIF_BASIC table
-    (set gnt:symbol (let ([geneid (field GeneRIF_BASIC GeneId)])
-                     (if (eq? geneid 0)
-                         (field GeneRIF symbol)
-                         "")))
-    (multiset gnt:geneWikiEntryOfGn
-              (let* ([entries
-                      (sanitize-rdf-string
-                       (field
-                        ("GROUP_CONCAT(DISTINCT CONCAT_WS('::::', IFNULL(GeneCategory.Name, ''), IFNULL(GeneRIF.PubMed_ID, ''), GeneRIF.email, CAST(CONVERT(BINARY CONVERT(GeneRIF.comment USING latin1) USING utf8) AS VARCHAR(15000)), GeneRIF.createtime, IFNULL(weburl, '')) SEPARATOR';;;;;')"
-                         wikientry)))]
-                     [comments (string-split-substring entries ";;;;;")])
-                (map
-                 (match-lambda
-                   ((genecategory pmid email text createtime weburl)
-                    (blank-node
-                     (set gnt:geneCategory genecategory)
-                     (multiset dct:source
-                               (map (lambda (el) (if (string-null? el)
-                                                     ""
-                                                     (ontology 'pubmed: el)))
-                                    (string-split pmid #\space)))
-                     (set dct:creator (regexp-substitute/global #f "@.*$"
-                                                                email
-                                                                'pre
-                                                                ""
-                                                                'post))
-                     (set gnt:geneWikiEntry
-                          (annotate-field text '^^xsd:string))
-                     (set dct:created (annotate-field
-                                       createtime
-                                       '^^xsd:datetime))
-                     (set foaf:homepage weburl))))
-                 (map
-                  (cut string-split-substring <> "::::")
-                  comments))))))
-
 (define-transformer ncbi-genewiki-entries
   (tables (GeneRIF_BASIC)
           "GROUP BY GeneId, comment, createtime")
@@ -116,6 +95,76 @@
           (set dct:created (annotate-field (time-unix->string
                                             (field GeneRIF_BASIC createtime) "~5")
                                            '^^xsd:datetime))))))
+      (string->identifier
+       "generif"
+       (make-version-3-uuid
+         (u8-list->bytevector
+          ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
+          '(107 167 184 17 157 173 17 209 128 180
+                0 192 79 212 48 200))
+         (format #f "~a~a~a~a"
+                 (field Species FullName)
+                 (field GeneRIF comment)
+                 (field GeneRIF symbol)
+                 (field GeneRIF createtime))
+         ""))
+    (string->identifier
+     "generif"
+     (make-version-3-uuid
+      (u8-list->bytevector
+       ;; URL namespace UUID is 6ba7b811-9dad-11d1-80b4-00c04fd430c8
+       '(107 167 184 17 157 173 17 209 128 180
+             0 192 79 212 48 200))
+      (format #f "~a~a~a~a"
+              (field Species FullName)
+              (field GeneRIF comment)
+              (field GeneRIF symbol)
+              (field GeneRIF createtime))
+      ""))
+    (set rdf:type 'gnc:GNWikiEntry)
+    (set rdfs:label (sanitize-rdf-string (field GeneRIF comment)))
+    (set gnt:symbol (field GeneRIF symbol))
+    (multiset gnt:belongsToCategory
+              (remove-duplicates
+               (string-split-substring
+                (field ("GROUP_CONCAT(DISTINCT GeneCategory.Name SEPARATOR '$$')"
+                        GeneCategory))
+                "$$")))
+    (set gnt:belongsToSpecies
+         (string->identifier
+          ""
+          (remap-species-identifiers (field Species Fullname))
+          #:separator ""
+          #:proc string-capitalize-first))
+    (multiset dct:references
+              (map (lambda (x)
+                     (ontology 'pubmed: x))
+                   (string-split
+                    (let ((pmid (field
+                                 ("IFNULL(GeneRIF.PubMed_ID, '')"
+                                  PubMed_ID))))
+                      (if (number? pmid)
+                          (number->string pmid)
+                          pmid))
+                    #\space)))
+    (set dct:created
+         (let ((createtime (field GeneRIF createtime)))
+           (if (string? createtime)
+               ""
+               (annotate-field
+                (time-unix->string
+                 createtime "~5")
+                '^^xsd:datetime))))
+    (set dct:creator
+         (if (and (not (string-null?
+                        (string-trim-both (field GeneRIF email))))
+                  (not (string-null? (field Investigators Email))))
+             (investigator-attributes->id
+              (field Investigators FirstName)
+              (field Investigators LastName)
+              (field Investigators Email))
+             ""))
+    (set foaf:homepage (field GeneRIF weburl))))