about summary refs log tree commit diff
path: root/examples/dataset-metadata.scm
diff options
context:
space:
mode:
Diffstat (limited to 'examples/dataset-metadata.scm')
-rwxr-xr-xexamples/dataset-metadata.scm113
1 files changed, 43 insertions, 70 deletions
diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm
index 591b18e..c40c42c 100755
--- a/examples/dataset-metadata.scm
+++ b/examples/dataset-metadata.scm
@@ -27,7 +27,8 @@
   (string->identifier "investigator"
                       (string-join
                        (list first-name last-name (fix-email-id email))
-                       "_")))
+                       "_")
+                      #:separator "_"))
 
 (define-transformer investigators
   ;; There are a few duplicate entries. We group by email to
@@ -36,7 +37,7 @@
           "GROUP BY Email")
   (triples (investigator-attributes->id (field Investigators FirstName)
                                         (field Investigators LastName)
-                                        (field Investigators Email))
+                                        "")
     (set rdf:type 'foaf:Person)
     (set foaf:name (string-append (field Investigators FirstName) " "
                                   (field Investigators LastName)))
@@ -64,7 +65,7 @@
    (gnt:has_go_tree_value a owl:ObjectProperty)
    (gnt:has_go_tree_value skos:definition "This resource the following GO tree value")
    (gnt:has_go_tree_value rdfs:domain gnc:gene_chip))
-  (triples (string->identifier "platform" (field GeneChip Name))
+  (triples (string->identifier "platform" (field GeneChip Name) #:separator "_")
     (set rdf:type 'gnc:gene_chip)
     (set rdfs:label (field GeneChip GeneChipName))
     (set skos:prefLabel (field GeneChip Name))
@@ -72,9 +73,7 @@
                                Title)))
     (set gnt:has_go_tree_value (field GeneChip Go_tree_value))
     (set xkos:classifiedUnder
-         (string->identifier "" (remap-species-identifiers (field Species Fullname))
-                             #:separator ""
-                             #:proc string-capitalize-first))
+         (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator ""))
     (set gnt:has_geo_series_id
          (ontology 'geoSeries:
                    (string-trim-both (field GeneChip GeoPlatform))))))
@@ -162,10 +161,15 @@
    (gnt:has_acknowledgement rdfs:label "Acknowledgement")
    (gnt:has_acknowledgement a owl:ObjectProperty)
    (gnt:has_acknowledgement skos:definition "People to acknowledge"))
-  (triples (string->identifier
-            "" (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                         (field InfoFiles InfoPageName)
-                                         'pre "_" 'post))
+  (triples
+      (string->identifier
+       "" (let ((info-page-name (field InfoFiles InfoPageName))
+                (info-title (field InfoFiles Title)))
+            (format #f "~a"
+                    (if (and (string? info-page-name)
+                             (string=? (string-downcase (string-trim-both info-page-name))
+                                       "none"))
+                        info-title info-page-name))))
     (set rdf:type 'dcat:Dataset)
     (set xkos:classifiedUnder
          (let ([dataset-type
@@ -176,30 +180,21 @@
                (string->symbol
                 dataset-type)
                "")))
-    (set rdfs:label (regexp-substitute/global
-                     #f "^[Nn]one$"
-                     (field InfoFiles InfoPageName)
-                     ""))
+    (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName)))
     (set skos:prefLabel
-         (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
-                 DatasetFullName)))
+         (normalize-string-field
+          (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
+                  DatasetFullName))))
     (set skos:altLabel (field Datasets DatasetName DatasetGroup))
-    (set dct:title
-         (regexp-substitute/global
-          #f "^[Nn]one$"
-          (or
-           (regexp-substitute/global
-            #f "^Unpublished$" (field Datasets PublicationTitle) "")
-           (field InfoFiles InfoFileTitle)
-           "")
-          ""))
+    (set dct:title (normalize-string-field (field Datasets PublicationTitle)))
     (set dct:created
-         (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
-                 createTimeGenoFreeze)))
+         (normalize-string-field
+          (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
+                  createTimeGenoFreeze))))
     (set dcat:contactPoint
          (investigator-attributes->id (field Investigators FirstName)
                                       (field Investigators LastName)
-                                      (field Investigators Email)))
+                                      ""))
     (set foaf:Organization
          (field Organizations OrganizationName))
     (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId)))
@@ -209,14 +204,16 @@
          (string->identifier
           "set"
           (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))"
-                  InbredSetName))))
+                  InbredSetName))
+          #:separator "_"))
     (set gnt:has_tissue (string->identifier "tissue"
-                                           (field Tissue Short_Name)))
+                                            (field Tissue Short_Name)
+                                            #:separator "_"))
     (set gnt:uses_normalization
-         (string->identifier "avgMethod"
-                             ;; If AvgMethodName is NULL, assume N/A.
-                             (if (string-blank? (field AvgMethod Name AvgMethodName))
-                                 "N/A" (field AvgMethod Name AvgMethodName))))
+         (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName))))
+           (if (not (string-blank? avg-method))
+               (string->identifier "avg_method" avg-method #:separator "_")
+               "")))
     (set gnt:has_summary
          (let* ((summary-link
                  (format
@@ -375,7 +372,8 @@
                "" (string->symbol acknowledgment-link))))
     (set gnt:uses_platform
          (string->identifier "platform"
-                             (field GeneChip Name GeneChip)))
+                             (field GeneChip Name GeneChip)
+                             #:separator "_"))
     (set gnt:has_geo_series_id
          (let ((s
                 (string-match "GSE[0-9]*"
@@ -391,11 +389,7 @@
            (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
           "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
   (triples
-      (string->identifier
-       ""
-       (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                 (field PublishFreeze Name)
-                                 'pre "_" 'post))
+      (string->identifier "" (field PublishFreeze Name))
     (set rdf:type 'dcat:Dataset)
     (set xkos:classifiedUnder 'gnc:phenotype)
     (set dct:title (field PublishFreeze FullName))
@@ -407,8 +401,7 @@
     (set gnt:belongs_to_group
          (string->identifier
           "set" (field InbredSet Name InbredSetName)
-          #:separator "_"
-          #:proc (lambda (x) x)))))
+          #:separator "_"))))
 
 (define-transformer genofreeze
   (tables (GenoFreeze
@@ -416,15 +409,7 @@
            (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId"))
           "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
   (triples
-      (string->identifier
-       ""
-       (regexp-substitute/global
-        #f "[^A-Za-z0-9:]"
-        (regexp-substitute/global
-         #f "[^A-Za-z0-9:]"
-         (field GenoFreeze Name)
-         'pre "_" 'post)
-        'pre "_" 'post))
+      (string->identifier "" (field GenoFreeze Name))
     (set rdf:type 'dcat:Dataset)
     (set xkos:classifiedUnder 'gnc:genotype)
     (set rdfs:label (field GenoFreeze Name))
@@ -454,19 +439,14 @@
    (gnt:uses_data_scale a owl:ObjectProperty)
    (gnt:uses_data_scale skos:definition "Thi data scale this resource uses"))
   (triples
-      (string->identifier
-       ""
-       (regexp-substitute/global
-        #f "[^A-Za-z0-9:]"
-        (field ProbeSetFreeze Name)
-        'pre "_" 'post))
+      (string->identifier "" (field ProbeSetFreeze Name))
     (set rdf:type 'dcat:Dataset)
     (set xkos:classifiedUnder 'gnc:probeset)
     (set gnt:uses_normalization
-         (string->identifier "avgMethod"
-                             ;; If AvgMethodName is NULL, assume N/A.
-                             (if (string-blank? (field AvgMethod Name AvgMethodName))
-                                 "N/A" (field AvgMethod Name AvgMethodName))))
+         (let ((avg-method (field AvgMethod Name AvgMethodName)))
+           (if (string-blank? avg-method)
+               #f
+               avg-method)))
     (set dct:title (field ProbeSetFreeze FullName))
     (set rdfs:label (field ProbeSetFreeze ShortName))
     (set skos:prefLabel (field ProbeSetFreeze Name))
@@ -475,15 +455,8 @@
                       (field ProbeSetFreeze CreateTime)
                       '^^xsd:datetime))
     (set gnt:uses_data_scale (field ProbeSetFreeze DataScale))
-    (set gnt:has_tissue
-         (string->identifier
-          "tissue"
-          (field Tissue Short_Name)))
-    (set gnt:belongs_to_group
-         (string->identifier
-          "set" (field InbredSet Name InbredSetName)
-          #:separator ""
-          #:proc string-capitalize-first))))
+    (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_"))
+    (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_"))))