From 5fec5c4d6d1c07251b06348a00bb040978b5e9ac Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Tue, 23 Dec 2025 20:06:23 +0300 Subject: Update schema for several transforms. Signed-off-by: Munyoki Kilyungi --- examples/dataset-metadata.scm | 113 ++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 70 deletions(-) (limited to 'examples/dataset-metadata.scm') diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm index 591b18e..c40c42c 100755 --- a/examples/dataset-metadata.scm +++ b/examples/dataset-metadata.scm @@ -27,7 +27,8 @@ (string->identifier "investigator" (string-join (list first-name last-name (fix-email-id email)) - "_"))) + "_") + #:separator "_")) (define-transformer investigators ;; There are a few duplicate entries. We group by email to @@ -36,7 +37,7 @@ "GROUP BY Email") (triples (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) - (field Investigators Email)) + "") (set rdf:type 'foaf:Person) (set foaf:name (string-append (field Investigators FirstName) " " (field Investigators LastName))) @@ -64,7 +65,7 @@ (gnt:has_go_tree_value a owl:ObjectProperty) (gnt:has_go_tree_value skos:definition "This resource the following GO tree value") (gnt:has_go_tree_value rdfs:domain gnc:gene_chip)) - (triples (string->identifier "platform" (field GeneChip Name)) + (triples (string->identifier "platform" (field GeneChip Name) #:separator "_") (set rdf:type 'gnc:gene_chip) (set rdfs:label (field GeneChip GeneChipName)) (set skos:prefLabel (field GeneChip Name)) @@ -72,9 +73,7 @@ Title))) (set gnt:has_go_tree_value (field GeneChip Go_tree_value)) (set xkos:classifiedUnder - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) + (string->identifier "" (remap-species-identifiers (field Species Fullname)) #:separator "")) (set gnt:has_geo_series_id (ontology 'geoSeries: (string-trim-both (field GeneChip GeoPlatform)))))) @@ -162,10 +161,15 @@ (gnt:has_acknowledgement rdfs:label "Acknowledgement") (gnt:has_acknowledgement a owl:ObjectProperty) (gnt:has_acknowledgement skos:definition "People to acknowledge")) - (triples (string->identifier - "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post)) + (triples + (string->identifier + "" (let ((info-page-name (field InfoFiles InfoPageName)) + (info-title (field InfoFiles Title))) + (format #f "~a" + (if (and (string? info-page-name) + (string=? (string-downcase (string-trim-both info-page-name)) + "none")) + info-title info-page-name)))) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder (let ([dataset-type @@ -176,30 +180,21 @@ (string->symbol dataset-type) ""))) - (set rdfs:label (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) + (set rdfs:label (normalize-string-field (field InfoFiles InfoPageName))) (set skos:prefLabel - (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" - DatasetFullName))) + (normalize-string-field + (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" + DatasetFullName)))) (set skos:altLabel (field Datasets DatasetName DatasetGroup)) - (set dct:title - (regexp-substitute/global - #f "^[Nn]one$" - (or - (regexp-substitute/global - #f "^Unpublished$" (field Datasets PublicationTitle) "") - (field InfoFiles InfoFileTitle) - "") - "")) + (set dct:title (normalize-string-field (field Datasets PublicationTitle))) (set dct:created - (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" - createTimeGenoFreeze))) + (normalize-string-field + (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" + createTimeGenoFreeze)))) (set dcat:contactPoint (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) - (field Investigators Email))) + "")) (set foaf:Organization (field Organizations OrganizationName)) (set dct:identifier (format #f "GN~a" (field InfoFiles GN_AccesionId))) @@ -209,14 +204,16 @@ (string->identifier "set" (field ("IFNULL(InbredSet.Name, IFNULL(PublishInbredSet.Name, GenoInbredSet.Name))" - InbredSetName)))) + InbredSetName)) + #:separator "_")) (set gnt:has_tissue (string->identifier "tissue" - (field Tissue Short_Name))) + (field Tissue Short_Name) + #:separator "_")) (set gnt:uses_normalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) + (let ((avg-method (normalize-string-field (field AvgMethod Name AvgMethodName)))) + (if (not (string-blank? avg-method)) + (string->identifier "avg_method" avg-method #:separator "_") + ""))) (set gnt:has_summary (let* ((summary-link (format @@ -375,7 +372,8 @@ "" (string->symbol acknowledgment-link)))) (set gnt:uses_platform (string->identifier "platform" - (field GeneChip Name GeneChip))) + (field GeneChip Name GeneChip) + #:separator "_")) (set gnt:has_geo_series_id (let ((s (string-match "GSE[0-9]*" @@ -391,11 +389,7 @@ (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") (triples - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post)) + (string->identifier "" (field PublishFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:phenotype) (set dct:title (field PublishFreeze FullName)) @@ -407,8 +401,7 @@ (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) - #:separator "_" - #:proc (lambda (x) x))))) + #:separator "_")))) (define-transformer genofreeze (tables (GenoFreeze @@ -416,15 +409,7 @@ (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GenoFreeze Name) - 'pre "_" 'post) - 'pre "_" 'post)) + (string->identifier "" (field GenoFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:genotype) (set rdfs:label (field GenoFreeze Name)) @@ -454,19 +439,14 @@ (gnt:uses_data_scale a owl:ObjectProperty) (gnt:uses_data_scale skos:definition "Thi data scale this resource uses")) (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post)) + (string->identifier "" (field ProbeSetFreeze Name)) (set rdf:type 'dcat:Dataset) (set xkos:classifiedUnder 'gnc:probeset) (set gnt:uses_normalization - (string->identifier "avgMethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) + (let ((avg-method (field AvgMethod Name AvgMethodName))) + (if (string-blank? avg-method) + #f + avg-method))) (set dct:title (field ProbeSetFreeze FullName)) (set rdfs:label (field ProbeSetFreeze ShortName)) (set skos:prefLabel (field ProbeSetFreeze Name)) @@ -475,15 +455,8 @@ (field ProbeSetFreeze CreateTime) '^^xsd:datetime)) (set gnt:uses_data_scale (field ProbeSetFreeze DataScale)) - (set gnt:has_tissue - (string->identifier - "tissue" - (field Tissue Short_Name))) - (set gnt:belongs_to_group - (string->identifier - "set" (field InbredSet Name InbredSetName) - #:separator "" - #:proc string-capitalize-first)))) + (set gnt:has_tissue (string->identifier "tissue" (field Tissue Short_Name) #:separator "_")) + (set gnt:belongs_to_group (string->identifier "set" (field InbredSet Name InbredSetName) #:separator "_")))) -- cgit 1.4.1