From 1ea6e2dd7655788e198dc13695c829287132498f Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Fri, 15 Dec 2023 21:38:24 +0300 Subject: Preserve gene symbol case when used as an identifer. Genes with varying casing (e.g., Shh, SHH) result in `string->identifier` capitalizing the first letter by default. This creates inconsistencies in gene symbols, leading to different predicates and objects for the same entity, introducing errors. Signed-off-by: Munyoki Kilyungi --- examples/genelist.scm | 10 ++++++---- examples/probeset.scm | 7 ++++++- examples/strains.scm | 8 +++++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/examples/genelist.scm b/examples/genelist.scm index fbd39c1..b19b30f 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -78,10 +78,12 @@ (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) (triples (string->identifier - "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (string-trim-both - (field GeneList GeneSymbol)) - 'pre "_" 'post)) + "gene" (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (string-trim-both + (field GeneList GeneSymbol)) + 'pre "_" 'post) + #:proc (lambda (x) x)) (set rdf:type 'gnc:GeneSymbol) (set rdfs:label (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) diff --git a/examples/probeset.scm b/examples/probeset.scm index 24c09c7..92c6a2a 100755 --- a/examples/probeset.scm +++ b/examples/probeset.scm @@ -79,7 +79,12 @@ (set gnt:hasTargetId (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" TargetId))) - (set gnt:symbol (field ProbeSet Symbol)) + (set gnt:symbol + (string->identifier + "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field ProbeSet Symbol) + 'pre "_" 'post) + #:proc (lambda (x) x))) (set dct:description (sanitize-rdf-string (field ProbeSet description))) (set gnt:targetsRegion (sanitize-rdf-string diff --git a/examples/strains.scm b/examples/strains.scm index 4e62b49..b4e2a56 100755 --- a/examples/strains.scm +++ b/examples/strains.scm @@ -88,9 +88,11 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) (set gnt:symbol (string->identifier - "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field Strain Symbol) - 'pre "_" 'post))))) + "gene" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field Strain Symbol) + 'pre "_" 'post) + #:proc (lambda (x) x))))) (define-transformer mapping-method (tables (MappingMethod)) -- cgit v1.2.3