diff options
author | Munyoki Kilyungi | 2023-12-15 21:38:24 +0300 |
---|---|---|
committer | Munyoki Kilyungi | 2023-12-15 21:45:03 +0300 |
commit | 1ea6e2dd7655788e198dc13695c829287132498f (patch) | |
tree | 3ef484f9bd0010a40e2781826f11e9d673e887b6 | |
parent | 4a62e17816928e271ba982038ac36fcaf72783d2 (diff) | |
download | gn-transform-databases-1ea6e2dd7655788e198dc13695c829287132498f.tar.gz |
Preserve gene symbol case when used as an identifer.
Genes with varying casing (e.g., Shh, SHH) result in
`string->identifier` capitalizing the first letter by default. This
creates inconsistencies in gene symbols, leading to different
predicates and objects for the same entity, introducing errors.
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-x | examples/genelist.scm | 10 | ||||
-rwxr-xr-x | examples/probeset.scm | 7 | ||||
-rwxr-xr-x | examples/strains.scm | 8 |
3 files changed, 17 insertions, 8 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm index fbd39c1..b19b30f 100755 --- a/examples/genelist.scm +++ b/examples/genelist.scm @@ -78,10 +78,12 @@ (gnt:hasTargetSeq rdfs:domain gnc:Probeset)) (triples (string->identifier - "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (string-trim-both - (field GeneList GeneSymbol)) - 'pre "_" 'post)) + "gene" (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (string-trim-both + (field GeneList GeneSymbol)) + 'pre "_" 'post) + #:proc (lambda (x) x)) (set rdf:type 'gnc:GeneSymbol) (set rdfs:label (field GeneList GeneSymbol)) (set dct:description (sanitize-rdf-string (field GeneList GeneDescription))) diff --git a/examples/probeset.scm b/examples/probeset.scm index 24c09c7..92c6a2a 100755 --- a/examples/probeset.scm +++ b/examples/probeset.scm @@ -79,7 +79,12 @@ (set gnt:hasTargetId (field ("NULLIF(TRIM(ProbeSet.TargetId), '')" TargetId))) - (set gnt:symbol (field ProbeSet Symbol)) + (set gnt:symbol + (string->identifier + "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field ProbeSet Symbol) + 'pre "_" 'post) + #:proc (lambda (x) x))) (set dct:description (sanitize-rdf-string (field ProbeSet description))) (set gnt:targetsRegion (sanitize-rdf-string diff --git a/examples/strains.scm b/examples/strains.scm index 4e62b49..b4e2a56 100755 --- a/examples/strains.scm +++ b/examples/strains.scm @@ -88,9 +88,11 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used. (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) (set gnt:symbol (string->identifier - "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field Strain Symbol) - 'pre "_" 'post))))) + "gene" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field Strain Symbol) + 'pre "_" 'post) + #:proc (lambda (x) x))))) (define-transformer mapping-method (tables (MappingMethod)) |