aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-12-15 21:38:24 +0300
committerMunyoki Kilyungi2023-12-15 21:45:03 +0300
commit1ea6e2dd7655788e198dc13695c829287132498f (patch)
tree3ef484f9bd0010a40e2781826f11e9d673e887b6
parent4a62e17816928e271ba982038ac36fcaf72783d2 (diff)
downloadgn-transform-databases-1ea6e2dd7655788e198dc13695c829287132498f.tar.gz
Preserve gene symbol case when used as an identifer.
Genes with varying casing (e.g., Shh, SHH) result in `string->identifier` capitalizing the first letter by default. This creates inconsistencies in gene symbols, leading to different predicates and objects for the same entity, introducing errors. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/genelist.scm10
-rwxr-xr-xexamples/probeset.scm7
-rwxr-xr-xexamples/strains.scm8
3 files changed, 17 insertions, 8 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm
index fbd39c1..b19b30f 100755
--- a/examples/genelist.scm
+++ b/examples/genelist.scm
@@ -78,10 +78,12 @@
(gnt:hasTargetSeq rdfs:domain gnc:Probeset))
(triples
(string->identifier
- "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (string-trim-both
- (field GeneList GeneSymbol))
- 'pre "_" 'post))
+ "gene" (regexp-substitute/global
+ #f "[^A-Za-z0-9:]"
+ (string-trim-both
+ (field GeneList GeneSymbol))
+ 'pre "_" 'post)
+ #:proc (lambda (x) x))
(set rdf:type 'gnc:GeneSymbol)
(set rdfs:label (field GeneList GeneSymbol))
(set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
diff --git a/examples/probeset.scm b/examples/probeset.scm
index 24c09c7..92c6a2a 100755
--- a/examples/probeset.scm
+++ b/examples/probeset.scm
@@ -79,7 +79,12 @@
(set gnt:hasTargetId
(field ("NULLIF(TRIM(ProbeSet.TargetId), '')"
TargetId)))
- (set gnt:symbol (field ProbeSet Symbol))
+ (set gnt:symbol
+ (string->identifier
+ "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field ProbeSet Symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x)))
(set dct:description (sanitize-rdf-string (field ProbeSet description)))
(set gnt:targetsRegion
(sanitize-rdf-string
diff --git a/examples/strains.scm b/examples/strains.scm
index 4e62b49..b4e2a56 100755
--- a/examples/strains.scm
+++ b/examples/strains.scm
@@ -88,9 +88,11 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used.
(set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias))))
(set gnt:symbol
(string->identifier
- "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (field Strain Symbol)
- 'pre "_" 'post)))))
+ "gene"
+ (regexp-substitute/global #f "[^A-Za-z0-9:]"
+ (field Strain Symbol)
+ 'pre "_" 'post)
+ #:proc (lambda (x) x)))))
(define-transformer mapping-method
(tables (MappingMethod))