about summary refs log tree commit diff
path: root/examples
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-12-15 21:38:24 +0300
committerMunyoki Kilyungi2023-12-15 21:45:03 +0300
commit1ea6e2dd7655788e198dc13695c829287132498f (patch)
tree3ef484f9bd0010a40e2781826f11e9d673e887b6 /examples
parent4a62e17816928e271ba982038ac36fcaf72783d2 (diff)
downloadgn-transform-databases-1ea6e2dd7655788e198dc13695c829287132498f.tar.gz
Preserve gene symbol case when used as an identifer.
Genes with varying casing (e.g., Shh, SHH) result in
`string->identifier` capitalizing the first letter by default.  This
creates inconsistencies in gene symbols, leading to different
predicates and objects for the same entity, introducing errors.

Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/genelist.scm10
-rwxr-xr-xexamples/probeset.scm7
-rwxr-xr-xexamples/strains.scm8
3 files changed, 17 insertions, 8 deletions
diff --git a/examples/genelist.scm b/examples/genelist.scm
index fbd39c1..b19b30f 100755
--- a/examples/genelist.scm
+++ b/examples/genelist.scm
@@ -78,10 +78,12 @@
    (gnt:hasTargetSeq rdfs:domain gnc:Probeset))
   (triples
       (string->identifier
-       "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                        (string-trim-both
-                                         (field GeneList GeneSymbol))
-                                        'pre "_" 'post))
+       "gene" (regexp-substitute/global
+               #f "[^A-Za-z0-9:]"
+               (string-trim-both
+                (field GeneList GeneSymbol))
+               'pre "_" 'post)
+       #:proc (lambda (x) x))
     (set rdf:type 'gnc:GeneSymbol)
     (set rdfs:label (field GeneList GeneSymbol))
     (set dct:description (sanitize-rdf-string (field GeneList GeneDescription)))
diff --git a/examples/probeset.scm b/examples/probeset.scm
index 24c09c7..92c6a2a 100755
--- a/examples/probeset.scm
+++ b/examples/probeset.scm
@@ -79,7 +79,12 @@
     (set gnt:hasTargetId
          (field ("NULLIF(TRIM(ProbeSet.TargetId), '')"
                  TargetId)))
-    (set gnt:symbol (field ProbeSet Symbol))
+    (set gnt:symbol
+         (string->identifier
+          "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                           (field ProbeSet Symbol)
+                                           'pre "_" 'post)
+          #:proc (lambda (x) x)))
     (set dct:description (sanitize-rdf-string (field ProbeSet description)))
     (set gnt:targetsRegion
          (sanitize-rdf-string
diff --git a/examples/strains.scm b/examples/strains.scm
index 4e62b49..b4e2a56 100755
--- a/examples/strains.scm
+++ b/examples/strains.scm
@@ -88,9 +88,11 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used.
     (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias))))
     (set gnt:symbol
          (string->identifier
-          "gene" (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                           (field Strain Symbol)
-                                           'pre "_" 'post)))))
+          "gene"
+          (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                    (field Strain Symbol)
+                                    'pre "_" 'post)
+          #:proc (lambda (x) x)))))
 
 (define-transformer mapping-method
   (tables (MappingMethod))