about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2021-12-02 15:14:13 +0530
committerArun Isaac2021-12-02 15:36:40 +0530
commit08ed5b9c842197e77a9dfb0072afc611fd24fcc0 (patch)
tree26255ea2c94460589cb7a9c09f4d137025ae512d
parentbe97de713219788f118edc92abbbd66e7caa76be (diff)
downloadgn-transform-databases-08ed5b9c842197e77a9dfb0072afc611fd24fcc0.tar.gz
Abstract out string->identifier.
Building a turtle identifier from a string after removing illegal
characters and prefixing is an extremely common operation. Abstract
it. Also, mandate identifier prefixes. It is better to play it safe.

* dump.scm (string->identifier): New function.
(binomial-name->species-id, dump-strain, mapping-method-name->id,
inbred-set-name->id, aphenotype-id->id, tissue-short-name->id,
investigator-email->id, avg-method-name->id, gene-chip-name->id): Use
string->identifier.
-rwxr-xr-xdump.scm59
1 files changed, 26 insertions, 33 deletions
diff --git a/dump.scm b/dump.scm
index cab926c..018933b 100755
--- a/dump.scm
+++ b/dump.scm
@@ -82,6 +82,17 @@
         str
         substrings))
 
+(define (string->identifier prefix str)
+  "Convert STR to a turtle identifier after replacing illegal
+characters with an underscore and prefixing with gn:PREFIX."
+  (string->symbol
+   (string-append "gn:" prefix
+                  (string-map (lambda (c)
+                                (case c
+                                  ((#\/ #\< #\> #\+ #\( #\) #\space #\@) #\_)
+                                  (else c)))
+                              (string-downcase str)))))
+
 (define (snake->lower-camel str)
   (let ((char-list (string->list str)))
     (call-with-output-string
@@ -119,9 +130,8 @@
 (define (triple subject predicate object)
   (format #t "~a ~a ~s .~%" subject predicate object))
 
-(define (binomial-name->species-id binomial-name)
-  (string->symbol
-   (string-append "gn:" (string-replace-substring binomial-name " " "_"))))
+(define binomial-name->species-id
+  (cut string->identifier "species" <>))
 
 (define (dump-species db)
   (sql-for-each (lambda (alist)
@@ -150,14 +160,7 @@
                       (_ . name2)
                       (_ . symbol)
                       (_ . alias))
-                     (let ((id
-                            ;; TODO: Ensure this identifier does not collide.
-                            (string-append "gn:"
-                                           (string-map (lambda (c)
-                                                         (case c
-                                                           ((#\/ #\< #\> #\+ #\( #\) #\space) #\_)
-                                                           (else c)))
-                                                       name))))
+                     (let ((id (string->identifier "strain" name)))
                        (triple id 'rdf:type 'gn:strain)
                        ;; The species this is a strain of
                        (triple id 'gn:strainOfSpecies
@@ -177,8 +180,8 @@
                 "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
 
 ;; TODO: This function is unused. Remove if not required.
-(define (mapping-method-name->id name)
-  (string->symbol (string-append "gn:mappingMethod" name)))
+(define mapping-method-name->id
+  (cut string->identifier "mappingMethod" <>))
 
 ;; TODO: This function is unused. Remove if not required.
 (define (dump-mapping-method db)
@@ -189,8 +192,8 @@
                 db
                 "SELECT Name FROM MappingMethod"))
 
-(define (inbred-set-name->id name)
-  (string->symbol (string-append "gn:inbredSet" name)))
+(define inbred-set-name->id
+  (cut string->identifier "inbredSet" <>))
 
 (define (dump-inbred-set db)
   (sql-for-each (lambda (alist)
@@ -211,7 +214,7 @@ FROM InbredSet
 INNER JOIN Species USING (SpeciesId)"))
 
 (define (phenotype-id->id id)
-  (string->symbol (string-append "gn:phenotype" (number->string id))))
+  (string->identifier "phenotype" (number->string id)))
 
 (define (dump-phenotype db)
   (sql-for-each (lambda (alist)
@@ -267,8 +270,8 @@ Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
 FROM PublishXRef
 INNER JOIN InbredSet USING (InbredSetId)"))
 
-(define (tissue-short-name->id short-name)
-  (string->symbol (string-append "gn:tissue" short-name)))
+(define tissue-short-name->id
+  (cut string->identifier "tissue" <>))
 
 (define (dump-tissue db)
   ;; The Name and TissueName fields seem to be identical. BIRN_lex_ID
@@ -290,10 +293,7 @@ INNER JOIN InbredSet USING (InbredSetId)"))
   (string-replace-substring email " " ""))
 
 (define (investigator-email->id email)
-  (string->symbol
-   (string-append "gn:investigator"
-                  (string-replace-substring
-                   (fix-email-id email) "@" "_"))))
+  (string->identifier "investigator" (fix-email-id email)))
 
 (define (dump-investigators db)
   (sql-for-each (lambda (alist)
@@ -326,13 +326,8 @@ INNER JOIN InbredSet USING (InbredSetId)"))
 WHERE Email != ''
 GROUP BY Email"))
 
-(define (avg-method-name->id name)
-  (string->symbol
-   (string-append "gn:avgmethod"
-                  (string-replace-substring
-                   (string-replace-substring
-                    (string-downcase name) " " "_")
-                   "/" "_"))))
+(define avg-method-name->id
+  (cut string->identifier "avgmethod" <>))
 
 (define (dump-avg-method db)
   (sql-for-each (match-lambda
@@ -348,10 +343,8 @@ GROUP BY Email"))
                 ;; "N/A". Deduplicate.
                 "SELECT DISTINCT Name FROM AvgMethod"))
 
-(define (gene-chip-name->id name)
-  (string->symbol
-   (string-replace-substring (string-append "gn:platform" name)
-                             " " "_")))
+(define gene-chip-name->id
+  (cut string->identifier "platform" <>))
 
 (define (dump-gene-chip db)
   (sql-for-each (match-lambda