about summary refs log tree commit diff
path: root/transform/strings.scm
blob: 75443997d465198e1439140e400c831f1605a083 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
(define-module (transform strings)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-19)
  #:use-module (ice-9 match)
  #:use-module (ice-9 string-fun)
  #:use-module (ice-9 textual-ports)
  #:export (string-blank?
            time-unix->string
            string-blank?
            string-split-substring
            delete-substrings
            replace-substrings
            remove-duplicates
            sanitize-rdf-string
            snake->lower-camel
            lower-case-and-replace-spaces
            string-capitalize-first
            normalize-string-field
            fix-email-id
            investigator-attributes->id))

;; One email ID in the Investigators table has spaces in it. This
;; function fixes that.
(define (fix-email-id email)
  (string-delete #\space email))

(define (investigator-attributes->id first-name last-name email)
  ;; There is just one record corresponding to "Evan Williams" which
  ;; does not have an email ID. To accommodate that record, we
  ;; construct the investigator ID from not just the email ID, but
  ;; also the first and the last names. It would be preferable to just
  ;; find Evan Williams' email ID and insert it into the database.
  (string->identifier "investigator"
                      (string-join
                       (list first-name last-name (fix-email-id email))
                       "_")
                      #:separator "_"))

(define (lower-case-and-replace-spaces str)
  (string-map
   (lambda (c)
     (if (char=? c #\space)
         #\-                         ; replace space with hyphen
         c))                         ; convert character to lower case
   (string-downcase str)))

(define (time-unix->string seconds . maybe-format)
  "Given an integer saying the number of seconds since the Unix
epoch (1970-01-01 00:00:00), SECONDS, format it as a human readable
date and time-string, possible using the MAYBE-FORMAT."
  (letrec ([time-unix->time-utc
            (lambda (seconds)
              (add-duration
               (date->time-utc (make-date 0 0 0 0 1 1 1970 0))
               (make-time time-duration 0 seconds)))])
    (apply date->string
           (time-utc->date (time-unix->time-utc seconds))
           maybe-format)))

(define (string-blank? str)
  "Return non-#f if STR consists only of whitespace characters."
  (string-every char-set:whitespace str))

(define (string-split-substring str substr)
  "Split the string @var{str} into a list of substrings delimited by the
substring @var{substr}."

  (define substrlen (string-length substr))
  (define strlen (string-length str))

  (define (loop index start)
    (cond
     ((>= start strlen) (list ""))
     ((not index) (list (substring str start)))
     (else
      (cons (substring str start index)
            (let ((new-start (+ index substrlen)))
              (loop (string-contains str substr new-start)
                    new-start))))))

  (cond
   ((string-contains str substr) => (lambda (idx) (loop idx 0)))
   (else (list str))))

(define (delete-substrings str . substrings)
  "Delete SUBSTRINGS, a list of strings, from STR."
  (fold (lambda (substring result)
          (string-replace-substring result substring ""))
        str
        substrings))

(define (replace-substrings str replacement-alist)
  "Replace substrings in STR according to REPLACEMENT-ALIST, an
association list mapping substrings to their replacements."
  (fold (match-lambda*
          (((substring . replacement) str)
           (string-replace-substring str substring replacement)))
        str
        replacement-alist))

(define (sanitize-rdf-string str)
  (replace-substrings
   (string-trim-both str)
   '(("\r" . "\\r")
     ("\n" . "\\n")
     ("\"" . "'")
     ("\v" . "")
     (\x81„" . "/")
     ("’" . "’")
     ("â€" . "‒")
     ("\x9d" . "")
     ("\xad" . "")
     ("\x28" . "")
     ("\x29" . "")
     ("\xa0" . " ")
     ("\x02" . "")
     ("\x01" . ""))))

(define (snake->lower-camel str)
  (let ((char-list (string->list str)))
    (call-with-output-string
      (lambda (port)
        (put-char port (char-downcase (string-ref str 0)))
        (map (lambda (char previous-char)
               (unless (char=? char #\_)
                 (put-char port (if (char=? previous-char #\_)
                                    (char-upcase char)
                                    char))))
             (drop char-list 1)
             char-list)))))

(define (string-capitalize-first string)
  (string-titlecase
   (string-downcase string) 0 1))

(define (remove-duplicates lst)
  (let loop ((lst lst) (result '()))
    (cond
      ((null? lst) (reverse result))
      ((memq (car lst) result) (loop (cdr lst) result))
      (else (loop (cdr lst) (cons (car lst) result))))))

(define (normalize-string-field field)
  (let ((field (string-trim-both field)))
    (match field
      ((?  string? field)
       (if (or (string-blank? field)
               (string=? (string-downcase field) "none"))
           ""
           field))
      (_ ""))))