1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
|
import itertools
import os
import re
from string import Template
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
from tokenizers import Encoding, Tokenizer
dirname = os.path.dirname(__file__)
css_filename = os.path.join(dirname, "visualizer-styles.css")
with open(css_filename) as f:
css = f.read()
class Annotation:
start: int
end: int
label: int
def __init__(self, start: int, end: int, label: str):
self.start = start
self.end = end
self.label = label
AnnotationList = List[Annotation]
PartialIntList = List[Optional[int]]
class CharStateKey(NamedTuple):
token_ix: Optional[int]
anno_ix: Optional[int]
class CharState:
char_ix: Optional[int]
def __init__(self, char_ix):
self.char_ix = char_ix
self.anno_ix: Optional[int] = None
self.tokens: List[int] = []
@property
def token_ix(self):
return self.tokens[0] if len(self.tokens) > 0 else None
@property
def is_multitoken(self):
"""
BPE tokenizers can output more than one token for a char
"""
return len(self.tokens) > 1
def partition_key(self) -> CharStateKey:
return CharStateKey(
token_ix=self.token_ix,
anno_ix=self.anno_ix,
)
class Aligned:
pass
class EncodingVisualizer:
"""
Build an EncodingVisualizer
Args:
tokenizer (:class:`~tokenizers.Tokenizer`):
A tokenizer instance
default_to_notebook (:obj:`bool`):
Whether to render html output in a notebook by default
annotation_converter (:obj:`Callable`, `optional`):
An optional (lambda) function that takes an annotation in any format and returns
an Annotation object
"""
unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
def __init__(
self,
tokenizer: Tokenizer,
default_to_notebook: bool = True,
annotation_converter: Optional[Callable[[Any], Annotation]] = None,
):
if default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?
You can also pass `default_to_notebook=False` to get back raw HTML
"""
)
self.tokenizer = tokenizer
self.default_to_notebook = default_to_notebook
self.annotation_coverter = annotation_converter
pass
def __call__(
self,
text: str,
annotations: AnnotationList = [],
default_to_notebook: Optional[bool] = None,
) -> Optional[str]:
"""
Build a visualization of the given text
Args:
text (:obj:`str`):
The text to tokenize
annotations (:obj:`List[Annotation]`, `optional`):
An optional list of annotations of the text. The can either be an annotation class
or anything else if you instantiated the visualizer with a converter function
default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
If True, will render the html in a notebook. Otherwise returns an html string.
Returns:
The HTML string if default_to_notebook is False, otherwise (default) returns None and
renders the HTML in the notebook
"""
final_default_to_notebook = self.default_to_notebook
if default_to_notebook is not None:
final_default_to_notebook = default_to_notebook
if final_default_to_notebook:
try:
from IPython.core.display import HTML, display
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?"""
)
if self.annotation_coverter is not None:
annotations = list(map(self.annotation_coverter, annotations))
encoding = self.tokenizer.encode(text)
html = EncodingVisualizer.__make_html(text, encoding, annotations)
if final_default_to_notebook:
display(HTML(html))
else:
return html
@staticmethod
def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
"""
Generates a color palette for all the labels in a given set of annotations
Args:
annotations (:obj:`Annotation`):
A list of annotations
Returns:
:obj:`dict`: A dictionary mapping labels to colors in HSL format
"""
if len(annotations) == 0:
return {}
labels = set(map(lambda x: x.label, annotations))
num_labels = len(labels)
h_step = int(255 / num_labels)
if h_step < 20:
h_step = 20
s = 32
l = 64 # noqa: E741
h = 10
colors = {}
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
colors[label] = f"hsl({h},{s}%,{l}%"
h += h_step
return colors
@staticmethod
def consecutive_chars_to_html(
consecutive_chars_list: List[CharState],
text: str,
encoding: Encoding,
):
"""
Converts a list of "consecutive chars" into a single HTML element.
Chars are consecutive if they fall under the same word, token and annotation.
The CharState class is a named tuple with a "partition_key" method that makes it easy to
compare if two chars are consecutive.
Args:
consecutive_chars_list (:obj:`List[CharState]`):
A list of CharStates that have been grouped together
text (:obj:`str`):
The original text being processed
encoding (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`str`: The HTML span for a set of consecutive chars
"""
first = consecutive_chars_list[0]
if first.char_ix is None:
# its a special token
stoken = encoding.tokens[first.token_ix]
# special tokens are represented as empty spans. We use the data attribute and css
# magic to display it
return f'<span class="special-token" data-stoken={stoken}></span>'
# We're not in a special token so this group has a start and end.
last = consecutive_chars_list[-1]
start = first.char_ix
end = last.char_ix + 1
span_text = text[start:end]
css_classes = [] # What css classes will we apply on the resulting span
data_items = {} # What data attributes will we apply on the result span
if first.token_ix is not None:
# We can either be in a token or not (e.g. in white space)
css_classes.append("token")
if first.is_multitoken:
css_classes.append("multi-token")
if first.token_ix % 2:
# We use this to color alternating tokens.
# A token might be split by an annotation that ends in the middle of it, so this
# lets us visually indicate a consecutive token despite its possible splitting in
# the html markup
css_classes.append("odd-token")
else:
# Like above, but a different color so we can see the tokens alternate
css_classes.append("even-token")
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
# This is a special token that is in the text. probably UNK
css_classes.append("special-token")
# TODO is this the right name for the data attribute ?
data_items["stok"] = encoding.tokens[first.token_ix]
else:
# In this case we are looking at a group/single char that is not tokenized.
# e.g. white space
css_classes.append("non-token")
css = f'''class="{' '.join(css_classes)}"'''
data = ""
for key, val in data_items.items():
data += f' data-{key}="{val}"'
return f"<span {css} {data} >{span_text}</span>"
@staticmethod
def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
current_consecutive_chars = [char_states[0]]
prev_anno_ix = char_states[0].anno_ix
spans = []
label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
cur_anno_ix = char_states[0].anno_ix
if cur_anno_ix is not None:
# If we started in an annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
for cs in char_states[1:]:
cur_anno_ix = cs.anno_ix
if cur_anno_ix != prev_anno_ix:
# If we've transitioned in or out of an annotation
spans.append(
# Create a span from the current consecutive characters
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
current_consecutive_chars = [cs]
if prev_anno_ix is not None:
# if we transitioned out of an annotation close it's span
spans.append("</span>")
if cur_anno_ix is not None:
# If we entered a new annotation make a span for it
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
prev_anno_ix = cur_anno_ix
if cs.partition_key() == current_consecutive_chars[0].partition_key():
# If the current charchter is in the same "group" as the previous one
current_consecutive_chars.append(cs)
else:
# Otherwise we make a span for the previous group
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
# An reset the consecutive_char_list to form a new group
current_consecutive_chars = [cs]
# All that's left is to fill out the final span
# TODO I think there is an edge case here where an annotation's span might not close
spans.append(
EncodingVisualizer.consecutive_chars_to_html(
current_consecutive_chars,
text=text,
encoding=encoding,
)
)
res = HTMLBody(spans) # Send the list of spans to the body of our html
return res
@staticmethod
def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
"""
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`AnnotationList`):
A (possibly empty) list of annotations
Returns:
A list of length len(text) whose entry at index i is None if there is no annotation on
charachter i or k, the index of the annotation that covers index i where k is with
respect to the list of annotations
"""
annotation_map = [None] * len(text)
for anno_ix, a in enumerate(annotations):
for i in range(a.start, a.end):
annotation_map[i] = anno_ix
return annotation_map
@staticmethod
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
"""
For each character in the original text, we emit a tuple representing it's "state":
* which token_ix it corresponds to
* which word_ix it corresponds to
* which annotation_ix it corresponds to
Args:
text (:obj:`str`):
The raw text we want to align to
annotations (:obj:`List[Annotation]`):
A (possibly empty) list of annotations
encoding: (:class:`~tokenizers.Encoding`):
The encoding returned from the tokenizer
Returns:
:obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
it's state is
"""
annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
# Todo make this a dataclass or named tuple
char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
for token_ix, token in enumerate(encoding.tokens):
offsets = encoding.token_to_chars(token_ix)
if offsets is not None:
start, end = offsets
for i in range(start, end):
char_states[i].tokens.append(token_ix)
for char_ix, anno_ix in enumerate(annotation_map):
char_states[char_ix].anno_ix = anno_ix
return char_states
def HTMLBody(children: List[str], css_styles=css) -> str:
"""
Generates the full html with css from a list of html spans
Args:
children (:obj:`List[str]`):
A list of strings, assumed to be html elements
css_styles (:obj:`str`, `optional`):
Optional alternative implementation of the css
Returns:
:obj:`str`: An HTML string with style markup
"""
children_text = "".join(children)
return f"""
<html>
<head>
<style>
{css_styles}
</style>
</head>
<body>
<div class="tokenized-text" dir=auto>
{children_text}
</div>
</body>
</html>
"""
|