1 files changed, 403 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
new file mode 100644
index 00000000..c988a648
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
@@ -0,0 +1,403 @@
+import itertools
+import os
+import re
+from string import Template
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+from tokenizers import Encoding, Tokenizer
+
+
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "visualizer-styles.css")
+with open(css_filename) as f:
+    css = f.read()
+
+
+class Annotation:
+    start: int
+    end: int
+    label: int
+
+    def __init__(self, start: int, end: int, label: str):
+        self.start = start
+        self.end = end
+        self.label = label
+
+
+AnnotationList = List[Annotation]
+PartialIntList = List[Optional[int]]
+
+
+class CharStateKey(NamedTuple):
+    token_ix: Optional[int]
+    anno_ix: Optional[int]
+
+
+class CharState:
+    char_ix: Optional[int]
+
+    def __init__(self, char_ix):
+        self.char_ix = char_ix
+
+        self.anno_ix: Optional[int] = None
+        self.tokens: List[int] = []
+
+    @property
+    def token_ix(self):
+        return self.tokens[0] if len(self.tokens) > 0 else None
+
+    @property
+    def is_multitoken(self):
+        """
+        BPE tokenizers can output more than one token for a char
+        """
+        return len(self.tokens) > 1
+
+    def partition_key(self) -> CharStateKey:
+        return CharStateKey(
+            token_ix=self.token_ix,
+            anno_ix=self.anno_ix,
+        )
+
+
+class Aligned:
+    pass
+
+
+class EncodingVisualizer:
+    """
+    Build an EncodingVisualizer
+
+    Args:
+
+         tokenizer (:class:`~tokenizers.Tokenizer`):
+            A tokenizer instance
+
+         default_to_notebook (:obj:`bool`):
+            Whether to render html output in a notebook by default
+
+         annotation_converter (:obj:`Callable`, `optional`):
+            An optional (lambda) function that takes an annotation in any format and returns
+            an Annotation object
+    """
+
+    unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        default_to_notebook: bool = True,
+        annotation_converter: Optional[Callable[[Any], Annotation]] = None,
+    ):
+        if default_to_notebook:
+            try:
+                from IPython.core.display import HTML, display
+            except ImportError:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                        Are you running in a notebook?
+                        You can also pass `default_to_notebook=False` to get back raw HTML
+                    """
+                )
+
+        self.tokenizer = tokenizer
+        self.default_to_notebook = default_to_notebook
+        self.annotation_coverter = annotation_converter
+        pass
+
+    def __call__(
+        self,
+        text: str,
+        annotations: AnnotationList = [],
+        default_to_notebook: Optional[bool] = None,
+    ) -> Optional[str]:
+        """
+        Build a visualization of the given text
+
+        Args:
+            text (:obj:`str`):
+                The text to tokenize
+
+            annotations (:obj:`List[Annotation]`, `optional`):
+                An optional list of annotations of the text. The can either be an annotation class
+                or anything else if you instantiated the visualizer with a converter function
+
+            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
+                If True, will render the html in a notebook. Otherwise returns an html string.
+
+        Returns:
+            The HTML string if default_to_notebook is False, otherwise (default) returns None and
+            renders the HTML in the notebook
+
+        """
+        final_default_to_notebook = self.default_to_notebook
+        if default_to_notebook is not None:
+            final_default_to_notebook = default_to_notebook
+        if final_default_to_notebook:
+            try:
+                from IPython.core.display import HTML, display
+            except ImportError:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                    Are you running in a notebook?"""
+                )
+        if self.annotation_coverter is not None:
+            annotations = list(map(self.annotation_coverter, annotations))
+        encoding = self.tokenizer.encode(text)
+        html = EncodingVisualizer.__make_html(text, encoding, annotations)
+        if final_default_to_notebook:
+            display(HTML(html))
+        else:
+            return html
+
+    @staticmethod
+    def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
+        """
+        Generates a color palette for all the labels in a given set of annotations
+
+        Args:
+          annotations (:obj:`Annotation`):
+            A list of annotations
+
+        Returns:
+            :obj:`dict`: A dictionary mapping labels to colors in HSL format
+        """
+        if len(annotations) == 0:
+            return {}
+        labels = set(map(lambda x: x.label, annotations))
+        num_labels = len(labels)
+        h_step = int(255 / num_labels)
+        if h_step < 20:
+            h_step = 20
+        s = 32
+        l = 64  # noqa: E741
+        h = 10
+        colors = {}
+
+        for label in sorted(labels):  # sort so we always get the same colors for a given set of labels
+            colors[label] = f"hsl({h},{s}%,{l}%"
+            h += h_step
+        return colors
+
+    @staticmethod
+    def consecutive_chars_to_html(
+        consecutive_chars_list: List[CharState],
+        text: str,
+        encoding: Encoding,
+    ):
+        """
+        Converts a list of "consecutive chars" into a single HTML element.
+        Chars are consecutive if they fall under the same word, token and annotation.
+        The CharState class is a named tuple with a "partition_key" method that makes it easy to
+        compare if two chars are consecutive.
+
+        Args:
+            consecutive_chars_list (:obj:`List[CharState]`):
+                A list of CharStates that have been grouped together
+
+            text (:obj:`str`):
+                The original text being processed
+
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`str`: The HTML span for a set of consecutive chars
+        """
+        first = consecutive_chars_list[0]
+        if first.char_ix is None:
+            # its a special token
+            stoken = encoding.tokens[first.token_ix]
+            # special tokens are represented as empty spans. We use the data attribute and css
+            # magic to display it
+            return f'<span class="special-token" data-stoken={stoken}></span>'
+        # We're not in a special token so this group has a start and end.
+        last = consecutive_chars_list[-1]
+        start = first.char_ix
+        end = last.char_ix + 1
+        span_text = text[start:end]
+        css_classes = []  # What css classes will we apply on the resulting span
+        data_items = {}  # What data attributes will we apply on the result span
+        if first.token_ix is not None:
+            # We can either be in a token or not (e.g. in white space)
+            css_classes.append("token")
+            if first.is_multitoken:
+                css_classes.append("multi-token")
+            if first.token_ix % 2:
+                # We use this to color alternating tokens.
+                # A token might be split by an annotation that ends in the middle of it, so this
+                # lets us visually indicate a consecutive token despite its possible splitting in
+                # the html markup
+                css_classes.append("odd-token")
+            else:
+                # Like above, but a different color so we can see the tokens alternate
+                css_classes.append("even-token")
+            if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
+                # This is a special token that is in the text. probably UNK
+                css_classes.append("special-token")
+                # TODO is this the right name for the data attribute ?
+                data_items["stok"] = encoding.tokens[first.token_ix]
+        else:
+            # In this case we are looking at a group/single char that is not tokenized.
+            # e.g. white space
+            css_classes.append("non-token")
+        css = f'''class="{' '.join(css_classes)}"'''
+        data = ""
+        for key, val in data_items.items():
+            data += f' data-{key}="{val}"'
+        return f"<span {css} {data} >{span_text}</span>"
+
+    @staticmethod
+    def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
+        char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
+        current_consecutive_chars = [char_states[0]]
+        prev_anno_ix = char_states[0].anno_ix
+        spans = []
+        label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
+        cur_anno_ix = char_states[0].anno_ix
+        if cur_anno_ix is not None:
+            # If we started in an  annotation make a span for it
+            anno = annotations[cur_anno_ix]
+            label = anno.label
+            color = label_colors_dict[label]
+            spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+
+        for cs in char_states[1:]:
+            cur_anno_ix = cs.anno_ix
+            if cur_anno_ix != prev_anno_ix:
+                # If we've transitioned in or out of an annotation
+                spans.append(
+                    # Create a span from the current consecutive characters
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                current_consecutive_chars = [cs]
+
+                if prev_anno_ix is not None:
+                    # if we transitioned out of an annotation close it's span
+                    spans.append("</span>")
+                if cur_anno_ix is not None:
+                    # If we entered a new annotation make a span for it
+                    anno = annotations[cur_anno_ix]
+                    label = anno.label
+                    color = label_colors_dict[label]
+                    spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+            prev_anno_ix = cur_anno_ix
+
+            if cs.partition_key() == current_consecutive_chars[0].partition_key():
+                # If the current charchter is in the same "group" as the previous one
+                current_consecutive_chars.append(cs)
+            else:
+                # Otherwise we make a span for the previous group
+                spans.append(
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                # An reset the consecutive_char_list to form a new group
+                current_consecutive_chars = [cs]
+        # All that's left is to fill out the final span
+        # TODO I think there is an edge case here where an annotation's span might not close
+        spans.append(
+            EncodingVisualizer.consecutive_chars_to_html(
+                current_consecutive_chars,
+                text=text,
+                encoding=encoding,
+            )
+        )
+        res = HTMLBody(spans)  # Send the list of spans to the body of our html
+        return res
+
+    @staticmethod
+    def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
+        """
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`AnnotationList`):
+                A (possibly empty) list of annotations
+
+        Returns:
+            A list of  length len(text) whose entry at index i is None if there is no annotation on
+            charachter i or k, the index of the annotation that covers index i where k is with
+            respect to the list of annotations
+        """
+        annotation_map = [None] * len(text)
+        for anno_ix, a in enumerate(annotations):
+            for i in range(a.start, a.end):
+                annotation_map[i] = anno_ix
+        return annotation_map
+
+    @staticmethod
+    def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
+        """
+        For each character in the original text, we emit a tuple representing it's "state":
+
+            * which token_ix it corresponds to
+            * which word_ix it corresponds to
+            * which annotation_ix it corresponds to
+
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`List[Annotation]`):
+                A (possibly empty) list of annotations
+
+            encoding: (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
+            it's state is
+        """
+        annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
+        # Todo make this a dataclass or named tuple
+        char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
+        for token_ix, token in enumerate(encoding.tokens):
+            offsets = encoding.token_to_chars(token_ix)
+            if offsets is not None:
+                start, end = offsets
+                for i in range(start, end):
+                    char_states[i].tokens.append(token_ix)
+        for char_ix, anno_ix in enumerate(annotation_map):
+            char_states[char_ix].anno_ix = anno_ix
+
+        return char_states
+
+
+def HTMLBody(children: List[str], css_styles=css) -> str:
+    """
+    Generates the full html with css from a list of html spans
+
+    Args:
+        children (:obj:`List[str]`):
+            A list of strings, assumed to be html elements
+
+        css_styles (:obj:`str`, `optional`):
+            Optional alternative implementation of the css
+
+    Returns:
+        :obj:`str`: An HTML string with style markup
+    """
+    children_text = "".join(children)
+    return f"""
+    <html>
+        <head>
+            <style>
+                {css_styles}
+            </style>
+        </head>
+        <body>
+            <div class="tokenized-text" dir=auto>
+            {children_text}
+            </div>
+        </body>
+    </html>
+    """