two version of R2R are here HEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tokenizers
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
25 files changed, 5665 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
new file mode 100644
index 00000000..efd57429
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
@@ -0,0 +1,100 @@
+from enum import Enum
+from typing import List, Tuple, Union
+
+
+Offsets = Tuple[int, int]
+
+TextInputSequence = str
+"""A :obj:`str` that represents an input sequence """
+
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+"""A pre-tokenized input sequence. Can be one of:
+
+    - A :obj:`List` of :obj:`str`
+    - A :obj:`Tuple` of :obj:`str`
+"""
+
+TextEncodeInput = Union[
+    TextInputSequence,
+    Tuple[TextInputSequence, TextInputSequence],
+    List[TextInputSequence],
+]
+"""Represents a textual input for encoding. Can be either:
+
+    - A single sequence: :data:`~tokenizers.TextInputSequence`
+    - A pair of sequences:
+
+      - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
+      - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
+"""
+
+PreTokenizedEncodeInput = Union[
+    PreTokenizedInputSequence,
+    Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+    List[PreTokenizedInputSequence],
+]
+"""Represents a pre-tokenized input for encoding. Can be either:
+
+    - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
+    - A pair of sequences:
+
+      - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
+      - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
+"""
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+"""Represents all the possible types of input sequences for encoding. Can be:
+
+    - When ``is_pretokenized=False``: :data:`~TextInputSequence`
+    - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
+"""
+
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+"""Represents all the possible types of input for encoding. Can be:
+
+    - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
+    - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
+"""
+
+
+class OffsetReferential(Enum):
+    ORIGINAL = "original"
+    NORMALIZED = "normalized"
+
+
+class OffsetType(Enum):
+    BYTE = "byte"
+    CHAR = "char"
+
+
+class SplitDelimiterBehavior(Enum):
+    REMOVED = "removed"
+    ISOLATED = "isolated"
+    MERGED_WITH_PREVIOUS = "merged_with_previous"
+    MERGED_WITH_NEXT = "merged_with_next"
+    CONTIGUOUS = "contiguous"
+
+
+from .tokenizers import (
+    AddedToken,
+    Encoding,
+    NormalizedString,
+    PreTokenizedString,
+    Regex,
+    Token,
+    Tokenizer,
+    decoders,
+    models,
+    normalizers,
+    pre_tokenizers,
+    processors,
+    trainers,
+    __version__,
+)
+from .implementations import (
+    BertWordPieceTokenizer,
+    ByteLevelBPETokenizer,
+    CharBPETokenizer,
+    SentencePieceBPETokenizer,
+    SentencePieceUnigramTokenizer,
+)
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi
new file mode 100644
index 00000000..5dbc665d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi
@@ -0,0 +1,1200 @@
+# Generated content DO NOT EDIT
+class AddedToken:
+    """
+    Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
+    It can have special options that defines the way it should behave.
+
+    Args:
+        content (:obj:`str`): The content of the token
+
+        single_word (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should only match single words. If :obj:`True`, this
+            token will never match inside of a word. For example the token ``ing`` would match
+            on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
+            The notion of "`inside of a word`" is defined by the word boundaries pattern in
+            regular expressions (ie. the token should start and end with word boundaries).
+
+        lstrip (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should strip all potential whitespaces on its left side.
+            If :obj:`True`, this token will greedily match any whitespace on its left. For
+            example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
+            ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
+
+        rstrip (:obj:`bool`, defaults to :obj:`False`):
+            Defines whether this token should strip all potential whitespaces on its right
+            side. If :obj:`True`, this token will greedily match any whitespace on its right.
+            It works just like :obj:`lstrip` but on the right.
+
+        normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+            Defines whether this token should match against the normalized version of the input
+            text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
+            lowercasing the text, the token could be extract from the input ``"I saw a lion
+            Yesterday"``.
+        special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+            Defines whether this token should be skipped when decoding.
+
+    """
+    def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
+        pass
+
+    @property
+    def content(self):
+        """
+        Get the content of this :obj:`AddedToken`
+        """
+        pass
+
+    @property
+    def lstrip(self):
+        """
+        Get the value of the :obj:`lstrip` option
+        """
+        pass
+
+    @property
+    def normalized(self):
+        """
+        Get the value of the :obj:`normalized` option
+        """
+        pass
+
+    @property
+    def rstrip(self):
+        """
+        Get the value of the :obj:`rstrip` option
+        """
+        pass
+
+    @property
+    def single_word(self):
+        """
+        Get the value of the :obj:`single_word` option
+        """
+        pass
+
+    @property
+    def special(self):
+        """
+        Get the value of the :obj:`special` option
+        """
+        pass
+
+class Encoding:
+    """
+    The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
+    """
+    @property
+    def attention_mask(self):
+        """
+        The attention mask
+
+        This indicates to the LM which tokens should be attended to, and which should not.
+        This is especially important when batching sequences, where we need to applying
+        padding.
+
+        Returns:
+           :obj:`List[int]`: The attention mask
+        """
+        pass
+
+    def char_to_token(self, char_pos, sequence_index=0):
+        """
+        Get the token that contains the char at the given position in the input sequence.
+
+        Args:
+            char_pos (:obj:`int`):
+                The position of a char in the input string
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target char
+
+        Returns:
+            :obj:`int`: The index of the token that contains this char in the encoded sequence
+        """
+        pass
+
+    def char_to_word(self, char_pos, sequence_index=0):
+        """
+        Get the word that contains the char at the given position in the input sequence.
+
+        Args:
+            char_pos (:obj:`int`):
+                The position of a char in the input string
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target char
+
+        Returns:
+            :obj:`int`: The index of the word that contains this char in the input sequence
+        """
+        pass
+
+    @property
+    def ids(self):
+        """
+        The generated IDs
+
+        The IDs are the main input to a Language Model. They are the token indices,
+        the numerical representations that a LM understands.
+
+        Returns:
+            :obj:`List[int]`: The list of IDs
+        """
+        pass
+
+    @staticmethod
+    def merge(encodings, growing_offsets=True):
+        """
+        Merge the list of encodings into one final :class:`~tokenizers.Encoding`
+
+        Args:
+            encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
+                The list of encodings that should be merged in one
+
+            growing_offsets (:obj:`bool`, defaults to :obj:`True`):
+                Whether the offsets should accumulate while merging
+
+        Returns:
+            :class:`~tokenizers.Encoding`: The resulting Encoding
+        """
+        pass
+
+    @property
+    def n_sequences(self):
+        """
+        The number of sequences represented
+
+        Returns:
+            :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
+        """
+        pass
+
+    @property
+    def offsets(self):
+        """
+        The offsets associated to each token
+
+        These offsets let's you slice the input string, and thus retrieve the original
+        part that led to producing the corresponding token.
+
+        Returns:
+            A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
+        """
+        pass
+
+    @property
+    def overflowing(self):
+        """
+        A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
+
+        When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
+        the output into as many pieces as required to match the specified maximum length.
+        This field lets you retrieve all the subsequent pieces.
+
+        When you use pairs of sequences, the overflowing pieces will contain enough
+        variations to cover all the possible combinations, while respecting the provided
+        maximum length.
+        """
+        pass
+
+    def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
+        """
+        Pad the :class:`~tokenizers.Encoding` at the given length
+
+        Args:
+            length (:obj:`int`):
+                The desired length
+
+            direction: (:obj:`str`, defaults to :obj:`right`):
+                The expected padding direction. Can be either :obj:`right` or :obj:`left`
+
+            pad_id (:obj:`int`, defaults to :obj:`0`):
+                The ID corresponding to the padding token
+
+            pad_type_id (:obj:`int`, defaults to :obj:`0`):
+                The type ID corresponding to the padding token
+
+            pad_token (:obj:`str`, defaults to `[PAD]`):
+                The pad token to use
+        """
+        pass
+
+    @property
+    def sequence_ids(self):
+        """
+        The generated sequence indices.
+
+        They represent the index of the input sequence associated to each token.
+        The sequence id can be None if the token is not related to any input sequence,
+        like for example with special tokens.
+
+        Returns:
+            A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
+        """
+        pass
+
+    def set_sequence_id(self, sequence_id):
+        """
+        Set the given sequence index
+
+        Set the given sequence index for the whole range of tokens contained in this
+        :class:`~tokenizers.Encoding`.
+        """
+        pass
+
+    @property
+    def special_tokens_mask(self):
+        """
+        The special token mask
+
+        This indicates which tokens are special tokens, and which are not.
+
+        Returns:
+            :obj:`List[int]`: The special tokens mask
+        """
+        pass
+
+    def token_to_chars(self, token_index):
+        """
+        Get the offsets of the token at the given index.
+
+        The returned offsets are related to the input sequence that contains the
+        token.  In order to determine in which input sequence it belongs, you
+        must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
+
+        Args:
+            token_index (:obj:`int`):
+                The index of a token in the encoded sequence.
+
+        Returns:
+            :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
+        """
+        pass
+
+    def token_to_sequence(self, token_index):
+        """
+        Get the index of the sequence represented by the given token.
+
+        In the general use case, this method returns :obj:`0` for a single sequence or
+        the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+
+        Args:
+            token_index (:obj:`int`):
+                The index of a token in the encoded sequence.
+
+        Returns:
+            :obj:`int`: The sequence id of the given token
+        """
+        pass
+
+    def token_to_word(self, token_index):
+        """
+        Get the index of the word that contains the token in one of the input sequences.
+
+        The returned word index is related to the input sequence that contains
+        the token.  In order to determine in which input sequence it belongs, you
+        must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
+
+        Args:
+            token_index (:obj:`int`):
+                The index of a token in the encoded sequence.
+
+        Returns:
+            :obj:`int`: The index of the word in the relevant input sequence.
+        """
+        pass
+
+    @property
+    def tokens(self):
+        """
+        The generated tokens
+
+        They are the string representation of the IDs.
+
+        Returns:
+            :obj:`List[str]`: The list of tokens
+        """
+        pass
+
+    def truncate(self, max_length, stride=0, direction="right"):
+        """
+        Truncate the :class:`~tokenizers.Encoding` at the given length
+
+        If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
+        this information is lost. It will be considered as representing a single sequence.
+
+        Args:
+            max_length (:obj:`int`):
+                The desired length
+
+            stride (:obj:`int`, defaults to :obj:`0`):
+                The length of previous content to be included in each overflowing piece
+
+            direction (:obj:`str`, defaults to :obj:`right`):
+                Truncate direction
+        """
+        pass
+
+    @property
+    def type_ids(self):
+        """
+        The generated type IDs
+
+        Generally used for tasks like sequence classification or question answering,
+        these tokens let the LM know which input sequence corresponds to each tokens.
+
+        Returns:
+            :obj:`List[int]`: The list of type ids
+        """
+        pass
+
+    @property
+    def word_ids(self):
+        """
+        The generated word indices.
+
+        They represent the index of the word associated to each token.
+        When the input is pre-tokenized, they correspond to the ID of the given input label,
+        otherwise they correspond to the words indices as defined by the
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
+
+        For special tokens and such (any token that was generated from something that was
+        not part of the input), the output is :obj:`None`
+
+        Returns:
+            A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
+        """
+        pass
+
+    def word_to_chars(self, word_index, sequence_index=0):
+        """
+        Get the offsets of the word at the given index in one of the input sequences.
+
+        Args:
+            word_index (:obj:`int`):
+                The index of a word in one of the input sequences.
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target word
+
+        Returns:
+            :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
+        """
+        pass
+
+    def word_to_tokens(self, word_index, sequence_index=0):
+        """
+        Get the encoded tokens corresponding to the word at the given index
+        in one of the input sequences.
+
+        Args:
+            word_index (:obj:`int`):
+                The index of a word in one of the input sequences.
+            sequence_index (:obj:`int`, defaults to :obj:`0`):
+                The index of the sequence that contains the target word
+
+        Returns:
+            :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
+        """
+        pass
+
+    @property
+    def words(self):
+        """
+        The generated word indices.
+
+        .. warning::
+            This is deprecated and will be removed in a future version.
+            Please use :obj:`~tokenizers.Encoding.word_ids` instead.
+
+        They represent the index of the word associated to each token.
+        When the input is pre-tokenized, they correspond to the ID of the given input label,
+        otherwise they correspond to the words indices as defined by the
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
+
+        For special tokens and such (any token that was generated from something that was
+        not part of the input), the output is :obj:`None`
+
+        Returns:
+            A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
+        """
+        pass
+
+class NormalizedString:
+    """
+    NormalizedString
+
+    A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
+    While making all the requested modifications, it keeps track of the alignment information
+    between the two versions of the string.
+
+    Args:
+        sequence: str:
+            The string sequence used to initialize this NormalizedString
+    """
+    def append(self, s):
+        """
+        Append the given sequence to the string
+        """
+        pass
+
+    def clear(self):
+        """
+        Clears the string
+        """
+        pass
+
+    def filter(self, func):
+        """
+        Filter each character of the string using the given func
+        """
+        pass
+
+    def for_each(self, func):
+        """
+        Calls the given function for each character of the string
+        """
+        pass
+
+    def lowercase(self):
+        """
+        Lowercase the string
+        """
+        pass
+
+    def lstrip(self):
+        """
+        Strip the left of the string
+        """
+        pass
+
+    def map(self, func):
+        """
+        Calls the given function for each character of the string
+
+        Replaces each character of the string using the returned value. Each
+        returned value **must** be a str of length 1 (ie a character).
+        """
+        pass
+
+    def nfc(self):
+        """
+        Runs the NFC normalization
+        """
+        pass
+
+    def nfd(self):
+        """
+        Runs the NFD normalization
+        """
+        pass
+
+    def nfkc(self):
+        """
+        Runs the NFKC normalization
+        """
+        pass
+
+    def nfkd(self):
+        """
+        Runs the NFKD normalization
+        """
+        pass
+
+    @property
+    def normalized(self):
+        """
+        The normalized part of the string
+        """
+        pass
+
+    def prepend(self, s):
+        """
+        Prepend the given sequence to the string
+        """
+        pass
+
+    def replace(self, pattern, content):
+        """
+        Replace the content of the given pattern with the provided content
+
+        Args:
+            pattern: Pattern:
+                A pattern used to match the string. Usually a string or a Regex
+
+            content: str:
+                The content to be used as replacement
+        """
+        pass
+
+    def rstrip(self):
+        """
+        Strip the right of the string
+        """
+        pass
+
+    def slice(self, range):
+        """
+        Slice the string using the given range
+        """
+        pass
+
+    def split(self, pattern, behavior):
+        """
+        Split the NormalizedString using the given pattern and the specified behavior
+
+        Args:
+            pattern: Pattern:
+                A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
+
+            behavior: SplitDelimiterBehavior:
+                The behavior to use when splitting.
+                Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+                "contiguous"
+
+        Returns:
+            A list of NormalizedString, representing each split
+        """
+        pass
+
+    def strip(self):
+        """
+        Strip both ends of the string
+        """
+        pass
+
+    def uppercase(self):
+        """
+        Uppercase the string
+        """
+        pass
+
+class PreTokenizedString:
+    """
+    PreTokenizedString
+
+    Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
+    underlying string, while keeping track of the alignment information (offsets).
+
+    The PreTokenizedString manages what we call `splits`. Each split represents a substring
+    which is a subpart of the original string, with the relevant offsets and tokens.
+
+    When calling one of the methods used to modify the PreTokenizedString (namely one of
+    `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
+    tokens will get modified.
+
+    Args:
+        sequence: str:
+            The string sequence used to initialize this PreTokenizedString
+    """
+    def __init__(self, sequence):
+        pass
+
+    def get_splits(self, offset_referential="original", offset_type="char"):
+        """
+        Get the splits currently managed by the PreTokenizedString
+
+        Args:
+            offset_referential: :obj:`str`
+                Whether the returned splits should have offsets expressed relative
+                to the original string, or the normalized one. choices: "original", "normalized".
+
+            offset_type: :obj:`str`
+                Whether the returned splits should have offsets expressed in bytes or chars.
+                When slicing an str, we usually want to use chars, which is the default value.
+                Now in some cases it might be interesting to get these offsets expressed in bytes,
+                so it is possible to change this here.
+                choices: "char", "bytes"
+
+        Returns
+            A list of splits
+        """
+        pass
+
+    def normalize(self, func):
+        """
+        Normalize each split of the `PreTokenizedString` using the given `func`
+
+        Args:
+            func: Callable[[NormalizedString], None]:
+                The function used to normalize each underlying split. This function
+                does not need to return anything, just calling the methods on the provided
+                NormalizedString allow its modification.
+        """
+        pass
+
+    def split(self, func):
+        """
+        Split the PreTokenizedString using the given `func`
+
+        Args:
+            func: Callable[[index, NormalizedString], List[NormalizedString]]:
+                The function used to split each underlying split.
+                It is expected to return a list of `NormalizedString`, that represent the new
+                splits. If the given `NormalizedString` does not need any splitting, we can
+                just return it directly.
+                In order for the offsets to be tracked accurately, any returned `NormalizedString`
+                should come from calling either `.split` or `.slice` on the received one.
+        """
+        pass
+
+    def to_encoding(self, type_id=0, word_idx=None):
+        """
+        Return an Encoding generated from this PreTokenizedString
+
+        Args:
+            type_id: int = 0:
+                The type_id to be used on the generated Encoding.
+
+            word_idx: Optional[int] = None:
+                An optional word index to be used for each token of this Encoding. If provided,
+                all the word indices in the generated Encoding will use this value, instead
+                of the one automatically tracked during pre-tokenization.
+
+        Returns:
+            An Encoding
+        """
+        pass
+
+    def tokenize(self, func):
+        """
+        Tokenize each split of the `PreTokenizedString` using the given `func`
+
+        Args:
+            func: Callable[[str], List[Token]]:
+                The function used to tokenize each underlying split. This function must return
+                a list of Token generated from the input str.
+        """
+        pass
+
+class Regex:
+    """
+    Instantiate a new Regex with the given pattern
+    """
+    def __init__(self, pattern):
+        pass
+
+class Token:
+    pass
+
+class Tokenizer:
+    """
+    A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
+    and outputs an :class:`~tokenizers.Encoding`.
+
+    Args:
+        model (:class:`~tokenizers.models.Model`):
+            The core algorithm that this :obj:`Tokenizer` should be using.
+
+    """
+    def __init__(self, model):
+        pass
+
+    def add_special_tokens(self, tokens):
+        """
+        Add the given special tokens to the Tokenizer.
+
+        If these tokens are already part of the vocabulary, it just let the Tokenizer know about
+        them. If they don't exist, the Tokenizer creates them, giving them a new id.
+
+        These special tokens will never be processed by the model (ie won't be split into
+        multiple tokens), and they can be removed from the output when decoding.
+
+        Args:
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+                The list of special tokens we want to add to the vocabulary. Each token can either
+                be a string or an instance of :class:`~tokenizers.AddedToken` for more
+                customization.
+
+        Returns:
+            :obj:`int`: The number of tokens that were created in the vocabulary
+        """
+        pass
+
+    def add_tokens(self, tokens):
+        """
+        Add the given tokens to the vocabulary
+
+        The given tokens are added only if they don't already exist in the vocabulary.
+        Each token then gets a new attributed id.
+
+        Args:
+            tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+                The list of tokens we want to add to the vocabulary. Each token can be either a
+                string or an instance of :class:`~tokenizers.AddedToken` for more customization.
+
+        Returns:
+            :obj:`int`: The number of tokens that were created in the vocabulary
+        """
+        pass
+
+    def decode(self, ids, skip_special_tokens=True):
+        """
+        Decode the given list of ids back to a string
+
+        This is used to decode anything coming back from a Language Model
+
+        Args:
+            ids (A :obj:`List/Tuple` of :obj:`int`):
+                The list of ids that we want to decode
+
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded string
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+    def decode_batch(self, sequences, skip_special_tokens=True):
+        """
+        Decode a batch of ids back to their corresponding string
+
+        Args:
+            sequences (:obj:`List` of :obj:`List[int]`):
+                The batch of sequences we want to decode
+
+            skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether the special tokens should be removed from the decoded strings
+
+        Returns:
+            :obj:`List[str]`: A list of decoded strings
+        """
+        pass
+
+    @property
+    def decoder(self):
+        """
+        The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
+        """
+        pass
+
+    def enable_padding(
+        self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
+    ):
+        """
+        Enable the padding
+
+        Args:
+            direction (:obj:`str`, `optional`, defaults to :obj:`right`):
+                The direction in which to pad. Can be either ``right`` or ``left``
+
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If specified, the padding length should always snap to the next multiple of the
+                given value. For example if we were going to pad witha length of 250 but
+                ``pad_to_multiple_of=8`` then we will pad to 256.
+
+            pad_id (:obj:`int`, defaults to 0):
+                The id to be used when padding
+
+            pad_type_id (:obj:`int`, defaults to 0):
+                The type id to be used when padding
+
+            pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
+                The pad token to be used when padding
+
+            length (:obj:`int`, `optional`):
+                If specified, the length at which to pad. If not specified we pad using the size of
+                the longest sequence in a batch.
+        """
+        pass
+
+    def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
+        """
+        Enable truncation
+
+        Args:
+            max_length (:obj:`int`):
+                The max length at which to truncate
+
+            stride (:obj:`int`, `optional`):
+                The length of the previous first sequence to be included in the overflowing
+                sequence
+
+            strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
+                The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
+                ``only_second``.
+
+            direction (:obj:`str`, defaults to :obj:`right`):
+                Truncate direction
+        """
+        pass
+
+    def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
+        """
+        Encode the given sequence and pair. This method can process raw text sequences
+        as well as already pre-tokenized sequences.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode("A single sequence")`
+                encode("A sequence", "And its pair")`
+                encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
+                encode(
+                    [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
+                    is_pretokenized=True
+                )
+
+        Args:
+            sequence (:obj:`~tokenizers.InputSequence`):
+                The main input sequence we want to encode. This sequence can be either raw
+                text or pre-tokenized, according to the ``is_pretokenized`` argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
+
+            pair (:obj:`~tokenizers.InputSequence`, `optional`):
+                An optional input sequence. The expected format is the same that for ``sequence``.
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            :class:`~tokenizers.Encoding`: The encoded result
+
+        """
+        pass
+
+    def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Encode the given batch of inputs. This method accept both raw text sequences
+        as well as already pre-tokenized sequences.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode_batch([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
+    @property
+    def encode_special_tokens(self):
+        """
+        Modifies the tokenizer in order to use or not the special tokens
+        during encoding.
+
+        Args:
+            value (:obj:`bool`):
+                Whether to use the special tokens or not
+
+        """
+        pass
+
+    @staticmethod
+    def from_buffer(buffer):
+        """
+        Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
+
+        Args:
+            buffer (:obj:`bytes`):
+                A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
+
+        Returns:
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
+        """
+        pass
+
+    @staticmethod
+    def from_file(path):
+        """
+        Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
+
+        Args:
+            path (:obj:`str`):
+                A path to a local JSON file representing a previously serialized
+                :class:`~tokenizers.Tokenizer`
+
+        Returns:
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
+        """
+        pass
+
+    @staticmethod
+    def from_pretrained(identifier, revision="main", auth_token=None):
+        """
+        Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
+        Hugging Face Hub.
+
+        Args:
+            identifier (:obj:`str`):
+                The identifier of a Model on the Hugging Face Hub, that contains
+                a tokenizer.json file
+            revision (:obj:`str`, defaults to `main`):
+                A branch or commit id
+            auth_token (:obj:`str`, `optional`, defaults to `None`):
+                An optional auth token used to access private repositories on the
+                Hugging Face Hub
+
+        Returns:
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
+        """
+        pass
+
+    @staticmethod
+    def from_str(json):
+        """
+        Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
+
+        Args:
+            json (:obj:`str`):
+                A valid JSON string representing a previously serialized
+                :class:`~tokenizers.Tokenizer`
+
+        Returns:
+            :class:`~tokenizers.Tokenizer`: The new tokenizer
+        """
+        pass
+
+    def get_added_tokens_decoder(self):
+        """
+        Get the underlying vocabulary
+
+        Returns:
+            :obj:`Dict[int, AddedToken]`: The vocabulary
+        """
+        pass
+
+    def get_vocab(self, with_added_tokens=True):
+        """
+        Get the underlying vocabulary
+
+        Args:
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to include the added tokens
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary
+        """
+        pass
+
+    def get_vocab_size(self, with_added_tokens=True):
+        """
+        Get the size of the underlying vocabulary
+
+        Args:
+            with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to include the added tokens
+
+        Returns:
+            :obj:`int`: The size of the vocabulary
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Convert the given id to its corresponding token if it exists
+
+        Args:
+            id (:obj:`int`):
+                The id to convert
+
+        Returns:
+            :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
+        """
+        pass
+
+    @property
+    def model(self):
+        """
+        The :class:`~tokenizers.models.Model` in use by the Tokenizer
+        """
+        pass
+
+    def no_padding(self):
+        """
+        Disable padding
+        """
+        pass
+
+    def no_truncation(self):
+        """
+        Disable truncation
+        """
+        pass
+
+    @property
+    def normalizer(self):
+        """
+        The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
+        """
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+
+    @property
+    def padding(self):
+        """
+        Get the current padding parameters
+
+        `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
+
+        Returns:
+            (:obj:`dict`, `optional`):
+                A dict with the current padding parameters if padding is enabled
+        """
+        pass
+
+    def post_process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Apply all the post-processing steps to the given encodings.
+
+        The various steps are:
+
+            1. Truncate according to the set truncation params (provided with
+               :meth:`~tokenizers.Tokenizer.enable_truncation`)
+            2. Apply the :class:`~tokenizers.processors.PostProcessor`
+            3. Pad according to the set padding params (provided with
+               :meth:`~tokenizers.Tokenizer.enable_padding`)
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The :class:`~tokenizers.Encoding` corresponding to the main sequence.
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Returns:
+            :class:`~tokenizers.Encoding`: The final post-processed encoding
+        """
+        pass
+
+    @property
+    def post_processor(self):
+        """
+        The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
+        """
+        pass
+
+    @property
+    def pre_tokenizer(self):
+        """
+        The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
+        """
+        pass
+
+    def save(self, path, pretty=True):
+        """
+        Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
+
+        Args:
+            path (:obj:`str`):
+                A path to a file in which to save the serialized tokenizer.
+
+            pretty (:obj:`bool`, defaults to :obj:`True`):
+                Whether the JSON file should be pretty formatted.
+        """
+        pass
+
+    def to_str(self, pretty=False):
+        """
+        Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
+
+        Args:
+            pretty (:obj:`bool`, defaults to :obj:`False`):
+                Whether the JSON string should be pretty formatted.
+
+        Returns:
+            :obj:`str`: A string representing the serialized Tokenizer
+        """
+        pass
+
+    def token_to_id(self, token):
+        """
+        Convert the given token to its corresponding id if it exists
+
+        Args:
+            token (:obj:`str`):
+                The token to convert
+
+        Returns:
+            :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
+        """
+        pass
+
+    def train(self, files, trainer=None):
+        """
+        Train the Tokenizer using the given files.
+
+        Reads the files line by line, while keeping all the whitespace, even new lines.
+        If you want to train from data store in-memory, you can check
+        :meth:`~tokenizers.Tokenizer.train_from_iterator`
+
+        Args:
+            files (:obj:`List[str]`):
+                A list of path to the files that we should use for training
+
+            trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
+                An optional trainer that should be used to train our Model
+        """
+        pass
+
+    def train_from_iterator(self, iterator, trainer=None, length=None):
+        """
+        Train the Tokenizer using the provided iterator.
+
+        You can provide anything that is a Python Iterator
+
+            * A list of sequences :obj:`List[str]`
+            * A generator that yields :obj:`str` or :obj:`List[str]`
+            * A Numpy array of strings
+            * ...
+
+        Args:
+            iterator (:obj:`Iterator`):
+                Any iterator over strings or list of strings
+
+            trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
+                An optional trainer that should be used to train our Model
+
+            length (:obj:`int`, `optional`):
+                The total number of sequences in the iterator. This is used to
+                provide meaningful progress tracking
+        """
+        pass
+
+    @property
+    def truncation(self):
+        """
+        Get the currently set truncation parameters
+
+        `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
+
+        Returns:
+            (:obj:`dict`, `optional`):
+                A dict with the current truncation parameters if truncation is enabled
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py
new file mode 100644
index 00000000..a717379c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py
@@ -0,0 +1,14 @@
+from .. import decoders
+
+
+Decoder = decoders.Decoder
+ByteLevel = decoders.ByteLevel
+Replace = decoders.Replace
+WordPiece = decoders.WordPiece
+ByteFallback = decoders.ByteFallback
+Fuse = decoders.Fuse
+Strip = decoders.Strip
+Metaspace = decoders.Metaspace
+BPEDecoder = decoders.BPEDecoder
+CTC = decoders.CTC
+Sequence = decoders.Sequence
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi
new file mode 100644
index 00000000..b967fbd1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi
@@ -0,0 +1,271 @@
+# Generated content DO NOT EDIT
+class Decoder:
+    """
+    Base class for all decoders
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a Decoder will return an instance of this class when instantiated.
+    """
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class BPEDecoder(Decoder):
+    """
+    BPEDecoder Decoder
+
+    Args:
+        suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
+            The suffix that was used to caracterize an end-of-word. This suffix will
+            be replaced by whitespaces during the decoding
+    """
+    def __init__(self, suffix="</w>"):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class ByteFallback(Decoder):
+    """
+    ByteFallback Decoder
+    ByteFallback is a simple trick which converts tokens looking like `<0x61>`
+    to pure bytes, and attempts to make them into a string. If the tokens
+    cannot be decoded you will get � instead for each inconvertable byte token
+
+    """
+    def __init__(self):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class ByteLevel(Decoder):
+    """
+    ByteLevel Decoder
+
+    This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
+    :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
+    """
+    def __init__(self):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class CTC(Decoder):
+    """
+    CTC Decoder
+
+    Args:
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
+            The pad token used by CTC to delimit a new token.
+        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
+            The word delimiter token. It will be replaced by a <space>
+        cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to cleanup some tokenization artifacts.
+            Mainly spaces before punctuation, and some abbreviated english forms.
+    """
+    def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class Fuse(Decoder):
+    """
+    Fuse Decoder
+    Fuse simply fuses every token into a single string.
+    This is the last step of decoding, this decoder exists only if
+    there is need to add other decoders *after* the fusion
+    """
+    def __init__(self):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class Metaspace(Decoder):
+    """
+    Metaspace Decoder
+
+    Args:
+        replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
+            The replacement character. Must be exactly one character. By default we
+            use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+        prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+            Choices: "always", "never", "first". First means the space is only added on the first
+            token (relevant when special tokens are used or other pre_tokenizer are used).
+    """
+    def __init__(self, replacement="▁", prepend_scheme="always", split=True):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class Replace(Decoder):
+    """
+    Replace Decoder
+
+    This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
+    :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
+    """
+    def __init__(self, pattern, content):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class Sequence(Decoder):
+    """
+    Sequence Decoder
+
+    Args:
+        decoders (:obj:`List[Decoder]`)
+            The decoders that need to be chained
+    """
+    def __init__(self, decoders):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class Strip(Decoder):
+    """
+    Strip normalizer
+    Strips n left characters of each token, or n right characters of each token
+    """
+    def __init__(self, content, left=0, right=0):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
+
+class WordPiece(Decoder):
+    """
+    WordPiece Decoder
+
+    Args:
+        prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
+            The prefix to use for subwords that are not a beginning-of-word
+
+        cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+            and some abbreviated english forms.
+    """
+    def __init__(self, prefix="##", cleanup=True):
+        pass
+
+    def decode(self, tokens):
+        """
+        Decode the given list of tokens to a final string
+
+        Args:
+            tokens (:obj:`List[str]`):
+                The list of tokens to decode
+
+        Returns:
+            :obj:`str`: The decoded string
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py
new file mode 100644
index 00000000..7e775892
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py
@@ -0,0 +1,6 @@
+from .base_tokenizer import BaseTokenizer
+from .bert_wordpiece import BertWordPieceTokenizer
+from .byte_level_bpe import ByteLevelBPETokenizer
+from .char_level_bpe import CharBPETokenizer
+from .sentencepiece_bpe import SentencePieceBPETokenizer
+from .sentencepiece_unigram import SentencePieceUnigramTokenizer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py
new file mode 100644
index 00000000..4528dceb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py
@@ -0,0 +1,418 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
+from tokenizers.decoders import Decoder
+from tokenizers.models import Model
+from tokenizers.normalizers import Normalizer
+from tokenizers.pre_tokenizers import PreTokenizer
+from tokenizers.processors import PostProcessor
+
+
+Offsets = Tuple[int, int]
+
+
+class BaseTokenizer:
+    def __init__(self, tokenizer: Tokenizer, parameters=None):
+        self._tokenizer = tokenizer
+        self._parameters = parameters if parameters is not None else {}
+
+    def __repr__(self):
+        return "Tokenizer(vocabulary_size={}, {})".format(
+            self._tokenizer.get_vocab_size(),
+            ", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
+        )
+
+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        return self._tokenizer.num_special_tokens_to_add(is_pair)
+
+    def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+        """Returns the vocabulary
+
+        Args:
+            with_added_tokens: boolean:
+                Whether to include the added tokens in the vocabulary
+
+        Returns:
+            The vocabulary
+        """
+        return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
+
+    def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        """Returns the added reverse vocabulary
+
+        Returns:
+            The added vocabulary mapping ints to AddedTokens
+        """
+        return self._tokenizer.get_added_tokens_decoder()
+
+    def get_vocab_size(self, with_added_tokens: bool = True) -> int:
+        """Return the size of vocabulary, with or without added tokens.
+
+        Args:
+            with_added_tokens: (`optional`) bool:
+                Whether to count in added special tokens or not
+
+        Returns:
+            Size of vocabulary
+        """
+        return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
+
+    def enable_padding(
+        self,
+        direction: Optional[str] = "right",
+        pad_to_multiple_of: Optional[int] = None,
+        pad_id: Optional[int] = 0,
+        pad_type_id: Optional[int] = 0,
+        pad_token: Optional[str] = "[PAD]",
+        length: Optional[int] = None,
+    ):
+        """Change the padding strategy
+
+        Args:
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_to_multiple_of: (`optional`) unsigned int:
+                If specified, the padding length should always snap to the next multiple of
+                the given value. For example if we were going to pad with a length of 250 but
+                `pad_to_multiple_of=8` then we will pad to 256.
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+
+            length: (`optional`) unsigned int:
+                If specified, the length at which to pad. If not specified
+                we pad using the size of the longest sequence in a batch
+        """
+        return self._tokenizer.enable_padding(
+            direction=direction,
+            pad_to_multiple_of=pad_to_multiple_of,
+            pad_id=pad_id,
+            pad_type_id=pad_type_id,
+            pad_token=pad_token,
+            length=length,
+        )
+
+    def no_padding(self):
+        """Disable padding"""
+        return self._tokenizer.no_padding()
+
+    @property
+    def padding(self) -> Optional[dict]:
+        """Get the current padding parameters
+
+        Returns:
+            None if padding is disabled, a dict with the currently set parameters
+            if the padding is enabled.
+        """
+        return self._tokenizer.padding
+
+    def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
+        """Change the truncation options
+
+        Args:
+            max_length: unsigned int:
+                The maximum length at which to truncate
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+
+            strategy: (`optional`) str:
+                Can be one of `longest_first`, `only_first` or `only_second`
+        """
+        return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
+
+    def no_truncation(self):
+        """Disable truncation"""
+        return self._tokenizer.no_truncation()
+
+    @property
+    def truncation(self) -> Optional[dict]:
+        """Get the current truncation parameters
+
+        Returns:
+            None if truncation is disabled, a dict with the current truncation parameters if
+            truncation is enabled
+        """
+        return self._tokenizer.truncation
+
+    def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
+        """Add the given tokens to the vocabulary
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                A list of tokens to add to the vocabulary. Each token can either be
+                a string, or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_tokens(tokens)
+
+    def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
+        """Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+        The special tokens will never be processed by the model, and will be
+        removed while decoding.
+
+        Args:
+            tokens: List[Union[str, AddedToken]]:
+                A list of special tokens to add to the vocabulary. Each token can either be
+                a string, or an instance of AddedToken
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_special_tokens(special_tokens)
+
+    def normalize(self, sequence: str) -> str:
+        """Normalize the given sequence
+
+        Args:
+            sequence: str:
+                The sequence to normalize
+
+        Returns:
+            The normalized string
+        """
+        return self._tokenizer.normalize(sequence)
+
+    def encode(
+        self,
+        sequence: InputSequence,
+        pair: Optional[InputSequence] = None,
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> Encoding:
+        """Encode the given sequence and pair. This method can process raw text sequences as well
+        as already pre-tokenized sequences.
+
+        Args:
+            sequence: InputSequence:
+                The sequence we want to encode. This sequence can be either raw text or
+                pre-tokenized, according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            An Encoding
+        """
+        if sequence is None:
+            raise ValueError("encode: `sequence` can't be `None`")
+
+        return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
+
+    def encode_batch(
+        self,
+        inputs: List[EncodeInput],
+        is_pretokenized: bool = False,
+        add_special_tokens: bool = True,
+    ) -> List[Encoding]:
+        """Encode the given inputs. This method accept both raw text sequences as well as already
+        pre-tokenized sequences.
+
+        Args:
+            inputs: List[EncodeInput]:
+                A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+                expected to be of the following form:
+                    `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+                Each `InputSequence` can either be raw text or pre-tokenized,
+                according to the `is_pretokenized` argument:
+
+                - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+                - If `is_pretokenized=True`: `InputSequence` is expected to be
+                    `Union[List[str], Tuple[str]]`
+
+            is_pretokenized: bool:
+                Whether the input is already pre-tokenized.
+
+            add_special_tokens: bool:
+                Whether to add the special tokens while encoding.
+
+        Returns:
+            A list of Encoding
+        """
+
+        if inputs is None:
+            raise ValueError("encode_batch: `inputs` can't be `None`")
+
+        return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
+
+    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+        """Decode the given list of ids to a string sequence
+
+        Args:
+            ids: List[unsigned int]:
+                A list of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output string
+
+        Returns:
+            The decoded string
+        """
+        if ids is None:
+            raise ValueError("None input is not valid. Should be a list of integers.")
+
+        return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
+        """Decode the list of sequences to a list of string sequences
+
+        Args:
+            sequences: List[List[unsigned int]]:
+                A list of sequence of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output strings
+
+        Returns:
+            A list of decoded strings
+        """
+        if sequences is None:
+            raise ValueError("None input is not valid. Should be list of list of integers.")
+
+        return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
+
+    def token_to_id(self, token: str) -> Optional[int]:
+        """Convert the given token to its corresponding id
+
+        Args:
+            token: str:
+                The token to convert
+
+        Returns:
+            The corresponding id if it exists, None otherwise
+        """
+        return self._tokenizer.token_to_id(token)
+
+    def id_to_token(self, id: int) -> Optional[str]:
+        """Convert the given token id to its corresponding string
+
+        Args:
+            token: id:
+                The token id to convert
+
+        Returns:
+            The corresponding string if it exists, None otherwise
+        """
+        return self._tokenizer.id_to_token(id)
+
+    def save_model(self, directory: str, prefix: Optional[str] = None):
+        """Save the current model to the given directory
+
+        Args:
+            directory: str:
+                A path to the destination directory
+
+            prefix: (Optional) str:
+                An optional prefix, used to prefix each file name
+        """
+        return self._tokenizer.model.save(directory, prefix=prefix)
+
+    def save(self, path: str, pretty: bool = True):
+        """Save the current Tokenizer at the given path
+
+        Args:
+            path: str:
+                A path to the destination Tokenizer file
+        """
+        return self._tokenizer.save(path, pretty)
+
+    def to_str(self, pretty: bool = False):
+        """Get a serialized JSON version of the Tokenizer as a str
+
+        Args:
+            pretty: bool:
+                Whether the JSON string should be prettified
+
+        Returns:
+            str
+        """
+        return self._tokenizer.to_str(pretty)
+
+    def post_process(
+        self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
+    ) -> Encoding:
+        """Apply all the post-processing steps to the given encodings.
+
+        The various steps are:
+            1. Truncate according to global params (provided to `enable_truncation`)
+            2. Apply the PostProcessor
+            3. Pad according to global params. (provided to `enable_padding`)
+
+        Args:
+            encoding: Encoding:
+                The main Encoding to post process
+
+            pair: Optional[Encoding]:
+                An optional pair Encoding
+
+            add_special_tokens: bool:
+                Whether to add special tokens
+
+        Returns:
+            The resulting Encoding
+        """
+        return self._tokenizer.post_process(encoding, pair, add_special_tokens)
+
+    @property
+    def model(self) -> Model:
+        return self._tokenizer.model
+
+    @model.setter
+    def model(self, model: Model):
+        self._tokenizer.model = model
+
+    @property
+    def normalizer(self) -> Normalizer:
+        return self._tokenizer.normalizer
+
+    @normalizer.setter
+    def normalizer(self, normalizer: Normalizer):
+        self._tokenizer.normalizer = normalizer
+
+    @property
+    def pre_tokenizer(self) -> PreTokenizer:
+        return self._tokenizer.pre_tokenizer
+
+    @pre_tokenizer.setter
+    def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
+        self._tokenizer.pre_tokenizer = pre_tokenizer
+
+    @property
+    def post_processor(self) -> PostProcessor:
+        return self._tokenizer.post_processor
+
+    @post_processor.setter
+    def post_processor(self, post_processor: PostProcessor):
+        self._tokenizer.post_processor = post_processor
+
+    @property
+    def decoder(self) -> Decoder:
+        return self._tokenizer.decoder
+
+    @decoder.setter
+    def decoder(self, decoder: Decoder):
+        self._tokenizer.decoder = decoder
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py
new file mode 100644
index 00000000..1f34e3ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py
@@ -0,0 +1,151 @@
+from typing import Dict, Iterator, List, Optional, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, trainers
+from tokenizers.models import WordPiece
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+from tokenizers.processors import BertProcessing
+
+from .base_tokenizer import BaseTokenizer
+
+
+class BertWordPieceTokenizer(BaseTokenizer):
+    """Bert WordPiece Tokenizer"""
+
+    def __init__(
+        self,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        unk_token: Union[str, AddedToken] = "[UNK]",
+        sep_token: Union[str, AddedToken] = "[SEP]",
+        cls_token: Union[str, AddedToken] = "[CLS]",
+        pad_token: Union[str, AddedToken] = "[PAD]",
+        mask_token: Union[str, AddedToken] = "[MASK]",
+        clean_text: bool = True,
+        handle_chinese_chars: bool = True,
+        strip_accents: Optional[bool] = None,
+        lowercase: bool = True,
+        wordpieces_prefix: str = "##",
+    ):
+        if vocab is not None:
+            tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
+        else:
+            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
+
+        # Let the tokenizer know about special tokens if they are part of the vocab
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+        if tokenizer.token_to_id(str(sep_token)) is not None:
+            tokenizer.add_special_tokens([str(sep_token)])
+        if tokenizer.token_to_id(str(cls_token)) is not None:
+            tokenizer.add_special_tokens([str(cls_token)])
+        if tokenizer.token_to_id(str(pad_token)) is not None:
+            tokenizer.add_special_tokens([str(pad_token)])
+        if tokenizer.token_to_id(str(mask_token)) is not None:
+            tokenizer.add_special_tokens([str(mask_token)])
+
+        tokenizer.normalizer = BertNormalizer(
+            clean_text=clean_text,
+            handle_chinese_chars=handle_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=lowercase,
+        )
+        tokenizer.pre_tokenizer = BertPreTokenizer()
+
+        if vocab is not None:
+            sep_token_id = tokenizer.token_to_id(str(sep_token))
+            if sep_token_id is None:
+                raise TypeError("sep_token not found in the vocabulary")
+            cls_token_id = tokenizer.token_to_id(str(cls_token))
+            if cls_token_id is None:
+                raise TypeError("cls_token not found in the vocabulary")
+
+            tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
+        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
+
+        parameters = {
+            "model": "BertWordPiece",
+            "unk_token": unk_token,
+            "sep_token": sep_token,
+            "cls_token": cls_token,
+            "pad_token": pad_token,
+            "mask_token": mask_token,
+            "clean_text": clean_text,
+            "handle_chinese_chars": handle_chinese_chars,
+            "strip_accents": strip_accents,
+            "lowercase": lowercase,
+            "wordpieces_prefix": wordpieces_prefix,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    @staticmethod
+    def from_file(vocab: str, **kwargs):
+        vocab = WordPiece.read_file(vocab)
+        return BertWordPieceTokenizer(vocab, **kwargs)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        special_tokens: List[Union[str, AddedToken]] = [
+            "[PAD]",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[MASK]",
+        ],
+        show_progress: bool = True,
+        wordpieces_prefix: str = "##",
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.WordPieceTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
+            continuing_subword_prefix=wordpieces_prefix,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        special_tokens: List[Union[str, AddedToken]] = [
+            "[PAD]",
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[MASK]",
+        ],
+        show_progress: bool = True,
+        wordpieces_prefix: str = "##",
+        length: Optional[int] = None,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.WordPieceTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
+            continuing_subword_prefix=wordpieces_prefix,
+        )
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
new file mode 100644
index 00000000..c7e3dbc4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
@@ -0,0 +1,122 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
+
+from .base_tokenizer import BaseTokenizer
+
+
+class ByteLevelBPETokenizer(BaseTokenizer):
+    """ByteLevelBPETokenizer
+
+    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        add_prefix_space: bool = False,
+        lowercase: bool = False,
+        dropout: Optional[float] = None,
+        unicode_normalizer: Optional[str] = None,
+        continuing_subword_prefix: Optional[str] = None,
+        end_of_word_suffix: Optional[str] = None,
+        trim_offsets: bool = False,
+    ):
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(
+                BPE(
+                    vocab,
+                    merges,
+                    dropout=dropout,
+                    continuing_subword_prefix=continuing_subword_prefix or "",
+                    end_of_word_suffix=end_of_word_suffix or "",
+                )
+            )
+        else:
+            tokenizer = Tokenizer(BPE())
+
+        # Check for Unicode normalization first (before everything else)
+        normalizers = []
+
+        if unicode_normalizer:
+            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+        if lowercase:
+            normalizers += [Lowercase()]
+
+        # Create the normalizer structure
+        if len(normalizers) > 0:
+            if len(normalizers) > 1:
+                tokenizer.normalizer = Sequence(normalizers)
+            else:
+                tokenizer.normalizer = normalizers[0]
+
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
+
+        parameters = {
+            "model": "ByteLevelBPE",
+            "add_prefix_space": add_prefix_space,
+            "lowercase": lowercase,
+            "dropout": dropout,
+            "unicode_normalizer": unicode_normalizer,
+            "continuing_subword_prefix": continuing_subword_prefix,
+            "end_of_word_suffix": end_of_word_suffix,
+            "trim_offsets": trim_offsets,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    @staticmethod
+    def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+        vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+        return ByteLevelBPETokenizer(vocab, merges, **kwargs)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            show_progress=show_progress,
+            special_tokens=special_tokens,
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        show_progress: bool = True,
+        special_tokens: List[Union[str, AddedToken]] = [],
+        length: Optional[int] = None,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            show_progress=show_progress,
+            special_tokens=special_tokens,
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+        )
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
new file mode 100644
index 00000000..29ca5977
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
@@ -0,0 +1,150 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from ..models import BPE
+from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
+from .base_tokenizer import BaseTokenizer
+
+
+class CharBPETokenizer(BaseTokenizer):
+    """Original BPE Tokenizer
+
+    Represents the BPE algorithm, as introduced by Rico Sennrich
+    (https://arxiv.org/abs/1508.07909)
+
+    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
+    Sennrich subword-nmt implementation by the following options that you can deactivate:
+        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
+            * removing any control characters and replacing all whitespaces by the classic one.
+            * handle chinese chars by putting spaces around them.
+            * strip all accents.
+        - spitting on punctuation in addition to whitespaces (deactivate it with
+          `split_on_whitespace_only=True`)
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        suffix: str = "</w>",
+        dropout: Optional[float] = None,
+        lowercase: bool = False,
+        unicode_normalizer: Optional[str] = None,
+        bert_normalizer: bool = True,
+        split_on_whitespace_only: bool = False,
+    ):
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(
+                BPE(
+                    vocab,
+                    merges,
+                    dropout=dropout,
+                    unk_token=str(unk_token),
+                    end_of_word_suffix=suffix,
+                )
+            )
+        else:
+            tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
+
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+
+        # Check for Unicode normalization first (before everything else)
+        normalizers = []
+
+        if unicode_normalizer:
+            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+        if bert_normalizer:
+            normalizers += [BertNormalizer(lowercase=False)]
+
+        if lowercase:
+            normalizers += [Lowercase()]
+
+        # Create the normalizer structure
+        if len(normalizers) > 0:
+            if len(normalizers) > 1:
+                tokenizer.normalizer = Sequence(normalizers)
+            else:
+                tokenizer.normalizer = normalizers[0]
+
+        if split_on_whitespace_only:
+            tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
+        else:
+            tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+        tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
+
+        parameters = {
+            "model": "BPE",
+            "unk_token": unk_token,
+            "suffix": suffix,
+            "dropout": dropout,
+            "lowercase": lowercase,
+            "unicode_normalizer": unicode_normalizer,
+            "bert_normalizer": bert_normalizer,
+            "split_on_whitespace_only": split_on_whitespace_only,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    @staticmethod
+    def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+        vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+        return CharBPETokenizer(vocab, merges, **kwargs)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        suffix: Optional[str] = "</w>",
+        show_progress: bool = True,
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            end_of_word_suffix=suffix,
+            show_progress=show_progress,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        suffix: Optional[str] = "</w>",
+        show_progress: bool = True,
+        length: Optional[int] = None,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            end_of_word_suffix=suffix,
+            show_progress=show_progress,
+        )
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
new file mode 100644
index 00000000..cd550b41
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
@@ -0,0 +1,103 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+
+from .base_tokenizer import BaseTokenizer
+
+
+class SentencePieceBPETokenizer(BaseTokenizer):
+    """SentencePiece BPE Tokenizer
+
+    Represents the BPE algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+        dropout: Optional[float] = None,
+        fuse_unk: Optional[bool] = False,
+    ):
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+        else:
+            tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+
+        tokenizer.normalizer = NFKC()
+        prepend_scheme = "always" if add_prefix_space else "never"
+        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+        parameters = {
+            "model": "SentencePieceBPE",
+            "unk_token": unk_token,
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+            "dropout": dropout,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    @staticmethod
+    def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+        vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+        return SentencePieceBPETokenizer(vocab, merges, **kwargs)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        show_progress: bool = True,
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            show_progress=show_progress,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        show_progress: bool = True,
+        length: Optional[int] = None,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            show_progress=show_progress,
+        )
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py
new file mode 100644
index 00000000..1237e85e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py
@@ -0,0 +1,196 @@
+import json
+import os
+from typing import Iterator, List, Optional, Union, Tuple
+
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
+from tokenizers.models import Unigram
+
+from .base_tokenizer import BaseTokenizer
+
+
+class SentencePieceUnigramTokenizer(BaseTokenizer):
+    """SentencePiece Unigram Tokenizer
+
+    Represents the Unigram algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[List[Tuple[str, float]]] = None,
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+    ):
+        if vocab is not None:
+            # Let Unigram(..) fail if only one of them is None
+            tokenizer = Tokenizer(Unigram(vocab))
+        else:
+            tokenizer = Tokenizer(Unigram())
+
+        tokenizer.normalizer = normalizers.Sequence(
+            [normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
+        )
+        prepend_scheme = "always" if add_prefix_space else "never"
+        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+        parameters = {
+            "model": "SentencePieceUnigram",
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+        initial_alphabet: Optional[List[str]] = None,
+        unk_token: Optional[str] = None,
+    ):
+        """
+        Train the model using the given files
+
+        Args:
+            files (:obj:`List[str]`):
+                A list of path to the files that we should use for training
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+                A list of special tokens the model should know of.
+            initial_alphabet (:obj:`List[str]`, `optional`):
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contain more than one character, only the first one
+                is kept.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+        """
+
+        if special_tokens is None:
+            special_tokens = []
+
+        if initial_alphabet is None:
+            initial_alphabet = []
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
+            initial_alphabet=initial_alphabet,
+            unk_token=unk_token,
+        )
+
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+        special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+        initial_alphabet: Optional[List[str]] = None,
+        unk_token: Optional[str] = None,
+        length: Optional[int] = None,
+    ):
+        """
+        Train the model using the given iterator
+
+        Args:
+            iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
+                Any iterator over strings or list of strings
+            vocab_size (:obj:`int`):
+                The size of the final vocabulary, including all tokens and alphabet.
+            show_progress (:obj:`bool`):
+                Whether to show progress bars while training.
+            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+                A list of special tokens the model should know of.
+            initial_alphabet (:obj:`List[str]`, `optional`):
+                A list of characters to include in the initial alphabet, even
+                if not seen in the training dataset.
+                If the strings contain more than one character, only the first one
+                is kept.
+            unk_token (:obj:`str`, `optional`):
+                The unknown token to be used by the model.
+            length (:obj:`int`, `optional`):
+                The total number of sequences in the iterator. This is used to
+                provide meaningful progress tracking
+        """
+
+        if special_tokens is None:
+            special_tokens = []
+
+        if initial_alphabet is None:
+            initial_alphabet = []
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=special_tokens,
+            show_progress=show_progress,
+            initial_alphabet=initial_alphabet,
+            unk_token=unk_token,
+        )
+
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )
+
+    @staticmethod
+    def from_spm(filename: str):
+        try:
+            import sys
+
+            sys.path.append(".")
+
+            import sentencepiece_model_pb2 as model
+        except Exception:
+            raise Exception(
+                "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
+            )
+
+        m = model.ModelProto()
+        m.ParseFromString(open(filename, "rb").read())
+
+        precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
+        vocab = [(piece.piece, piece.score) for piece in m.pieces]
+        unk_id = m.trainer_spec.unk_id
+        model_type = m.trainer_spec.model_type
+        byte_fallback = m.trainer_spec.byte_fallback
+        if model_type != 1:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+
+        replacement = "▁"
+        add_prefix_space = True
+
+        tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
+
+        if precompiled_charsmap:
+            tokenizer.normalizer = normalizers.Sequence(
+                [
+                    normalizers.Precompiled(precompiled_charsmap),
+                    normalizers.Replace(Regex(" {2,}"), " "),
+                ]
+            )
+        else:
+            tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
+        prepend_scheme = "always" if add_prefix_space else "never"
+        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+        parameters = {
+            "model": "SentencePieceUnigram",
+        }
+
+        obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
+        BaseTokenizer.__init__(obj, tokenizer, parameters)
+        return obj
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py
new file mode 100644
index 00000000..68ac211a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py
@@ -0,0 +1,8 @@
+# Generated content DO NOT EDIT
+from .. import models
+
+Model = models.Model
+BPE = models.BPE
+Unigram = models.Unigram
+WordLevel = models.WordLevel
+WordPiece = models.WordPiece
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
new file mode 100644
index 00000000..955b9a16
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
@@ -0,0 +1,591 @@
+# Generated content DO NOT EDIT
+class Model:
+    """
+    Base class for all models
+
+    The model represents the actual tokenization algorithm. This is the part that
+    will contain and manage the learned vocabulary.
+
+    This class cannot be constructed directly. Please use one of the concrete models.
+    """
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
+        """
+        pass
+
+    def save(self, folder, prefix):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given prefix for the various
+        files that will get created.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
+        """
+        pass
+
+    def token_to_id(self, tokens):
+        """
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
+        """
+        pass
+
+    def tokenize(self, sequence):
+        """
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+        """
+        pass
+
+class BPE(Model):
+    """
+    An implementation of the BPE (Byte-Pair Encoding) algorithm
+
+    Args:
+        vocab (:obj:`Dict[str, int]`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+        merges (:obj:`List[Tuple[str, str]]`, `optional`):
+            A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
+
+        cache_capacity (:obj:`int`, `optional`):
+            The number of words that the BPE cache can contain. The cache allows
+            to speed-up the process by keeping the result of the merge operations
+            for a number of words.
+
+        dropout (:obj:`float`, `optional`):
+            A float between 0 and 1 that represents the BPE dropout to use.
+
+        unk_token (:obj:`str`, `optional`):
+            The unknown token to be used by the model.
+
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            The prefix to attach to subword units that don't represent a beginning of word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            The suffix to attach to subword units that represent an end of word.
+
+        fuse_unk (:obj:`bool`, `optional`):
+            Whether to fuse any subsequent unknown tokens into a single one
+
+        byte_fallback (:obj:`bool`, `optional`):
+            Whether to use spm byte-fallback trick (defaults to False)
+
+        ignore_merges (:obj:`bool`, `optional`):
+            Whether or not to match tokens with the vocab before using merges.
+    """
+    def __init__(
+        self,
+        vocab=None,
+        merges=None,
+        cache_capacity=None,
+        dropout=None,
+        unk_token=None,
+        continuing_subword_prefix=None,
+        end_of_word_suffix=None,
+        fuse_unk=None,
+        byte_fallback=False,
+        ignore_merges=False,
+    ):
+        pass
+
+    @staticmethod
+    def from_file(cls, vocab, merge, **kwargs):
+        """
+        Instantiate a BPE model from the given files.
+
+        This method is roughly equivalent to doing::
+
+           vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+           bpe = BPE(vocab, merges)
+
+        If you don't need to keep the :obj:`vocab, merges` values lying around,
+        this method is more optimized than manually calling
+        :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+            merges (:obj:`str`):
+                The path to a :obj:`merges.txt` file
+
+        Returns:
+            :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
+        """
+        pass
+
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
+        """
+        pass
+
+    @staticmethod
+    def read_file(self, vocab, merges):
+        """
+        Read a :obj:`vocab.json` and a :obj:`merges.txt` files
+
+        This method provides a way to read and parse the content of these files,
+        returning the relevant data structures. If you want to instantiate some BPE models
+        from memory, this method gives you the expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+            merges (:obj:`str`):
+                The path to a :obj:`merges.txt` file
+
+        Returns:
+            A :obj:`Tuple` with the vocab and the merges:
+                The vocabulary and merges loaded into memory
+        """
+        pass
+
+    def save(self, folder, prefix):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given prefix for the various
+        files that will get created.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
+        """
+        pass
+
+    def token_to_id(self, tokens):
+        """
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
+        """
+        pass
+
+    def tokenize(self, sequence):
+        """
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+        """
+        pass
+
+class Unigram(Model):
+    """
+    An implementation of the Unigram algorithm
+
+    Args:
+        vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
+            A list of vocabulary items and their relative score [("am", -0.2442),...]
+    """
+    def __init__(self, vocab, unk_id, byte_fallback):
+        pass
+
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
+        """
+        pass
+
+    def save(self, folder, prefix):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given prefix for the various
+        files that will get created.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
+        """
+        pass
+
+    def token_to_id(self, tokens):
+        """
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
+        """
+        pass
+
+    def tokenize(self, sequence):
+        """
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+        """
+        pass
+
+class WordLevel(Model):
+    """
+    An implementation of the WordLevel algorithm
+
+    Most simple tokenizer model based on mapping tokens to their corresponding id.
+
+    Args:
+        vocab (:obj:`str`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+        unk_token (:obj:`str`, `optional`):
+            The unknown token to be used by the model.
+    """
+    def __init__(self, vocab, unk_token):
+        pass
+
+    @staticmethod
+    def from_file(vocab, unk_token):
+        """
+        Instantiate a WordLevel model from the given file
+
+        This method is roughly equivalent to doing::
+
+            vocab = WordLevel.read_file(vocab_filename)
+            wordlevel = WordLevel(vocab)
+
+        If you don't need to keep the :obj:`vocab` values lying around, this method is
+        more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
+        initialize a :class:`~tokenizers.models.WordLevel`
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+        Returns:
+            :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
+        """
+        pass
+
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
+        """
+        pass
+
+    @staticmethod
+    def read_file(vocab):
+        """
+        Read a :obj:`vocab.json`
+
+        This method provides a way to read and parse the content of a vocabulary file,
+        returning the relevant data structures. If you want to instantiate some WordLevel models
+        from memory, this method gives you the expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.json` file
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+        """
+        pass
+
+    def save(self, folder, prefix):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given prefix for the various
+        files that will get created.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
+        """
+        pass
+
+    def token_to_id(self, tokens):
+        """
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
+        """
+        pass
+
+    def tokenize(self, sequence):
+        """
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+        """
+        pass
+
+class WordPiece(Model):
+    """
+    An implementation of the WordPiece algorithm
+
+    Args:
+        vocab (:obj:`Dict[str, int]`, `optional`):
+            A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+        unk_token (:obj:`str`, `optional`):
+            The unknown token to be used by the model.
+
+        max_input_chars_per_word (:obj:`int`, `optional`):
+            The maximum number of characters to authorize in a single word.
+    """
+    def __init__(self, vocab, unk_token, max_input_chars_per_word):
+        pass
+
+    @staticmethod
+    def from_file(vocab, **kwargs):
+        """
+        Instantiate a WordPiece model from the given file
+
+        This method is roughly equivalent to doing::
+
+            vocab = WordPiece.read_file(vocab_filename)
+            wordpiece = WordPiece(vocab)
+
+        If you don't need to keep the :obj:`vocab` values lying around, this method is
+        more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
+        initialize a :class:`~tokenizers.models.WordPiece`
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.txt` file
+
+        Returns:
+            :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
+        """
+        pass
+
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
+
+    def id_to_token(self, id):
+        """
+        Get the token associated to an ID
+
+        Args:
+            id (:obj:`int`):
+                An ID to convert to a token
+
+        Returns:
+            :obj:`str`: The token associated to the ID
+        """
+        pass
+
+    @staticmethod
+    def read_file(vocab):
+        """
+        Read a :obj:`vocab.txt` file
+
+        This method provides a way to read and parse the content of a standard `vocab.txt`
+        file as used by the WordPiece Model, returning the relevant data structures. If you
+        want to instantiate some WordPiece models from memory, this method gives you the
+        expected input from the standard files.
+
+        Args:
+            vocab (:obj:`str`):
+                The path to a :obj:`vocab.txt` file
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+        """
+        pass
+
+    def save(self, folder, prefix):
+        """
+        Save the current model
+
+        Save the current model in the given folder, using the given prefix for the various
+        files that will get created.
+        Any file with the same name that already exists in this folder will be overwritten.
+
+        Args:
+            folder (:obj:`str`):
+                The path to the target folder in which to save the various files
+
+            prefix (:obj:`str`, `optional`):
+                An optional prefix, used to prefix each file name
+
+        Returns:
+            :obj:`List[str]`: The list of saved files
+        """
+        pass
+
+    def token_to_id(self, tokens):
+        """
+        Get the ID associated to a token
+
+        Args:
+            token (:obj:`str`):
+                A token to convert to an ID
+
+        Returns:
+            :obj:`int`: The ID associated to the token
+        """
+        pass
+
+    def tokenize(self, sequence):
+        """
+        Tokenize a sequence
+
+        Args:
+            sequence (:obj:`str`):
+                A sequence to tokenize
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
new file mode 100644
index 00000000..15a16f1e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
@@ -0,0 +1,29 @@
+from .. import normalizers
+
+
+Normalizer = normalizers.Normalizer
+BertNormalizer = normalizers.BertNormalizer
+NFD = normalizers.NFD
+NFKD = normalizers.NFKD
+NFC = normalizers.NFC
+NFKC = normalizers.NFKC
+Sequence = normalizers.Sequence
+Lowercase = normalizers.Lowercase
+Prepend = normalizers.Prepend
+Strip = normalizers.Strip
+StripAccents = normalizers.StripAccents
+Nmt = normalizers.Nmt
+Precompiled = normalizers.Precompiled
+Replace = normalizers.Replace
+
+
+NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
+
+
+def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+    if normalizer not in NORMALIZERS:
+        raise ValueError(
+            "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
+        )
+
+    return NORMALIZERS[normalizer]()
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
new file mode 100644
index 00000000..507d4473
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
@@ -0,0 +1,595 @@
+# Generated content DO NOT EDIT
+class Normalizer:
+    """
+    Base class for all normalizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Normalizer will return an instance of this class when instantiated.
+    """
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class BertNormalizer(Normalizer):
+    """
+    BertNormalizer
+
+    Takes care of normalizing raw text before giving it to a Bert model.
+    This includes cleaning the text, handling accents, chinese chars and lowercasing
+
+    Args:
+        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to clean the text, by removing any control characters
+            and replacing all whitespaces by the classic one.
+
+        handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to handle chinese chars by putting spaces around them.
+
+        strip_accents (:obj:`bool`, `optional`):
+            Whether to strip all accents. If this option is not specified (ie == None),
+            then it will be determined by the value for `lowercase` (as in the original Bert).
+
+        lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase.
+    """
+    def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Lowercase(Normalizer):
+    """
+    Lowercase Normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class NFC(Normalizer):
+    """
+    NFC Unicode Normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class NFD(Normalizer):
+    """
+    NFD Unicode Normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class NFKC(Normalizer):
+    """
+    NFKC Unicode Normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class NFKD(Normalizer):
+    """
+    NFKD Unicode Normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Nmt(Normalizer):
+    """
+    Nmt normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Precompiled(Normalizer):
+    """
+    Precompiled normalizer
+    Don't use manually it is used for compatiblity for SentencePiece.
+    """
+    def __init__(self, precompiled_charsmap):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Prepend(Normalizer):
+    """
+    Prepend normalizer
+    """
+    def __init__(self, prepend):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Replace(Normalizer):
+    """
+    Replace normalizer
+    """
+    def __init__(self, pattern, content):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Sequence(Normalizer):
+    """
+    Allows concatenating multiple other Normalizer as a Sequence.
+    All the normalizers run in sequence in the given order
+
+    Args:
+        normalizers (:obj:`List[Normalizer]`):
+            A list of Normalizer to be run as a sequence
+    """
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class Strip(Normalizer):
+    """
+    Strip normalizer
+    """
+    def __init__(self, left=True, right=True):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
+
+class StripAccents(Normalizer):
+    """
+    StripAccents normalizer
+    """
+    def __init__(self):
+        pass
+
+    def normalize(self, normalized):
+        """
+        Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.NormalizedString` to
+        keep track of the alignment information. If you just want to see the result
+        of the normalization on a raw string, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+        Args:
+            normalized (:class:`~tokenizers.NormalizedString`):
+                The normalized string on which to apply this
+                :class:`~tokenizers.normalizers.Normalizer`
+        """
+        pass
+
+    def normalize_str(self, sequence):
+        """
+        Normalize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+        information. If you need to get/convert offsets, you can use
+        :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to normalize
+
+        Returns:
+            :obj:`str`: A string after normalization
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py
new file mode 100644
index 00000000..48277f0d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py
@@ -0,0 +1,15 @@
+# Generated content DO NOT EDIT
+from .. import pre_tokenizers
+
+PreTokenizer = pre_tokenizers.PreTokenizer
+BertPreTokenizer = pre_tokenizers.BertPreTokenizer
+ByteLevel = pre_tokenizers.ByteLevel
+CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
+Digits = pre_tokenizers.Digits
+Metaspace = pre_tokenizers.Metaspace
+Punctuation = pre_tokenizers.Punctuation
+Sequence = pre_tokenizers.Sequence
+Split = pre_tokenizers.Split
+UnicodeScripts = pre_tokenizers.UnicodeScripts
+Whitespace = pre_tokenizers.Whitespace
+WhitespaceSplit = pre_tokenizers.WhitespaceSplit
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi
new file mode 100644
index 00000000..d81d3802
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi
@@ -0,0 +1,607 @@
+# Generated content DO NOT EDIT
+class PreTokenizer:
+    """
+    Base class for all pre-tokenizers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    PreTokenizer will return an instance of this class when instantiated.
+    """
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class BertPreTokenizer(PreTokenizer):
+    """
+    BertPreTokenizer
+
+    This pre-tokenizer splits tokens on spaces, and also on punctuation.
+    Each occurence of a punctuation character will be treated separately.
+    """
+    def __init__(self):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class ByteLevel(PreTokenizer):
+    """
+    ByteLevel PreTokenizer
+
+    This pre-tokenizer takes care of replacing all bytes of the given string
+    with a corresponding representation, as well as splitting into words.
+
+    Args:
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+        use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Set this to :obj:`False` to prevent this `pre_tokenizer` from using
+            the GPT2 specific regexp for spliting on whitespace.
+    """
+    def __init__(self, add_prefix_space=True, use_regex=True):
+        pass
+
+    @staticmethod
+    def alphabet():
+        """
+        Returns the alphabet used by this PreTokenizer.
+
+        Since the ByteLevel works as its name suggests, at the byte level, it
+        encodes each byte value to a unique visible character. This means that there is a
+        total of 256 different characters composing this alphabet.
+
+        Returns:
+            :obj:`List[str]`: A list of characters that compose the alphabet
+        """
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class CharDelimiterSplit(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
+
+    Args:
+        delimiter: str:
+            The delimiter char that will be used to split input
+    """
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Digits(PreTokenizer):
+    """
+    This pre-tokenizer simply splits using the digits in separate tokens
+
+    Args:
+        individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, digits will each be separated as follows::
+
+                "Call 123 please" -> "Call ", "1", "2", "3", " please"
+
+            If set to False, digits will grouped as follows::
+
+                "Call 123 please" -> "Call ", "123", " please"
+    """
+    def __init__(self, individual_digits=False):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Metaspace(PreTokenizer):
+    """
+    Metaspace pre-tokenizer
+
+    This pre-tokenizer replaces any whitespace by the provided replacement character.
+    It then tries to split on these spaces.
+
+    Args:
+        replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
+            The replacement character. Must be exactly one character. By default we
+            use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+        prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
+            Whether to add a space to the first word if there isn't already one. This
+            lets us treat `hello` exactly like `say hello`.
+            Choices: "always", "never", "first". First means the space is only added on the first
+            token (relevant when special tokens are used or other pre_tokenizer are used).
+
+    """
+    def __init__(self, replacement="_", prepend_scheme="always", split=True):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Punctuation(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on punctuation as individual characters.
+
+    Args:
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
+            "contiguous"
+    """
+    def __init__(self, behavior="isolated"):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Sequence(PreTokenizer):
+    """
+    This pre-tokenizer composes other pre_tokenizers and applies them in sequence
+    """
+    def __init__(self, pretokenizers):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Split(PreTokenizer):
+    """
+    Split PreTokenizer
+
+    This versatile pre-tokenizer splits using the provided pattern and
+    according to the provided behavior. The pattern can be inverted by
+    making use of the invert flag.
+
+    Args:
+        pattern (:obj:`str` or :class:`~tokenizers.Regex`):
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
+
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+            "contiguous"
+
+        invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to invert the pattern.
+    """
+    def __init__(self, pattern, behavior, invert=False):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class UnicodeScripts(PreTokenizer):
+    """
+    This pre-tokenizer splits on characters that belong to different language family
+    It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
+    Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
+    This mimicks SentencePiece Unigram implementation.
+    """
+    def __init__(self):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class Whitespace(PreTokenizer):
+    """
+    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+    """
+    def __init__(self):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
+
+class WhitespaceSplit(PreTokenizer):
+    """
+    This pre-tokenizer simply splits on the whitespace. Works like `.split()`
+    """
+    def __init__(self):
+        pass
+
+    def pre_tokenize(self, pretok):
+        """
+        Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+        This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+        keep track of the pre-tokenization, and leverage the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+        the pre-tokenization of a raw string, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+        Args:
+            pretok (:class:`~tokenizers.PreTokenizedString):
+                The pre-tokenized string on which to apply this
+                :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+        """
+        pass
+
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given string
+
+        This method provides a way to visualize the effect of a
+        :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+        alignment, nor does it provide all the capabilities of the
+        :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+        :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+        Args:
+            sequence (:obj:`str`):
+                A string to pre-tokeize
+
+        Returns:
+            :obj:`List[Tuple[str, Offsets]]`:
+                A list of tuple with the pre-tokenized parts and their offsets
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py
new file mode 100644
index 00000000..06d12403
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py
@@ -0,0 +1,9 @@
+# Generated content DO NOT EDIT
+from .. import processors
+
+PostProcessor = processors.PostProcessor
+BertProcessing = processors.BertProcessing
+ByteLevel = processors.ByteLevel
+RobertaProcessing = processors.RobertaProcessing
+Sequence = processors.Sequence
+TemplateProcessing = processors.TemplateProcessing
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi
new file mode 100644
index 00000000..5136d02b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi
@@ -0,0 +1,342 @@
+# Generated content DO NOT EDIT
+class PostProcessor:
+    """
+    Base class for all post-processors
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of
+    a PostProcessor will return an instance of this class when instantiated.
+    """
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
+class BertProcessing(PostProcessor):
+    """
+    This post-processor takes care of adding the special tokens needed by
+    a Bert model:
+
+        - a SEP token
+        - a CLS token
+
+    Args:
+        sep (:obj:`Tuple[str, int]`):
+            A tuple with the string representation of the SEP token, and its id
+
+        cls (:obj:`Tuple[str, int]`):
+            A tuple with the string representation of the CLS token, and its id
+    """
+    def __init__(self, sep, cls):
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
+class ByteLevel(PostProcessor):
+    """
+    This post-processor takes care of trimming the offsets.
+
+    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+    want the offsets to include these whitespaces, then this PostProcessor must be used.
+
+    Args:
+        trim_offsets (:obj:`bool`):
+            Whether to trim the whitespaces from the produced offsets.
+    """
+    def __init__(self, trim_offsets=True):
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
+class RobertaProcessing(PostProcessor):
+    """
+    This post-processor takes care of adding the special tokens needed by
+    a Roberta model:
+
+        - a SEP token
+        - a CLS token
+
+    It also takes care of trimming the offsets.
+    By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+    want the offsets to include these whitespaces, then this PostProcessor should be initialized
+    with :obj:`trim_offsets=True`
+
+    Args:
+        sep (:obj:`Tuple[str, int]`):
+            A tuple with the string representation of the SEP token, and its id
+
+        cls (:obj:`Tuple[str, int]`):
+            A tuple with the string representation of the CLS token, and its id
+
+        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to trim the whitespaces from the produced offsets.
+
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether the add_prefix_space option was enabled during pre-tokenization. This
+            is relevant because it defines the way the offsets are trimmed out.
+    """
+    def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
+class Sequence(PostProcessor):
+    """
+    Sequence Processor
+
+    Args:
+        processors (:obj:`List[PostProcessor]`)
+            The processors that need to be chained
+    """
+    def __init__(self, processors):
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
+
+class TemplateProcessing(PostProcessor):
+    """
+    Provides a way to specify templates in order to add the special tokens to each
+    input sequence as relevant.
+
+    Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
+    delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
+    sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
+    sequences. The final result looks like this:
+
+        - Single sequence: :obj:`[CLS] Hello there [SEP]`
+        - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
+
+    With the type ids as following::
+
+        [CLS]   ...   [SEP]   ...   [SEP]
+          0      0      0      1      1
+
+    You can achieve such behavior using a TemplateProcessing::
+
+        TemplateProcessing(
+            single="[CLS] $0 [SEP]",
+            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
+            special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
+        )
+
+    In this example, each input sequence is identified using a ``$`` construct. This identifier
+    lets us specify each input sequence, and the type_id to use. When nothing is specified,
+    it uses the default values. Here are the different ways to specify it:
+
+        - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
+        - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
+        - Specifying both: ``$A:0``, ``$B:1``, ...
+
+    The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
+
+    **Warning**: You must ensure that you are giving the correct tokens/ids as these
+    will be added to the Encoding without any further check. If the given ids correspond
+    to something totally different in a `Tokenizer` using this `PostProcessor`, it
+    might lead to unexpected results.
+
+    Args:
+        single (:obj:`Template`):
+            The template used for single sequences
+
+        pair (:obj:`Template`):
+            The template used when both sequences are specified
+
+        special_tokens (:obj:`Tokens`):
+            The list of special tokens used in each sequences
+
+    Types:
+
+        Template (:obj:`str` or :obj:`List`):
+            - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
+            - If a :obj:`List[str]` is provided, a list of tokens
+
+        Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
+            - A :obj:`Tuple` with both a token and its associated ID, in any order
+            - A :obj:`dict` with the following keys:
+                - "id": :obj:`str` => The special token id, as specified in the Template
+                - "ids": :obj:`List[int]` => The associated IDs
+                - "tokens": :obj:`List[str]` => The associated tokens
+
+             The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
+             the same length.
+    """
+    def __init__(self, single, pair, special_tokens):
+        pass
+
+    def num_special_tokens_to_add(self, is_pair):
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+
+        Args:
+            is_pair (:obj:`bool`):
+                Whether the input would be a pair of sequences
+
+        Returns:
+            :obj:`int`: The number of tokens to add
+        """
+        pass
+
+    def process(self, encoding, pair=None, add_special_tokens=True):
+        """
+        Post-process the given encodings, generating the final one
+
+        Args:
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding for the first sequence
+
+            pair (:class:`~tokenizers.Encoding`, `optional`):
+                The encoding for the pair sequence
+
+            add_special_tokens (:obj:`bool`):
+                Whether to add the special tokens
+
+        Return:
+            :class:`~tokenizers.Encoding`: The final encoding
+        """
+        pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so
new file mode 100755
index 00000000..563e2885
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so
Binary files differdiff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py
new file mode 100644
index 00000000..f941e2ed
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py
@@ -0,0 +1 @@
+from .visualizer import Annotation, EncodingVisualizer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css
new file mode 100644
index 00000000..f54fde45
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css
@@ -0,0 +1,170 @@
+.tokenized-text {
+    width:100%;
+    padding:2rem;
+    max-height: 400px;
+    overflow-y: auto;
+    box-sizing:border-box;
+    line-height:4rem; /* Lots of space between lines */
+    font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
+    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
+    background-color: rgba(0,0,0,0.01);
+    letter-spacing:2px; /* Give some extra separation between chars */
+}
+.non-token{
+    /* White space and other things the tokenizer ignores*/
+    white-space: pre;
+    letter-spacing:4px;
+    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
+    border-bottom:1px solid #A0A0A0;
+    line-height: 1rem;
+    height: calc(100% - 2px);
+}
+
+.token {
+    white-space: pre;
+    position:relative;
+    color:black;
+    letter-spacing:2px;
+}
+
+.annotation{
+    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
+    border-radius:4px;
+    position:relative;
+    width:fit-content;
+}
+.annotation:before {
+    /*The before holds the text and the after holds the background*/
+    z-index:1000; /* Make sure this is above the background */
+    content:attr(data-label); /* The annotations label is on a data attribute */
+    color:white;
+    position:absolute;
+    font-size:1rem;
+    text-align:center;
+    font-weight:bold;
+
+    top:1.75rem;
+    line-height:0;
+    left:0;
+    width:100%;
+    padding:0.5rem 0;
+    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
+    overflow: hidden;
+    white-space: nowrap;
+    text-overflow:ellipsis;
+}
+
+.annotation:after {
+    content:attr(data-label); /* The content defines the width of the annotation*/
+    position:absolute;
+    font-size:0.75rem;
+    text-align:center;
+    font-weight:bold;
+    text-overflow:ellipsis;
+    top:1.75rem;
+    line-height:0;
+    overflow: hidden;
+    white-space: nowrap;
+
+    left:0;
+    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+
+    padding:0.5rem 0;
+    /* Nast hack below:
+    We set the annotations color in code because we don't know the colors at css time.
+    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
+    So to get around that, annotations have the color set on them with a style attribute and then we
+    can get the color with currentColor.
+    Annotations wrap tokens and tokens set the color back to black
+     */
+    background-color: currentColor;
+}
+.annotation:hover::after, .annotation:hover::before{
+    /* When the user hovers over an annotation expand the label to display in full
+     */
+    min-width: fit-content;
+}
+
+.annotation:hover{
+    /* Emphasize the annotation start end with a border on hover*/
+    border-color: currentColor;
+    border: 2px solid;
+}
+.special-token:not(:empty){
+    /*
+    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
+     */
+    position:relative;
+}
+.special-token:empty::before{
+    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
+    content:attr(data-stok);
+    background:#202020;
+    font-size:0.75rem;
+    color:white;
+    margin: 0 0.25rem;
+    padding: 0.25rem;
+    border-radius:4px
+}
+
+.special-token:not(:empty):before {
+    /* Special tokens that have text (UNK) are displayed above the actual text*/
+    content:attr(data-stok);
+    position:absolute;
+    bottom:1.75rem;
+    min-width:100%;
+    width:100%;
+    height:1rem;
+    line-height:1rem;
+    font-size:1rem;
+    text-align:center;
+    color:white;
+    font-weight:bold;
+    background:#202020;
+    border-radius:10%;
+}
+/*
+We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
+instead we apply even and odd class at generation time and color them that way
+ */
+.even-token{
+    background:#DCDCDC	;
+    border: 1px solid #DCDCDC;
+}
+.odd-token{
+    background:#A0A0A0;
+    border: 1px solid #A0A0A0;
+}
+.even-token.multi-token,.odd-token.multi-token{
+    background:  repeating-linear-gradient(
+    45deg,
+    transparent,
+    transparent 1px,
+    #ccc 1px,
+    #ccc 1px
+    ),
+    /* on "bottom" */
+    linear-gradient(
+    to bottom,
+    #FFB6C1,
+    #999
+    );
+}
+
+.multi-token:hover::after {
+    content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
+    color:white;
+    background-color: black;
+    position:absolute;
+    font-size:0.75rem;
+    text-align:center;
+    font-weight:bold;
+    text-overflow:ellipsis;
+    top:1.75rem;
+    line-height:0;
+    overflow: hidden;
+    white-space: nowrap;
+    left:0;
+    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+    padding:0.5rem 0;
+}
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
new file mode 100644
index 00000000..c988a648
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
@@ -0,0 +1,403 @@
+import itertools
+import os
+import re
+from string import Template
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+from tokenizers import Encoding, Tokenizer
+
+
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "visualizer-styles.css")
+with open(css_filename) as f:
+    css = f.read()
+
+
+class Annotation:
+    start: int
+    end: int
+    label: int
+
+    def __init__(self, start: int, end: int, label: str):
+        self.start = start
+        self.end = end
+        self.label = label
+
+
+AnnotationList = List[Annotation]
+PartialIntList = List[Optional[int]]
+
+
+class CharStateKey(NamedTuple):
+    token_ix: Optional[int]
+    anno_ix: Optional[int]
+
+
+class CharState:
+    char_ix: Optional[int]
+
+    def __init__(self, char_ix):
+        self.char_ix = char_ix
+
+        self.anno_ix: Optional[int] = None
+        self.tokens: List[int] = []
+
+    @property
+    def token_ix(self):
+        return self.tokens[0] if len(self.tokens) > 0 else None
+
+    @property
+    def is_multitoken(self):
+        """
+        BPE tokenizers can output more than one token for a char
+        """
+        return len(self.tokens) > 1
+
+    def partition_key(self) -> CharStateKey:
+        return CharStateKey(
+            token_ix=self.token_ix,
+            anno_ix=self.anno_ix,
+        )
+
+
+class Aligned:
+    pass
+
+
+class EncodingVisualizer:
+    """
+    Build an EncodingVisualizer
+
+    Args:
+
+         tokenizer (:class:`~tokenizers.Tokenizer`):
+            A tokenizer instance
+
+         default_to_notebook (:obj:`bool`):
+            Whether to render html output in a notebook by default
+
+         annotation_converter (:obj:`Callable`, `optional`):
+            An optional (lambda) function that takes an annotation in any format and returns
+            an Annotation object
+    """
+
+    unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
+
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        default_to_notebook: bool = True,
+        annotation_converter: Optional[Callable[[Any], Annotation]] = None,
+    ):
+        if default_to_notebook:
+            try:
+                from IPython.core.display import HTML, display
+            except ImportError:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                        Are you running in a notebook?
+                        You can also pass `default_to_notebook=False` to get back raw HTML
+                    """
+                )
+
+        self.tokenizer = tokenizer
+        self.default_to_notebook = default_to_notebook
+        self.annotation_coverter = annotation_converter
+        pass
+
+    def __call__(
+        self,
+        text: str,
+        annotations: AnnotationList = [],
+        default_to_notebook: Optional[bool] = None,
+    ) -> Optional[str]:
+        """
+        Build a visualization of the given text
+
+        Args:
+            text (:obj:`str`):
+                The text to tokenize
+
+            annotations (:obj:`List[Annotation]`, `optional`):
+                An optional list of annotations of the text. The can either be an annotation class
+                or anything else if you instantiated the visualizer with a converter function
+
+            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
+                If True, will render the html in a notebook. Otherwise returns an html string.
+
+        Returns:
+            The HTML string if default_to_notebook is False, otherwise (default) returns None and
+            renders the HTML in the notebook
+
+        """
+        final_default_to_notebook = self.default_to_notebook
+        if default_to_notebook is not None:
+            final_default_to_notebook = default_to_notebook
+        if final_default_to_notebook:
+            try:
+                from IPython.core.display import HTML, display
+            except ImportError:
+                raise Exception(
+                    """We couldn't import IPython utils for html display.
+                    Are you running in a notebook?"""
+                )
+        if self.annotation_coverter is not None:
+            annotations = list(map(self.annotation_coverter, annotations))
+        encoding = self.tokenizer.encode(text)
+        html = EncodingVisualizer.__make_html(text, encoding, annotations)
+        if final_default_to_notebook:
+            display(HTML(html))
+        else:
+            return html
+
+    @staticmethod
+    def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
+        """
+        Generates a color palette for all the labels in a given set of annotations
+
+        Args:
+          annotations (:obj:`Annotation`):
+            A list of annotations
+
+        Returns:
+            :obj:`dict`: A dictionary mapping labels to colors in HSL format
+        """
+        if len(annotations) == 0:
+            return {}
+        labels = set(map(lambda x: x.label, annotations))
+        num_labels = len(labels)
+        h_step = int(255 / num_labels)
+        if h_step < 20:
+            h_step = 20
+        s = 32
+        l = 64  # noqa: E741
+        h = 10
+        colors = {}
+
+        for label in sorted(labels):  # sort so we always get the same colors for a given set of labels
+            colors[label] = f"hsl({h},{s}%,{l}%"
+            h += h_step
+        return colors
+
+    @staticmethod
+    def consecutive_chars_to_html(
+        consecutive_chars_list: List[CharState],
+        text: str,
+        encoding: Encoding,
+    ):
+        """
+        Converts a list of "consecutive chars" into a single HTML element.
+        Chars are consecutive if they fall under the same word, token and annotation.
+        The CharState class is a named tuple with a "partition_key" method that makes it easy to
+        compare if two chars are consecutive.
+
+        Args:
+            consecutive_chars_list (:obj:`List[CharState]`):
+                A list of CharStates that have been grouped together
+
+            text (:obj:`str`):
+                The original text being processed
+
+            encoding (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`str`: The HTML span for a set of consecutive chars
+        """
+        first = consecutive_chars_list[0]
+        if first.char_ix is None:
+            # its a special token
+            stoken = encoding.tokens[first.token_ix]
+            # special tokens are represented as empty spans. We use the data attribute and css
+            # magic to display it
+            return f'<span class="special-token" data-stoken={stoken}></span>'
+        # We're not in a special token so this group has a start and end.
+        last = consecutive_chars_list[-1]
+        start = first.char_ix
+        end = last.char_ix + 1
+        span_text = text[start:end]
+        css_classes = []  # What css classes will we apply on the resulting span
+        data_items = {}  # What data attributes will we apply on the result span
+        if first.token_ix is not None:
+            # We can either be in a token or not (e.g. in white space)
+            css_classes.append("token")
+            if first.is_multitoken:
+                css_classes.append("multi-token")
+            if first.token_ix % 2:
+                # We use this to color alternating tokens.
+                # A token might be split by an annotation that ends in the middle of it, so this
+                # lets us visually indicate a consecutive token despite its possible splitting in
+                # the html markup
+                css_classes.append("odd-token")
+            else:
+                # Like above, but a different color so we can see the tokens alternate
+                css_classes.append("even-token")
+            if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
+                # This is a special token that is in the text. probably UNK
+                css_classes.append("special-token")
+                # TODO is this the right name for the data attribute ?
+                data_items["stok"] = encoding.tokens[first.token_ix]
+        else:
+            # In this case we are looking at a group/single char that is not tokenized.
+            # e.g. white space
+            css_classes.append("non-token")
+        css = f'''class="{' '.join(css_classes)}"'''
+        data = ""
+        for key, val in data_items.items():
+            data += f' data-{key}="{val}"'
+        return f"<span {css} {data} >{span_text}</span>"
+
+    @staticmethod
+    def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
+        char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
+        current_consecutive_chars = [char_states[0]]
+        prev_anno_ix = char_states[0].anno_ix
+        spans = []
+        label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
+        cur_anno_ix = char_states[0].anno_ix
+        if cur_anno_ix is not None:
+            # If we started in an  annotation make a span for it
+            anno = annotations[cur_anno_ix]
+            label = anno.label
+            color = label_colors_dict[label]
+            spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+
+        for cs in char_states[1:]:
+            cur_anno_ix = cs.anno_ix
+            if cur_anno_ix != prev_anno_ix:
+                # If we've transitioned in or out of an annotation
+                spans.append(
+                    # Create a span from the current consecutive characters
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                current_consecutive_chars = [cs]
+
+                if prev_anno_ix is not None:
+                    # if we transitioned out of an annotation close it's span
+                    spans.append("</span>")
+                if cur_anno_ix is not None:
+                    # If we entered a new annotation make a span for it
+                    anno = annotations[cur_anno_ix]
+                    label = anno.label
+                    color = label_colors_dict[label]
+                    spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+            prev_anno_ix = cur_anno_ix
+
+            if cs.partition_key() == current_consecutive_chars[0].partition_key():
+                # If the current charchter is in the same "group" as the previous one
+                current_consecutive_chars.append(cs)
+            else:
+                # Otherwise we make a span for the previous group
+                spans.append(
+                    EncodingVisualizer.consecutive_chars_to_html(
+                        current_consecutive_chars,
+                        text=text,
+                        encoding=encoding,
+                    )
+                )
+                # An reset the consecutive_char_list to form a new group
+                current_consecutive_chars = [cs]
+        # All that's left is to fill out the final span
+        # TODO I think there is an edge case here where an annotation's span might not close
+        spans.append(
+            EncodingVisualizer.consecutive_chars_to_html(
+                current_consecutive_chars,
+                text=text,
+                encoding=encoding,
+            )
+        )
+        res = HTMLBody(spans)  # Send the list of spans to the body of our html
+        return res
+
+    @staticmethod
+    def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
+        """
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`AnnotationList`):
+                A (possibly empty) list of annotations
+
+        Returns:
+            A list of  length len(text) whose entry at index i is None if there is no annotation on
+            charachter i or k, the index of the annotation that covers index i where k is with
+            respect to the list of annotations
+        """
+        annotation_map = [None] * len(text)
+        for anno_ix, a in enumerate(annotations):
+            for i in range(a.start, a.end):
+                annotation_map[i] = anno_ix
+        return annotation_map
+
+    @staticmethod
+    def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
+        """
+        For each character in the original text, we emit a tuple representing it's "state":
+
+            * which token_ix it corresponds to
+            * which word_ix it corresponds to
+            * which annotation_ix it corresponds to
+
+        Args:
+            text (:obj:`str`):
+                The raw text we want to align to
+
+            annotations (:obj:`List[Annotation]`):
+                A (possibly empty) list of annotations
+
+            encoding: (:class:`~tokenizers.Encoding`):
+                The encoding returned from the tokenizer
+
+        Returns:
+            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
+            it's state is
+        """
+        annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
+        # Todo make this a dataclass or named tuple
+        char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
+        for token_ix, token in enumerate(encoding.tokens):
+            offsets = encoding.token_to_chars(token_ix)
+            if offsets is not None:
+                start, end = offsets
+                for i in range(start, end):
+                    char_states[i].tokens.append(token_ix)
+        for char_ix, anno_ix in enumerate(annotation_map):
+            char_states[char_ix].anno_ix = anno_ix
+
+        return char_states
+
+
+def HTMLBody(children: List[str], css_styles=css) -> str:
+    """
+    Generates the full html with css from a list of html spans
+
+    Args:
+        children (:obj:`List[str]`):
+            A list of strings, assumed to be html elements
+
+        css_styles (:obj:`str`, `optional`):
+            Optional alternative implementation of the css
+
+    Returns:
+        :obj:`str`: An HTML string with style markup
+    """
+    children_text = "".join(children)
+    return f"""
+    <html>
+        <head>
+            <style>
+                {css_styles}
+            </style>
+        </head>
+        <body>
+            <div class="tokenized-text" dir=auto>
+            {children_text}
+            </div>
+        </body>
+    </html>
+    """
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
new file mode 100644
index 00000000..22f94c50
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
@@ -0,0 +1,8 @@
+# Generated content DO NOT EDIT
+from .. import trainers
+
+Trainer = trainers.Trainer
+BpeTrainer = trainers.BpeTrainer
+UnigramTrainer = trainers.UnigramTrainer
+WordLevelTrainer = trainers.WordLevelTrainer
+WordPieceTrainer = trainers.WordPieceTrainer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
new file mode 100644
index 00000000..d6c52571
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
@@ -0,0 +1,156 @@
+# Generated content DO NOT EDIT
+class Trainer:
+    """
+    Base class for all trainers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Trainer will return an instance of this class when instantiated.
+    """
+
+class BpeTrainer(Trainer):
+    """
+    Trainer capable of training a BPE model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+            A list of special tokens the model should know of.
+
+        limit_alphabet (:obj:`int`, `optional`):
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet (:obj:`List[str]`, `optional`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            A suffix to be used for every subword that is a end-of-word.
+
+        max_token_length (:obj:`int`, `optional`):
+            Prevents creating tokens longer than the specified size.
+            This can help with reducing polluting your vocabulary with
+            highly repetitive tokens like `======` for wikipedia
+
+    """
+
+class UnigramTrainer(Trainer):
+    """
+    Trainer capable of training a Unigram model
+
+    Args:
+        vocab_size (:obj:`int`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        show_progress (:obj:`bool`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            A list of special tokens the model should know of.
+
+        initial_alphabet (:obj:`List[str]`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        shrinking_factor (:obj:`float`):
+            The shrinking factor used at each step of the training to prune the
+            vocabulary.
+
+        unk_token (:obj:`str`):
+            The token used for out-of-vocabulary tokens.
+
+        max_piece_length (:obj:`int`):
+            The maximum length of a given token.
+
+        n_sub_iterations (:obj:`int`):
+            The number of iterations of the EM algorithm to perform before
+            pruning the vocabulary.
+    """
+    def __init__(
+        self,
+        vocab_size=8000,
+        show_progress=True,
+        special_tokens=[],
+        shrinking_factor=0.75,
+        unk_token=None,
+        max_piece_length=16,
+        n_sub_iterations=2,
+    ):
+        pass
+
+class WordLevelTrainer(Trainer):
+    """
+    Trainer capable of training a WorldLevel model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            A list of special tokens the model should know of.
+    """
+
+class WordPieceTrainer(Trainer):
+    """
+    Trainer capable of training a WordPiece model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+            A list of special tokens the model should know of.
+
+        limit_alphabet (:obj:`int`, `optional`):
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet (:obj:`List[str]`, `optional`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            A suffix to be used for every subword that is a end-of-word.
+    """
+    def __init__(
+        self,
+        vocab_size=30000,
+        min_frequency=0,
+        show_progress=True,
+        special_tokens=[],
+        limit_alphabet=None,
+        initial_alphabet=[],
+        continuing_subword_prefix="##",
+        end_of_word_suffix=None,
+    ):
+        pass
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tokenizers
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz