aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/__init__.py100
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi1200
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py14
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi271
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py6
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py418
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py151
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py122
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py150
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py103
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py196
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py8
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi591
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py29
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi595
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py15
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi607
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py9
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi342
-rwxr-xr-x.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.sobin0 -> 11826456 bytes
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py1
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css170
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py403
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py8
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi156
25 files changed, 5665 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
new file mode 100644
index 00000000..efd57429
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
@@ -0,0 +1,100 @@
+from enum import Enum
+from typing import List, Tuple, Union
+
+
+Offsets = Tuple[int, int]
+
+TextInputSequence = str
+"""A :obj:`str` that represents an input sequence """
+
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+"""A pre-tokenized input sequence. Can be one of:
+
+ - A :obj:`List` of :obj:`str`
+ - A :obj:`Tuple` of :obj:`str`
+"""
+
+TextEncodeInput = Union[
+ TextInputSequence,
+ Tuple[TextInputSequence, TextInputSequence],
+ List[TextInputSequence],
+]
+"""Represents a textual input for encoding. Can be either:
+
+ - A single sequence: :data:`~tokenizers.TextInputSequence`
+ - A pair of sequences:
+
+ - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
+ - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
+"""
+
+PreTokenizedEncodeInput = Union[
+ PreTokenizedInputSequence,
+ Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+ List[PreTokenizedInputSequence],
+]
+"""Represents a pre-tokenized input for encoding. Can be either:
+
+ - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
+ - A pair of sequences:
+
+ - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
+ - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
+"""
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+"""Represents all the possible types of input sequences for encoding. Can be:
+
+ - When ``is_pretokenized=False``: :data:`~TextInputSequence`
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
+"""
+
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+"""Represents all the possible types of input for encoding. Can be:
+
+ - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
+"""
+
+
+class OffsetReferential(Enum):
+ ORIGINAL = "original"
+ NORMALIZED = "normalized"
+
+
+class OffsetType(Enum):
+ BYTE = "byte"
+ CHAR = "char"
+
+
+class SplitDelimiterBehavior(Enum):
+ REMOVED = "removed"
+ ISOLATED = "isolated"
+ MERGED_WITH_PREVIOUS = "merged_with_previous"
+ MERGED_WITH_NEXT = "merged_with_next"
+ CONTIGUOUS = "contiguous"
+
+
+from .tokenizers import (
+ AddedToken,
+ Encoding,
+ NormalizedString,
+ PreTokenizedString,
+ Regex,
+ Token,
+ Tokenizer,
+ decoders,
+ models,
+ normalizers,
+ pre_tokenizers,
+ processors,
+ trainers,
+ __version__,
+)
+from .implementations import (
+ BertWordPieceTokenizer,
+ ByteLevelBPETokenizer,
+ CharBPETokenizer,
+ SentencePieceBPETokenizer,
+ SentencePieceUnigramTokenizer,
+)
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi
new file mode 100644
index 00000000..5dbc665d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi
@@ -0,0 +1,1200 @@
+# Generated content DO NOT EDIT
+class AddedToken:
+ """
+ Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
+ It can have special options that defines the way it should behave.
+
+ Args:
+ content (:obj:`str`): The content of the token
+
+ single_word (:obj:`bool`, defaults to :obj:`False`):
+ Defines whether this token should only match single words. If :obj:`True`, this
+ token will never match inside of a word. For example the token ``ing`` would match
+ on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
+ The notion of "`inside of a word`" is defined by the word boundaries pattern in
+ regular expressions (ie. the token should start and end with word boundaries).
+
+ lstrip (:obj:`bool`, defaults to :obj:`False`):
+ Defines whether this token should strip all potential whitespaces on its left side.
+ If :obj:`True`, this token will greedily match any whitespace on its left. For
+ example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
+ ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
+
+ rstrip (:obj:`bool`, defaults to :obj:`False`):
+ Defines whether this token should strip all potential whitespaces on its right
+ side. If :obj:`True`, this token will greedily match any whitespace on its right.
+ It works just like :obj:`lstrip` but on the right.
+
+ normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+ Defines whether this token should match against the normalized version of the input
+ text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
+ lowercasing the text, the token could be extract from the input ``"I saw a lion
+ Yesterday"``.
+ special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
+ Defines whether this token should be skipped when decoding.
+
+ """
+ def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
+ pass
+
+ @property
+ def content(self):
+ """
+ Get the content of this :obj:`AddedToken`
+ """
+ pass
+
+ @property
+ def lstrip(self):
+ """
+ Get the value of the :obj:`lstrip` option
+ """
+ pass
+
+ @property
+ def normalized(self):
+ """
+ Get the value of the :obj:`normalized` option
+ """
+ pass
+
+ @property
+ def rstrip(self):
+ """
+ Get the value of the :obj:`rstrip` option
+ """
+ pass
+
+ @property
+ def single_word(self):
+ """
+ Get the value of the :obj:`single_word` option
+ """
+ pass
+
+ @property
+ def special(self):
+ """
+ Get the value of the :obj:`special` option
+ """
+ pass
+
+class Encoding:
+ """
+ The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
+ """
+ @property
+ def attention_mask(self):
+ """
+ The attention mask
+
+ This indicates to the LM which tokens should be attended to, and which should not.
+ This is especially important when batching sequences, where we need to applying
+ padding.
+
+ Returns:
+ :obj:`List[int]`: The attention mask
+ """
+ pass
+
+ def char_to_token(self, char_pos, sequence_index=0):
+ """
+ Get the token that contains the char at the given position in the input sequence.
+
+ Args:
+ char_pos (:obj:`int`):
+ The position of a char in the input string
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
+ The index of the sequence that contains the target char
+
+ Returns:
+ :obj:`int`: The index of the token that contains this char in the encoded sequence
+ """
+ pass
+
+ def char_to_word(self, char_pos, sequence_index=0):
+ """
+ Get the word that contains the char at the given position in the input sequence.
+
+ Args:
+ char_pos (:obj:`int`):
+ The position of a char in the input string
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
+ The index of the sequence that contains the target char
+
+ Returns:
+ :obj:`int`: The index of the word that contains this char in the input sequence
+ """
+ pass
+
+ @property
+ def ids(self):
+ """
+ The generated IDs
+
+ The IDs are the main input to a Language Model. They are the token indices,
+ the numerical representations that a LM understands.
+
+ Returns:
+ :obj:`List[int]`: The list of IDs
+ """
+ pass
+
+ @staticmethod
+ def merge(encodings, growing_offsets=True):
+ """
+ Merge the list of encodings into one final :class:`~tokenizers.Encoding`
+
+ Args:
+ encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
+ The list of encodings that should be merged in one
+
+ growing_offsets (:obj:`bool`, defaults to :obj:`True`):
+ Whether the offsets should accumulate while merging
+
+ Returns:
+ :class:`~tokenizers.Encoding`: The resulting Encoding
+ """
+ pass
+
+ @property
+ def n_sequences(self):
+ """
+ The number of sequences represented
+
+ Returns:
+ :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
+ """
+ pass
+
+ @property
+ def offsets(self):
+ """
+ The offsets associated to each token
+
+ These offsets let's you slice the input string, and thus retrieve the original
+ part that led to producing the corresponding token.
+
+ Returns:
+ A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
+ """
+ pass
+
+ @property
+ def overflowing(self):
+ """
+ A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
+
+ When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
+ the output into as many pieces as required to match the specified maximum length.
+ This field lets you retrieve all the subsequent pieces.
+
+ When you use pairs of sequences, the overflowing pieces will contain enough
+ variations to cover all the possible combinations, while respecting the provided
+ maximum length.
+ """
+ pass
+
+ def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
+ """
+ Pad the :class:`~tokenizers.Encoding` at the given length
+
+ Args:
+ length (:obj:`int`):
+ The desired length
+
+ direction: (:obj:`str`, defaults to :obj:`right`):
+ The expected padding direction. Can be either :obj:`right` or :obj:`left`
+
+ pad_id (:obj:`int`, defaults to :obj:`0`):
+ The ID corresponding to the padding token
+
+ pad_type_id (:obj:`int`, defaults to :obj:`0`):
+ The type ID corresponding to the padding token
+
+ pad_token (:obj:`str`, defaults to `[PAD]`):
+ The pad token to use
+ """
+ pass
+
+ @property
+ def sequence_ids(self):
+ """
+ The generated sequence indices.
+
+ They represent the index of the input sequence associated to each token.
+ The sequence id can be None if the token is not related to any input sequence,
+ like for example with special tokens.
+
+ Returns:
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
+ """
+ pass
+
+ def set_sequence_id(self, sequence_id):
+ """
+ Set the given sequence index
+
+ Set the given sequence index for the whole range of tokens contained in this
+ :class:`~tokenizers.Encoding`.
+ """
+ pass
+
+ @property
+ def special_tokens_mask(self):
+ """
+ The special token mask
+
+ This indicates which tokens are special tokens, and which are not.
+
+ Returns:
+ :obj:`List[int]`: The special tokens mask
+ """
+ pass
+
+ def token_to_chars(self, token_index):
+ """
+ Get the offsets of the token at the given index.
+
+ The returned offsets are related to the input sequence that contains the
+ token. In order to determine in which input sequence it belongs, you
+ must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
+
+ Args:
+ token_index (:obj:`int`):
+ The index of a token in the encoded sequence.
+
+ Returns:
+ :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
+ """
+ pass
+
+ def token_to_sequence(self, token_index):
+ """
+ Get the index of the sequence represented by the given token.
+
+ In the general use case, this method returns :obj:`0` for a single sequence or
+ the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+
+ Args:
+ token_index (:obj:`int`):
+ The index of a token in the encoded sequence.
+
+ Returns:
+ :obj:`int`: The sequence id of the given token
+ """
+ pass
+
+ def token_to_word(self, token_index):
+ """
+ Get the index of the word that contains the token in one of the input sequences.
+
+ The returned word index is related to the input sequence that contains
+ the token. In order to determine in which input sequence it belongs, you
+ must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
+
+ Args:
+ token_index (:obj:`int`):
+ The index of a token in the encoded sequence.
+
+ Returns:
+ :obj:`int`: The index of the word in the relevant input sequence.
+ """
+ pass
+
+ @property
+ def tokens(self):
+ """
+ The generated tokens
+
+ They are the string representation of the IDs.
+
+ Returns:
+ :obj:`List[str]`: The list of tokens
+ """
+ pass
+
+ def truncate(self, max_length, stride=0, direction="right"):
+ """
+ Truncate the :class:`~tokenizers.Encoding` at the given length
+
+ If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
+ this information is lost. It will be considered as representing a single sequence.
+
+ Args:
+ max_length (:obj:`int`):
+ The desired length
+
+ stride (:obj:`int`, defaults to :obj:`0`):
+ The length of previous content to be included in each overflowing piece
+
+ direction (:obj:`str`, defaults to :obj:`right`):
+ Truncate direction
+ """
+ pass
+
+ @property
+ def type_ids(self):
+ """
+ The generated type IDs
+
+ Generally used for tasks like sequence classification or question answering,
+ these tokens let the LM know which input sequence corresponds to each tokens.
+
+ Returns:
+ :obj:`List[int]`: The list of type ids
+ """
+ pass
+
+ @property
+ def word_ids(self):
+ """
+ The generated word indices.
+
+ They represent the index of the word associated to each token.
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
+ otherwise they correspond to the words indices as defined by the
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
+
+ For special tokens and such (any token that was generated from something that was
+ not part of the input), the output is :obj:`None`
+
+ Returns:
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
+ """
+ pass
+
+ def word_to_chars(self, word_index, sequence_index=0):
+ """
+ Get the offsets of the word at the given index in one of the input sequences.
+
+ Args:
+ word_index (:obj:`int`):
+ The index of a word in one of the input sequences.
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
+ The index of the sequence that contains the target word
+
+ Returns:
+ :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
+ """
+ pass
+
+ def word_to_tokens(self, word_index, sequence_index=0):
+ """
+ Get the encoded tokens corresponding to the word at the given index
+ in one of the input sequences.
+
+ Args:
+ word_index (:obj:`int`):
+ The index of a word in one of the input sequences.
+ sequence_index (:obj:`int`, defaults to :obj:`0`):
+ The index of the sequence that contains the target word
+
+ Returns:
+ :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
+ """
+ pass
+
+ @property
+ def words(self):
+ """
+ The generated word indices.
+
+ .. warning::
+ This is deprecated and will be removed in a future version.
+ Please use :obj:`~tokenizers.Encoding.word_ids` instead.
+
+ They represent the index of the word associated to each token.
+ When the input is pre-tokenized, they correspond to the ID of the given input label,
+ otherwise they correspond to the words indices as defined by the
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
+
+ For special tokens and such (any token that was generated from something that was
+ not part of the input), the output is :obj:`None`
+
+ Returns:
+ A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
+ """
+ pass
+
+class NormalizedString:
+ """
+ NormalizedString
+
+ A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
+ While making all the requested modifications, it keeps track of the alignment information
+ between the two versions of the string.
+
+ Args:
+ sequence: str:
+ The string sequence used to initialize this NormalizedString
+ """
+ def append(self, s):
+ """
+ Append the given sequence to the string
+ """
+ pass
+
+ def clear(self):
+ """
+ Clears the string
+ """
+ pass
+
+ def filter(self, func):
+ """
+ Filter each character of the string using the given func
+ """
+ pass
+
+ def for_each(self, func):
+ """
+ Calls the given function for each character of the string
+ """
+ pass
+
+ def lowercase(self):
+ """
+ Lowercase the string
+ """
+ pass
+
+ def lstrip(self):
+ """
+ Strip the left of the string
+ """
+ pass
+
+ def map(self, func):
+ """
+ Calls the given function for each character of the string
+
+ Replaces each character of the string using the returned value. Each
+ returned value **must** be a str of length 1 (ie a character).
+ """
+ pass
+
+ def nfc(self):
+ """
+ Runs the NFC normalization
+ """
+ pass
+
+ def nfd(self):
+ """
+ Runs the NFD normalization
+ """
+ pass
+
+ def nfkc(self):
+ """
+ Runs the NFKC normalization
+ """
+ pass
+
+ def nfkd(self):
+ """
+ Runs the NFKD normalization
+ """
+ pass
+
+ @property
+ def normalized(self):
+ """
+ The normalized part of the string
+ """
+ pass
+
+ def prepend(self, s):
+ """
+ Prepend the given sequence to the string
+ """
+ pass
+
+ def replace(self, pattern, content):
+ """
+ Replace the content of the given pattern with the provided content
+
+ Args:
+ pattern: Pattern:
+ A pattern used to match the string. Usually a string or a Regex
+
+ content: str:
+ The content to be used as replacement
+ """
+ pass
+
+ def rstrip(self):
+ """
+ Strip the right of the string
+ """
+ pass
+
+ def slice(self, range):
+ """
+ Slice the string using the given range
+ """
+ pass
+
+ def split(self, pattern, behavior):
+ """
+ Split the NormalizedString using the given pattern and the specified behavior
+
+ Args:
+ pattern: Pattern:
+ A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
+
+ behavior: SplitDelimiterBehavior:
+ The behavior to use when splitting.
+ Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+ "contiguous"
+
+ Returns:
+ A list of NormalizedString, representing each split
+ """
+ pass
+
+ def strip(self):
+ """
+ Strip both ends of the string
+ """
+ pass
+
+ def uppercase(self):
+ """
+ Uppercase the string
+ """
+ pass
+
+class PreTokenizedString:
+ """
+ PreTokenizedString
+
+ Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
+ underlying string, while keeping track of the alignment information (offsets).
+
+ The PreTokenizedString manages what we call `splits`. Each split represents a substring
+ which is a subpart of the original string, with the relevant offsets and tokens.
+
+ When calling one of the methods used to modify the PreTokenizedString (namely one of
+ `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
+ tokens will get modified.
+
+ Args:
+ sequence: str:
+ The string sequence used to initialize this PreTokenizedString
+ """
+ def __init__(self, sequence):
+ pass
+
+ def get_splits(self, offset_referential="original", offset_type="char"):
+ """
+ Get the splits currently managed by the PreTokenizedString
+
+ Args:
+ offset_referential: :obj:`str`
+ Whether the returned splits should have offsets expressed relative
+ to the original string, or the normalized one. choices: "original", "normalized".
+
+ offset_type: :obj:`str`
+ Whether the returned splits should have offsets expressed in bytes or chars.
+ When slicing an str, we usually want to use chars, which is the default value.
+ Now in some cases it might be interesting to get these offsets expressed in bytes,
+ so it is possible to change this here.
+ choices: "char", "bytes"
+
+ Returns
+ A list of splits
+ """
+ pass
+
+ def normalize(self, func):
+ """
+ Normalize each split of the `PreTokenizedString` using the given `func`
+
+ Args:
+ func: Callable[[NormalizedString], None]:
+ The function used to normalize each underlying split. This function
+ does not need to return anything, just calling the methods on the provided
+ NormalizedString allow its modification.
+ """
+ pass
+
+ def split(self, func):
+ """
+ Split the PreTokenizedString using the given `func`
+
+ Args:
+ func: Callable[[index, NormalizedString], List[NormalizedString]]:
+ The function used to split each underlying split.
+ It is expected to return a list of `NormalizedString`, that represent the new
+ splits. If the given `NormalizedString` does not need any splitting, we can
+ just return it directly.
+ In order for the offsets to be tracked accurately, any returned `NormalizedString`
+ should come from calling either `.split` or `.slice` on the received one.
+ """
+ pass
+
+ def to_encoding(self, type_id=0, word_idx=None):
+ """
+ Return an Encoding generated from this PreTokenizedString
+
+ Args:
+ type_id: int = 0:
+ The type_id to be used on the generated Encoding.
+
+ word_idx: Optional[int] = None:
+ An optional word index to be used for each token of this Encoding. If provided,
+ all the word indices in the generated Encoding will use this value, instead
+ of the one automatically tracked during pre-tokenization.
+
+ Returns:
+ An Encoding
+ """
+ pass
+
+ def tokenize(self, func):
+ """
+ Tokenize each split of the `PreTokenizedString` using the given `func`
+
+ Args:
+ func: Callable[[str], List[Token]]:
+ The function used to tokenize each underlying split. This function must return
+ a list of Token generated from the input str.
+ """
+ pass
+
+class Regex:
+ """
+ Instantiate a new Regex with the given pattern
+ """
+ def __init__(self, pattern):
+ pass
+
+class Token:
+ pass
+
+class Tokenizer:
+ """
+ A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
+ and outputs an :class:`~tokenizers.Encoding`.
+
+ Args:
+ model (:class:`~tokenizers.models.Model`):
+ The core algorithm that this :obj:`Tokenizer` should be using.
+
+ """
+ def __init__(self, model):
+ pass
+
+ def add_special_tokens(self, tokens):
+ """
+ Add the given special tokens to the Tokenizer.
+
+ If these tokens are already part of the vocabulary, it just let the Tokenizer know about
+ them. If they don't exist, the Tokenizer creates them, giving them a new id.
+
+ These special tokens will never be processed by the model (ie won't be split into
+ multiple tokens), and they can be removed from the output when decoding.
+
+ Args:
+ tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+ The list of special tokens we want to add to the vocabulary. Each token can either
+ be a string or an instance of :class:`~tokenizers.AddedToken` for more
+ customization.
+
+ Returns:
+ :obj:`int`: The number of tokens that were created in the vocabulary
+ """
+ pass
+
+ def add_tokens(self, tokens):
+ """
+ Add the given tokens to the vocabulary
+
+ The given tokens are added only if they don't already exist in the vocabulary.
+ Each token then gets a new attributed id.
+
+ Args:
+ tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
+ The list of tokens we want to add to the vocabulary. Each token can be either a
+ string or an instance of :class:`~tokenizers.AddedToken` for more customization.
+
+ Returns:
+ :obj:`int`: The number of tokens that were created in the vocabulary
+ """
+ pass
+
+ def decode(self, ids, skip_special_tokens=True):
+ """
+ Decode the given list of ids back to a string
+
+ This is used to decode anything coming back from a Language Model
+
+ Args:
+ ids (A :obj:`List/Tuple` of :obj:`int`):
+ The list of ids that we want to decode
+
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether the special tokens should be removed from the decoded string
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+ def decode_batch(self, sequences, skip_special_tokens=True):
+ """
+ Decode a batch of ids back to their corresponding string
+
+ Args:
+ sequences (:obj:`List` of :obj:`List[int]`):
+ The batch of sequences we want to decode
+
+ skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether the special tokens should be removed from the decoded strings
+
+ Returns:
+ :obj:`List[str]`: A list of decoded strings
+ """
+ pass
+
+ @property
+ def decoder(self):
+ """
+ The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
+ """
+ pass
+
+ def enable_padding(
+ self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
+ ):
+ """
+ Enable the padding
+
+ Args:
+ direction (:obj:`str`, `optional`, defaults to :obj:`right`):
+ The direction in which to pad. Can be either ``right`` or ``left``
+
+ pad_to_multiple_of (:obj:`int`, `optional`):
+ If specified, the padding length should always snap to the next multiple of the
+ given value. For example if we were going to pad witha length of 250 but
+ ``pad_to_multiple_of=8`` then we will pad to 256.
+
+ pad_id (:obj:`int`, defaults to 0):
+ The id to be used when padding
+
+ pad_type_id (:obj:`int`, defaults to 0):
+ The type id to be used when padding
+
+ pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
+ The pad token to be used when padding
+
+ length (:obj:`int`, `optional`):
+ If specified, the length at which to pad. If not specified we pad using the size of
+ the longest sequence in a batch.
+ """
+ pass
+
+ def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
+ """
+ Enable truncation
+
+ Args:
+ max_length (:obj:`int`):
+ The max length at which to truncate
+
+ stride (:obj:`int`, `optional`):
+ The length of the previous first sequence to be included in the overflowing
+ sequence
+
+ strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
+ The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
+ ``only_second``.
+
+ direction (:obj:`str`, defaults to :obj:`right`):
+ Truncate direction
+ """
+ pass
+
+ def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
+ """
+ Encode the given sequence and pair. This method can process raw text sequences
+ as well as already pre-tokenized sequences.
+
+ Example:
+ Here are some examples of the inputs that are accepted::
+
+ encode("A single sequence")`
+ encode("A sequence", "And its pair")`
+ encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
+ encode(
+ [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
+ is_pretokenized=True
+ )
+
+ Args:
+ sequence (:obj:`~tokenizers.InputSequence`):
+ The main input sequence we want to encode. This sequence can be either raw
+ text or pre-tokenized, according to the ``is_pretokenized`` argument:
+
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
+
+ pair (:obj:`~tokenizers.InputSequence`, `optional`):
+ An optional input sequence. The expected format is the same that for ``sequence``.
+
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+ Whether the input is already pre-tokenized
+
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether to add the special tokens
+
+ Returns:
+ :class:`~tokenizers.Encoding`: The encoded result
+
+ """
+ pass
+
+ def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
+ """
+ Encode the given batch of inputs. This method accept both raw text sequences
+ as well as already pre-tokenized sequences.
+
+ Example:
+ Here are some examples of the inputs that are accepted::
+
+ encode_batch([
+ "A single sequence",
+ ("A tuple with a sequence", "And its pair"),
+ [ "A", "pre", "tokenized", "sequence" ],
+ ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+ ])
+
+ Args:
+ input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+ A list of single sequences or pair sequences to encode. Each sequence
+ can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+ argument:
+
+ - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+ - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+ is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+ Whether the input is already pre-tokenized
+
+ add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether to add the special tokens
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+ """
+ pass
+
+ @property
+ def encode_special_tokens(self):
+ """
+ Modifies the tokenizer in order to use or not the special tokens
+ during encoding.
+
+ Args:
+ value (:obj:`bool`):
+ Whether to use the special tokens or not
+
+ """
+ pass
+
+ @staticmethod
+ def from_buffer(buffer):
+ """
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
+
+ Args:
+ buffer (:obj:`bytes`):
+ A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
+
+ Returns:
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
+ """
+ pass
+
+ @staticmethod
+ def from_file(path):
+ """
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
+
+ Args:
+ path (:obj:`str`):
+ A path to a local JSON file representing a previously serialized
+ :class:`~tokenizers.Tokenizer`
+
+ Returns:
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
+ """
+ pass
+
+ @staticmethod
+ def from_pretrained(identifier, revision="main", auth_token=None):
+ """
+ Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
+ Hugging Face Hub.
+
+ Args:
+ identifier (:obj:`str`):
+ The identifier of a Model on the Hugging Face Hub, that contains
+ a tokenizer.json file
+ revision (:obj:`str`, defaults to `main`):
+ A branch or commit id
+ auth_token (:obj:`str`, `optional`, defaults to `None`):
+ An optional auth token used to access private repositories on the
+ Hugging Face Hub
+
+ Returns:
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
+ """
+ pass
+
+ @staticmethod
+ def from_str(json):
+ """
+ Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
+
+ Args:
+ json (:obj:`str`):
+ A valid JSON string representing a previously serialized
+ :class:`~tokenizers.Tokenizer`
+
+ Returns:
+ :class:`~tokenizers.Tokenizer`: The new tokenizer
+ """
+ pass
+
+ def get_added_tokens_decoder(self):
+ """
+ Get the underlying vocabulary
+
+ Returns:
+ :obj:`Dict[int, AddedToken]`: The vocabulary
+ """
+ pass
+
+ def get_vocab(self, with_added_tokens=True):
+ """
+ Get the underlying vocabulary
+
+ Args:
+ with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether to include the added tokens
+
+ Returns:
+ :obj:`Dict[str, int]`: The vocabulary
+ """
+ pass
+
+ def get_vocab_size(self, with_added_tokens=True):
+ """
+ Get the size of the underlying vocabulary
+
+ Args:
+ with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
+ Whether to include the added tokens
+
+ Returns:
+ :obj:`int`: The size of the vocabulary
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Convert the given id to its corresponding token if it exists
+
+ Args:
+ id (:obj:`int`):
+ The id to convert
+
+ Returns:
+ :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
+ """
+ pass
+
+ @property
+ def model(self):
+ """
+ The :class:`~tokenizers.models.Model` in use by the Tokenizer
+ """
+ pass
+
+ def no_padding(self):
+ """
+ Disable padding
+ """
+ pass
+
+ def no_truncation(self):
+ """
+ Disable truncation
+ """
+ pass
+
+ @property
+ def normalizer(self):
+ """
+ The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
+ """
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+ :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+ :return:
+ """
+ pass
+
+ @property
+ def padding(self):
+ """
+ Get the current padding parameters
+
+ `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
+
+ Returns:
+ (:obj:`dict`, `optional`):
+ A dict with the current padding parameters if padding is enabled
+ """
+ pass
+
+ def post_process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Apply all the post-processing steps to the given encodings.
+
+ The various steps are:
+
+ 1. Truncate according to the set truncation params (provided with
+ :meth:`~tokenizers.Tokenizer.enable_truncation`)
+ 2. Apply the :class:`~tokenizers.processors.PostProcessor`
+ 3. Pad according to the set padding params (provided with
+ :meth:`~tokenizers.Tokenizer.enable_padding`)
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The :class:`~tokenizers.Encoding` corresponding to the main sequence.
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Returns:
+ :class:`~tokenizers.Encoding`: The final post-processed encoding
+ """
+ pass
+
+ @property
+ def post_processor(self):
+ """
+ The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
+ """
+ pass
+
+ @property
+ def pre_tokenizer(self):
+ """
+ The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
+ """
+ pass
+
+ def save(self, path, pretty=True):
+ """
+ Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
+
+ Args:
+ path (:obj:`str`):
+ A path to a file in which to save the serialized tokenizer.
+
+ pretty (:obj:`bool`, defaults to :obj:`True`):
+ Whether the JSON file should be pretty formatted.
+ """
+ pass
+
+ def to_str(self, pretty=False):
+ """
+ Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
+
+ Args:
+ pretty (:obj:`bool`, defaults to :obj:`False`):
+ Whether the JSON string should be pretty formatted.
+
+ Returns:
+ :obj:`str`: A string representing the serialized Tokenizer
+ """
+ pass
+
+ def token_to_id(self, token):
+ """
+ Convert the given token to its corresponding id if it exists
+
+ Args:
+ token (:obj:`str`):
+ The token to convert
+
+ Returns:
+ :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
+ """
+ pass
+
+ def train(self, files, trainer=None):
+ """
+ Train the Tokenizer using the given files.
+
+ Reads the files line by line, while keeping all the whitespace, even new lines.
+ If you want to train from data store in-memory, you can check
+ :meth:`~tokenizers.Tokenizer.train_from_iterator`
+
+ Args:
+ files (:obj:`List[str]`):
+ A list of path to the files that we should use for training
+
+ trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
+ An optional trainer that should be used to train our Model
+ """
+ pass
+
+ def train_from_iterator(self, iterator, trainer=None, length=None):
+ """
+ Train the Tokenizer using the provided iterator.
+
+ You can provide anything that is a Python Iterator
+
+ * A list of sequences :obj:`List[str]`
+ * A generator that yields :obj:`str` or :obj:`List[str]`
+ * A Numpy array of strings
+ * ...
+
+ Args:
+ iterator (:obj:`Iterator`):
+ Any iterator over strings or list of strings
+
+ trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
+ An optional trainer that should be used to train our Model
+
+ length (:obj:`int`, `optional`):
+ The total number of sequences in the iterator. This is used to
+ provide meaningful progress tracking
+ """
+ pass
+
+ @property
+ def truncation(self):
+ """
+ Get the currently set truncation parameters
+
+ `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
+
+ Returns:
+ (:obj:`dict`, `optional`):
+ A dict with the current truncation parameters if truncation is enabled
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py
new file mode 100644
index 00000000..a717379c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py
@@ -0,0 +1,14 @@
+from .. import decoders
+
+
+Decoder = decoders.Decoder
+ByteLevel = decoders.ByteLevel
+Replace = decoders.Replace
+WordPiece = decoders.WordPiece
+ByteFallback = decoders.ByteFallback
+Fuse = decoders.Fuse
+Strip = decoders.Strip
+Metaspace = decoders.Metaspace
+BPEDecoder = decoders.BPEDecoder
+CTC = decoders.CTC
+Sequence = decoders.Sequence
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi
new file mode 100644
index 00000000..b967fbd1
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi
@@ -0,0 +1,271 @@
+# Generated content DO NOT EDIT
+class Decoder:
+ """
+ Base class for all decoders
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of
+ a Decoder will return an instance of this class when instantiated.
+ """
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class BPEDecoder(Decoder):
+ """
+ BPEDecoder Decoder
+
+ Args:
+ suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
+ The suffix that was used to caracterize an end-of-word. This suffix will
+ be replaced by whitespaces during the decoding
+ """
+ def __init__(self, suffix="</w>"):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class ByteFallback(Decoder):
+ """
+ ByteFallback Decoder
+ ByteFallback is a simple trick which converts tokens looking like `<0x61>`
+ to pure bytes, and attempts to make them into a string. If the tokens
+ cannot be decoded you will get � instead for each inconvertable byte token
+
+ """
+ def __init__(self):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class ByteLevel(Decoder):
+ """
+ ByteLevel Decoder
+
+ This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
+ """
+ def __init__(self):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class CTC(Decoder):
+ """
+ CTC Decoder
+
+ Args:
+ pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`):
+ The pad token used by CTC to delimit a new token.
+ word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`):
+ The word delimiter token. It will be replaced by a <space>
+ cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to cleanup some tokenization artifacts.
+ Mainly spaces before punctuation, and some abbreviated english forms.
+ """
+ def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class Fuse(Decoder):
+ """
+ Fuse Decoder
+ Fuse simply fuses every token into a single string.
+ This is the last step of decoding, this decoder exists only if
+ there is need to add other decoders *after* the fusion
+ """
+ def __init__(self):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class Metaspace(Decoder):
+ """
+ Metaspace Decoder
+
+ Args:
+ replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
+ The replacement character. Must be exactly one character. By default we
+ use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+ prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
+ Whether to add a space to the first word if there isn't already one. This
+ lets us treat `hello` exactly like `say hello`.
+ Choices: "always", "never", "first". First means the space is only added on the first
+ token (relevant when special tokens are used or other pre_tokenizer are used).
+ """
+ def __init__(self, replacement="▁", prepend_scheme="always", split=True):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class Replace(Decoder):
+ """
+ Replace Decoder
+
+ This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
+ """
+ def __init__(self, pattern, content):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class Sequence(Decoder):
+ """
+ Sequence Decoder
+
+ Args:
+ decoders (:obj:`List[Decoder]`)
+ The decoders that need to be chained
+ """
+ def __init__(self, decoders):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class Strip(Decoder):
+ """
+ Strip normalizer
+ Strips n left characters of each token, or n right characters of each token
+ """
+ def __init__(self, content, left=0, right=0):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
+
+class WordPiece(Decoder):
+ """
+ WordPiece Decoder
+
+ Args:
+ prefix (:obj:`str`, `optional`, defaults to :obj:`##`):
+ The prefix to use for subwords that are not a beginning-of-word
+
+ cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+ and some abbreviated english forms.
+ """
+ def __init__(self, prefix="##", cleanup=True):
+ pass
+
+ def decode(self, tokens):
+ """
+ Decode the given list of tokens to a final string
+
+ Args:
+ tokens (:obj:`List[str]`):
+ The list of tokens to decode
+
+ Returns:
+ :obj:`str`: The decoded string
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py
new file mode 100644
index 00000000..7e775892
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py
@@ -0,0 +1,6 @@
+from .base_tokenizer import BaseTokenizer
+from .bert_wordpiece import BertWordPieceTokenizer
+from .byte_level_bpe import ByteLevelBPETokenizer
+from .char_level_bpe import CharBPETokenizer
+from .sentencepiece_bpe import SentencePieceBPETokenizer
+from .sentencepiece_unigram import SentencePieceUnigramTokenizer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py
new file mode 100644
index 00000000..4528dceb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py
@@ -0,0 +1,418 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
+from tokenizers.decoders import Decoder
+from tokenizers.models import Model
+from tokenizers.normalizers import Normalizer
+from tokenizers.pre_tokenizers import PreTokenizer
+from tokenizers.processors import PostProcessor
+
+
+Offsets = Tuple[int, int]
+
+
+class BaseTokenizer:
+ def __init__(self, tokenizer: Tokenizer, parameters=None):
+ self._tokenizer = tokenizer
+ self._parameters = parameters if parameters is not None else {}
+
+ def __repr__(self):
+ return "Tokenizer(vocabulary_size={}, {})".format(
+ self._tokenizer.get_vocab_size(),
+ ", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
+ )
+
+ def num_special_tokens_to_add(self, is_pair: bool) -> int:
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+ :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+ :return:
+ """
+ return self._tokenizer.num_special_tokens_to_add(is_pair)
+
+ def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
+ """Returns the vocabulary
+
+ Args:
+ with_added_tokens: boolean:
+ Whether to include the added tokens in the vocabulary
+
+ Returns:
+ The vocabulary
+ """
+ return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
+
+ def get_added_tokens_decoder(self) -> Dict[int, AddedToken]:
+ """Returns the added reverse vocabulary
+
+ Returns:
+ The added vocabulary mapping ints to AddedTokens
+ """
+ return self._tokenizer.get_added_tokens_decoder()
+
+ def get_vocab_size(self, with_added_tokens: bool = True) -> int:
+ """Return the size of vocabulary, with or without added tokens.
+
+ Args:
+ with_added_tokens: (`optional`) bool:
+ Whether to count in added special tokens or not
+
+ Returns:
+ Size of vocabulary
+ """
+ return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
+
+ def enable_padding(
+ self,
+ direction: Optional[str] = "right",
+ pad_to_multiple_of: Optional[int] = None,
+ pad_id: Optional[int] = 0,
+ pad_type_id: Optional[int] = 0,
+ pad_token: Optional[str] = "[PAD]",
+ length: Optional[int] = None,
+ ):
+ """Change the padding strategy
+
+ Args:
+ direction: (`optional`) str:
+ Can be one of: `right` or `left`
+
+ pad_to_multiple_of: (`optional`) unsigned int:
+ If specified, the padding length should always snap to the next multiple of
+ the given value. For example if we were going to pad with a length of 250 but
+ `pad_to_multiple_of=8` then we will pad to 256.
+
+ pad_id: (`optional`) unsigned int:
+ The indice to be used when padding
+
+ pad_type_id: (`optional`) unsigned int:
+ The type indice to be used when padding
+
+ pad_token: (`optional`) str:
+ The pad token to be used when padding
+
+ length: (`optional`) unsigned int:
+ If specified, the length at which to pad. If not specified
+ we pad using the size of the longest sequence in a batch
+ """
+ return self._tokenizer.enable_padding(
+ direction=direction,
+ pad_to_multiple_of=pad_to_multiple_of,
+ pad_id=pad_id,
+ pad_type_id=pad_type_id,
+ pad_token=pad_token,
+ length=length,
+ )
+
+ def no_padding(self):
+ """Disable padding"""
+ return self._tokenizer.no_padding()
+
+ @property
+ def padding(self) -> Optional[dict]:
+ """Get the current padding parameters
+
+ Returns:
+ None if padding is disabled, a dict with the currently set parameters
+ if the padding is enabled.
+ """
+ return self._tokenizer.padding
+
+ def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
+ """Change the truncation options
+
+ Args:
+ max_length: unsigned int:
+ The maximum length at which to truncate
+
+ stride: (`optional`) unsigned int:
+ The length of the previous first sequence to be included
+ in the overflowing sequence
+
+ strategy: (`optional`) str:
+ Can be one of `longest_first`, `only_first` or `only_second`
+ """
+ return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
+
+ def no_truncation(self):
+ """Disable truncation"""
+ return self._tokenizer.no_truncation()
+
+ @property
+ def truncation(self) -> Optional[dict]:
+ """Get the current truncation parameters
+
+ Returns:
+ None if truncation is disabled, a dict with the current truncation parameters if
+ truncation is enabled
+ """
+ return self._tokenizer.truncation
+
+ def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
+ """Add the given tokens to the vocabulary
+
+ Args:
+ tokens: List[Union[str, AddedToken]]:
+ A list of tokens to add to the vocabulary. Each token can either be
+ a string, or an instance of AddedToken
+
+ Returns:
+ The number of tokens that were added to the vocabulary
+ """
+ return self._tokenizer.add_tokens(tokens)
+
+ def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
+ """Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+ The special tokens will never be processed by the model, and will be
+ removed while decoding.
+
+ Args:
+ tokens: List[Union[str, AddedToken]]:
+ A list of special tokens to add to the vocabulary. Each token can either be
+ a string, or an instance of AddedToken
+
+ Returns:
+ The number of tokens that were added to the vocabulary
+ """
+ return self._tokenizer.add_special_tokens(special_tokens)
+
+ def normalize(self, sequence: str) -> str:
+ """Normalize the given sequence
+
+ Args:
+ sequence: str:
+ The sequence to normalize
+
+ Returns:
+ The normalized string
+ """
+ return self._tokenizer.normalize(sequence)
+
+ def encode(
+ self,
+ sequence: InputSequence,
+ pair: Optional[InputSequence] = None,
+ is_pretokenized: bool = False,
+ add_special_tokens: bool = True,
+ ) -> Encoding:
+ """Encode the given sequence and pair. This method can process raw text sequences as well
+ as already pre-tokenized sequences.
+
+ Args:
+ sequence: InputSequence:
+ The sequence we want to encode. This sequence can be either raw text or
+ pre-tokenized, according to the `is_pretokenized` argument:
+
+ - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+ - If `is_pretokenized=True`: `InputSequence` is expected to be
+ `Union[List[str], Tuple[str]]`
+
+ is_pretokenized: bool:
+ Whether the input is already pre-tokenized.
+
+ add_special_tokens: bool:
+ Whether to add the special tokens while encoding.
+
+ Returns:
+ An Encoding
+ """
+ if sequence is None:
+ raise ValueError("encode: `sequence` can't be `None`")
+
+ return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
+
+ def encode_batch(
+ self,
+ inputs: List[EncodeInput],
+ is_pretokenized: bool = False,
+ add_special_tokens: bool = True,
+ ) -> List[Encoding]:
+ """Encode the given inputs. This method accept both raw text sequences as well as already
+ pre-tokenized sequences.
+
+ Args:
+ inputs: List[EncodeInput]:
+ A list of single sequences or pair sequences to encode. Each `EncodeInput` is
+ expected to be of the following form:
+ `Union[InputSequence, Tuple[InputSequence, InputSequence]]`
+
+ Each `InputSequence` can either be raw text or pre-tokenized,
+ according to the `is_pretokenized` argument:
+
+ - If `is_pretokenized=False`: `InputSequence` is expected to be `str`
+ - If `is_pretokenized=True`: `InputSequence` is expected to be
+ `Union[List[str], Tuple[str]]`
+
+ is_pretokenized: bool:
+ Whether the input is already pre-tokenized.
+
+ add_special_tokens: bool:
+ Whether to add the special tokens while encoding.
+
+ Returns:
+ A list of Encoding
+ """
+
+ if inputs is None:
+ raise ValueError("encode_batch: `inputs` can't be `None`")
+
+ return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
+
+ def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+ """Decode the given list of ids to a string sequence
+
+ Args:
+ ids: List[unsigned int]:
+ A list of ids to be decoded
+
+ skip_special_tokens: (`optional`) boolean:
+ Whether to remove all the special tokens from the output string
+
+ Returns:
+ The decoded string
+ """
+ if ids is None:
+ raise ValueError("None input is not valid. Should be a list of integers.")
+
+ return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+ def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
+ """Decode the list of sequences to a list of string sequences
+
+ Args:
+ sequences: List[List[unsigned int]]:
+ A list of sequence of ids to be decoded
+
+ skip_special_tokens: (`optional`) boolean:
+ Whether to remove all the special tokens from the output strings
+
+ Returns:
+ A list of decoded strings
+ """
+ if sequences is None:
+ raise ValueError("None input is not valid. Should be list of list of integers.")
+
+ return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
+
+ def token_to_id(self, token: str) -> Optional[int]:
+ """Convert the given token to its corresponding id
+
+ Args:
+ token: str:
+ The token to convert
+
+ Returns:
+ The corresponding id if it exists, None otherwise
+ """
+ return self._tokenizer.token_to_id(token)
+
+ def id_to_token(self, id: int) -> Optional[str]:
+ """Convert the given token id to its corresponding string
+
+ Args:
+ token: id:
+ The token id to convert
+
+ Returns:
+ The corresponding string if it exists, None otherwise
+ """
+ return self._tokenizer.id_to_token(id)
+
+ def save_model(self, directory: str, prefix: Optional[str] = None):
+ """Save the current model to the given directory
+
+ Args:
+ directory: str:
+ A path to the destination directory
+
+ prefix: (Optional) str:
+ An optional prefix, used to prefix each file name
+ """
+ return self._tokenizer.model.save(directory, prefix=prefix)
+
+ def save(self, path: str, pretty: bool = True):
+ """Save the current Tokenizer at the given path
+
+ Args:
+ path: str:
+ A path to the destination Tokenizer file
+ """
+ return self._tokenizer.save(path, pretty)
+
+ def to_str(self, pretty: bool = False):
+ """Get a serialized JSON version of the Tokenizer as a str
+
+ Args:
+ pretty: bool:
+ Whether the JSON string should be prettified
+
+ Returns:
+ str
+ """
+ return self._tokenizer.to_str(pretty)
+
+ def post_process(
+ self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
+ ) -> Encoding:
+ """Apply all the post-processing steps to the given encodings.
+
+ The various steps are:
+ 1. Truncate according to global params (provided to `enable_truncation`)
+ 2. Apply the PostProcessor
+ 3. Pad according to global params. (provided to `enable_padding`)
+
+ Args:
+ encoding: Encoding:
+ The main Encoding to post process
+
+ pair: Optional[Encoding]:
+ An optional pair Encoding
+
+ add_special_tokens: bool:
+ Whether to add special tokens
+
+ Returns:
+ The resulting Encoding
+ """
+ return self._tokenizer.post_process(encoding, pair, add_special_tokens)
+
+ @property
+ def model(self) -> Model:
+ return self._tokenizer.model
+
+ @model.setter
+ def model(self, model: Model):
+ self._tokenizer.model = model
+
+ @property
+ def normalizer(self) -> Normalizer:
+ return self._tokenizer.normalizer
+
+ @normalizer.setter
+ def normalizer(self, normalizer: Normalizer):
+ self._tokenizer.normalizer = normalizer
+
+ @property
+ def pre_tokenizer(self) -> PreTokenizer:
+ return self._tokenizer.pre_tokenizer
+
+ @pre_tokenizer.setter
+ def pre_tokenizer(self, pre_tokenizer: PreTokenizer):
+ self._tokenizer.pre_tokenizer = pre_tokenizer
+
+ @property
+ def post_processor(self) -> PostProcessor:
+ return self._tokenizer.post_processor
+
+ @post_processor.setter
+ def post_processor(self, post_processor: PostProcessor):
+ self._tokenizer.post_processor = post_processor
+
+ @property
+ def decoder(self) -> Decoder:
+ return self._tokenizer.decoder
+
+ @decoder.setter
+ def decoder(self, decoder: Decoder):
+ self._tokenizer.decoder = decoder
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py
new file mode 100644
index 00000000..1f34e3ca
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py
@@ -0,0 +1,151 @@
+from typing import Dict, Iterator, List, Optional, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, trainers
+from tokenizers.models import WordPiece
+from tokenizers.normalizers import BertNormalizer
+from tokenizers.pre_tokenizers import BertPreTokenizer
+from tokenizers.processors import BertProcessing
+
+from .base_tokenizer import BaseTokenizer
+
+
+class BertWordPieceTokenizer(BaseTokenizer):
+ """Bert WordPiece Tokenizer"""
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ unk_token: Union[str, AddedToken] = "[UNK]",
+ sep_token: Union[str, AddedToken] = "[SEP]",
+ cls_token: Union[str, AddedToken] = "[CLS]",
+ pad_token: Union[str, AddedToken] = "[PAD]",
+ mask_token: Union[str, AddedToken] = "[MASK]",
+ clean_text: bool = True,
+ handle_chinese_chars: bool = True,
+ strip_accents: Optional[bool] = None,
+ lowercase: bool = True,
+ wordpieces_prefix: str = "##",
+ ):
+ if vocab is not None:
+ tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
+ else:
+ tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
+
+ # Let the tokenizer know about special tokens if they are part of the vocab
+ if tokenizer.token_to_id(str(unk_token)) is not None:
+ tokenizer.add_special_tokens([str(unk_token)])
+ if tokenizer.token_to_id(str(sep_token)) is not None:
+ tokenizer.add_special_tokens([str(sep_token)])
+ if tokenizer.token_to_id(str(cls_token)) is not None:
+ tokenizer.add_special_tokens([str(cls_token)])
+ if tokenizer.token_to_id(str(pad_token)) is not None:
+ tokenizer.add_special_tokens([str(pad_token)])
+ if tokenizer.token_to_id(str(mask_token)) is not None:
+ tokenizer.add_special_tokens([str(mask_token)])
+
+ tokenizer.normalizer = BertNormalizer(
+ clean_text=clean_text,
+ handle_chinese_chars=handle_chinese_chars,
+ strip_accents=strip_accents,
+ lowercase=lowercase,
+ )
+ tokenizer.pre_tokenizer = BertPreTokenizer()
+
+ if vocab is not None:
+ sep_token_id = tokenizer.token_to_id(str(sep_token))
+ if sep_token_id is None:
+ raise TypeError("sep_token not found in the vocabulary")
+ cls_token_id = tokenizer.token_to_id(str(cls_token))
+ if cls_token_id is None:
+ raise TypeError("cls_token not found in the vocabulary")
+
+ tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
+ tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
+
+ parameters = {
+ "model": "BertWordPiece",
+ "unk_token": unk_token,
+ "sep_token": sep_token,
+ "cls_token": cls_token,
+ "pad_token": pad_token,
+ "mask_token": mask_token,
+ "clean_text": clean_text,
+ "handle_chinese_chars": handle_chinese_chars,
+ "strip_accents": strip_accents,
+ "lowercase": lowercase,
+ "wordpieces_prefix": wordpieces_prefix,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab: str, **kwargs):
+ vocab = WordPiece.read_file(vocab)
+ return BertWordPieceTokenizer(vocab, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ special_tokens: List[Union[str, AddedToken]] = [
+ "[PAD]",
+ "[UNK]",
+ "[CLS]",
+ "[SEP]",
+ "[MASK]",
+ ],
+ show_progress: bool = True,
+ wordpieces_prefix: str = "##",
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.WordPieceTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ special_tokens=special_tokens,
+ show_progress=show_progress,
+ continuing_subword_prefix=wordpieces_prefix,
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ special_tokens: List[Union[str, AddedToken]] = [
+ "[PAD]",
+ "[UNK]",
+ "[CLS]",
+ "[SEP]",
+ "[MASK]",
+ ],
+ show_progress: bool = True,
+ wordpieces_prefix: str = "##",
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.WordPieceTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ special_tokens=special_tokens,
+ show_progress=show_progress,
+ continuing_subword_prefix=wordpieces_prefix,
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
new file mode 100644
index 00000000..c7e3dbc4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
@@ -0,0 +1,122 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
+
+from .base_tokenizer import BaseTokenizer
+
+
+class ByteLevelBPETokenizer(BaseTokenizer):
+ """ByteLevelBPETokenizer
+
+ Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+ add_prefix_space: bool = False,
+ lowercase: bool = False,
+ dropout: Optional[float] = None,
+ unicode_normalizer: Optional[str] = None,
+ continuing_subword_prefix: Optional[str] = None,
+ end_of_word_suffix: Optional[str] = None,
+ trim_offsets: bool = False,
+ ):
+ if vocab is not None and merges is not None:
+ tokenizer = Tokenizer(
+ BPE(
+ vocab,
+ merges,
+ dropout=dropout,
+ continuing_subword_prefix=continuing_subword_prefix or "",
+ end_of_word_suffix=end_of_word_suffix or "",
+ )
+ )
+ else:
+ tokenizer = Tokenizer(BPE())
+
+ # Check for Unicode normalization first (before everything else)
+ normalizers = []
+
+ if unicode_normalizer:
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+ if lowercase:
+ normalizers += [Lowercase()]
+
+ # Create the normalizer structure
+ if len(normalizers) > 0:
+ if len(normalizers) > 1:
+ tokenizer.normalizer = Sequence(normalizers)
+ else:
+ tokenizer.normalizer = normalizers[0]
+
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+ tokenizer.decoder = decoders.ByteLevel()
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
+
+ parameters = {
+ "model": "ByteLevelBPE",
+ "add_prefix_space": add_prefix_space,
+ "lowercase": lowercase,
+ "dropout": dropout,
+ "unicode_normalizer": unicode_normalizer,
+ "continuing_subword_prefix": continuing_subword_prefix,
+ "end_of_word_suffix": end_of_word_suffix,
+ "trim_offsets": trim_offsets,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ return ByteLevelBPETokenizer(vocab, merges, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ show_progress: bool = True,
+ special_tokens: List[Union[str, AddedToken]] = [],
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ show_progress=show_progress,
+ special_tokens=special_tokens,
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ show_progress: bool = True,
+ special_tokens: List[Union[str, AddedToken]] = [],
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ show_progress=show_progress,
+ special_tokens=special_tokens,
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
new file mode 100644
index 00000000..29ca5977
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
@@ -0,0 +1,150 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from ..models import BPE
+from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
+from .base_tokenizer import BaseTokenizer
+
+
+class CharBPETokenizer(BaseTokenizer):
+ """Original BPE Tokenizer
+
+ Represents the BPE algorithm, as introduced by Rico Sennrich
+ (https://arxiv.org/abs/1508.07909)
+
+ The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
+ Sennrich subword-nmt implementation by the following options that you can deactivate:
+ - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
+ * removing any control characters and replacing all whitespaces by the classic one.
+ * handle chinese chars by putting spaces around them.
+ * strip all accents.
+ - spitting on punctuation in addition to whitespaces (deactivate it with
+ `split_on_whitespace_only=True`)
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+ unk_token: Union[str, AddedToken] = "<unk>",
+ suffix: str = "</w>",
+ dropout: Optional[float] = None,
+ lowercase: bool = False,
+ unicode_normalizer: Optional[str] = None,
+ bert_normalizer: bool = True,
+ split_on_whitespace_only: bool = False,
+ ):
+ if vocab is not None and merges is not None:
+ tokenizer = Tokenizer(
+ BPE(
+ vocab,
+ merges,
+ dropout=dropout,
+ unk_token=str(unk_token),
+ end_of_word_suffix=suffix,
+ )
+ )
+ else:
+ tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
+
+ if tokenizer.token_to_id(str(unk_token)) is not None:
+ tokenizer.add_special_tokens([str(unk_token)])
+
+ # Check for Unicode normalization first (before everything else)
+ normalizers = []
+
+ if unicode_normalizer:
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+ if bert_normalizer:
+ normalizers += [BertNormalizer(lowercase=False)]
+
+ if lowercase:
+ normalizers += [Lowercase()]
+
+ # Create the normalizer structure
+ if len(normalizers) > 0:
+ if len(normalizers) > 1:
+ tokenizer.normalizer = Sequence(normalizers)
+ else:
+ tokenizer.normalizer = normalizers[0]
+
+ if split_on_whitespace_only:
+ tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
+ else:
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+ tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
+
+ parameters = {
+ "model": "BPE",
+ "unk_token": unk_token,
+ "suffix": suffix,
+ "dropout": dropout,
+ "lowercase": lowercase,
+ "unicode_normalizer": unicode_normalizer,
+ "bert_normalizer": bert_normalizer,
+ "split_on_whitespace_only": split_on_whitespace_only,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ return CharBPETokenizer(vocab, merges, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ suffix: Optional[str] = "</w>",
+ show_progress: bool = True,
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ end_of_word_suffix=suffix,
+ show_progress=show_progress,
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ suffix: Optional[str] = "</w>",
+ show_progress: bool = True,
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ end_of_word_suffix=suffix,
+ show_progress=show_progress,
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
new file mode 100644
index 00000000..cd550b41
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
@@ -0,0 +1,103 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+
+from .base_tokenizer import BaseTokenizer
+
+
+class SentencePieceBPETokenizer(BaseTokenizer):
+ """SentencePiece BPE Tokenizer
+
+ Represents the BPE algorithm, with the pretokenization used by SentencePiece
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+ unk_token: Union[str, AddedToken] = "<unk>",
+ replacement: str = "▁",
+ add_prefix_space: bool = True,
+ dropout: Optional[float] = None,
+ fuse_unk: Optional[bool] = False,
+ ):
+ if vocab is not None and merges is not None:
+ tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+ else:
+ tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+
+ if tokenizer.token_to_id(str(unk_token)) is not None:
+ tokenizer.add_special_tokens([str(unk_token)])
+
+ tokenizer.normalizer = NFKC()
+ prepend_scheme = "always" if add_prefix_space else "never"
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+ parameters = {
+ "model": "SentencePieceBPE",
+ "unk_token": unk_token,
+ "replacement": replacement,
+ "add_prefix_space": add_prefix_space,
+ "dropout": dropout,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ return SentencePieceBPETokenizer(vocab, merges, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ show_progress: bool = True,
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ show_progress=show_progress,
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ show_progress: bool = True,
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ show_progress=show_progress,
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py
new file mode 100644
index 00000000..1237e85e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py
@@ -0,0 +1,196 @@
+import json
+import os
+from typing import Iterator, List, Optional, Union, Tuple
+
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
+from tokenizers.models import Unigram
+
+from .base_tokenizer import BaseTokenizer
+
+
+class SentencePieceUnigramTokenizer(BaseTokenizer):
+ """SentencePiece Unigram Tokenizer
+
+ Represents the Unigram algorithm, with the pretokenization used by SentencePiece
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[List[Tuple[str, float]]] = None,
+ replacement: str = "▁",
+ add_prefix_space: bool = True,
+ ):
+ if vocab is not None:
+ # Let Unigram(..) fail if only one of them is None
+ tokenizer = Tokenizer(Unigram(vocab))
+ else:
+ tokenizer = Tokenizer(Unigram())
+
+ tokenizer.normalizer = normalizers.Sequence(
+ [normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
+ )
+ prepend_scheme = "always" if add_prefix_space else "never"
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+ parameters = {
+ "model": "SentencePieceUnigram",
+ "replacement": replacement,
+ "add_prefix_space": add_prefix_space,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 8000,
+ show_progress: bool = True,
+ special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+ initial_alphabet: Optional[List[str]] = None,
+ unk_token: Optional[str] = None,
+ ):
+ """
+ Train the model using the given files
+
+ Args:
+ files (:obj:`List[str]`):
+ A list of path to the files that we should use for training
+ vocab_size (:obj:`int`):
+ The size of the final vocabulary, including all tokens and alphabet.
+ show_progress (:obj:`bool`):
+ Whether to show progress bars while training.
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+ A list of special tokens the model should know of.
+ initial_alphabet (:obj:`List[str]`, `optional`):
+ A list of characters to include in the initial alphabet, even
+ if not seen in the training dataset.
+ If the strings contain more than one character, only the first one
+ is kept.
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+ """
+
+ if special_tokens is None:
+ special_tokens = []
+
+ if initial_alphabet is None:
+ initial_alphabet = []
+
+ trainer = trainers.UnigramTrainer(
+ vocab_size=vocab_size,
+ special_tokens=special_tokens,
+ show_progress=show_progress,
+ initial_alphabet=initial_alphabet,
+ unk_token=unk_token,
+ )
+
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 8000,
+ show_progress: bool = True,
+ special_tokens: Optional[List[Union[str, AddedToken]]] = None,
+ initial_alphabet: Optional[List[str]] = None,
+ unk_token: Optional[str] = None,
+ length: Optional[int] = None,
+ ):
+ """
+ Train the model using the given iterator
+
+ Args:
+ iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
+ Any iterator over strings or list of strings
+ vocab_size (:obj:`int`):
+ The size of the final vocabulary, including all tokens and alphabet.
+ show_progress (:obj:`bool`):
+ Whether to show progress bars while training.
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+ A list of special tokens the model should know of.
+ initial_alphabet (:obj:`List[str]`, `optional`):
+ A list of characters to include in the initial alphabet, even
+ if not seen in the training dataset.
+ If the strings contain more than one character, only the first one
+ is kept.
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+ length (:obj:`int`, `optional`):
+ The total number of sequences in the iterator. This is used to
+ provide meaningful progress tracking
+ """
+
+ if special_tokens is None:
+ special_tokens = []
+
+ if initial_alphabet is None:
+ initial_alphabet = []
+
+ trainer = trainers.UnigramTrainer(
+ vocab_size=vocab_size,
+ special_tokens=special_tokens,
+ show_progress=show_progress,
+ initial_alphabet=initial_alphabet,
+ unk_token=unk_token,
+ )
+
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )
+
+ @staticmethod
+ def from_spm(filename: str):
+ try:
+ import sys
+
+ sys.path.append(".")
+
+ import sentencepiece_model_pb2 as model
+ except Exception:
+ raise Exception(
+ "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
+ )
+
+ m = model.ModelProto()
+ m.ParseFromString(open(filename, "rb").read())
+
+ precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
+ vocab = [(piece.piece, piece.score) for piece in m.pieces]
+ unk_id = m.trainer_spec.unk_id
+ model_type = m.trainer_spec.model_type
+ byte_fallback = m.trainer_spec.byte_fallback
+ if model_type != 1:
+ raise Exception(
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+ )
+
+ replacement = "▁"
+ add_prefix_space = True
+
+ tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
+
+ if precompiled_charsmap:
+ tokenizer.normalizer = normalizers.Sequence(
+ [
+ normalizers.Precompiled(precompiled_charsmap),
+ normalizers.Replace(Regex(" {2,}"), " "),
+ ]
+ )
+ else:
+ tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
+ prepend_scheme = "always" if add_prefix_space else "never"
+ tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+ tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+ parameters = {
+ "model": "SentencePieceUnigram",
+ }
+
+ obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
+ BaseTokenizer.__init__(obj, tokenizer, parameters)
+ return obj
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py
new file mode 100644
index 00000000..68ac211a
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py
@@ -0,0 +1,8 @@
+# Generated content DO NOT EDIT
+from .. import models
+
+Model = models.Model
+BPE = models.BPE
+Unigram = models.Unigram
+WordLevel = models.WordLevel
+WordPiece = models.WordPiece
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
new file mode 100644
index 00000000..955b9a16
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
@@ -0,0 +1,591 @@
+# Generated content DO NOT EDIT
+class Model:
+ """
+ Base class for all models
+
+ The model represents the actual tokenization algorithm. This is the part that
+ will contain and manage the learned vocabulary.
+
+ This class cannot be constructed directly. Please use one of the concrete models.
+ """
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class BPE(Model):
+ """
+ An implementation of the BPE (Byte-Pair Encoding) algorithm
+
+ Args:
+ vocab (:obj:`Dict[str, int]`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ merges (:obj:`List[Tuple[str, str]]`, `optional`):
+ A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
+
+ cache_capacity (:obj:`int`, `optional`):
+ The number of words that the BPE cache can contain. The cache allows
+ to speed-up the process by keeping the result of the merge operations
+ for a number of words.
+
+ dropout (:obj:`float`, `optional`):
+ A float between 0 and 1 that represents the BPE dropout to use.
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+
+ continuing_subword_prefix (:obj:`str`, `optional`):
+ The prefix to attach to subword units that don't represent a beginning of word.
+
+ end_of_word_suffix (:obj:`str`, `optional`):
+ The suffix to attach to subword units that represent an end of word.
+
+ fuse_unk (:obj:`bool`, `optional`):
+ Whether to fuse any subsequent unknown tokens into a single one
+
+ byte_fallback (:obj:`bool`, `optional`):
+ Whether to use spm byte-fallback trick (defaults to False)
+
+ ignore_merges (:obj:`bool`, `optional`):
+ Whether or not to match tokens with the vocab before using merges.
+ """
+ def __init__(
+ self,
+ vocab=None,
+ merges=None,
+ cache_capacity=None,
+ dropout=None,
+ unk_token=None,
+ continuing_subword_prefix=None,
+ end_of_word_suffix=None,
+ fuse_unk=None,
+ byte_fallback=False,
+ ignore_merges=False,
+ ):
+ pass
+
+ @staticmethod
+ def from_file(cls, vocab, merge, **kwargs):
+ """
+ Instantiate a BPE model from the given files.
+
+ This method is roughly equivalent to doing::
+
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ bpe = BPE(vocab, merges)
+
+ If you don't need to keep the :obj:`vocab, merges` values lying around,
+ this method is more optimized than manually calling
+ :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ merges (:obj:`str`):
+ The path to a :obj:`merges.txt` file
+
+ Returns:
+ :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(self, vocab, merges):
+ """
+ Read a :obj:`vocab.json` and a :obj:`merges.txt` files
+
+ This method provides a way to read and parse the content of these files,
+ returning the relevant data structures. If you want to instantiate some BPE models
+ from memory, this method gives you the expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ merges (:obj:`str`):
+ The path to a :obj:`merges.txt` file
+
+ Returns:
+ A :obj:`Tuple` with the vocab and the merges:
+ The vocabulary and merges loaded into memory
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class Unigram(Model):
+ """
+ An implementation of the Unigram algorithm
+
+ Args:
+ vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
+ A list of vocabulary items and their relative score [("am", -0.2442),...]
+ """
+ def __init__(self, vocab, unk_id, byte_fallback):
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class WordLevel(Model):
+ """
+ An implementation of the WordLevel algorithm
+
+ Most simple tokenizer model based on mapping tokens to their corresponding id.
+
+ Args:
+ vocab (:obj:`str`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+ """
+ def __init__(self, vocab, unk_token):
+ pass
+
+ @staticmethod
+ def from_file(vocab, unk_token):
+ """
+ Instantiate a WordLevel model from the given file
+
+ This method is roughly equivalent to doing::
+
+ vocab = WordLevel.read_file(vocab_filename)
+ wordlevel = WordLevel(vocab)
+
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
+ more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
+ initialize a :class:`~tokenizers.models.WordLevel`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ Returns:
+ :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(vocab):
+ """
+ Read a :obj:`vocab.json`
+
+ This method provides a way to read and parse the content of a vocabulary file,
+ returning the relevant data structures. If you want to instantiate some WordLevel models
+ from memory, this method gives you the expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ Returns:
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class WordPiece(Model):
+ """
+ An implementation of the WordPiece algorithm
+
+ Args:
+ vocab (:obj:`Dict[str, int]`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+
+ max_input_chars_per_word (:obj:`int`, `optional`):
+ The maximum number of characters to authorize in a single word.
+ """
+ def __init__(self, vocab, unk_token, max_input_chars_per_word):
+ pass
+
+ @staticmethod
+ def from_file(vocab, **kwargs):
+ """
+ Instantiate a WordPiece model from the given file
+
+ This method is roughly equivalent to doing::
+
+ vocab = WordPiece.read_file(vocab_filename)
+ wordpiece = WordPiece(vocab)
+
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
+ more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
+ initialize a :class:`~tokenizers.models.WordPiece`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.txt` file
+
+ Returns:
+ :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(vocab):
+ """
+ Read a :obj:`vocab.txt` file
+
+ This method provides a way to read and parse the content of a standard `vocab.txt`
+ file as used by the WordPiece Model, returning the relevant data structures. If you
+ want to instantiate some WordPiece models from memory, this method gives you the
+ expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.txt` file
+
+ Returns:
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
new file mode 100644
index 00000000..15a16f1e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
@@ -0,0 +1,29 @@
+from .. import normalizers
+
+
+Normalizer = normalizers.Normalizer
+BertNormalizer = normalizers.BertNormalizer
+NFD = normalizers.NFD
+NFKD = normalizers.NFKD
+NFC = normalizers.NFC
+NFKC = normalizers.NFKC
+Sequence = normalizers.Sequence
+Lowercase = normalizers.Lowercase
+Prepend = normalizers.Prepend
+Strip = normalizers.Strip
+StripAccents = normalizers.StripAccents
+Nmt = normalizers.Nmt
+Precompiled = normalizers.Precompiled
+Replace = normalizers.Replace
+
+
+NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
+
+
+def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+ if normalizer not in NORMALIZERS:
+ raise ValueError(
+ "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
+ )
+
+ return NORMALIZERS[normalizer]()
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
new file mode 100644
index 00000000..507d4473
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
@@ -0,0 +1,595 @@
+# Generated content DO NOT EDIT
+class Normalizer:
+ """
+ Base class for all normalizers
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
+ Normalizer will return an instance of this class when instantiated.
+ """
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class BertNormalizer(Normalizer):
+ """
+ BertNormalizer
+
+ Takes care of normalizing raw text before giving it to a Bert model.
+ This includes cleaning the text, handling accents, chinese chars and lowercasing
+
+ Args:
+ clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to clean the text, by removing any control characters
+ and replacing all whitespaces by the classic one.
+
+ handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to handle chinese chars by putting spaces around them.
+
+ strip_accents (:obj:`bool`, `optional`):
+ Whether to strip all accents. If this option is not specified (ie == None),
+ then it will be determined by the value for `lowercase` (as in the original Bert).
+
+ lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to lowercase.
+ """
+ def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Lowercase(Normalizer):
+ """
+ Lowercase Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFC(Normalizer):
+ """
+ NFC Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFD(Normalizer):
+ """
+ NFD Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFKC(Normalizer):
+ """
+ NFKC Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFKD(Normalizer):
+ """
+ NFKD Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Nmt(Normalizer):
+ """
+ Nmt normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Precompiled(Normalizer):
+ """
+ Precompiled normalizer
+ Don't use manually it is used for compatiblity for SentencePiece.
+ """
+ def __init__(self, precompiled_charsmap):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Prepend(Normalizer):
+ """
+ Prepend normalizer
+ """
+ def __init__(self, prepend):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Replace(Normalizer):
+ """
+ Replace normalizer
+ """
+ def __init__(self, pattern, content):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Sequence(Normalizer):
+ """
+ Allows concatenating multiple other Normalizer as a Sequence.
+ All the normalizers run in sequence in the given order
+
+ Args:
+ normalizers (:obj:`List[Normalizer]`):
+ A list of Normalizer to be run as a sequence
+ """
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Strip(Normalizer):
+ """
+ Strip normalizer
+ """
+ def __init__(self, left=True, right=True):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class StripAccents(Normalizer):
+ """
+ StripAccents normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py
new file mode 100644
index 00000000..48277f0d
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py
@@ -0,0 +1,15 @@
+# Generated content DO NOT EDIT
+from .. import pre_tokenizers
+
+PreTokenizer = pre_tokenizers.PreTokenizer
+BertPreTokenizer = pre_tokenizers.BertPreTokenizer
+ByteLevel = pre_tokenizers.ByteLevel
+CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
+Digits = pre_tokenizers.Digits
+Metaspace = pre_tokenizers.Metaspace
+Punctuation = pre_tokenizers.Punctuation
+Sequence = pre_tokenizers.Sequence
+Split = pre_tokenizers.Split
+UnicodeScripts = pre_tokenizers.UnicodeScripts
+Whitespace = pre_tokenizers.Whitespace
+WhitespaceSplit = pre_tokenizers.WhitespaceSplit
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi
new file mode 100644
index 00000000..d81d3802
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi
@@ -0,0 +1,607 @@
+# Generated content DO NOT EDIT
+class PreTokenizer:
+ """
+ Base class for all pre-tokenizers
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
+ PreTokenizer will return an instance of this class when instantiated.
+ """
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class BertPreTokenizer(PreTokenizer):
+ """
+ BertPreTokenizer
+
+ This pre-tokenizer splits tokens on spaces, and also on punctuation.
+ Each occurence of a punctuation character will be treated separately.
+ """
+ def __init__(self):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class ByteLevel(PreTokenizer):
+ """
+ ByteLevel PreTokenizer
+
+ This pre-tokenizer takes care of replacing all bytes of the given string
+ with a corresponding representation, as well as splitting into words.
+
+ Args:
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to add a space to the first word if there isn't already one. This
+ lets us treat `hello` exactly like `say hello`.
+ use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Set this to :obj:`False` to prevent this `pre_tokenizer` from using
+ the GPT2 specific regexp for spliting on whitespace.
+ """
+ def __init__(self, add_prefix_space=True, use_regex=True):
+ pass
+
+ @staticmethod
+ def alphabet():
+ """
+ Returns the alphabet used by this PreTokenizer.
+
+ Since the ByteLevel works as its name suggests, at the byte level, it
+ encodes each byte value to a unique visible character. This means that there is a
+ total of 256 different characters composing this alphabet.
+
+ Returns:
+ :obj:`List[str]`: A list of characters that compose the alphabet
+ """
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class CharDelimiterSplit(PreTokenizer):
+ """
+ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
+
+ Args:
+ delimiter: str:
+ The delimiter char that will be used to split input
+ """
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Digits(PreTokenizer):
+ """
+ This pre-tokenizer simply splits using the digits in separate tokens
+
+ Args:
+ individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ If set to True, digits will each be separated as follows::
+
+ "Call 123 please" -> "Call ", "1", "2", "3", " please"
+
+ If set to False, digits will grouped as follows::
+
+ "Call 123 please" -> "Call ", "123", " please"
+ """
+ def __init__(self, individual_digits=False):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Metaspace(PreTokenizer):
+ """
+ Metaspace pre-tokenizer
+
+ This pre-tokenizer replaces any whitespace by the provided replacement character.
+ It then tries to split on these spaces.
+
+ Args:
+ replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
+ The replacement character. Must be exactly one character. By default we
+ use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
+
+ prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
+ Whether to add a space to the first word if there isn't already one. This
+ lets us treat `hello` exactly like `say hello`.
+ Choices: "always", "never", "first". First means the space is only added on the first
+ token (relevant when special tokens are used or other pre_tokenizer are used).
+
+ """
+ def __init__(self, replacement="_", prepend_scheme="always", split=True):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Punctuation(PreTokenizer):
+ """
+ This pre-tokenizer simply splits on punctuation as individual characters.
+
+ Args:
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+ The behavior to use when splitting.
+ Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
+ "contiguous"
+ """
+ def __init__(self, behavior="isolated"):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Sequence(PreTokenizer):
+ """
+ This pre-tokenizer composes other pre_tokenizers and applies them in sequence
+ """
+ def __init__(self, pretokenizers):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Split(PreTokenizer):
+ """
+ Split PreTokenizer
+
+ This versatile pre-tokenizer splits using the provided pattern and
+ according to the provided behavior. The pattern can be inverted by
+ making use of the invert flag.
+
+ Args:
+ pattern (:obj:`str` or :class:`~tokenizers.Regex`):
+ A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
+
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+ The behavior to use when splitting.
+ Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+ "contiguous"
+
+ invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether to invert the pattern.
+ """
+ def __init__(self, pattern, behavior, invert=False):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class UnicodeScripts(PreTokenizer):
+ """
+ This pre-tokenizer splits on characters that belong to different language family
+ It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
+ Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
+ This mimicks SentencePiece Unigram implementation.
+ """
+ def __init__(self):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class Whitespace(PreTokenizer):
+ """
+ This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+ """
+ def __init__(self):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
+
+class WhitespaceSplit(PreTokenizer):
+ """
+ This pre-tokenizer simply splits on the whitespace. Works like `.split()`
+ """
+ def __init__(self):
+ pass
+
+ def pre_tokenize(self, pretok):
+ """
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
+ keep track of the pre-tokenization, and leverage the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
+ the pre-tokenization of a raw string, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
+
+ Args:
+ pretok (:class:`~tokenizers.PreTokenizedString):
+ The pre-tokenized string on which to apply this
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
+ """
+ pass
+
+ def pre_tokenize_str(self, sequence):
+ """
+ Pre tokenize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
+ alignment, nor does it provide all the capabilities of the
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to pre-tokeize
+
+ Returns:
+ :obj:`List[Tuple[str, Offsets]]`:
+ A list of tuple with the pre-tokenized parts and their offsets
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py
new file mode 100644
index 00000000..06d12403
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py
@@ -0,0 +1,9 @@
+# Generated content DO NOT EDIT
+from .. import processors
+
+PostProcessor = processors.PostProcessor
+BertProcessing = processors.BertProcessing
+ByteLevel = processors.ByteLevel
+RobertaProcessing = processors.RobertaProcessing
+Sequence = processors.Sequence
+TemplateProcessing = processors.TemplateProcessing
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi
new file mode 100644
index 00000000..5136d02b
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi
@@ -0,0 +1,342 @@
+# Generated content DO NOT EDIT
+class PostProcessor:
+ """
+ Base class for all post-processors
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of
+ a PostProcessor will return an instance of this class when instantiated.
+ """
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
+
+class BertProcessing(PostProcessor):
+ """
+ This post-processor takes care of adding the special tokens needed by
+ a Bert model:
+
+ - a SEP token
+ - a CLS token
+
+ Args:
+ sep (:obj:`Tuple[str, int]`):
+ A tuple with the string representation of the SEP token, and its id
+
+ cls (:obj:`Tuple[str, int]`):
+ A tuple with the string representation of the CLS token, and its id
+ """
+ def __init__(self, sep, cls):
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
+
+class ByteLevel(PostProcessor):
+ """
+ This post-processor takes care of trimming the offsets.
+
+ By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+ want the offsets to include these whitespaces, then this PostProcessor must be used.
+
+ Args:
+ trim_offsets (:obj:`bool`):
+ Whether to trim the whitespaces from the produced offsets.
+ """
+ def __init__(self, trim_offsets=True):
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
+
+class RobertaProcessing(PostProcessor):
+ """
+ This post-processor takes care of adding the special tokens needed by
+ a Roberta model:
+
+ - a SEP token
+ - a CLS token
+
+ It also takes care of trimming the offsets.
+ By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
+ want the offsets to include these whitespaces, then this PostProcessor should be initialized
+ with :obj:`trim_offsets=True`
+
+ Args:
+ sep (:obj:`Tuple[str, int]`):
+ A tuple with the string representation of the SEP token, and its id
+
+ cls (:obj:`Tuple[str, int]`):
+ A tuple with the string representation of the CLS token, and its id
+
+ trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to trim the whitespaces from the produced offsets.
+
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether the add_prefix_space option was enabled during pre-tokenization. This
+ is relevant because it defines the way the offsets are trimmed out.
+ """
+ def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
+
+class Sequence(PostProcessor):
+ """
+ Sequence Processor
+
+ Args:
+ processors (:obj:`List[PostProcessor]`)
+ The processors that need to be chained
+ """
+ def __init__(self, processors):
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
+
+class TemplateProcessing(PostProcessor):
+ """
+ Provides a way to specify templates in order to add the special tokens to each
+ input sequence as relevant.
+
+ Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to
+ delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first
+ sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair
+ sequences. The final result looks like this:
+
+ - Single sequence: :obj:`[CLS] Hello there [SEP]`
+ - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]`
+
+ With the type ids as following::
+
+ [CLS] ... [SEP] ... [SEP]
+ 0 0 0 1 1
+
+ You can achieve such behavior using a TemplateProcessing::
+
+ TemplateProcessing(
+ single="[CLS] $0 [SEP]",
+ pair="[CLS] $A [SEP] $B:1 [SEP]:1",
+ special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
+ )
+
+ In this example, each input sequence is identified using a ``$`` construct. This identifier
+ lets us specify each input sequence, and the type_id to use. When nothing is specified,
+ it uses the default values. Here are the different ways to specify it:
+
+ - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B``
+ - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ...
+ - Specifying both: ``$A:0``, ``$B:1``, ...
+
+ The same construct is used for special tokens: ``<identifier>(:<type_id>)?``.
+
+ **Warning**: You must ensure that you are giving the correct tokens/ids as these
+ will be added to the Encoding without any further check. If the given ids correspond
+ to something totally different in a `Tokenizer` using this `PostProcessor`, it
+ might lead to unexpected results.
+
+ Args:
+ single (:obj:`Template`):
+ The template used for single sequences
+
+ pair (:obj:`Template`):
+ The template used when both sequences are specified
+
+ special_tokens (:obj:`Tokens`):
+ The list of special tokens used in each sequences
+
+ Types:
+
+ Template (:obj:`str` or :obj:`List`):
+ - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens
+ - If a :obj:`List[str]` is provided, a list of tokens
+
+ Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`):
+ - A :obj:`Tuple` with both a token and its associated ID, in any order
+ - A :obj:`dict` with the following keys:
+ - "id": :obj:`str` => The special token id, as specified in the Template
+ - "ids": :obj:`List[int]` => The associated IDs
+ - "tokens": :obj:`List[str]` => The associated tokens
+
+ The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
+ the same length.
+ """
+ def __init__(self, single, pair, special_tokens):
+ pass
+
+ def num_special_tokens_to_add(self, is_pair):
+ """
+ Return the number of special tokens that would be added for single/pair sentences.
+
+ Args:
+ is_pair (:obj:`bool`):
+ Whether the input would be a pair of sequences
+
+ Returns:
+ :obj:`int`: The number of tokens to add
+ """
+ pass
+
+ def process(self, encoding, pair=None, add_special_tokens=True):
+ """
+ Post-process the given encodings, generating the final one
+
+ Args:
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding for the first sequence
+
+ pair (:class:`~tokenizers.Encoding`, `optional`):
+ The encoding for the pair sequence
+
+ add_special_tokens (:obj:`bool`):
+ Whether to add the special tokens
+
+ Return:
+ :class:`~tokenizers.Encoding`: The final encoding
+ """
+ pass
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so
new file mode 100755
index 00000000..563e2885
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so
Binary files differ
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py
new file mode 100644
index 00000000..f941e2ed
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py
@@ -0,0 +1 @@
+from .visualizer import Annotation, EncodingVisualizer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css
new file mode 100644
index 00000000..f54fde45
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css
@@ -0,0 +1,170 @@
+.tokenized-text {
+ width:100%;
+ padding:2rem;
+ max-height: 400px;
+ overflow-y: auto;
+ box-sizing:border-box;
+ line-height:4rem; /* Lots of space between lines */
+ font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace;
+ box-shadow: 2px 2px 2px rgba(0,0,0,0.2);
+ background-color: rgba(0,0,0,0.01);
+ letter-spacing:2px; /* Give some extra separation between chars */
+}
+.non-token{
+ /* White space and other things the tokenizer ignores*/
+ white-space: pre;
+ letter-spacing:4px;
+ border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/
+ border-bottom:1px solid #A0A0A0;
+ line-height: 1rem;
+ height: calc(100% - 2px);
+}
+
+.token {
+ white-space: pre;
+ position:relative;
+ color:black;
+ letter-spacing:2px;
+}
+
+.annotation{
+ white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */
+ border-radius:4px;
+ position:relative;
+ width:fit-content;
+}
+.annotation:before {
+ /*The before holds the text and the after holds the background*/
+ z-index:1000; /* Make sure this is above the background */
+ content:attr(data-label); /* The annotations label is on a data attribute */
+ color:white;
+ position:absolute;
+ font-size:1rem;
+ text-align:center;
+ font-weight:bold;
+
+ top:1.75rem;
+ line-height:0;
+ left:0;
+ width:100%;
+ padding:0.5rem 0;
+ /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/
+ overflow: hidden;
+ white-space: nowrap;
+ text-overflow:ellipsis;
+}
+
+.annotation:after {
+ content:attr(data-label); /* The content defines the width of the annotation*/
+ position:absolute;
+ font-size:0.75rem;
+ text-align:center;
+ font-weight:bold;
+ text-overflow:ellipsis;
+ top:1.75rem;
+ line-height:0;
+ overflow: hidden;
+ white-space: nowrap;
+
+ left:0;
+ width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+
+ padding:0.5rem 0;
+ /* Nast hack below:
+ We set the annotations color in code because we don't know the colors at css time.
+ But you can't pass a color as a data attribute to get it into the pseudo element (this thing)
+ So to get around that, annotations have the color set on them with a style attribute and then we
+ can get the color with currentColor.
+ Annotations wrap tokens and tokens set the color back to black
+ */
+ background-color: currentColor;
+}
+.annotation:hover::after, .annotation:hover::before{
+ /* When the user hovers over an annotation expand the label to display in full
+ */
+ min-width: fit-content;
+}
+
+.annotation:hover{
+ /* Emphasize the annotation start end with a border on hover*/
+ border-color: currentColor;
+ border: 2px solid;
+}
+.special-token:not(:empty){
+ /*
+ A none empty special token is like UNK (as opposed to CLS which has no representation in the text )
+ */
+ position:relative;
+}
+.special-token:empty::before{
+ /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/
+ content:attr(data-stok);
+ background:#202020;
+ font-size:0.75rem;
+ color:white;
+ margin: 0 0.25rem;
+ padding: 0.25rem;
+ border-radius:4px
+}
+
+.special-token:not(:empty):before {
+ /* Special tokens that have text (UNK) are displayed above the actual text*/
+ content:attr(data-stok);
+ position:absolute;
+ bottom:1.75rem;
+ min-width:100%;
+ width:100%;
+ height:1rem;
+ line-height:1rem;
+ font-size:1rem;
+ text-align:center;
+ color:white;
+ font-weight:bold;
+ background:#202020;
+ border-radius:10%;
+}
+/*
+We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations
+instead we apply even and odd class at generation time and color them that way
+ */
+.even-token{
+ background:#DCDCDC ;
+ border: 1px solid #DCDCDC;
+}
+.odd-token{
+ background:#A0A0A0;
+ border: 1px solid #A0A0A0;
+}
+.even-token.multi-token,.odd-token.multi-token{
+ background: repeating-linear-gradient(
+ 45deg,
+ transparent,
+ transparent 1px,
+ #ccc 1px,
+ #ccc 1px
+ ),
+ /* on "bottom" */
+ linear-gradient(
+ to bottom,
+ #FFB6C1,
+ #999
+ );
+}
+
+.multi-token:hover::after {
+ content:"This char has more than 1 token"; /* The content defines the width of the annotation*/
+ color:white;
+ background-color: black;
+ position:absolute;
+ font-size:0.75rem;
+ text-align:center;
+ font-weight:bold;
+ text-overflow:ellipsis;
+ top:1.75rem;
+ line-height:0;
+ overflow: hidden;
+ white-space: nowrap;
+ left:0;
+ width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/
+ padding:0.5rem 0;
+}
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
new file mode 100644
index 00000000..c988a648
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py
@@ -0,0 +1,403 @@
+import itertools
+import os
+import re
+from string import Template
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+
+from tokenizers import Encoding, Tokenizer
+
+
+dirname = os.path.dirname(__file__)
+css_filename = os.path.join(dirname, "visualizer-styles.css")
+with open(css_filename) as f:
+ css = f.read()
+
+
+class Annotation:
+ start: int
+ end: int
+ label: int
+
+ def __init__(self, start: int, end: int, label: str):
+ self.start = start
+ self.end = end
+ self.label = label
+
+
+AnnotationList = List[Annotation]
+PartialIntList = List[Optional[int]]
+
+
+class CharStateKey(NamedTuple):
+ token_ix: Optional[int]
+ anno_ix: Optional[int]
+
+
+class CharState:
+ char_ix: Optional[int]
+
+ def __init__(self, char_ix):
+ self.char_ix = char_ix
+
+ self.anno_ix: Optional[int] = None
+ self.tokens: List[int] = []
+
+ @property
+ def token_ix(self):
+ return self.tokens[0] if len(self.tokens) > 0 else None
+
+ @property
+ def is_multitoken(self):
+ """
+ BPE tokenizers can output more than one token for a char
+ """
+ return len(self.tokens) > 1
+
+ def partition_key(self) -> CharStateKey:
+ return CharStateKey(
+ token_ix=self.token_ix,
+ anno_ix=self.anno_ix,
+ )
+
+
+class Aligned:
+ pass
+
+
+class EncodingVisualizer:
+ """
+ Build an EncodingVisualizer
+
+ Args:
+
+ tokenizer (:class:`~tokenizers.Tokenizer`):
+ A tokenizer instance
+
+ default_to_notebook (:obj:`bool`):
+ Whether to render html output in a notebook by default
+
+ annotation_converter (:obj:`Callable`, `optional`):
+ An optional (lambda) function that takes an annotation in any format and returns
+ an Annotation object
+ """
+
+ unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE)
+
+ def __init__(
+ self,
+ tokenizer: Tokenizer,
+ default_to_notebook: bool = True,
+ annotation_converter: Optional[Callable[[Any], Annotation]] = None,
+ ):
+ if default_to_notebook:
+ try:
+ from IPython.core.display import HTML, display
+ except ImportError:
+ raise Exception(
+ """We couldn't import IPython utils for html display.
+ Are you running in a notebook?
+ You can also pass `default_to_notebook=False` to get back raw HTML
+ """
+ )
+
+ self.tokenizer = tokenizer
+ self.default_to_notebook = default_to_notebook
+ self.annotation_coverter = annotation_converter
+ pass
+
+ def __call__(
+ self,
+ text: str,
+ annotations: AnnotationList = [],
+ default_to_notebook: Optional[bool] = None,
+ ) -> Optional[str]:
+ """
+ Build a visualization of the given text
+
+ Args:
+ text (:obj:`str`):
+ The text to tokenize
+
+ annotations (:obj:`List[Annotation]`, `optional`):
+ An optional list of annotations of the text. The can either be an annotation class
+ or anything else if you instantiated the visualizer with a converter function
+
+ default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
+ If True, will render the html in a notebook. Otherwise returns an html string.
+
+ Returns:
+ The HTML string if default_to_notebook is False, otherwise (default) returns None and
+ renders the HTML in the notebook
+
+ """
+ final_default_to_notebook = self.default_to_notebook
+ if default_to_notebook is not None:
+ final_default_to_notebook = default_to_notebook
+ if final_default_to_notebook:
+ try:
+ from IPython.core.display import HTML, display
+ except ImportError:
+ raise Exception(
+ """We couldn't import IPython utils for html display.
+ Are you running in a notebook?"""
+ )
+ if self.annotation_coverter is not None:
+ annotations = list(map(self.annotation_coverter, annotations))
+ encoding = self.tokenizer.encode(text)
+ html = EncodingVisualizer.__make_html(text, encoding, annotations)
+ if final_default_to_notebook:
+ display(HTML(html))
+ else:
+ return html
+
+ @staticmethod
+ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
+ """
+ Generates a color palette for all the labels in a given set of annotations
+
+ Args:
+ annotations (:obj:`Annotation`):
+ A list of annotations
+
+ Returns:
+ :obj:`dict`: A dictionary mapping labels to colors in HSL format
+ """
+ if len(annotations) == 0:
+ return {}
+ labels = set(map(lambda x: x.label, annotations))
+ num_labels = len(labels)
+ h_step = int(255 / num_labels)
+ if h_step < 20:
+ h_step = 20
+ s = 32
+ l = 64 # noqa: E741
+ h = 10
+ colors = {}
+
+ for label in sorted(labels): # sort so we always get the same colors for a given set of labels
+ colors[label] = f"hsl({h},{s}%,{l}%"
+ h += h_step
+ return colors
+
+ @staticmethod
+ def consecutive_chars_to_html(
+ consecutive_chars_list: List[CharState],
+ text: str,
+ encoding: Encoding,
+ ):
+ """
+ Converts a list of "consecutive chars" into a single HTML element.
+ Chars are consecutive if they fall under the same word, token and annotation.
+ The CharState class is a named tuple with a "partition_key" method that makes it easy to
+ compare if two chars are consecutive.
+
+ Args:
+ consecutive_chars_list (:obj:`List[CharState]`):
+ A list of CharStates that have been grouped together
+
+ text (:obj:`str`):
+ The original text being processed
+
+ encoding (:class:`~tokenizers.Encoding`):
+ The encoding returned from the tokenizer
+
+ Returns:
+ :obj:`str`: The HTML span for a set of consecutive chars
+ """
+ first = consecutive_chars_list[0]
+ if first.char_ix is None:
+ # its a special token
+ stoken = encoding.tokens[first.token_ix]
+ # special tokens are represented as empty spans. We use the data attribute and css
+ # magic to display it
+ return f'<span class="special-token" data-stoken={stoken}></span>'
+ # We're not in a special token so this group has a start and end.
+ last = consecutive_chars_list[-1]
+ start = first.char_ix
+ end = last.char_ix + 1
+ span_text = text[start:end]
+ css_classes = [] # What css classes will we apply on the resulting span
+ data_items = {} # What data attributes will we apply on the result span
+ if first.token_ix is not None:
+ # We can either be in a token or not (e.g. in white space)
+ css_classes.append("token")
+ if first.is_multitoken:
+ css_classes.append("multi-token")
+ if first.token_ix % 2:
+ # We use this to color alternating tokens.
+ # A token might be split by an annotation that ends in the middle of it, so this
+ # lets us visually indicate a consecutive token despite its possible splitting in
+ # the html markup
+ css_classes.append("odd-token")
+ else:
+ # Like above, but a different color so we can see the tokens alternate
+ css_classes.append("even-token")
+ if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
+ # This is a special token that is in the text. probably UNK
+ css_classes.append("special-token")
+ # TODO is this the right name for the data attribute ?
+ data_items["stok"] = encoding.tokens[first.token_ix]
+ else:
+ # In this case we are looking at a group/single char that is not tokenized.
+ # e.g. white space
+ css_classes.append("non-token")
+ css = f'''class="{' '.join(css_classes)}"'''
+ data = ""
+ for key, val in data_items.items():
+ data += f' data-{key}="{val}"'
+ return f"<span {css} {data} >{span_text}</span>"
+
+ @staticmethod
+ def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str:
+ char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations)
+ current_consecutive_chars = [char_states[0]]
+ prev_anno_ix = char_states[0].anno_ix
+ spans = []
+ label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations)
+ cur_anno_ix = char_states[0].anno_ix
+ if cur_anno_ix is not None:
+ # If we started in an annotation make a span for it
+ anno = annotations[cur_anno_ix]
+ label = anno.label
+ color = label_colors_dict[label]
+ spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+
+ for cs in char_states[1:]:
+ cur_anno_ix = cs.anno_ix
+ if cur_anno_ix != prev_anno_ix:
+ # If we've transitioned in or out of an annotation
+ spans.append(
+ # Create a span from the current consecutive characters
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ current_consecutive_chars = [cs]
+
+ if prev_anno_ix is not None:
+ # if we transitioned out of an annotation close it's span
+ spans.append("</span>")
+ if cur_anno_ix is not None:
+ # If we entered a new annotation make a span for it
+ anno = annotations[cur_anno_ix]
+ label = anno.label
+ color = label_colors_dict[label]
+ spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
+ prev_anno_ix = cur_anno_ix
+
+ if cs.partition_key() == current_consecutive_chars[0].partition_key():
+ # If the current charchter is in the same "group" as the previous one
+ current_consecutive_chars.append(cs)
+ else:
+ # Otherwise we make a span for the previous group
+ spans.append(
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ # An reset the consecutive_char_list to form a new group
+ current_consecutive_chars = [cs]
+ # All that's left is to fill out the final span
+ # TODO I think there is an edge case here where an annotation's span might not close
+ spans.append(
+ EncodingVisualizer.consecutive_chars_to_html(
+ current_consecutive_chars,
+ text=text,
+ encoding=encoding,
+ )
+ )
+ res = HTMLBody(spans) # Send the list of spans to the body of our html
+ return res
+
+ @staticmethod
+ def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList:
+ """
+ Args:
+ text (:obj:`str`):
+ The raw text we want to align to
+
+ annotations (:obj:`AnnotationList`):
+ A (possibly empty) list of annotations
+
+ Returns:
+ A list of length len(text) whose entry at index i is None if there is no annotation on
+ charachter i or k, the index of the annotation that covers index i where k is with
+ respect to the list of annotations
+ """
+ annotation_map = [None] * len(text)
+ for anno_ix, a in enumerate(annotations):
+ for i in range(a.start, a.end):
+ annotation_map[i] = anno_ix
+ return annotation_map
+
+ @staticmethod
+ def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
+ """
+ For each character in the original text, we emit a tuple representing it's "state":
+
+ * which token_ix it corresponds to
+ * which word_ix it corresponds to
+ * which annotation_ix it corresponds to
+
+ Args:
+ text (:obj:`str`):
+ The raw text we want to align to
+
+ annotations (:obj:`List[Annotation]`):
+ A (possibly empty) list of annotations
+
+ encoding: (:class:`~tokenizers.Encoding`):
+ The encoding returned from the tokenizer
+
+ Returns:
+ :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
+ it's state is
+ """
+ annotation_map = EncodingVisualizer.__make_anno_map(text, annotations)
+ # Todo make this a dataclass or named tuple
+ char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))]
+ for token_ix, token in enumerate(encoding.tokens):
+ offsets = encoding.token_to_chars(token_ix)
+ if offsets is not None:
+ start, end = offsets
+ for i in range(start, end):
+ char_states[i].tokens.append(token_ix)
+ for char_ix, anno_ix in enumerate(annotation_map):
+ char_states[char_ix].anno_ix = anno_ix
+
+ return char_states
+
+
+def HTMLBody(children: List[str], css_styles=css) -> str:
+ """
+ Generates the full html with css from a list of html spans
+
+ Args:
+ children (:obj:`List[str]`):
+ A list of strings, assumed to be html elements
+
+ css_styles (:obj:`str`, `optional`):
+ Optional alternative implementation of the css
+
+ Returns:
+ :obj:`str`: An HTML string with style markup
+ """
+ children_text = "".join(children)
+ return f"""
+ <html>
+ <head>
+ <style>
+ {css_styles}
+ </style>
+ </head>
+ <body>
+ <div class="tokenized-text" dir=auto>
+ {children_text}
+ </div>
+ </body>
+ </html>
+ """
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
new file mode 100644
index 00000000..22f94c50
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
@@ -0,0 +1,8 @@
+# Generated content DO NOT EDIT
+from .. import trainers
+
+Trainer = trainers.Trainer
+BpeTrainer = trainers.BpeTrainer
+UnigramTrainer = trainers.UnigramTrainer
+WordLevelTrainer = trainers.WordLevelTrainer
+WordPieceTrainer = trainers.WordPieceTrainer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
new file mode 100644
index 00000000..d6c52571
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
@@ -0,0 +1,156 @@
+# Generated content DO NOT EDIT
+class Trainer:
+ """
+ Base class for all trainers
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
+ Trainer will return an instance of this class when instantiated.
+ """
+
+class BpeTrainer(Trainer):
+ """
+ Trainer capable of training a BPE model
+
+ Args:
+ vocab_size (:obj:`int`, `optional`):
+ The size of the final vocabulary, including all tokens and alphabet.
+
+ min_frequency (:obj:`int`, `optional`):
+ The minimum frequency a pair should have in order to be merged.
+
+ show_progress (:obj:`bool`, `optional`):
+ Whether to show progress bars while training.
+
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+ A list of special tokens the model should know of.
+
+ limit_alphabet (:obj:`int`, `optional`):
+ The maximum different characters to keep in the alphabet.
+
+ initial_alphabet (:obj:`List[str]`, `optional`):
+ A list of characters to include in the initial alphabet, even
+ if not seen in the training dataset.
+ If the strings contain more than one character, only the first one
+ is kept.
+
+ continuing_subword_prefix (:obj:`str`, `optional`):
+ A prefix to be used for every subword that is not a beginning-of-word.
+
+ end_of_word_suffix (:obj:`str`, `optional`):
+ A suffix to be used for every subword that is a end-of-word.
+
+ max_token_length (:obj:`int`, `optional`):
+ Prevents creating tokens longer than the specified size.
+ This can help with reducing polluting your vocabulary with
+ highly repetitive tokens like `======` for wikipedia
+
+ """
+
+class UnigramTrainer(Trainer):
+ """
+ Trainer capable of training a Unigram model
+
+ Args:
+ vocab_size (:obj:`int`):
+ The size of the final vocabulary, including all tokens and alphabet.
+
+ show_progress (:obj:`bool`):
+ Whether to show progress bars while training.
+
+ special_tokens (:obj:`List[Union[str, AddedToken]]`):
+ A list of special tokens the model should know of.
+
+ initial_alphabet (:obj:`List[str]`):
+ A list of characters to include in the initial alphabet, even
+ if not seen in the training dataset.
+ If the strings contain more than one character, only the first one
+ is kept.
+
+ shrinking_factor (:obj:`float`):
+ The shrinking factor used at each step of the training to prune the
+ vocabulary.
+
+ unk_token (:obj:`str`):
+ The token used for out-of-vocabulary tokens.
+
+ max_piece_length (:obj:`int`):
+ The maximum length of a given token.
+
+ n_sub_iterations (:obj:`int`):
+ The number of iterations of the EM algorithm to perform before
+ pruning the vocabulary.
+ """
+ def __init__(
+ self,
+ vocab_size=8000,
+ show_progress=True,
+ special_tokens=[],
+ shrinking_factor=0.75,
+ unk_token=None,
+ max_piece_length=16,
+ n_sub_iterations=2,
+ ):
+ pass
+
+class WordLevelTrainer(Trainer):
+ """
+ Trainer capable of training a WorldLevel model
+
+ Args:
+ vocab_size (:obj:`int`, `optional`):
+ The size of the final vocabulary, including all tokens and alphabet.
+
+ min_frequency (:obj:`int`, `optional`):
+ The minimum frequency a pair should have in order to be merged.
+
+ show_progress (:obj:`bool`, `optional`):
+ Whether to show progress bars while training.
+
+ special_tokens (:obj:`List[Union[str, AddedToken]]`):
+ A list of special tokens the model should know of.
+ """
+
+class WordPieceTrainer(Trainer):
+ """
+ Trainer capable of training a WordPiece model
+
+ Args:
+ vocab_size (:obj:`int`, `optional`):
+ The size of the final vocabulary, including all tokens and alphabet.
+
+ min_frequency (:obj:`int`, `optional`):
+ The minimum frequency a pair should have in order to be merged.
+
+ show_progress (:obj:`bool`, `optional`):
+ Whether to show progress bars while training.
+
+ special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+ A list of special tokens the model should know of.
+
+ limit_alphabet (:obj:`int`, `optional`):
+ The maximum different characters to keep in the alphabet.
+
+ initial_alphabet (:obj:`List[str]`, `optional`):
+ A list of characters to include in the initial alphabet, even
+ if not seen in the training dataset.
+ If the strings contain more than one character, only the first one
+ is kept.
+
+ continuing_subword_prefix (:obj:`str`, `optional`):
+ A prefix to be used for every subword that is not a beginning-of-word.
+
+ end_of_word_suffix (:obj:`str`, `optional`):
+ A suffix to be used for every subword that is a end-of-word.
+ """
+ def __init__(
+ self,
+ vocab_size=30000,
+ min_frequency=0,
+ show_progress=True,
+ special_tokens=[],
+ limit_alphabet=None,
+ initial_alphabet=[],
+ continuing_subword_prefix="##",
+ end_of_word_suffix=None,
+ ):
+ pass