diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers')
25 files changed, 5665 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py new file mode 100644 index 00000000..efd57429 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py @@ -0,0 +1,100 @@ +from enum import Enum +from typing import List, Tuple, Union + + +Offsets = Tuple[int, int] + +TextInputSequence = str +"""A :obj:`str` that represents an input sequence """ + +PreTokenizedInputSequence = Union[List[str], Tuple[str]] +"""A pre-tokenized input sequence. Can be one of: + + - A :obj:`List` of :obj:`str` + - A :obj:`Tuple` of :obj:`str` +""" + +TextEncodeInput = Union[ + TextInputSequence, + Tuple[TextInputSequence, TextInputSequence], + List[TextInputSequence], +] +"""Represents a textual input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.TextInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 +""" + +PreTokenizedEncodeInput = Union[ + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + List[PreTokenizedInputSequence], +] +"""Represents a pre-tokenized input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 +""" + +InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] +"""Represents all the possible types of input sequences for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextInputSequence` + - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` +""" + +EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] +"""Represents all the possible types of input for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextEncodeInput` + - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` +""" + + +class OffsetReferential(Enum): + ORIGINAL = "original" + NORMALIZED = "normalized" + + +class OffsetType(Enum): + BYTE = "byte" + CHAR = "char" + + +class SplitDelimiterBehavior(Enum): + REMOVED = "removed" + ISOLATED = "isolated" + MERGED_WITH_PREVIOUS = "merged_with_previous" + MERGED_WITH_NEXT = "merged_with_next" + CONTIGUOUS = "contiguous" + + +from .tokenizers import ( + AddedToken, + Encoding, + NormalizedString, + PreTokenizedString, + Regex, + Token, + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, + __version__, +) +from .implementations import ( + BertWordPieceTokenizer, + ByteLevelBPETokenizer, + CharBPETokenizer, + SentencePieceBPETokenizer, + SentencePieceUnigramTokenizer, +) diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi new file mode 100644 index 00000000..5dbc665d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.pyi @@ -0,0 +1,1200 @@ +# Generated content DO NOT EDIT +class AddedToken: + """ + Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`. + It can have special options that defines the way it should behave. + + Args: + content (:obj:`str`): The content of the token + + single_word (:obj:`bool`, defaults to :obj:`False`): + Defines whether this token should only match single words. If :obj:`True`, this + token will never match inside of a word. For example the token ``ing`` would match + on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`. + The notion of "`inside of a word`" is defined by the word boundaries pattern in + regular expressions (ie. the token should start and end with word boundaries). + + lstrip (:obj:`bool`, defaults to :obj:`False`): + Defines whether this token should strip all potential whitespaces on its left side. + If :obj:`True`, this token will greedily match any whitespace on its left. For + example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text + ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left). + + rstrip (:obj:`bool`, defaults to :obj:`False`): + Defines whether this token should strip all potential whitespaces on its right + side. If :obj:`True`, this token will greedily match any whitespace on its right. + It works just like :obj:`lstrip` but on the right. + + normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`): + Defines whether this token should match against the normalized version of the input + text. For example, with the added token ``"yesterday"``, and a normalizer in charge of + lowercasing the text, the token could be extract from the input ``"I saw a lion + Yesterday"``. + special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`): + Defines whether this token should be skipped when decoding. + + """ + def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False): + pass + + @property + def content(self): + """ + Get the content of this :obj:`AddedToken` + """ + pass + + @property + def lstrip(self): + """ + Get the value of the :obj:`lstrip` option + """ + pass + + @property + def normalized(self): + """ + Get the value of the :obj:`normalized` option + """ + pass + + @property + def rstrip(self): + """ + Get the value of the :obj:`rstrip` option + """ + pass + + @property + def single_word(self): + """ + Get the value of the :obj:`single_word` option + """ + pass + + @property + def special(self): + """ + Get the value of the :obj:`special` option + """ + pass + +class Encoding: + """ + The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. + """ + @property + def attention_mask(self): + """ + The attention mask + + This indicates to the LM which tokens should be attended to, and which should not. + This is especially important when batching sequences, where we need to applying + padding. + + Returns: + :obj:`List[int]`: The attention mask + """ + pass + + def char_to_token(self, char_pos, sequence_index=0): + """ + Get the token that contains the char at the given position in the input sequence. + + Args: + char_pos (:obj:`int`): + The position of a char in the input string + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target char + + Returns: + :obj:`int`: The index of the token that contains this char in the encoded sequence + """ + pass + + def char_to_word(self, char_pos, sequence_index=0): + """ + Get the word that contains the char at the given position in the input sequence. + + Args: + char_pos (:obj:`int`): + The position of a char in the input string + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target char + + Returns: + :obj:`int`: The index of the word that contains this char in the input sequence + """ + pass + + @property + def ids(self): + """ + The generated IDs + + The IDs are the main input to a Language Model. They are the token indices, + the numerical representations that a LM understands. + + Returns: + :obj:`List[int]`: The list of IDs + """ + pass + + @staticmethod + def merge(encodings, growing_offsets=True): + """ + Merge the list of encodings into one final :class:`~tokenizers.Encoding` + + Args: + encodings (A :obj:`List` of :class:`~tokenizers.Encoding`): + The list of encodings that should be merged in one + + growing_offsets (:obj:`bool`, defaults to :obj:`True`): + Whether the offsets should accumulate while merging + + Returns: + :class:`~tokenizers.Encoding`: The resulting Encoding + """ + pass + + @property + def n_sequences(self): + """ + The number of sequences represented + + Returns: + :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` + """ + pass + + @property + def offsets(self): + """ + The offsets associated to each token + + These offsets let's you slice the input string, and thus retrieve the original + part that led to producing the corresponding token. + + Returns: + A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets + """ + pass + + @property + def overflowing(self): + """ + A :obj:`List` of overflowing :class:`~tokenizers.Encoding` + + When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting + the output into as many pieces as required to match the specified maximum length. + This field lets you retrieve all the subsequent pieces. + + When you use pairs of sequences, the overflowing pieces will contain enough + variations to cover all the possible combinations, while respecting the provided + maximum length. + """ + pass + + def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): + """ + Pad the :class:`~tokenizers.Encoding` at the given length + + Args: + length (:obj:`int`): + The desired length + + direction: (:obj:`str`, defaults to :obj:`right`): + The expected padding direction. Can be either :obj:`right` or :obj:`left` + + pad_id (:obj:`int`, defaults to :obj:`0`): + The ID corresponding to the padding token + + pad_type_id (:obj:`int`, defaults to :obj:`0`): + The type ID corresponding to the padding token + + pad_token (:obj:`str`, defaults to `[PAD]`): + The pad token to use + """ + pass + + @property + def sequence_ids(self): + """ + The generated sequence indices. + + They represent the index of the input sequence associated to each token. + The sequence id can be None if the token is not related to any input sequence, + like for example with special tokens. + + Returns: + A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. + """ + pass + + def set_sequence_id(self, sequence_id): + """ + Set the given sequence index + + Set the given sequence index for the whole range of tokens contained in this + :class:`~tokenizers.Encoding`. + """ + pass + + @property + def special_tokens_mask(self): + """ + The special token mask + + This indicates which tokens are special tokens, and which are not. + + Returns: + :obj:`List[int]`: The special tokens mask + """ + pass + + def token_to_chars(self, token_index): + """ + Get the offsets of the token at the given index. + + The returned offsets are related to the input sequence that contains the + token. In order to determine in which input sequence it belongs, you + must call :meth:`~tokenizers.Encoding.token_to_sequence()`. + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` + """ + pass + + def token_to_sequence(self, token_index): + """ + Get the index of the sequence represented by the given token. + + In the general use case, this method returns :obj:`0` for a single sequence or + the first sequence of a pair, and :obj:`1` for the second sequence of a pair + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`int`: The sequence id of the given token + """ + pass + + def token_to_word(self, token_index): + """ + Get the index of the word that contains the token in one of the input sequences. + + The returned word index is related to the input sequence that contains + the token. In order to determine in which input sequence it belongs, you + must call :meth:`~tokenizers.Encoding.token_to_sequence()`. + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`int`: The index of the word in the relevant input sequence. + """ + pass + + @property + def tokens(self): + """ + The generated tokens + + They are the string representation of the IDs. + + Returns: + :obj:`List[str]`: The list of tokens + """ + pass + + def truncate(self, max_length, stride=0, direction="right"): + """ + Truncate the :class:`~tokenizers.Encoding` at the given length + + If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating + this information is lost. It will be considered as representing a single sequence. + + Args: + max_length (:obj:`int`): + The desired length + + stride (:obj:`int`, defaults to :obj:`0`): + The length of previous content to be included in each overflowing piece + + direction (:obj:`str`, defaults to :obj:`right`): + Truncate direction + """ + pass + + @property + def type_ids(self): + """ + The generated type IDs + + Generally used for tasks like sequence classification or question answering, + these tokens let the LM know which input sequence corresponds to each tokens. + + Returns: + :obj:`List[int]`: The list of type ids + """ + pass + + @property + def word_ids(self): + """ + The generated word indices. + + They represent the index of the word associated to each token. + When the input is pre-tokenized, they correspond to the ID of the given input label, + otherwise they correspond to the words indices as defined by the + :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used. + + For special tokens and such (any token that was generated from something that was + not part of the input), the output is :obj:`None` + + Returns: + A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. + """ + pass + + def word_to_chars(self, word_index, sequence_index=0): + """ + Get the offsets of the word at the given index in one of the input sequences. + + Args: + word_index (:obj:`int`): + The index of a word in one of the input sequences. + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target word + + Returns: + :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` + """ + pass + + def word_to_tokens(self, word_index, sequence_index=0): + """ + Get the encoded tokens corresponding to the word at the given index + in one of the input sequences. + + Args: + word_index (:obj:`int`): + The index of a word in one of the input sequences. + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target word + + Returns: + :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` + """ + pass + + @property + def words(self): + """ + The generated word indices. + + .. warning:: + This is deprecated and will be removed in a future version. + Please use :obj:`~tokenizers.Encoding.word_ids` instead. + + They represent the index of the word associated to each token. + When the input is pre-tokenized, they correspond to the ID of the given input label, + otherwise they correspond to the words indices as defined by the + :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used. + + For special tokens and such (any token that was generated from something that was + not part of the input), the output is :obj:`None` + + Returns: + A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. + """ + pass + +class NormalizedString: + """ + NormalizedString + + A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. + While making all the requested modifications, it keeps track of the alignment information + between the two versions of the string. + + Args: + sequence: str: + The string sequence used to initialize this NormalizedString + """ + def append(self, s): + """ + Append the given sequence to the string + """ + pass + + def clear(self): + """ + Clears the string + """ + pass + + def filter(self, func): + """ + Filter each character of the string using the given func + """ + pass + + def for_each(self, func): + """ + Calls the given function for each character of the string + """ + pass + + def lowercase(self): + """ + Lowercase the string + """ + pass + + def lstrip(self): + """ + Strip the left of the string + """ + pass + + def map(self, func): + """ + Calls the given function for each character of the string + + Replaces each character of the string using the returned value. Each + returned value **must** be a str of length 1 (ie a character). + """ + pass + + def nfc(self): + """ + Runs the NFC normalization + """ + pass + + def nfd(self): + """ + Runs the NFD normalization + """ + pass + + def nfkc(self): + """ + Runs the NFKC normalization + """ + pass + + def nfkd(self): + """ + Runs the NFKD normalization + """ + pass + + @property + def normalized(self): + """ + The normalized part of the string + """ + pass + + def prepend(self, s): + """ + Prepend the given sequence to the string + """ + pass + + def replace(self, pattern, content): + """ + Replace the content of the given pattern with the provided content + + Args: + pattern: Pattern: + A pattern used to match the string. Usually a string or a Regex + + content: str: + The content to be used as replacement + """ + pass + + def rstrip(self): + """ + Strip the right of the string + """ + pass + + def slice(self, range): + """ + Slice the string using the given range + """ + pass + + def split(self, pattern, behavior): + """ + Split the NormalizedString using the given pattern and the specified behavior + + Args: + pattern: Pattern: + A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex` + + behavior: SplitDelimiterBehavior: + The behavior to use when splitting. + Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", + "contiguous" + + Returns: + A list of NormalizedString, representing each split + """ + pass + + def strip(self): + """ + Strip both ends of the string + """ + pass + + def uppercase(self): + """ + Uppercase the string + """ + pass + +class PreTokenizedString: + """ + PreTokenizedString + + Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the + underlying string, while keeping track of the alignment information (offsets). + + The PreTokenizedString manages what we call `splits`. Each split represents a substring + which is a subpart of the original string, with the relevant offsets and tokens. + + When calling one of the methods used to modify the PreTokenizedString (namely one of + `split`, `normalize` or `tokenize), only the `splits` that don't have any associated + tokens will get modified. + + Args: + sequence: str: + The string sequence used to initialize this PreTokenizedString + """ + def __init__(self, sequence): + pass + + def get_splits(self, offset_referential="original", offset_type="char"): + """ + Get the splits currently managed by the PreTokenizedString + + Args: + offset_referential: :obj:`str` + Whether the returned splits should have offsets expressed relative + to the original string, or the normalized one. choices: "original", "normalized". + + offset_type: :obj:`str` + Whether the returned splits should have offsets expressed in bytes or chars. + When slicing an str, we usually want to use chars, which is the default value. + Now in some cases it might be interesting to get these offsets expressed in bytes, + so it is possible to change this here. + choices: "char", "bytes" + + Returns + A list of splits + """ + pass + + def normalize(self, func): + """ + Normalize each split of the `PreTokenizedString` using the given `func` + + Args: + func: Callable[[NormalizedString], None]: + The function used to normalize each underlying split. This function + does not need to return anything, just calling the methods on the provided + NormalizedString allow its modification. + """ + pass + + def split(self, func): + """ + Split the PreTokenizedString using the given `func` + + Args: + func: Callable[[index, NormalizedString], List[NormalizedString]]: + The function used to split each underlying split. + It is expected to return a list of `NormalizedString`, that represent the new + splits. If the given `NormalizedString` does not need any splitting, we can + just return it directly. + In order for the offsets to be tracked accurately, any returned `NormalizedString` + should come from calling either `.split` or `.slice` on the received one. + """ + pass + + def to_encoding(self, type_id=0, word_idx=None): + """ + Return an Encoding generated from this PreTokenizedString + + Args: + type_id: int = 0: + The type_id to be used on the generated Encoding. + + word_idx: Optional[int] = None: + An optional word index to be used for each token of this Encoding. If provided, + all the word indices in the generated Encoding will use this value, instead + of the one automatically tracked during pre-tokenization. + + Returns: + An Encoding + """ + pass + + def tokenize(self, func): + """ + Tokenize each split of the `PreTokenizedString` using the given `func` + + Args: + func: Callable[[str], List[Token]]: + The function used to tokenize each underlying split. This function must return + a list of Token generated from the input str. + """ + pass + +class Regex: + """ + Instantiate a new Regex with the given pattern + """ + def __init__(self, pattern): + pass + +class Token: + pass + +class Tokenizer: + """ + A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input + and outputs an :class:`~tokenizers.Encoding`. + + Args: + model (:class:`~tokenizers.models.Model`): + The core algorithm that this :obj:`Tokenizer` should be using. + + """ + def __init__(self, model): + pass + + def add_special_tokens(self, tokens): + """ + Add the given special tokens to the Tokenizer. + + If these tokens are already part of the vocabulary, it just let the Tokenizer know about + them. If they don't exist, the Tokenizer creates them, giving them a new id. + + These special tokens will never be processed by the model (ie won't be split into + multiple tokens), and they can be removed from the output when decoding. + + Args: + tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): + The list of special tokens we want to add to the vocabulary. Each token can either + be a string or an instance of :class:`~tokenizers.AddedToken` for more + customization. + + Returns: + :obj:`int`: The number of tokens that were created in the vocabulary + """ + pass + + def add_tokens(self, tokens): + """ + Add the given tokens to the vocabulary + + The given tokens are added only if they don't already exist in the vocabulary. + Each token then gets a new attributed id. + + Args: + tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): + The list of tokens we want to add to the vocabulary. Each token can be either a + string or an instance of :class:`~tokenizers.AddedToken` for more customization. + + Returns: + :obj:`int`: The number of tokens that were created in the vocabulary + """ + pass + + def decode(self, ids, skip_special_tokens=True): + """ + Decode the given list of ids back to a string + + This is used to decode anything coming back from a Language Model + + Args: + ids (A :obj:`List/Tuple` of :obj:`int`): + The list of ids that we want to decode + + skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether the special tokens should be removed from the decoded string + + Returns: + :obj:`str`: The decoded string + """ + pass + + def decode_batch(self, sequences, skip_special_tokens=True): + """ + Decode a batch of ids back to their corresponding string + + Args: + sequences (:obj:`List` of :obj:`List[int]`): + The batch of sequences we want to decode + + skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether the special tokens should be removed from the decoded strings + + Returns: + :obj:`List[str]`: A list of decoded strings + """ + pass + + @property + def decoder(self): + """ + The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer + """ + pass + + def enable_padding( + self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None + ): + """ + Enable the padding + + Args: + direction (:obj:`str`, `optional`, defaults to :obj:`right`): + The direction in which to pad. Can be either ``right`` or ``left`` + + pad_to_multiple_of (:obj:`int`, `optional`): + If specified, the padding length should always snap to the next multiple of the + given value. For example if we were going to pad witha length of 250 but + ``pad_to_multiple_of=8`` then we will pad to 256. + + pad_id (:obj:`int`, defaults to 0): + The id to be used when padding + + pad_type_id (:obj:`int`, defaults to 0): + The type id to be used when padding + + pad_token (:obj:`str`, defaults to :obj:`[PAD]`): + The pad token to be used when padding + + length (:obj:`int`, `optional`): + If specified, the length at which to pad. If not specified we pad using the size of + the longest sequence in a batch. + """ + pass + + def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"): + """ + Enable truncation + + Args: + max_length (:obj:`int`): + The max length at which to truncate + + stride (:obj:`int`, `optional`): + The length of the previous first sequence to be included in the overflowing + sequence + + strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`): + The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or + ``only_second``. + + direction (:obj:`str`, defaults to :obj:`right`): + Truncate direction + """ + pass + + def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): + """ + Encode the given sequence and pair. This method can process raw text sequences + as well as already pre-tokenized sequences. + + Example: + Here are some examples of the inputs that are accepted:: + + encode("A single sequence")` + encode("A sequence", "And its pair")` + encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)` + encode( + [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ], + is_pretokenized=True + ) + + Args: + sequence (:obj:`~tokenizers.InputSequence`): + The main input sequence we want to encode. This sequence can be either raw + text or pre-tokenized, according to the ``is_pretokenized`` argument: + + - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence` + - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence` + + pair (:obj:`~tokenizers.InputSequence`, `optional`): + An optional input sequence. The expected format is the same that for ``sequence``. + + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Whether the input is already pre-tokenized + + add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to add the special tokens + + Returns: + :class:`~tokenizers.Encoding`: The encoded result + + """ + pass + + def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): + """ + Encode the given batch of inputs. This method accept both raw text sequences + as well as already pre-tokenized sequences. + + Example: + Here are some examples of the inputs that are accepted:: + + encode_batch([ + "A single sequence", + ("A tuple with a sequence", "And its pair"), + [ "A", "pre", "tokenized", "sequence" ], + ([ "A", "pre", "tokenized", "sequence" ], "And its pair") + ]) + + Args: + input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): + A list of single sequences or pair sequences to encode. Each sequence + can be either raw text or pre-tokenized, according to the ``is_pretokenized`` + argument: + + - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` + - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` + + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Whether the input is already pre-tokenized + + add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to add the special tokens + + Returns: + A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch + + """ + pass + + @property + def encode_special_tokens(self): + """ + Modifies the tokenizer in order to use or not the special tokens + during encoding. + + Args: + value (:obj:`bool`): + Whether to use the special tokens or not + + """ + pass + + @staticmethod + def from_buffer(buffer): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer. + + Args: + buffer (:obj:`bytes`): + A buffer containing a previously serialized :class:`~tokenizers.Tokenizer` + + Returns: + :class:`~tokenizers.Tokenizer`: The new tokenizer + """ + pass + + @staticmethod + def from_file(path): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path. + + Args: + path (:obj:`str`): + A path to a local JSON file representing a previously serialized + :class:`~tokenizers.Tokenizer` + + Returns: + :class:`~tokenizers.Tokenizer`: The new tokenizer + """ + pass + + @staticmethod + def from_pretrained(identifier, revision="main", auth_token=None): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the + Hugging Face Hub. + + Args: + identifier (:obj:`str`): + The identifier of a Model on the Hugging Face Hub, that contains + a tokenizer.json file + revision (:obj:`str`, defaults to `main`): + A branch or commit id + auth_token (:obj:`str`, `optional`, defaults to `None`): + An optional auth token used to access private repositories on the + Hugging Face Hub + + Returns: + :class:`~tokenizers.Tokenizer`: The new tokenizer + """ + pass + + @staticmethod + def from_str(json): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. + + Args: + json (:obj:`str`): + A valid JSON string representing a previously serialized + :class:`~tokenizers.Tokenizer` + + Returns: + :class:`~tokenizers.Tokenizer`: The new tokenizer + """ + pass + + def get_added_tokens_decoder(self): + """ + Get the underlying vocabulary + + Returns: + :obj:`Dict[int, AddedToken]`: The vocabulary + """ + pass + + def get_vocab(self, with_added_tokens=True): + """ + Get the underlying vocabulary + + Args: + with_added_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to include the added tokens + + Returns: + :obj:`Dict[str, int]`: The vocabulary + """ + pass + + def get_vocab_size(self, with_added_tokens=True): + """ + Get the size of the underlying vocabulary + + Args: + with_added_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to include the added tokens + + Returns: + :obj:`int`: The size of the vocabulary + """ + pass + + def id_to_token(self, id): + """ + Convert the given id to its corresponding token if it exists + + Args: + id (:obj:`int`): + The id to convert + + Returns: + :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary + """ + pass + + @property + def model(self): + """ + The :class:`~tokenizers.models.Model` in use by the Tokenizer + """ + pass + + def no_padding(self): + """ + Disable padding + """ + pass + + def no_truncation(self): + """ + Disable truncation + """ + pass + + @property + def normalizer(self): + """ + The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer + """ + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + pass + + @property + def padding(self): + """ + Get the current padding parameters + + `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead` + + Returns: + (:obj:`dict`, `optional`): + A dict with the current padding parameters if padding is enabled + """ + pass + + def post_process(self, encoding, pair=None, add_special_tokens=True): + """ + Apply all the post-processing steps to the given encodings. + + The various steps are: + + 1. Truncate according to the set truncation params (provided with + :meth:`~tokenizers.Tokenizer.enable_truncation`) + 2. Apply the :class:`~tokenizers.processors.PostProcessor` + 3. Pad according to the set padding params (provided with + :meth:`~tokenizers.Tokenizer.enable_padding`) + + Args: + encoding (:class:`~tokenizers.Encoding`): + The :class:`~tokenizers.Encoding` corresponding to the main sequence. + + pair (:class:`~tokenizers.Encoding`, `optional`): + An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence. + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Returns: + :class:`~tokenizers.Encoding`: The final post-processed encoding + """ + pass + + @property + def post_processor(self): + """ + The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer + """ + pass + + @property + def pre_tokenizer(self): + """ + The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer + """ + pass + + def save(self, path, pretty=True): + """ + Save the :class:`~tokenizers.Tokenizer` to the file at the given path. + + Args: + path (:obj:`str`): + A path to a file in which to save the serialized tokenizer. + + pretty (:obj:`bool`, defaults to :obj:`True`): + Whether the JSON file should be pretty formatted. + """ + pass + + def to_str(self, pretty=False): + """ + Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. + + Args: + pretty (:obj:`bool`, defaults to :obj:`False`): + Whether the JSON string should be pretty formatted. + + Returns: + :obj:`str`: A string representing the serialized Tokenizer + """ + pass + + def token_to_id(self, token): + """ + Convert the given token to its corresponding id if it exists + + Args: + token (:obj:`str`): + The token to convert + + Returns: + :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary + """ + pass + + def train(self, files, trainer=None): + """ + Train the Tokenizer using the given files. + + Reads the files line by line, while keeping all the whitespace, even new lines. + If you want to train from data store in-memory, you can check + :meth:`~tokenizers.Tokenizer.train_from_iterator` + + Args: + files (:obj:`List[str]`): + A list of path to the files that we should use for training + + trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): + An optional trainer that should be used to train our Model + """ + pass + + def train_from_iterator(self, iterator, trainer=None, length=None): + """ + Train the Tokenizer using the provided iterator. + + You can provide anything that is a Python Iterator + + * A list of sequences :obj:`List[str]` + * A generator that yields :obj:`str` or :obj:`List[str]` + * A Numpy array of strings + * ... + + Args: + iterator (:obj:`Iterator`): + Any iterator over strings or list of strings + + trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): + An optional trainer that should be used to train our Model + + length (:obj:`int`, `optional`): + The total number of sequences in the iterator. This is used to + provide meaningful progress tracking + """ + pass + + @property + def truncation(self): + """ + Get the currently set truncation parameters + + `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead` + + Returns: + (:obj:`dict`, `optional`): + A dict with the current truncation parameters if truncation is enabled + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py new file mode 100644 index 00000000..a717379c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.py @@ -0,0 +1,14 @@ +from .. import decoders + + +Decoder = decoders.Decoder +ByteLevel = decoders.ByteLevel +Replace = decoders.Replace +WordPiece = decoders.WordPiece +ByteFallback = decoders.ByteFallback +Fuse = decoders.Fuse +Strip = decoders.Strip +Metaspace = decoders.Metaspace +BPEDecoder = decoders.BPEDecoder +CTC = decoders.CTC +Sequence = decoders.Sequence diff --git a/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi new file mode 100644 index 00000000..b967fbd1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/decoders/__init__.pyi @@ -0,0 +1,271 @@ +# Generated content DO NOT EDIT +class Decoder: + """ + Base class for all decoders + + This class is not supposed to be instantiated directly. Instead, any implementation of + a Decoder will return an instance of this class when instantiated. + """ + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class BPEDecoder(Decoder): + """ + BPEDecoder Decoder + + Args: + suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`): + The suffix that was used to caracterize an end-of-word. This suffix will + be replaced by whitespaces during the decoding + """ + def __init__(self, suffix="</w>"): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class ByteFallback(Decoder): + """ + ByteFallback Decoder + ByteFallback is a simple trick which converts tokens looking like `<0x61>` + to pure bytes, and attempts to make them into a string. If the tokens + cannot be decoded you will get � instead for each inconvertable byte token + + """ + def __init__(self): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class ByteLevel(Decoder): + """ + ByteLevel Decoder + + This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` + :class:`~tokenizers.pre_tokenizers.PreTokenizer`. + """ + def __init__(self): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class CTC(Decoder): + """ + CTC Decoder + + Args: + pad_token (:obj:`str`, `optional`, defaults to :obj:`<pad>`): + The pad token used by CTC to delimit a new token. + word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`|`): + The word delimiter token. It will be replaced by a <space> + cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to cleanup some tokenization artifacts. + Mainly spaces before punctuation, and some abbreviated english forms. + """ + def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class Fuse(Decoder): + """ + Fuse Decoder + Fuse simply fuses every token into a single string. + This is the last step of decoding, this decoder exists only if + there is need to add other decoders *after* the fusion + """ + def __init__(self): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class Metaspace(Decoder): + """ + Metaspace Decoder + + Args: + replacement (:obj:`str`, `optional`, defaults to :obj:`▁`): + The replacement character. Must be exactly one character. By default we + use the `▁` (U+2581) meta symbol (Same as in SentencePiece). + + prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + Choices: "always", "never", "first". First means the space is only added on the first + token (relevant when special tokens are used or other pre_tokenizer are used). + """ + def __init__(self, replacement="▁", prepend_scheme="always", split=True): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class Replace(Decoder): + """ + Replace Decoder + + This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` + :class:`~tokenizers.pre_tokenizers.PreTokenizer`. + """ + def __init__(self, pattern, content): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class Sequence(Decoder): + """ + Sequence Decoder + + Args: + decoders (:obj:`List[Decoder]`) + The decoders that need to be chained + """ + def __init__(self, decoders): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class Strip(Decoder): + """ + Strip normalizer + Strips n left characters of each token, or n right characters of each token + """ + def __init__(self, content, left=0, right=0): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass + +class WordPiece(Decoder): + """ + WordPiece Decoder + + Args: + prefix (:obj:`str`, `optional`, defaults to :obj:`##`): + The prefix to use for subwords that are not a beginning-of-word + + cleanup (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, + and some abbreviated english forms. + """ + def __init__(self, prefix="##", cleanup=True): + pass + + def decode(self, tokens): + """ + Decode the given list of tokens to a final string + + Args: + tokens (:obj:`List[str]`): + The list of tokens to decode + + Returns: + :obj:`str`: The decoded string + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py new file mode 100644 index 00000000..7e775892 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/__init__.py @@ -0,0 +1,6 @@ +from .base_tokenizer import BaseTokenizer +from .bert_wordpiece import BertWordPieceTokenizer +from .byte_level_bpe import ByteLevelBPETokenizer +from .char_level_bpe import CharBPETokenizer +from .sentencepiece_bpe import SentencePieceBPETokenizer +from .sentencepiece_unigram import SentencePieceUnigramTokenizer diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py new file mode 100644 index 00000000..4528dceb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/base_tokenizer.py @@ -0,0 +1,418 @@ +from typing import Dict, List, Optional, Tuple, Union + +from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer +from tokenizers.decoders import Decoder +from tokenizers.models import Model +from tokenizers.normalizers import Normalizer +from tokenizers.pre_tokenizers import PreTokenizer +from tokenizers.processors import PostProcessor + + +Offsets = Tuple[int, int] + + +class BaseTokenizer: + def __init__(self, tokenizer: Tokenizer, parameters=None): + self._tokenizer = tokenizer + self._parameters = parameters if parameters is not None else {} + + def __repr__(self): + return "Tokenizer(vocabulary_size={}, {})".format( + self._tokenizer.get_vocab_size(), + ", ".join(k + "=" + str(v) for k, v in self._parameters.items()), + ) + + def num_special_tokens_to_add(self, is_pair: bool) -> int: + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + return self._tokenizer.num_special_tokens_to_add(is_pair) + + def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]: + """Returns the vocabulary + + Args: + with_added_tokens: boolean: + Whether to include the added tokens in the vocabulary + + Returns: + The vocabulary + """ + return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens) + + def get_added_tokens_decoder(self) -> Dict[int, AddedToken]: + """Returns the added reverse vocabulary + + Returns: + The added vocabulary mapping ints to AddedTokens + """ + return self._tokenizer.get_added_tokens_decoder() + + def get_vocab_size(self, with_added_tokens: bool = True) -> int: + """Return the size of vocabulary, with or without added tokens. + + Args: + with_added_tokens: (`optional`) bool: + Whether to count in added special tokens or not + + Returns: + Size of vocabulary + """ + return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens) + + def enable_padding( + self, + direction: Optional[str] = "right", + pad_to_multiple_of: Optional[int] = None, + pad_id: Optional[int] = 0, + pad_type_id: Optional[int] = 0, + pad_token: Optional[str] = "[PAD]", + length: Optional[int] = None, + ): + """Change the padding strategy + + Args: + direction: (`optional`) str: + Can be one of: `right` or `left` + + pad_to_multiple_of: (`optional`) unsigned int: + If specified, the padding length should always snap to the next multiple of + the given value. For example if we were going to pad with a length of 250 but + `pad_to_multiple_of=8` then we will pad to 256. + + pad_id: (`optional`) unsigned int: + The indice to be used when padding + + pad_type_id: (`optional`) unsigned int: + The type indice to be used when padding + + pad_token: (`optional`) str: + The pad token to be used when padding + + length: (`optional`) unsigned int: + If specified, the length at which to pad. If not specified + we pad using the size of the longest sequence in a batch + """ + return self._tokenizer.enable_padding( + direction=direction, + pad_to_multiple_of=pad_to_multiple_of, + pad_id=pad_id, + pad_type_id=pad_type_id, + pad_token=pad_token, + length=length, + ) + + def no_padding(self): + """Disable padding""" + return self._tokenizer.no_padding() + + @property + def padding(self) -> Optional[dict]: + """Get the current padding parameters + + Returns: + None if padding is disabled, a dict with the currently set parameters + if the padding is enabled. + """ + return self._tokenizer.padding + + def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"): + """Change the truncation options + + Args: + max_length: unsigned int: + The maximum length at which to truncate + + stride: (`optional`) unsigned int: + The length of the previous first sequence to be included + in the overflowing sequence + + strategy: (`optional`) str: + Can be one of `longest_first`, `only_first` or `only_second` + """ + return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy) + + def no_truncation(self): + """Disable truncation""" + return self._tokenizer.no_truncation() + + @property + def truncation(self) -> Optional[dict]: + """Get the current truncation parameters + + Returns: + None if truncation is disabled, a dict with the current truncation parameters if + truncation is enabled + """ + return self._tokenizer.truncation + + def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int: + """Add the given tokens to the vocabulary + + Args: + tokens: List[Union[str, AddedToken]]: + A list of tokens to add to the vocabulary. Each token can either be + a string, or an instance of AddedToken + + Returns: + The number of tokens that were added to the vocabulary + """ + return self._tokenizer.add_tokens(tokens) + + def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int: + """Add the given special tokens to the vocabulary, and treat them as special tokens. + + The special tokens will never be processed by the model, and will be + removed while decoding. + + Args: + tokens: List[Union[str, AddedToken]]: + A list of special tokens to add to the vocabulary. Each token can either be + a string, or an instance of AddedToken + + Returns: + The number of tokens that were added to the vocabulary + """ + return self._tokenizer.add_special_tokens(special_tokens) + + def normalize(self, sequence: str) -> str: + """Normalize the given sequence + + Args: + sequence: str: + The sequence to normalize + + Returns: + The normalized string + """ + return self._tokenizer.normalize(sequence) + + def encode( + self, + sequence: InputSequence, + pair: Optional[InputSequence] = None, + is_pretokenized: bool = False, + add_special_tokens: bool = True, + ) -> Encoding: + """Encode the given sequence and pair. This method can process raw text sequences as well + as already pre-tokenized sequences. + + Args: + sequence: InputSequence: + The sequence we want to encode. This sequence can be either raw text or + pre-tokenized, according to the `is_pretokenized` argument: + + - If `is_pretokenized=False`: `InputSequence` is expected to be `str` + - If `is_pretokenized=True`: `InputSequence` is expected to be + `Union[List[str], Tuple[str]]` + + is_pretokenized: bool: + Whether the input is already pre-tokenized. + + add_special_tokens: bool: + Whether to add the special tokens while encoding. + + Returns: + An Encoding + """ + if sequence is None: + raise ValueError("encode: `sequence` can't be `None`") + + return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens) + + def encode_batch( + self, + inputs: List[EncodeInput], + is_pretokenized: bool = False, + add_special_tokens: bool = True, + ) -> List[Encoding]: + """Encode the given inputs. This method accept both raw text sequences as well as already + pre-tokenized sequences. + + Args: + inputs: List[EncodeInput]: + A list of single sequences or pair sequences to encode. Each `EncodeInput` is + expected to be of the following form: + `Union[InputSequence, Tuple[InputSequence, InputSequence]]` + + Each `InputSequence` can either be raw text or pre-tokenized, + according to the `is_pretokenized` argument: + + - If `is_pretokenized=False`: `InputSequence` is expected to be `str` + - If `is_pretokenized=True`: `InputSequence` is expected to be + `Union[List[str], Tuple[str]]` + + is_pretokenized: bool: + Whether the input is already pre-tokenized. + + add_special_tokens: bool: + Whether to add the special tokens while encoding. + + Returns: + A list of Encoding + """ + + if inputs is None: + raise ValueError("encode_batch: `inputs` can't be `None`") + + return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens) + + def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: + """Decode the given list of ids to a string sequence + + Args: + ids: List[unsigned int]: + A list of ids to be decoded + + skip_special_tokens: (`optional`) boolean: + Whether to remove all the special tokens from the output string + + Returns: + The decoded string + """ + if ids is None: + raise ValueError("None input is not valid. Should be a list of integers.") + + return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) + + def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str: + """Decode the list of sequences to a list of string sequences + + Args: + sequences: List[List[unsigned int]]: + A list of sequence of ids to be decoded + + skip_special_tokens: (`optional`) boolean: + Whether to remove all the special tokens from the output strings + + Returns: + A list of decoded strings + """ + if sequences is None: + raise ValueError("None input is not valid. Should be list of list of integers.") + + return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens) + + def token_to_id(self, token: str) -> Optional[int]: + """Convert the given token to its corresponding id + + Args: + token: str: + The token to convert + + Returns: + The corresponding id if it exists, None otherwise + """ + return self._tokenizer.token_to_id(token) + + def id_to_token(self, id: int) -> Optional[str]: + """Convert the given token id to its corresponding string + + Args: + token: id: + The token id to convert + + Returns: + The corresponding string if it exists, None otherwise + """ + return self._tokenizer.id_to_token(id) + + def save_model(self, directory: str, prefix: Optional[str] = None): + """Save the current model to the given directory + + Args: + directory: str: + A path to the destination directory + + prefix: (Optional) str: + An optional prefix, used to prefix each file name + """ + return self._tokenizer.model.save(directory, prefix=prefix) + + def save(self, path: str, pretty: bool = True): + """Save the current Tokenizer at the given path + + Args: + path: str: + A path to the destination Tokenizer file + """ + return self._tokenizer.save(path, pretty) + + def to_str(self, pretty: bool = False): + """Get a serialized JSON version of the Tokenizer as a str + + Args: + pretty: bool: + Whether the JSON string should be prettified + + Returns: + str + """ + return self._tokenizer.to_str(pretty) + + def post_process( + self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True + ) -> Encoding: + """Apply all the post-processing steps to the given encodings. + + The various steps are: + 1. Truncate according to global params (provided to `enable_truncation`) + 2. Apply the PostProcessor + 3. Pad according to global params. (provided to `enable_padding`) + + Args: + encoding: Encoding: + The main Encoding to post process + + pair: Optional[Encoding]: + An optional pair Encoding + + add_special_tokens: bool: + Whether to add special tokens + + Returns: + The resulting Encoding + """ + return self._tokenizer.post_process(encoding, pair, add_special_tokens) + + @property + def model(self) -> Model: + return self._tokenizer.model + + @model.setter + def model(self, model: Model): + self._tokenizer.model = model + + @property + def normalizer(self) -> Normalizer: + return self._tokenizer.normalizer + + @normalizer.setter + def normalizer(self, normalizer: Normalizer): + self._tokenizer.normalizer = normalizer + + @property + def pre_tokenizer(self) -> PreTokenizer: + return self._tokenizer.pre_tokenizer + + @pre_tokenizer.setter + def pre_tokenizer(self, pre_tokenizer: PreTokenizer): + self._tokenizer.pre_tokenizer = pre_tokenizer + + @property + def post_processor(self) -> PostProcessor: + return self._tokenizer.post_processor + + @post_processor.setter + def post_processor(self, post_processor: PostProcessor): + self._tokenizer.post_processor = post_processor + + @property + def decoder(self) -> Decoder: + return self._tokenizer.decoder + + @decoder.setter + def decoder(self, decoder: Decoder): + self._tokenizer.decoder = decoder diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py new file mode 100644 index 00000000..1f34e3ca --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.py @@ -0,0 +1,151 @@ +from typing import Dict, Iterator, List, Optional, Union + +from tokenizers import AddedToken, Tokenizer, decoders, trainers +from tokenizers.models import WordPiece +from tokenizers.normalizers import BertNormalizer +from tokenizers.pre_tokenizers import BertPreTokenizer +from tokenizers.processors import BertProcessing + +from .base_tokenizer import BaseTokenizer + + +class BertWordPieceTokenizer(BaseTokenizer): + """Bert WordPiece Tokenizer""" + + def __init__( + self, + vocab: Optional[Union[str, Dict[str, int]]] = None, + unk_token: Union[str, AddedToken] = "[UNK]", + sep_token: Union[str, AddedToken] = "[SEP]", + cls_token: Union[str, AddedToken] = "[CLS]", + pad_token: Union[str, AddedToken] = "[PAD]", + mask_token: Union[str, AddedToken] = "[MASK]", + clean_text: bool = True, + handle_chinese_chars: bool = True, + strip_accents: Optional[bool] = None, + lowercase: bool = True, + wordpieces_prefix: str = "##", + ): + if vocab is not None: + tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) + else: + tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token))) + + # Let the tokenizer know about special tokens if they are part of the vocab + if tokenizer.token_to_id(str(unk_token)) is not None: + tokenizer.add_special_tokens([str(unk_token)]) + if tokenizer.token_to_id(str(sep_token)) is not None: + tokenizer.add_special_tokens([str(sep_token)]) + if tokenizer.token_to_id(str(cls_token)) is not None: + tokenizer.add_special_tokens([str(cls_token)]) + if tokenizer.token_to_id(str(pad_token)) is not None: + tokenizer.add_special_tokens([str(pad_token)]) + if tokenizer.token_to_id(str(mask_token)) is not None: + tokenizer.add_special_tokens([str(mask_token)]) + + tokenizer.normalizer = BertNormalizer( + clean_text=clean_text, + handle_chinese_chars=handle_chinese_chars, + strip_accents=strip_accents, + lowercase=lowercase, + ) + tokenizer.pre_tokenizer = BertPreTokenizer() + + if vocab is not None: + sep_token_id = tokenizer.token_to_id(str(sep_token)) + if sep_token_id is None: + raise TypeError("sep_token not found in the vocabulary") + cls_token_id = tokenizer.token_to_id(str(cls_token)) + if cls_token_id is None: + raise TypeError("cls_token not found in the vocabulary") + + tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) + tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) + + parameters = { + "model": "BertWordPiece", + "unk_token": unk_token, + "sep_token": sep_token, + "cls_token": cls_token, + "pad_token": pad_token, + "mask_token": mask_token, + "clean_text": clean_text, + "handle_chinese_chars": handle_chinese_chars, + "strip_accents": strip_accents, + "lowercase": lowercase, + "wordpieces_prefix": wordpieces_prefix, + } + + super().__init__(tokenizer, parameters) + + @staticmethod + def from_file(vocab: str, **kwargs): + vocab = WordPiece.read_file(vocab) + return BertWordPieceTokenizer(vocab, **kwargs) + + def train( + self, + files: Union[str, List[str]], + vocab_size: int = 30000, + min_frequency: int = 2, + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + special_tokens: List[Union[str, AddedToken]] = [ + "[PAD]", + "[UNK]", + "[CLS]", + "[SEP]", + "[MASK]", + ], + show_progress: bool = True, + wordpieces_prefix: str = "##", + ): + """Train the model using the given files""" + + trainer = trainers.WordPieceTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + special_tokens=special_tokens, + show_progress=show_progress, + continuing_subword_prefix=wordpieces_prefix, + ) + if isinstance(files, str): + files = [files] + self._tokenizer.train(files, trainer=trainer) + + def train_from_iterator( + self, + iterator: Union[Iterator[str], Iterator[Iterator[str]]], + vocab_size: int = 30000, + min_frequency: int = 2, + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + special_tokens: List[Union[str, AddedToken]] = [ + "[PAD]", + "[UNK]", + "[CLS]", + "[SEP]", + "[MASK]", + ], + show_progress: bool = True, + wordpieces_prefix: str = "##", + length: Optional[int] = None, + ): + """Train the model using the given iterator""" + + trainer = trainers.WordPieceTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + special_tokens=special_tokens, + show_progress=show_progress, + continuing_subword_prefix=wordpieces_prefix, + ) + self._tokenizer.train_from_iterator( + iterator, + trainer=trainer, + length=length, + ) diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py new file mode 100644 index 00000000..c7e3dbc4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py @@ -0,0 +1,122 @@ +from typing import Dict, Iterator, List, Optional, Tuple, Union + +from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers +from tokenizers.models import BPE +from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str + +from .base_tokenizer import BaseTokenizer + + +class ByteLevelBPETokenizer(BaseTokenizer): + """ByteLevelBPETokenizer + + Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model + """ + + def __init__( + self, + vocab: Optional[Union[str, Dict[str, int]]] = None, + merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, + add_prefix_space: bool = False, + lowercase: bool = False, + dropout: Optional[float] = None, + unicode_normalizer: Optional[str] = None, + continuing_subword_prefix: Optional[str] = None, + end_of_word_suffix: Optional[str] = None, + trim_offsets: bool = False, + ): + if vocab is not None and merges is not None: + tokenizer = Tokenizer( + BPE( + vocab, + merges, + dropout=dropout, + continuing_subword_prefix=continuing_subword_prefix or "", + end_of_word_suffix=end_of_word_suffix or "", + ) + ) + else: + tokenizer = Tokenizer(BPE()) + + # Check for Unicode normalization first (before everything else) + normalizers = [] + + if unicode_normalizer: + normalizers += [unicode_normalizer_from_str(unicode_normalizer)] + + if lowercase: + normalizers += [Lowercase()] + + # Create the normalizer structure + if len(normalizers) > 0: + if len(normalizers) > 1: + tokenizer.normalizer = Sequence(normalizers) + else: + tokenizer.normalizer = normalizers[0] + + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets) + + parameters = { + "model": "ByteLevelBPE", + "add_prefix_space": add_prefix_space, + "lowercase": lowercase, + "dropout": dropout, + "unicode_normalizer": unicode_normalizer, + "continuing_subword_prefix": continuing_subword_prefix, + "end_of_word_suffix": end_of_word_suffix, + "trim_offsets": trim_offsets, + } + + super().__init__(tokenizer, parameters) + + @staticmethod + def from_file(vocab_filename: str, merges_filename: str, **kwargs): + vocab, merges = BPE.read_file(vocab_filename, merges_filename) + return ByteLevelBPETokenizer(vocab, merges, **kwargs) + + def train( + self, + files: Union[str, List[str]], + vocab_size: int = 30000, + min_frequency: int = 2, + show_progress: bool = True, + special_tokens: List[Union[str, AddedToken]] = [], + ): + """Train the model using the given files""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + show_progress=show_progress, + special_tokens=special_tokens, + initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), + ) + if isinstance(files, str): + files = [files] + self._tokenizer.train(files, trainer=trainer) + + def train_from_iterator( + self, + iterator: Union[Iterator[str], Iterator[Iterator[str]]], + vocab_size: int = 30000, + min_frequency: int = 2, + show_progress: bool = True, + special_tokens: List[Union[str, AddedToken]] = [], + length: Optional[int] = None, + ): + """Train the model using the given iterator""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + show_progress=show_progress, + special_tokens=special_tokens, + initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), + ) + self._tokenizer.train_from_iterator( + iterator, + trainer=trainer, + length=length, + ) diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py new file mode 100644 index 00000000..29ca5977 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py @@ -0,0 +1,150 @@ +from typing import Dict, Iterator, List, Optional, Tuple, Union + +from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers +from ..models import BPE +from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str +from .base_tokenizer import BaseTokenizer + + +class CharBPETokenizer(BaseTokenizer): + """Original BPE Tokenizer + + Represents the BPE algorithm, as introduced by Rico Sennrich + (https://arxiv.org/abs/1508.07909) + + The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original + Sennrich subword-nmt implementation by the following options that you can deactivate: + - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by: + * removing any control characters and replacing all whitespaces by the classic one. + * handle chinese chars by putting spaces around them. + * strip all accents. + - spitting on punctuation in addition to whitespaces (deactivate it with + `split_on_whitespace_only=True`) + """ + + def __init__( + self, + vocab: Optional[Union[str, Dict[str, int]]] = None, + merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, + unk_token: Union[str, AddedToken] = "<unk>", + suffix: str = "</w>", + dropout: Optional[float] = None, + lowercase: bool = False, + unicode_normalizer: Optional[str] = None, + bert_normalizer: bool = True, + split_on_whitespace_only: bool = False, + ): + if vocab is not None and merges is not None: + tokenizer = Tokenizer( + BPE( + vocab, + merges, + dropout=dropout, + unk_token=str(unk_token), + end_of_word_suffix=suffix, + ) + ) + else: + tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix)) + + if tokenizer.token_to_id(str(unk_token)) is not None: + tokenizer.add_special_tokens([str(unk_token)]) + + # Check for Unicode normalization first (before everything else) + normalizers = [] + + if unicode_normalizer: + normalizers += [unicode_normalizer_from_str(unicode_normalizer)] + + if bert_normalizer: + normalizers += [BertNormalizer(lowercase=False)] + + if lowercase: + normalizers += [Lowercase()] + + # Create the normalizer structure + if len(normalizers) > 0: + if len(normalizers) > 1: + tokenizer.normalizer = Sequence(normalizers) + else: + tokenizer.normalizer = normalizers[0] + + if split_on_whitespace_only: + tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() + else: + tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() + + tokenizer.decoder = decoders.BPEDecoder(suffix=suffix) + + parameters = { + "model": "BPE", + "unk_token": unk_token, + "suffix": suffix, + "dropout": dropout, + "lowercase": lowercase, + "unicode_normalizer": unicode_normalizer, + "bert_normalizer": bert_normalizer, + "split_on_whitespace_only": split_on_whitespace_only, + } + + super().__init__(tokenizer, parameters) + + @staticmethod + def from_file(vocab_filename: str, merges_filename: str, **kwargs): + vocab, merges = BPE.read_file(vocab_filename, merges_filename) + return CharBPETokenizer(vocab, merges, **kwargs) + + def train( + self, + files: Union[str, List[str]], + vocab_size: int = 30000, + min_frequency: int = 2, + special_tokens: List[Union[str, AddedToken]] = ["<unk>"], + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + suffix: Optional[str] = "</w>", + show_progress: bool = True, + ): + """Train the model using the given files""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + end_of_word_suffix=suffix, + show_progress=show_progress, + ) + if isinstance(files, str): + files = [files] + self._tokenizer.train(files, trainer=trainer) + + def train_from_iterator( + self, + iterator: Union[Iterator[str], Iterator[Iterator[str]]], + vocab_size: int = 30000, + min_frequency: int = 2, + special_tokens: List[Union[str, AddedToken]] = ["<unk>"], + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + suffix: Optional[str] = "</w>", + show_progress: bool = True, + length: Optional[int] = None, + ): + """Train the model using the given iterator""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + end_of_word_suffix=suffix, + show_progress=show_progress, + ) + self._tokenizer.train_from_iterator( + iterator, + trainer=trainer, + length=length, + ) diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py new file mode 100644 index 00000000..cd550b41 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py @@ -0,0 +1,103 @@ +from typing import Dict, Iterator, List, Optional, Tuple, Union + +from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers +from tokenizers.models import BPE +from tokenizers.normalizers import NFKC + +from .base_tokenizer import BaseTokenizer + + +class SentencePieceBPETokenizer(BaseTokenizer): + """SentencePiece BPE Tokenizer + + Represents the BPE algorithm, with the pretokenization used by SentencePiece + """ + + def __init__( + self, + vocab: Optional[Union[str, Dict[str, int]]] = None, + merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, + unk_token: Union[str, AddedToken] = "<unk>", + replacement: str = "▁", + add_prefix_space: bool = True, + dropout: Optional[float] = None, + fuse_unk: Optional[bool] = False, + ): + if vocab is not None and merges is not None: + tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)) + else: + tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)) + + if tokenizer.token_to_id(str(unk_token)) is not None: + tokenizer.add_special_tokens([str(unk_token)]) + + tokenizer.normalizer = NFKC() + prepend_scheme = "always" if add_prefix_space else "never" + tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + + parameters = { + "model": "SentencePieceBPE", + "unk_token": unk_token, + "replacement": replacement, + "add_prefix_space": add_prefix_space, + "dropout": dropout, + } + + super().__init__(tokenizer, parameters) + + @staticmethod + def from_file(vocab_filename: str, merges_filename: str, **kwargs): + vocab, merges = BPE.read_file(vocab_filename, merges_filename) + return SentencePieceBPETokenizer(vocab, merges, **kwargs) + + def train( + self, + files: Union[str, List[str]], + vocab_size: int = 30000, + min_frequency: int = 2, + special_tokens: List[Union[str, AddedToken]] = ["<unk>"], + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + show_progress: bool = True, + ): + """Train the model using the given files""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + show_progress=show_progress, + ) + if isinstance(files, str): + files = [files] + self._tokenizer.train(files, trainer=trainer) + + def train_from_iterator( + self, + iterator: Union[Iterator[str], Iterator[Iterator[str]]], + vocab_size: int = 30000, + min_frequency: int = 2, + special_tokens: List[Union[str, AddedToken]] = ["<unk>"], + limit_alphabet: int = 1000, + initial_alphabet: List[str] = [], + show_progress: bool = True, + length: Optional[int] = None, + ): + """Train the model using the given iterator""" + + trainer = trainers.BpeTrainer( + vocab_size=vocab_size, + min_frequency=min_frequency, + special_tokens=special_tokens, + limit_alphabet=limit_alphabet, + initial_alphabet=initial_alphabet, + show_progress=show_progress, + ) + self._tokenizer.train_from_iterator( + iterator, + trainer=trainer, + length=length, + ) diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py new file mode 100644 index 00000000..1237e85e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_unigram.py @@ -0,0 +1,196 @@ +import json +import os +from typing import Iterator, List, Optional, Union, Tuple + +from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers +from tokenizers.models import Unigram + +from .base_tokenizer import BaseTokenizer + + +class SentencePieceUnigramTokenizer(BaseTokenizer): + """SentencePiece Unigram Tokenizer + + Represents the Unigram algorithm, with the pretokenization used by SentencePiece + """ + + def __init__( + self, + vocab: Optional[List[Tuple[str, float]]] = None, + replacement: str = "▁", + add_prefix_space: bool = True, + ): + if vocab is not None: + # Let Unigram(..) fail if only one of them is None + tokenizer = Tokenizer(Unigram(vocab)) + else: + tokenizer = Tokenizer(Unigram()) + + tokenizer.normalizer = normalizers.Sequence( + [normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")] + ) + prepend_scheme = "always" if add_prefix_space else "never" + tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + + parameters = { + "model": "SentencePieceUnigram", + "replacement": replacement, + "add_prefix_space": add_prefix_space, + } + + super().__init__(tokenizer, parameters) + + def train( + self, + files: Union[str, List[str]], + vocab_size: int = 8000, + show_progress: bool = True, + special_tokens: Optional[List[Union[str, AddedToken]]] = None, + initial_alphabet: Optional[List[str]] = None, + unk_token: Optional[str] = None, + ): + """ + Train the model using the given files + + Args: + files (:obj:`List[str]`): + A list of path to the files that we should use for training + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + show_progress (:obj:`bool`): + Whether to show progress bars while training. + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + """ + + if special_tokens is None: + special_tokens = [] + + if initial_alphabet is None: + initial_alphabet = [] + + trainer = trainers.UnigramTrainer( + vocab_size=vocab_size, + special_tokens=special_tokens, + show_progress=show_progress, + initial_alphabet=initial_alphabet, + unk_token=unk_token, + ) + + if isinstance(files, str): + files = [files] + self._tokenizer.train(files, trainer=trainer) + + def train_from_iterator( + self, + iterator: Union[Iterator[str], Iterator[Iterator[str]]], + vocab_size: int = 8000, + show_progress: bool = True, + special_tokens: Optional[List[Union[str, AddedToken]]] = None, + initial_alphabet: Optional[List[str]] = None, + unk_token: Optional[str] = None, + length: Optional[int] = None, + ): + """ + Train the model using the given iterator + + Args: + iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`): + Any iterator over strings or list of strings + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + show_progress (:obj:`bool`): + Whether to show progress bars while training. + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + length (:obj:`int`, `optional`): + The total number of sequences in the iterator. This is used to + provide meaningful progress tracking + """ + + if special_tokens is None: + special_tokens = [] + + if initial_alphabet is None: + initial_alphabet = [] + + trainer = trainers.UnigramTrainer( + vocab_size=vocab_size, + special_tokens=special_tokens, + show_progress=show_progress, + initial_alphabet=initial_alphabet, + unk_token=unk_token, + ) + + self._tokenizer.train_from_iterator( + iterator, + trainer=trainer, + length=length, + ) + + @staticmethod + def from_spm(filename: str): + try: + import sys + + sys.path.append(".") + + import sentencepiece_model_pb2 as model + except Exception: + raise Exception( + "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." + ) + + m = model.ModelProto() + m.ParseFromString(open(filename, "rb").read()) + + precompiled_charsmap = m.normalizer_spec.precompiled_charsmap + vocab = [(piece.piece, piece.score) for piece in m.pieces] + unk_id = m.trainer_spec.unk_id + model_type = m.trainer_spec.model_type + byte_fallback = m.trainer_spec.byte_fallback + if model_type != 1: + raise Exception( + "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" + ) + + replacement = "▁" + add_prefix_space = True + + tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback)) + + if precompiled_charsmap: + tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Precompiled(precompiled_charsmap), + normalizers.Replace(Regex(" {2,}"), " "), + ] + ) + else: + tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")]) + prepend_scheme = "always" if add_prefix_space else "never" + tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) + + parameters = { + "model": "SentencePieceUnigram", + } + + obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) + BaseTokenizer.__init__(obj, tokenizer, parameters) + return obj diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py new file mode 100644 index 00000000..68ac211a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.py @@ -0,0 +1,8 @@ +# Generated content DO NOT EDIT +from .. import models + +Model = models.Model +BPE = models.BPE +Unigram = models.Unigram +WordLevel = models.WordLevel +WordPiece = models.WordPiece diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi new file mode 100644 index 00000000..955b9a16 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi @@ -0,0 +1,591 @@ +# Generated content DO NOT EDIT +class Model: + """ + Base class for all models + + The model represents the actual tokenization algorithm. This is the part that + will contain and manage the learned vocabulary. + + This class cannot be constructed directly. Please use one of the concrete models. + """ + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass + + def id_to_token(self, id): + """ + Get the token associated to an ID + + Args: + id (:obj:`int`): + An ID to convert to a token + + Returns: + :obj:`str`: The token associated to the ID + """ + pass + + def save(self, folder, prefix): + """ + Save the current model + + Save the current model in the given folder, using the given prefix for the various + files that will get created. + Any file with the same name that already exists in this folder will be overwritten. + + Args: + folder (:obj:`str`): + The path to the target folder in which to save the various files + + prefix (:obj:`str`, `optional`): + An optional prefix, used to prefix each file name + + Returns: + :obj:`List[str]`: The list of saved files + """ + pass + + def token_to_id(self, tokens): + """ + Get the ID associated to a token + + Args: + token (:obj:`str`): + A token to convert to an ID + + Returns: + :obj:`int`: The ID associated to the token + """ + pass + + def tokenize(self, sequence): + """ + Tokenize a sequence + + Args: + sequence (:obj:`str`): + A sequence to tokenize + + Returns: + A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens + """ + pass + +class BPE(Model): + """ + An implementation of the BPE (Byte-Pair Encoding) algorithm + + Args: + vocab (:obj:`Dict[str, int]`, `optional`): + A dictionnary of string keys and their ids :obj:`{"am": 0,...}` + + merges (:obj:`List[Tuple[str, str]]`, `optional`): + A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]` + + cache_capacity (:obj:`int`, `optional`): + The number of words that the BPE cache can contain. The cache allows + to speed-up the process by keeping the result of the merge operations + for a number of words. + + dropout (:obj:`float`, `optional`): + A float between 0 and 1 that represents the BPE dropout to use. + + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + + continuing_subword_prefix (:obj:`str`, `optional`): + The prefix to attach to subword units that don't represent a beginning of word. + + end_of_word_suffix (:obj:`str`, `optional`): + The suffix to attach to subword units that represent an end of word. + + fuse_unk (:obj:`bool`, `optional`): + Whether to fuse any subsequent unknown tokens into a single one + + byte_fallback (:obj:`bool`, `optional`): + Whether to use spm byte-fallback trick (defaults to False) + + ignore_merges (:obj:`bool`, `optional`): + Whether or not to match tokens with the vocab before using merges. + """ + def __init__( + self, + vocab=None, + merges=None, + cache_capacity=None, + dropout=None, + unk_token=None, + continuing_subword_prefix=None, + end_of_word_suffix=None, + fuse_unk=None, + byte_fallback=False, + ignore_merges=False, + ): + pass + + @staticmethod + def from_file(cls, vocab, merge, **kwargs): + """ + Instantiate a BPE model from the given files. + + This method is roughly equivalent to doing:: + + vocab, merges = BPE.read_file(vocab_filename, merges_filename) + bpe = BPE(vocab, merges) + + If you don't need to keep the :obj:`vocab, merges` values lying around, + this method is more optimized than manually calling + :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE` + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.json` file + + merges (:obj:`str`): + The path to a :obj:`merges.txt` file + + Returns: + :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files + """ + pass + + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass + + def id_to_token(self, id): + """ + Get the token associated to an ID + + Args: + id (:obj:`int`): + An ID to convert to a token + + Returns: + :obj:`str`: The token associated to the ID + """ + pass + + @staticmethod + def read_file(self, vocab, merges): + """ + Read a :obj:`vocab.json` and a :obj:`merges.txt` files + + This method provides a way to read and parse the content of these files, + returning the relevant data structures. If you want to instantiate some BPE models + from memory, this method gives you the expected input from the standard files. + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.json` file + + merges (:obj:`str`): + The path to a :obj:`merges.txt` file + + Returns: + A :obj:`Tuple` with the vocab and the merges: + The vocabulary and merges loaded into memory + """ + pass + + def save(self, folder, prefix): + """ + Save the current model + + Save the current model in the given folder, using the given prefix for the various + files that will get created. + Any file with the same name that already exists in this folder will be overwritten. + + Args: + folder (:obj:`str`): + The path to the target folder in which to save the various files + + prefix (:obj:`str`, `optional`): + An optional prefix, used to prefix each file name + + Returns: + :obj:`List[str]`: The list of saved files + """ + pass + + def token_to_id(self, tokens): + """ + Get the ID associated to a token + + Args: + token (:obj:`str`): + A token to convert to an ID + + Returns: + :obj:`int`: The ID associated to the token + """ + pass + + def tokenize(self, sequence): + """ + Tokenize a sequence + + Args: + sequence (:obj:`str`): + A sequence to tokenize + + Returns: + A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens + """ + pass + +class Unigram(Model): + """ + An implementation of the Unigram algorithm + + Args: + vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): + A list of vocabulary items and their relative score [("am", -0.2442),...] + """ + def __init__(self, vocab, unk_id, byte_fallback): + pass + + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass + + def id_to_token(self, id): + """ + Get the token associated to an ID + + Args: + id (:obj:`int`): + An ID to convert to a token + + Returns: + :obj:`str`: The token associated to the ID + """ + pass + + def save(self, folder, prefix): + """ + Save the current model + + Save the current model in the given folder, using the given prefix for the various + files that will get created. + Any file with the same name that already exists in this folder will be overwritten. + + Args: + folder (:obj:`str`): + The path to the target folder in which to save the various files + + prefix (:obj:`str`, `optional`): + An optional prefix, used to prefix each file name + + Returns: + :obj:`List[str]`: The list of saved files + """ + pass + + def token_to_id(self, tokens): + """ + Get the ID associated to a token + + Args: + token (:obj:`str`): + A token to convert to an ID + + Returns: + :obj:`int`: The ID associated to the token + """ + pass + + def tokenize(self, sequence): + """ + Tokenize a sequence + + Args: + sequence (:obj:`str`): + A sequence to tokenize + + Returns: + A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens + """ + pass + +class WordLevel(Model): + """ + An implementation of the WordLevel algorithm + + Most simple tokenizer model based on mapping tokens to their corresponding id. + + Args: + vocab (:obj:`str`, `optional`): + A dictionnary of string keys and their ids :obj:`{"am": 0,...}` + + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + """ + def __init__(self, vocab, unk_token): + pass + + @staticmethod + def from_file(vocab, unk_token): + """ + Instantiate a WordLevel model from the given file + + This method is roughly equivalent to doing:: + + vocab = WordLevel.read_file(vocab_filename) + wordlevel = WordLevel(vocab) + + If you don't need to keep the :obj:`vocab` values lying around, this method is + more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to + initialize a :class:`~tokenizers.models.WordLevel` + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.json` file + + Returns: + :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file + """ + pass + + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass + + def id_to_token(self, id): + """ + Get the token associated to an ID + + Args: + id (:obj:`int`): + An ID to convert to a token + + Returns: + :obj:`str`: The token associated to the ID + """ + pass + + @staticmethod + def read_file(vocab): + """ + Read a :obj:`vocab.json` + + This method provides a way to read and parse the content of a vocabulary file, + returning the relevant data structures. If you want to instantiate some WordLevel models + from memory, this method gives you the expected input from the standard files. + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.json` file + + Returns: + :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` + """ + pass + + def save(self, folder, prefix): + """ + Save the current model + + Save the current model in the given folder, using the given prefix for the various + files that will get created. + Any file with the same name that already exists in this folder will be overwritten. + + Args: + folder (:obj:`str`): + The path to the target folder in which to save the various files + + prefix (:obj:`str`, `optional`): + An optional prefix, used to prefix each file name + + Returns: + :obj:`List[str]`: The list of saved files + """ + pass + + def token_to_id(self, tokens): + """ + Get the ID associated to a token + + Args: + token (:obj:`str`): + A token to convert to an ID + + Returns: + :obj:`int`: The ID associated to the token + """ + pass + + def tokenize(self, sequence): + """ + Tokenize a sequence + + Args: + sequence (:obj:`str`): + A sequence to tokenize + + Returns: + A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens + """ + pass + +class WordPiece(Model): + """ + An implementation of the WordPiece algorithm + + Args: + vocab (:obj:`Dict[str, int]`, `optional`): + A dictionnary of string keys and their ids :obj:`{"am": 0,...}` + + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + + max_input_chars_per_word (:obj:`int`, `optional`): + The maximum number of characters to authorize in a single word. + """ + def __init__(self, vocab, unk_token, max_input_chars_per_word): + pass + + @staticmethod + def from_file(vocab, **kwargs): + """ + Instantiate a WordPiece model from the given file + + This method is roughly equivalent to doing:: + + vocab = WordPiece.read_file(vocab_filename) + wordpiece = WordPiece(vocab) + + If you don't need to keep the :obj:`vocab` values lying around, this method is + more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to + initialize a :class:`~tokenizers.models.WordPiece` + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.txt` file + + Returns: + :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file + """ + pass + + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass + + def id_to_token(self, id): + """ + Get the token associated to an ID + + Args: + id (:obj:`int`): + An ID to convert to a token + + Returns: + :obj:`str`: The token associated to the ID + """ + pass + + @staticmethod + def read_file(vocab): + """ + Read a :obj:`vocab.txt` file + + This method provides a way to read and parse the content of a standard `vocab.txt` + file as used by the WordPiece Model, returning the relevant data structures. If you + want to instantiate some WordPiece models from memory, this method gives you the + expected input from the standard files. + + Args: + vocab (:obj:`str`): + The path to a :obj:`vocab.txt` file + + Returns: + :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` + """ + pass + + def save(self, folder, prefix): + """ + Save the current model + + Save the current model in the given folder, using the given prefix for the various + files that will get created. + Any file with the same name that already exists in this folder will be overwritten. + + Args: + folder (:obj:`str`): + The path to the target folder in which to save the various files + + prefix (:obj:`str`, `optional`): + An optional prefix, used to prefix each file name + + Returns: + :obj:`List[str]`: The list of saved files + """ + pass + + def token_to_id(self, tokens): + """ + Get the ID associated to a token + + Args: + token (:obj:`str`): + A token to convert to an ID + + Returns: + :obj:`int`: The ID associated to the token + """ + pass + + def tokenize(self, sequence): + """ + Tokenize a sequence + + Args: + sequence (:obj:`str`): + A sequence to tokenize + + Returns: + A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py new file mode 100644 index 00000000..15a16f1e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py @@ -0,0 +1,29 @@ +from .. import normalizers + + +Normalizer = normalizers.Normalizer +BertNormalizer = normalizers.BertNormalizer +NFD = normalizers.NFD +NFKD = normalizers.NFKD +NFC = normalizers.NFC +NFKC = normalizers.NFKC +Sequence = normalizers.Sequence +Lowercase = normalizers.Lowercase +Prepend = normalizers.Prepend +Strip = normalizers.Strip +StripAccents = normalizers.StripAccents +Nmt = normalizers.Nmt +Precompiled = normalizers.Precompiled +Replace = normalizers.Replace + + +NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD} + + +def unicode_normalizer_from_str(normalizer: str) -> Normalizer: + if normalizer not in NORMALIZERS: + raise ValueError( + "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys()) + ) + + return NORMALIZERS[normalizer]() diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi new file mode 100644 index 00000000..507d4473 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi @@ -0,0 +1,595 @@ +# Generated content DO NOT EDIT +class Normalizer: + """ + Base class for all normalizers + + This class is not supposed to be instantiated directly. Instead, any implementation of a + Normalizer will return an instance of this class when instantiated. + """ + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class BertNormalizer(Normalizer): + """ + BertNormalizer + + Takes care of normalizing raw text before giving it to a Bert model. + This includes cleaning the text, handling accents, chinese chars and lowercasing + + Args: + clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to clean the text, by removing any control characters + and replacing all whitespaces by the classic one. + + handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to handle chinese chars by putting spaces around them. + + strip_accents (:obj:`bool`, `optional`): + Whether to strip all accents. If this option is not specified (ie == None), + then it will be determined by the value for `lowercase` (as in the original Bert). + + lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase. + """ + def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Lowercase(Normalizer): + """ + Lowercase Normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class NFC(Normalizer): + """ + NFC Unicode Normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class NFD(Normalizer): + """ + NFD Unicode Normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class NFKC(Normalizer): + """ + NFKC Unicode Normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class NFKD(Normalizer): + """ + NFKD Unicode Normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Nmt(Normalizer): + """ + Nmt normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Precompiled(Normalizer): + """ + Precompiled normalizer + Don't use manually it is used for compatiblity for SentencePiece. + """ + def __init__(self, precompiled_charsmap): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Prepend(Normalizer): + """ + Prepend normalizer + """ + def __init__(self, prepend): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Replace(Normalizer): + """ + Replace normalizer + """ + def __init__(self, pattern, content): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Sequence(Normalizer): + """ + Allows concatenating multiple other Normalizer as a Sequence. + All the normalizers run in sequence in the given order + + Args: + normalizers (:obj:`List[Normalizer]`): + A list of Normalizer to be run as a sequence + """ + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class Strip(Normalizer): + """ + Strip normalizer + """ + def __init__(self, left=True, right=True): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass + +class StripAccents(Normalizer): + """ + StripAccents normalizer + """ + def __init__(self): + pass + + def normalize(self, normalized): + """ + Normalize a :class:`~tokenizers.NormalizedString` in-place + + This method allows to modify a :class:`~tokenizers.NormalizedString` to + keep track of the alignment information. If you just want to see the result + of the normalization on a raw string, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize_str` + + Args: + normalized (:class:`~tokenizers.NormalizedString`): + The normalized string on which to apply this + :class:`~tokenizers.normalizers.Normalizer` + """ + pass + + def normalize_str(self, sequence): + """ + Normalize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment + information. If you need to get/convert offsets, you can use + :meth:`~tokenizers.normalizers.Normalizer.normalize` + + Args: + sequence (:obj:`str`): + A string to normalize + + Returns: + :obj:`str`: A string after normalization + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py new file mode 100644 index 00000000..48277f0d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.py @@ -0,0 +1,15 @@ +# Generated content DO NOT EDIT +from .. import pre_tokenizers + +PreTokenizer = pre_tokenizers.PreTokenizer +BertPreTokenizer = pre_tokenizers.BertPreTokenizer +ByteLevel = pre_tokenizers.ByteLevel +CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit +Digits = pre_tokenizers.Digits +Metaspace = pre_tokenizers.Metaspace +Punctuation = pre_tokenizers.Punctuation +Sequence = pre_tokenizers.Sequence +Split = pre_tokenizers.Split +UnicodeScripts = pre_tokenizers.UnicodeScripts +Whitespace = pre_tokenizers.Whitespace +WhitespaceSplit = pre_tokenizers.WhitespaceSplit diff --git a/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi new file mode 100644 index 00000000..d81d3802 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/pre_tokenizers/__init__.pyi @@ -0,0 +1,607 @@ +# Generated content DO NOT EDIT +class PreTokenizer: + """ + Base class for all pre-tokenizers + + This class is not supposed to be instantiated directly. Instead, any implementation of a + PreTokenizer will return an instance of this class when instantiated. + """ + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class BertPreTokenizer(PreTokenizer): + """ + BertPreTokenizer + + This pre-tokenizer splits tokens on spaces, and also on punctuation. + Each occurence of a punctuation character will be treated separately. + """ + def __init__(self): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class ByteLevel(PreTokenizer): + """ + ByteLevel PreTokenizer + + This pre-tokenizer takes care of replacing all bytes of the given string + with a corresponding representation, as well as splitting into words. + + Args: + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`): + Set this to :obj:`False` to prevent this `pre_tokenizer` from using + the GPT2 specific regexp for spliting on whitespace. + """ + def __init__(self, add_prefix_space=True, use_regex=True): + pass + + @staticmethod + def alphabet(): + """ + Returns the alphabet used by this PreTokenizer. + + Since the ByteLevel works as its name suggests, at the byte level, it + encodes each byte value to a unique visible character. This means that there is a + total of 256 different characters composing this alphabet. + + Returns: + :obj:`List[str]`: A list of characters that compose the alphabet + """ + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class CharDelimiterSplit(PreTokenizer): + """ + This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` + + Args: + delimiter: str: + The delimiter char that will be used to split input + """ + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Digits(PreTokenizer): + """ + This pre-tokenizer simply splits using the digits in separate tokens + + Args: + individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`): + If set to True, digits will each be separated as follows:: + + "Call 123 please" -> "Call ", "1", "2", "3", " please" + + If set to False, digits will grouped as follows:: + + "Call 123 please" -> "Call ", "123", " please" + """ + def __init__(self, individual_digits=False): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Metaspace(PreTokenizer): + """ + Metaspace pre-tokenizer + + This pre-tokenizer replaces any whitespace by the provided replacement character. + It then tries to split on these spaces. + + Args: + replacement (:obj:`str`, `optional`, defaults to :obj:`▁`): + The replacement character. Must be exactly one character. By default we + use the `▁` (U+2581) meta symbol (Same as in SentencePiece). + + prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`): + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + Choices: "always", "never", "first". First means the space is only added on the first + token (relevant when special tokens are used or other pre_tokenizer are used). + + """ + def __init__(self, replacement="_", prepend_scheme="always", split=True): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Punctuation(PreTokenizer): + """ + This pre-tokenizer simply splits on punctuation as individual characters. + + Args: + behavior (:class:`~tokenizers.SplitDelimiterBehavior`): + The behavior to use when splitting. + Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", + "contiguous" + """ + def __init__(self, behavior="isolated"): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Sequence(PreTokenizer): + """ + This pre-tokenizer composes other pre_tokenizers and applies them in sequence + """ + def __init__(self, pretokenizers): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Split(PreTokenizer): + """ + Split PreTokenizer + + This versatile pre-tokenizer splits using the provided pattern and + according to the provided behavior. The pattern can be inverted by + making use of the invert flag. + + Args: + pattern (:obj:`str` or :class:`~tokenizers.Regex`): + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` + + behavior (:class:`~tokenizers.SplitDelimiterBehavior`): + The behavior to use when splitting. + Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", + "contiguous" + + invert (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to invert the pattern. + """ + def __init__(self, pattern, behavior, invert=False): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class UnicodeScripts(PreTokenizer): + """ + This pre-tokenizer splits on characters that belong to different language family + It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt + Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. + This mimicks SentencePiece Unigram implementation. + """ + def __init__(self): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class Whitespace(PreTokenizer): + """ + This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` + """ + def __init__(self): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass + +class WhitespaceSplit(PreTokenizer): + """ + This pre-tokenizer simply splits on the whitespace. Works like `.split()` + """ + def __init__(self): + pass + + def pre_tokenize(self, pretok): + """ + Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place + + This method allows to modify a :class:`~tokenizers.PreTokenizedString` to + keep track of the pre-tokenization, and leverage the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of + the pre-tokenization of a raw string, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str` + + Args: + pretok (:class:`~tokenizers.PreTokenizedString): + The pre-tokenized string on which to apply this + :class:`~tokenizers.pre_tokenizers.PreTokenizer` + """ + pass + + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given string + + This method provides a way to visualize the effect of a + :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the + alignment, nor does it provide all the capabilities of the + :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use + :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` + + Args: + sequence (:obj:`str`): + A string to pre-tokeize + + Returns: + :obj:`List[Tuple[str, Offsets]]`: + A list of tuple with the pre-tokenized parts and their offsets + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py new file mode 100644 index 00000000..06d12403 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.py @@ -0,0 +1,9 @@ +# Generated content DO NOT EDIT +from .. import processors + +PostProcessor = processors.PostProcessor +BertProcessing = processors.BertProcessing +ByteLevel = processors.ByteLevel +RobertaProcessing = processors.RobertaProcessing +Sequence = processors.Sequence +TemplateProcessing = processors.TemplateProcessing diff --git a/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi new file mode 100644 index 00000000..5136d02b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/processors/__init__.pyi @@ -0,0 +1,342 @@ +# Generated content DO NOT EDIT +class PostProcessor: + """ + Base class for all post-processors + + This class is not supposed to be instantiated directly. Instead, any implementation of + a PostProcessor will return an instance of this class when instantiated. + """ + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass + +class BertProcessing(PostProcessor): + """ + This post-processor takes care of adding the special tokens needed by + a Bert model: + + - a SEP token + - a CLS token + + Args: + sep (:obj:`Tuple[str, int]`): + A tuple with the string representation of the SEP token, and its id + + cls (:obj:`Tuple[str, int]`): + A tuple with the string representation of the CLS token, and its id + """ + def __init__(self, sep, cls): + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass + +class ByteLevel(PostProcessor): + """ + This post-processor takes care of trimming the offsets. + + By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't + want the offsets to include these whitespaces, then this PostProcessor must be used. + + Args: + trim_offsets (:obj:`bool`): + Whether to trim the whitespaces from the produced offsets. + """ + def __init__(self, trim_offsets=True): + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass + +class RobertaProcessing(PostProcessor): + """ + This post-processor takes care of adding the special tokens needed by + a Roberta model: + + - a SEP token + - a CLS token + + It also takes care of trimming the offsets. + By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't + want the offsets to include these whitespaces, then this PostProcessor should be initialized + with :obj:`trim_offsets=True` + + Args: + sep (:obj:`Tuple[str, int]`): + A tuple with the string representation of the SEP token, and its id + + cls (:obj:`Tuple[str, int]`): + A tuple with the string representation of the CLS token, and its id + + trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to trim the whitespaces from the produced offsets. + + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether the add_prefix_space option was enabled during pre-tokenization. This + is relevant because it defines the way the offsets are trimmed out. + """ + def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass + +class Sequence(PostProcessor): + """ + Sequence Processor + + Args: + processors (:obj:`List[PostProcessor]`) + The processors that need to be chained + """ + def __init__(self, processors): + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass + +class TemplateProcessing(PostProcessor): + """ + Provides a way to specify templates in order to add the special tokens to each + input sequence as relevant. + + Let's take :obj:`BERT` tokenizer as an example. It uses two special tokens, used to + delimitate each sequence. :obj:`[CLS]` is always used at the beginning of the first + sequence, and :obj:`[SEP]` is added at the end of both the first, and the pair + sequences. The final result looks like this: + + - Single sequence: :obj:`[CLS] Hello there [SEP]` + - Pair sequences: :obj:`[CLS] My name is Anthony [SEP] What is my name? [SEP]` + + With the type ids as following:: + + [CLS] ... [SEP] ... [SEP] + 0 0 0 1 1 + + You can achieve such behavior using a TemplateProcessing:: + + TemplateProcessing( + single="[CLS] $0 [SEP]", + pair="[CLS] $A [SEP] $B:1 [SEP]:1", + special_tokens=[("[CLS]", 1), ("[SEP]", 0)], + ) + + In this example, each input sequence is identified using a ``$`` construct. This identifier + lets us specify each input sequence, and the type_id to use. When nothing is specified, + it uses the default values. Here are the different ways to specify it: + + - Specifying the sequence, with default ``type_id == 0``: ``$A`` or ``$B`` + - Specifying the `type_id` with default ``sequence == A``: ``$0``, ``$1``, ``$2``, ... + - Specifying both: ``$A:0``, ``$B:1``, ... + + The same construct is used for special tokens: ``<identifier>(:<type_id>)?``. + + **Warning**: You must ensure that you are giving the correct tokens/ids as these + will be added to the Encoding without any further check. If the given ids correspond + to something totally different in a `Tokenizer` using this `PostProcessor`, it + might lead to unexpected results. + + Args: + single (:obj:`Template`): + The template used for single sequences + + pair (:obj:`Template`): + The template used when both sequences are specified + + special_tokens (:obj:`Tokens`): + The list of special tokens used in each sequences + + Types: + + Template (:obj:`str` or :obj:`List`): + - If a :obj:`str` is provided, the whitespace is used as delimiter between tokens + - If a :obj:`List[str]` is provided, a list of tokens + + Tokens (:obj:`List[Union[Tuple[int, str], Tuple[str, int], dict]]`): + - A :obj:`Tuple` with both a token and its associated ID, in any order + - A :obj:`dict` with the following keys: + - "id": :obj:`str` => The special token id, as specified in the Template + - "ids": :obj:`List[int]` => The associated IDs + - "tokens": :obj:`List[str]` => The associated tokens + + The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have + the same length. + """ + def __init__(self, single, pair, special_tokens): + pass + + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + + Args: + is_pair (:obj:`bool`): + Whether the input would be a pair of sequences + + Returns: + :obj:`int`: The number of tokens to add + """ + pass + + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + + Args: + encoding (:class:`~tokenizers.Encoding`): + The encoding for the first sequence + + pair (:class:`~tokenizers.Encoding`, `optional`): + The encoding for the pair sequence + + add_special_tokens (:obj:`bool`): + Whether to add the special tokens + + Return: + :class:`~tokenizers.Encoding`: The final encoding + """ + pass diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so Binary files differnew file mode 100755 index 00000000..563e2885 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py new file mode 100644 index 00000000..f941e2ed --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/__init__.py @@ -0,0 +1 @@ +from .visualizer import Annotation, EncodingVisualizer diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css new file mode 100644 index 00000000..f54fde45 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer-styles.css @@ -0,0 +1,170 @@ +.tokenized-text { + width:100%; + padding:2rem; + max-height: 400px; + overflow-y: auto; + box-sizing:border-box; + line-height:4rem; /* Lots of space between lines */ + font-family: "Roboto Light", "Ubuntu Light", "Ubuntu", monospace; + box-shadow: 2px 2px 2px rgba(0,0,0,0.2); + background-color: rgba(0,0,0,0.01); + letter-spacing:2px; /* Give some extra separation between chars */ +} +.non-token{ + /* White space and other things the tokenizer ignores*/ + white-space: pre; + letter-spacing:4px; + border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/ + border-bottom:1px solid #A0A0A0; + line-height: 1rem; + height: calc(100% - 2px); +} + +.token { + white-space: pre; + position:relative; + color:black; + letter-spacing:2px; +} + +.annotation{ + white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */ + border-radius:4px; + position:relative; + width:fit-content; +} +.annotation:before { + /*The before holds the text and the after holds the background*/ + z-index:1000; /* Make sure this is above the background */ + content:attr(data-label); /* The annotations label is on a data attribute */ + color:white; + position:absolute; + font-size:1rem; + text-align:center; + font-weight:bold; + + top:1.75rem; + line-height:0; + left:0; + width:100%; + padding:0.5rem 0; + /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/ + overflow: hidden; + white-space: nowrap; + text-overflow:ellipsis; +} + +.annotation:after { + content:attr(data-label); /* The content defines the width of the annotation*/ + position:absolute; + font-size:0.75rem; + text-align:center; + font-weight:bold; + text-overflow:ellipsis; + top:1.75rem; + line-height:0; + overflow: hidden; + white-space: nowrap; + + left:0; + width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/ + + padding:0.5rem 0; + /* Nast hack below: + We set the annotations color in code because we don't know the colors at css time. + But you can't pass a color as a data attribute to get it into the pseudo element (this thing) + So to get around that, annotations have the color set on them with a style attribute and then we + can get the color with currentColor. + Annotations wrap tokens and tokens set the color back to black + */ + background-color: currentColor; +} +.annotation:hover::after, .annotation:hover::before{ + /* When the user hovers over an annotation expand the label to display in full + */ + min-width: fit-content; +} + +.annotation:hover{ + /* Emphasize the annotation start end with a border on hover*/ + border-color: currentColor; + border: 2px solid; +} +.special-token:not(:empty){ + /* + A none empty special token is like UNK (as opposed to CLS which has no representation in the text ) + */ + position:relative; +} +.special-token:empty::before{ + /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/ + content:attr(data-stok); + background:#202020; + font-size:0.75rem; + color:white; + margin: 0 0.25rem; + padding: 0.25rem; + border-radius:4px +} + +.special-token:not(:empty):before { + /* Special tokens that have text (UNK) are displayed above the actual text*/ + content:attr(data-stok); + position:absolute; + bottom:1.75rem; + min-width:100%; + width:100%; + height:1rem; + line-height:1rem; + font-size:1rem; + text-align:center; + color:white; + font-weight:bold; + background:#202020; + border-radius:10%; +} +/* +We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations +instead we apply even and odd class at generation time and color them that way + */ +.even-token{ + background:#DCDCDC ; + border: 1px solid #DCDCDC; +} +.odd-token{ + background:#A0A0A0; + border: 1px solid #A0A0A0; +} +.even-token.multi-token,.odd-token.multi-token{ + background: repeating-linear-gradient( + 45deg, + transparent, + transparent 1px, + #ccc 1px, + #ccc 1px + ), + /* on "bottom" */ + linear-gradient( + to bottom, + #FFB6C1, + #999 + ); +} + +.multi-token:hover::after { + content:"This char has more than 1 token"; /* The content defines the width of the annotation*/ + color:white; + background-color: black; + position:absolute; + font-size:0.75rem; + text-align:center; + font-weight:bold; + text-overflow:ellipsis; + top:1.75rem; + line-height:0; + overflow: hidden; + white-space: nowrap; + left:0; + width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/ + padding:0.5rem 0; +} diff --git a/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py new file mode 100644 index 00000000..c988a648 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py @@ -0,0 +1,403 @@ +import itertools +import os +import re +from string import Template +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple + +from tokenizers import Encoding, Tokenizer + + +dirname = os.path.dirname(__file__) +css_filename = os.path.join(dirname, "visualizer-styles.css") +with open(css_filename) as f: + css = f.read() + + +class Annotation: + start: int + end: int + label: int + + def __init__(self, start: int, end: int, label: str): + self.start = start + self.end = end + self.label = label + + +AnnotationList = List[Annotation] +PartialIntList = List[Optional[int]] + + +class CharStateKey(NamedTuple): + token_ix: Optional[int] + anno_ix: Optional[int] + + +class CharState: + char_ix: Optional[int] + + def __init__(self, char_ix): + self.char_ix = char_ix + + self.anno_ix: Optional[int] = None + self.tokens: List[int] = [] + + @property + def token_ix(self): + return self.tokens[0] if len(self.tokens) > 0 else None + + @property + def is_multitoken(self): + """ + BPE tokenizers can output more than one token for a char + """ + return len(self.tokens) > 1 + + def partition_key(self) -> CharStateKey: + return CharStateKey( + token_ix=self.token_ix, + anno_ix=self.anno_ix, + ) + + +class Aligned: + pass + + +class EncodingVisualizer: + """ + Build an EncodingVisualizer + + Args: + + tokenizer (:class:`~tokenizers.Tokenizer`): + A tokenizer instance + + default_to_notebook (:obj:`bool`): + Whether to render html output in a notebook by default + + annotation_converter (:obj:`Callable`, `optional`): + An optional (lambda) function that takes an annotation in any format and returns + an Annotation object + """ + + unk_token_regex = re.compile("(.{1}\b)?(unk|oov)(\b.{1})?", flags=re.IGNORECASE) + + def __init__( + self, + tokenizer: Tokenizer, + default_to_notebook: bool = True, + annotation_converter: Optional[Callable[[Any], Annotation]] = None, + ): + if default_to_notebook: + try: + from IPython.core.display import HTML, display + except ImportError: + raise Exception( + """We couldn't import IPython utils for html display. + Are you running in a notebook? + You can also pass `default_to_notebook=False` to get back raw HTML + """ + ) + + self.tokenizer = tokenizer + self.default_to_notebook = default_to_notebook + self.annotation_coverter = annotation_converter + pass + + def __call__( + self, + text: str, + annotations: AnnotationList = [], + default_to_notebook: Optional[bool] = None, + ) -> Optional[str]: + """ + Build a visualization of the given text + + Args: + text (:obj:`str`): + The text to tokenize + + annotations (:obj:`List[Annotation]`, `optional`): + An optional list of annotations of the text. The can either be an annotation class + or anything else if you instantiated the visualizer with a converter function + + default_to_notebook (:obj:`bool`, `optional`, defaults to `False`): + If True, will render the html in a notebook. Otherwise returns an html string. + + Returns: + The HTML string if default_to_notebook is False, otherwise (default) returns None and + renders the HTML in the notebook + + """ + final_default_to_notebook = self.default_to_notebook + if default_to_notebook is not None: + final_default_to_notebook = default_to_notebook + if final_default_to_notebook: + try: + from IPython.core.display import HTML, display + except ImportError: + raise Exception( + """We couldn't import IPython utils for html display. + Are you running in a notebook?""" + ) + if self.annotation_coverter is not None: + annotations = list(map(self.annotation_coverter, annotations)) + encoding = self.tokenizer.encode(text) + html = EncodingVisualizer.__make_html(text, encoding, annotations) + if final_default_to_notebook: + display(HTML(html)) + else: + return html + + @staticmethod + def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]: + """ + Generates a color palette for all the labels in a given set of annotations + + Args: + annotations (:obj:`Annotation`): + A list of annotations + + Returns: + :obj:`dict`: A dictionary mapping labels to colors in HSL format + """ + if len(annotations) == 0: + return {} + labels = set(map(lambda x: x.label, annotations)) + num_labels = len(labels) + h_step = int(255 / num_labels) + if h_step < 20: + h_step = 20 + s = 32 + l = 64 # noqa: E741 + h = 10 + colors = {} + + for label in sorted(labels): # sort so we always get the same colors for a given set of labels + colors[label] = f"hsl({h},{s}%,{l}%" + h += h_step + return colors + + @staticmethod + def consecutive_chars_to_html( + consecutive_chars_list: List[CharState], + text: str, + encoding: Encoding, + ): + """ + Converts a list of "consecutive chars" into a single HTML element. + Chars are consecutive if they fall under the same word, token and annotation. + The CharState class is a named tuple with a "partition_key" method that makes it easy to + compare if two chars are consecutive. + + Args: + consecutive_chars_list (:obj:`List[CharState]`): + A list of CharStates that have been grouped together + + text (:obj:`str`): + The original text being processed + + encoding (:class:`~tokenizers.Encoding`): + The encoding returned from the tokenizer + + Returns: + :obj:`str`: The HTML span for a set of consecutive chars + """ + first = consecutive_chars_list[0] + if first.char_ix is None: + # its a special token + stoken = encoding.tokens[first.token_ix] + # special tokens are represented as empty spans. We use the data attribute and css + # magic to display it + return f'<span class="special-token" data-stoken={stoken}></span>' + # We're not in a special token so this group has a start and end. + last = consecutive_chars_list[-1] + start = first.char_ix + end = last.char_ix + 1 + span_text = text[start:end] + css_classes = [] # What css classes will we apply on the resulting span + data_items = {} # What data attributes will we apply on the result span + if first.token_ix is not None: + # We can either be in a token or not (e.g. in white space) + css_classes.append("token") + if first.is_multitoken: + css_classes.append("multi-token") + if first.token_ix % 2: + # We use this to color alternating tokens. + # A token might be split by an annotation that ends in the middle of it, so this + # lets us visually indicate a consecutive token despite its possible splitting in + # the html markup + css_classes.append("odd-token") + else: + # Like above, but a different color so we can see the tokens alternate + css_classes.append("even-token") + if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None: + # This is a special token that is in the text. probably UNK + css_classes.append("special-token") + # TODO is this the right name for the data attribute ? + data_items["stok"] = encoding.tokens[first.token_ix] + else: + # In this case we are looking at a group/single char that is not tokenized. + # e.g. white space + css_classes.append("non-token") + css = f'''class="{' '.join(css_classes)}"''' + data = "" + for key, val in data_items.items(): + data += f' data-{key}="{val}"' + return f"<span {css} {data} >{span_text}</span>" + + @staticmethod + def __make_html(text: str, encoding: Encoding, annotations: AnnotationList) -> str: + char_states = EncodingVisualizer.__make_char_states(text, encoding, annotations) + current_consecutive_chars = [char_states[0]] + prev_anno_ix = char_states[0].anno_ix + spans = [] + label_colors_dict = EncodingVisualizer.calculate_label_colors(annotations) + cur_anno_ix = char_states[0].anno_ix + if cur_anno_ix is not None: + # If we started in an annotation make a span for it + anno = annotations[cur_anno_ix] + label = anno.label + color = label_colors_dict[label] + spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">') + + for cs in char_states[1:]: + cur_anno_ix = cs.anno_ix + if cur_anno_ix != prev_anno_ix: + # If we've transitioned in or out of an annotation + spans.append( + # Create a span from the current consecutive characters + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + current_consecutive_chars = [cs] + + if prev_anno_ix is not None: + # if we transitioned out of an annotation close it's span + spans.append("</span>") + if cur_anno_ix is not None: + # If we entered a new annotation make a span for it + anno = annotations[cur_anno_ix] + label = anno.label + color = label_colors_dict[label] + spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">') + prev_anno_ix = cur_anno_ix + + if cs.partition_key() == current_consecutive_chars[0].partition_key(): + # If the current charchter is in the same "group" as the previous one + current_consecutive_chars.append(cs) + else: + # Otherwise we make a span for the previous group + spans.append( + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + # An reset the consecutive_char_list to form a new group + current_consecutive_chars = [cs] + # All that's left is to fill out the final span + # TODO I think there is an edge case here where an annotation's span might not close + spans.append( + EncodingVisualizer.consecutive_chars_to_html( + current_consecutive_chars, + text=text, + encoding=encoding, + ) + ) + res = HTMLBody(spans) # Send the list of spans to the body of our html + return res + + @staticmethod + def __make_anno_map(text: str, annotations: AnnotationList) -> PartialIntList: + """ + Args: + text (:obj:`str`): + The raw text we want to align to + + annotations (:obj:`AnnotationList`): + A (possibly empty) list of annotations + + Returns: + A list of length len(text) whose entry at index i is None if there is no annotation on + charachter i or k, the index of the annotation that covers index i where k is with + respect to the list of annotations + """ + annotation_map = [None] * len(text) + for anno_ix, a in enumerate(annotations): + for i in range(a.start, a.end): + annotation_map[i] = anno_ix + return annotation_map + + @staticmethod + def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]: + """ + For each character in the original text, we emit a tuple representing it's "state": + + * which token_ix it corresponds to + * which word_ix it corresponds to + * which annotation_ix it corresponds to + + Args: + text (:obj:`str`): + The raw text we want to align to + + annotations (:obj:`List[Annotation]`): + A (possibly empty) list of annotations + + encoding: (:class:`~tokenizers.Encoding`): + The encoding returned from the tokenizer + + Returns: + :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what + it's state is + """ + annotation_map = EncodingVisualizer.__make_anno_map(text, annotations) + # Todo make this a dataclass or named tuple + char_states: List[CharState] = [CharState(char_ix) for char_ix in range(len(text))] + for token_ix, token in enumerate(encoding.tokens): + offsets = encoding.token_to_chars(token_ix) + if offsets is not None: + start, end = offsets + for i in range(start, end): + char_states[i].tokens.append(token_ix) + for char_ix, anno_ix in enumerate(annotation_map): + char_states[char_ix].anno_ix = anno_ix + + return char_states + + +def HTMLBody(children: List[str], css_styles=css) -> str: + """ + Generates the full html with css from a list of html spans + + Args: + children (:obj:`List[str]`): + A list of strings, assumed to be html elements + + css_styles (:obj:`str`, `optional`): + Optional alternative implementation of the css + + Returns: + :obj:`str`: An HTML string with style markup + """ + children_text = "".join(children) + return f""" + <html> + <head> + <style> + {css_styles} + </style> + </head> + <body> + <div class="tokenized-text" dir=auto> + {children_text} + </div> + </body> + </html> + """ diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py new file mode 100644 index 00000000..22f94c50 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py @@ -0,0 +1,8 @@ +# Generated content DO NOT EDIT +from .. import trainers + +Trainer = trainers.Trainer +BpeTrainer = trainers.BpeTrainer +UnigramTrainer = trainers.UnigramTrainer +WordLevelTrainer = trainers.WordLevelTrainer +WordPieceTrainer = trainers.WordPieceTrainer diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi new file mode 100644 index 00000000..d6c52571 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi @@ -0,0 +1,156 @@ +# Generated content DO NOT EDIT +class Trainer: + """ + Base class for all trainers + + This class is not supposed to be instantiated directly. Instead, any implementation of a + Trainer will return an instance of this class when instantiated. + """ + +class BpeTrainer(Trainer): + """ + Trainer capable of training a BPE model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + + limit_alphabet (:obj:`int`, `optional`): + The maximum different characters to keep in the alphabet. + + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix (:obj:`str`, `optional`): + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix (:obj:`str`, `optional`): + A suffix to be used for every subword that is a end-of-word. + + max_token_length (:obj:`int`, `optional`): + Prevents creating tokens longer than the specified size. + This can help with reducing polluting your vocabulary with + highly repetitive tokens like `======` for wikipedia + + """ + +class UnigramTrainer(Trainer): + """ + Trainer capable of training a Unigram model + + Args: + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + + show_progress (:obj:`bool`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + + initial_alphabet (:obj:`List[str]`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + shrinking_factor (:obj:`float`): + The shrinking factor used at each step of the training to prune the + vocabulary. + + unk_token (:obj:`str`): + The token used for out-of-vocabulary tokens. + + max_piece_length (:obj:`int`): + The maximum length of a given token. + + n_sub_iterations (:obj:`int`): + The number of iterations of the EM algorithm to perform before + pruning the vocabulary. + """ + def __init__( + self, + vocab_size=8000, + show_progress=True, + special_tokens=[], + shrinking_factor=0.75, + unk_token=None, + max_piece_length=16, + n_sub_iterations=2, + ): + pass + +class WordLevelTrainer(Trainer): + """ + Trainer capable of training a WorldLevel model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + """ + +class WordPieceTrainer(Trainer): + """ + Trainer capable of training a WordPiece model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + + limit_alphabet (:obj:`int`, `optional`): + The maximum different characters to keep in the alphabet. + + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix (:obj:`str`, `optional`): + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix (:obj:`str`, `optional`): + A suffix to be used for every subword that is a end-of-word. + """ + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix="##", + end_of_word_suffix=None, + ): + pass |