diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/__init__.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/tokenizers/__init__.py | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py new file mode 100644 index 00000000..efd57429 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py @@ -0,0 +1,100 @@ +from enum import Enum +from typing import List, Tuple, Union + + +Offsets = Tuple[int, int] + +TextInputSequence = str +"""A :obj:`str` that represents an input sequence """ + +PreTokenizedInputSequence = Union[List[str], Tuple[str]] +"""A pre-tokenized input sequence. Can be one of: + + - A :obj:`List` of :obj:`str` + - A :obj:`Tuple` of :obj:`str` +""" + +TextEncodeInput = Union[ + TextInputSequence, + Tuple[TextInputSequence, TextInputSequence], + List[TextInputSequence], +] +"""Represents a textual input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.TextInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 +""" + +PreTokenizedEncodeInput = Union[ + PreTokenizedInputSequence, + Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], + List[PreTokenizedInputSequence], +] +"""Represents a pre-tokenized input for encoding. Can be either: + + - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` + - A pair of sequences: + + - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` + - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 +""" + +InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] +"""Represents all the possible types of input sequences for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextInputSequence` + - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` +""" + +EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] +"""Represents all the possible types of input for encoding. Can be: + + - When ``is_pretokenized=False``: :data:`~TextEncodeInput` + - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` +""" + + +class OffsetReferential(Enum): + ORIGINAL = "original" + NORMALIZED = "normalized" + + +class OffsetType(Enum): + BYTE = "byte" + CHAR = "char" + + +class SplitDelimiterBehavior(Enum): + REMOVED = "removed" + ISOLATED = "isolated" + MERGED_WITH_PREVIOUS = "merged_with_previous" + MERGED_WITH_NEXT = "merged_with_next" + CONTIGUOUS = "contiguous" + + +from .tokenizers import ( + AddedToken, + Encoding, + NormalizedString, + PreTokenizedString, + Regex, + Token, + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, + __version__, +) +from .implementations import ( + BertWordPieceTokenizer, + ByteLevelBPETokenizer, + CharBPETokenizer, + SentencePieceBPETokenizer, + SentencePieceUnigramTokenizer, +) |