aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/__init__.py')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/__init__.py100
1 files changed, 100 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
new file mode 100644
index 00000000..efd57429
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/__init__.py
@@ -0,0 +1,100 @@
+from enum import Enum
+from typing import List, Tuple, Union
+
+
+Offsets = Tuple[int, int]
+
+TextInputSequence = str
+"""A :obj:`str` that represents an input sequence """
+
+PreTokenizedInputSequence = Union[List[str], Tuple[str]]
+"""A pre-tokenized input sequence. Can be one of:
+
+ - A :obj:`List` of :obj:`str`
+ - A :obj:`Tuple` of :obj:`str`
+"""
+
+TextEncodeInput = Union[
+ TextInputSequence,
+ Tuple[TextInputSequence, TextInputSequence],
+ List[TextInputSequence],
+]
+"""Represents a textual input for encoding. Can be either:
+
+ - A single sequence: :data:`~tokenizers.TextInputSequence`
+ - A pair of sequences:
+
+ - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
+ - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
+"""
+
+PreTokenizedEncodeInput = Union[
+ PreTokenizedInputSequence,
+ Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
+ List[PreTokenizedInputSequence],
+]
+"""Represents a pre-tokenized input for encoding. Can be either:
+
+ - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
+ - A pair of sequences:
+
+ - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
+ - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
+"""
+
+InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
+"""Represents all the possible types of input sequences for encoding. Can be:
+
+ - When ``is_pretokenized=False``: :data:`~TextInputSequence`
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
+"""
+
+EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
+"""Represents all the possible types of input for encoding. Can be:
+
+ - When ``is_pretokenized=False``: :data:`~TextEncodeInput`
+ - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
+"""
+
+
+class OffsetReferential(Enum):
+ ORIGINAL = "original"
+ NORMALIZED = "normalized"
+
+
+class OffsetType(Enum):
+ BYTE = "byte"
+ CHAR = "char"
+
+
+class SplitDelimiterBehavior(Enum):
+ REMOVED = "removed"
+ ISOLATED = "isolated"
+ MERGED_WITH_PREVIOUS = "merged_with_previous"
+ MERGED_WITH_NEXT = "merged_with_next"
+ CONTIGUOUS = "contiguous"
+
+
+from .tokenizers import (
+ AddedToken,
+ Encoding,
+ NormalizedString,
+ PreTokenizedString,
+ Regex,
+ Token,
+ Tokenizer,
+ decoders,
+ models,
+ normalizers,
+ pre_tokenizers,
+ processors,
+ trainers,
+ __version__,
+)
+from .implementations import (
+ BertWordPieceTokenizer,
+ ByteLevelBPETokenizer,
+ CharBPETokenizer,
+ SentencePieceBPETokenizer,
+ SentencePieceUnigramTokenizer,
+)