about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/tokenizers/trainers
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/trainers')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py8
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi156
2 files changed, 164 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
new file mode 100644
index 00000000..22f94c50
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.py
@@ -0,0 +1,8 @@
+# Generated content DO NOT EDIT
+from .. import trainers
+
+Trainer = trainers.Trainer
+BpeTrainer = trainers.BpeTrainer
+UnigramTrainer = trainers.UnigramTrainer
+WordLevelTrainer = trainers.WordLevelTrainer
+WordPieceTrainer = trainers.WordPieceTrainer
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
new file mode 100644
index 00000000..d6c52571
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi
@@ -0,0 +1,156 @@
+# Generated content DO NOT EDIT
+class Trainer:
+    """
+    Base class for all trainers
+
+    This class is not supposed to be instantiated directly. Instead, any implementation of a
+    Trainer will return an instance of this class when instantiated.
+    """
+
+class BpeTrainer(Trainer):
+    """
+    Trainer capable of training a BPE model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+            A list of special tokens the model should know of.
+
+        limit_alphabet (:obj:`int`, `optional`):
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet (:obj:`List[str]`, `optional`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            A suffix to be used for every subword that is a end-of-word.
+
+        max_token_length (:obj:`int`, `optional`):
+            Prevents creating tokens longer than the specified size.
+            This can help with reducing polluting your vocabulary with
+            highly repetitive tokens like `======` for wikipedia
+
+    """
+
+class UnigramTrainer(Trainer):
+    """
+    Trainer capable of training a Unigram model
+
+    Args:
+        vocab_size (:obj:`int`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        show_progress (:obj:`bool`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            A list of special tokens the model should know of.
+
+        initial_alphabet (:obj:`List[str]`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        shrinking_factor (:obj:`float`):
+            The shrinking factor used at each step of the training to prune the
+            vocabulary.
+
+        unk_token (:obj:`str`):
+            The token used for out-of-vocabulary tokens.
+
+        max_piece_length (:obj:`int`):
+            The maximum length of a given token.
+
+        n_sub_iterations (:obj:`int`):
+            The number of iterations of the EM algorithm to perform before
+            pruning the vocabulary.
+    """
+    def __init__(
+        self,
+        vocab_size=8000,
+        show_progress=True,
+        special_tokens=[],
+        shrinking_factor=0.75,
+        unk_token=None,
+        max_piece_length=16,
+        n_sub_iterations=2,
+    ):
+        pass
+
+class WordLevelTrainer(Trainer):
+    """
+    Trainer capable of training a WorldLevel model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`):
+            A list of special tokens the model should know of.
+    """
+
+class WordPieceTrainer(Trainer):
+    """
+    Trainer capable of training a WordPiece model
+
+    Args:
+        vocab_size (:obj:`int`, `optional`):
+            The size of the final vocabulary, including all tokens and alphabet.
+
+        min_frequency (:obj:`int`, `optional`):
+            The minimum frequency a pair should have in order to be merged.
+
+        show_progress (:obj:`bool`, `optional`):
+            Whether to show progress bars while training.
+
+        special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
+            A list of special tokens the model should know of.
+
+        limit_alphabet (:obj:`int`, `optional`):
+            The maximum different characters to keep in the alphabet.
+
+        initial_alphabet (:obj:`List[str]`, `optional`):
+            A list of characters to include in the initial alphabet, even
+            if not seen in the training dataset.
+            If the strings contain more than one character, only the first one
+            is kept.
+
+        continuing_subword_prefix (:obj:`str`, `optional`):
+            A prefix to be used for every subword that is not a beginning-of-word.
+
+        end_of_word_suffix (:obj:`str`, `optional`):
+            A suffix to be used for every subword that is a end-of-word.
+    """
+    def __init__(
+        self,
+        vocab_size=30000,
+        min_frequency=0,
+        show_progress=True,
+        special_tokens=[],
+        limit_alphabet=None,
+        initial_alphabet=[],
+        continuing_subword_prefix="##",
+        end_of_word_suffix=None,
+    ):
+        pass