diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi')
-rw-r--r-- | .venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi new file mode 100644 index 00000000..d6c52571 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers/trainers/__init__.pyi @@ -0,0 +1,156 @@ +# Generated content DO NOT EDIT +class Trainer: + """ + Base class for all trainers + + This class is not supposed to be instantiated directly. Instead, any implementation of a + Trainer will return an instance of this class when instantiated. + """ + +class BpeTrainer(Trainer): + """ + Trainer capable of training a BPE model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + + limit_alphabet (:obj:`int`, `optional`): + The maximum different characters to keep in the alphabet. + + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix (:obj:`str`, `optional`): + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix (:obj:`str`, `optional`): + A suffix to be used for every subword that is a end-of-word. + + max_token_length (:obj:`int`, `optional`): + Prevents creating tokens longer than the specified size. + This can help with reducing polluting your vocabulary with + highly repetitive tokens like `======` for wikipedia + + """ + +class UnigramTrainer(Trainer): + """ + Trainer capable of training a Unigram model + + Args: + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + + show_progress (:obj:`bool`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + + initial_alphabet (:obj:`List[str]`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + shrinking_factor (:obj:`float`): + The shrinking factor used at each step of the training to prune the + vocabulary. + + unk_token (:obj:`str`): + The token used for out-of-vocabulary tokens. + + max_piece_length (:obj:`int`): + The maximum length of a given token. + + n_sub_iterations (:obj:`int`): + The number of iterations of the EM algorithm to perform before + pruning the vocabulary. + """ + def __init__( + self, + vocab_size=8000, + show_progress=True, + special_tokens=[], + shrinking_factor=0.75, + unk_token=None, + max_piece_length=16, + n_sub_iterations=2, + ): + pass + +class WordLevelTrainer(Trainer): + """ + Trainer capable of training a WorldLevel model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + """ + +class WordPieceTrainer(Trainer): + """ + Trainer capable of training a WordPiece model + + Args: + vocab_size (:obj:`int`, `optional`): + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency (:obj:`int`, `optional`): + The minimum frequency a pair should have in order to be merged. + + show_progress (:obj:`bool`, `optional`): + Whether to show progress bars while training. + + special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`): + A list of special tokens the model should know of. + + limit_alphabet (:obj:`int`, `optional`): + The maximum different characters to keep in the alphabet. + + initial_alphabet (:obj:`List[str]`, `optional`): + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix (:obj:`str`, `optional`): + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix (:obj:`str`, `optional`): + A suffix to be used for every subword that is a end-of-word. + """ + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix="##", + end_of_word_suffix=None, + ): + pass |