aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi591
1 files changed, 591 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
new file mode 100644
index 00000000..955b9a16
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/models/__init__.pyi
@@ -0,0 +1,591 @@
+# Generated content DO NOT EDIT
+class Model:
+ """
+ Base class for all models
+
+ The model represents the actual tokenization algorithm. This is the part that
+ will contain and manage the learned vocabulary.
+
+ This class cannot be constructed directly. Please use one of the concrete models.
+ """
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class BPE(Model):
+ """
+ An implementation of the BPE (Byte-Pair Encoding) algorithm
+
+ Args:
+ vocab (:obj:`Dict[str, int]`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ merges (:obj:`List[Tuple[str, str]]`, `optional`):
+ A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
+
+ cache_capacity (:obj:`int`, `optional`):
+ The number of words that the BPE cache can contain. The cache allows
+ to speed-up the process by keeping the result of the merge operations
+ for a number of words.
+
+ dropout (:obj:`float`, `optional`):
+ A float between 0 and 1 that represents the BPE dropout to use.
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+
+ continuing_subword_prefix (:obj:`str`, `optional`):
+ The prefix to attach to subword units that don't represent a beginning of word.
+
+ end_of_word_suffix (:obj:`str`, `optional`):
+ The suffix to attach to subword units that represent an end of word.
+
+ fuse_unk (:obj:`bool`, `optional`):
+ Whether to fuse any subsequent unknown tokens into a single one
+
+ byte_fallback (:obj:`bool`, `optional`):
+ Whether to use spm byte-fallback trick (defaults to False)
+
+ ignore_merges (:obj:`bool`, `optional`):
+ Whether or not to match tokens with the vocab before using merges.
+ """
+ def __init__(
+ self,
+ vocab=None,
+ merges=None,
+ cache_capacity=None,
+ dropout=None,
+ unk_token=None,
+ continuing_subword_prefix=None,
+ end_of_word_suffix=None,
+ fuse_unk=None,
+ byte_fallback=False,
+ ignore_merges=False,
+ ):
+ pass
+
+ @staticmethod
+ def from_file(cls, vocab, merge, **kwargs):
+ """
+ Instantiate a BPE model from the given files.
+
+ This method is roughly equivalent to doing::
+
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ bpe = BPE(vocab, merges)
+
+ If you don't need to keep the :obj:`vocab, merges` values lying around,
+ this method is more optimized than manually calling
+ :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ merges (:obj:`str`):
+ The path to a :obj:`merges.txt` file
+
+ Returns:
+ :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(self, vocab, merges):
+ """
+ Read a :obj:`vocab.json` and a :obj:`merges.txt` files
+
+ This method provides a way to read and parse the content of these files,
+ returning the relevant data structures. If you want to instantiate some BPE models
+ from memory, this method gives you the expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ merges (:obj:`str`):
+ The path to a :obj:`merges.txt` file
+
+ Returns:
+ A :obj:`Tuple` with the vocab and the merges:
+ The vocabulary and merges loaded into memory
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class Unigram(Model):
+ """
+ An implementation of the Unigram algorithm
+
+ Args:
+ vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
+ A list of vocabulary items and their relative score [("am", -0.2442),...]
+ """
+ def __init__(self, vocab, unk_id, byte_fallback):
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class WordLevel(Model):
+ """
+ An implementation of the WordLevel algorithm
+
+ Most simple tokenizer model based on mapping tokens to their corresponding id.
+
+ Args:
+ vocab (:obj:`str`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+ """
+ def __init__(self, vocab, unk_token):
+ pass
+
+ @staticmethod
+ def from_file(vocab, unk_token):
+ """
+ Instantiate a WordLevel model from the given file
+
+ This method is roughly equivalent to doing::
+
+ vocab = WordLevel.read_file(vocab_filename)
+ wordlevel = WordLevel(vocab)
+
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
+ more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
+ initialize a :class:`~tokenizers.models.WordLevel`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ Returns:
+ :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(vocab):
+ """
+ Read a :obj:`vocab.json`
+
+ This method provides a way to read and parse the content of a vocabulary file,
+ returning the relevant data structures. If you want to instantiate some WordLevel models
+ from memory, this method gives you the expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.json` file
+
+ Returns:
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass
+
+class WordPiece(Model):
+ """
+ An implementation of the WordPiece algorithm
+
+ Args:
+ vocab (:obj:`Dict[str, int]`, `optional`):
+ A dictionnary of string keys and their ids :obj:`{"am": 0,...}`
+
+ unk_token (:obj:`str`, `optional`):
+ The unknown token to be used by the model.
+
+ max_input_chars_per_word (:obj:`int`, `optional`):
+ The maximum number of characters to authorize in a single word.
+ """
+ def __init__(self, vocab, unk_token, max_input_chars_per_word):
+ pass
+
+ @staticmethod
+ def from_file(vocab, **kwargs):
+ """
+ Instantiate a WordPiece model from the given file
+
+ This method is roughly equivalent to doing::
+
+ vocab = WordPiece.read_file(vocab_filename)
+ wordpiece = WordPiece(vocab)
+
+ If you don't need to keep the :obj:`vocab` values lying around, this method is
+ more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
+ initialize a :class:`~tokenizers.models.WordPiece`
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.txt` file
+
+ Returns:
+ :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
+ """
+ pass
+
+ def get_trainer(self):
+ """
+ Get the associated :class:`~tokenizers.trainers.Trainer`
+
+ Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+ :class:`~tokenizers.models.Model`.
+
+ Returns:
+ :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+ """
+ pass
+
+ def id_to_token(self, id):
+ """
+ Get the token associated to an ID
+
+ Args:
+ id (:obj:`int`):
+ An ID to convert to a token
+
+ Returns:
+ :obj:`str`: The token associated to the ID
+ """
+ pass
+
+ @staticmethod
+ def read_file(vocab):
+ """
+ Read a :obj:`vocab.txt` file
+
+ This method provides a way to read and parse the content of a standard `vocab.txt`
+ file as used by the WordPiece Model, returning the relevant data structures. If you
+ want to instantiate some WordPiece models from memory, this method gives you the
+ expected input from the standard files.
+
+ Args:
+ vocab (:obj:`str`):
+ The path to a :obj:`vocab.txt` file
+
+ Returns:
+ :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
+ """
+ pass
+
+ def save(self, folder, prefix):
+ """
+ Save the current model
+
+ Save the current model in the given folder, using the given prefix for the various
+ files that will get created.
+ Any file with the same name that already exists in this folder will be overwritten.
+
+ Args:
+ folder (:obj:`str`):
+ The path to the target folder in which to save the various files
+
+ prefix (:obj:`str`, `optional`):
+ An optional prefix, used to prefix each file name
+
+ Returns:
+ :obj:`List[str]`: The list of saved files
+ """
+ pass
+
+ def token_to_id(self, tokens):
+ """
+ Get the ID associated to a token
+
+ Args:
+ token (:obj:`str`):
+ A token to convert to an ID
+
+ Returns:
+ :obj:`int`: The ID associated to the token
+ """
+ pass
+
+ def tokenize(self, sequence):
+ """
+ Tokenize a sequence
+
+ Args:
+ sequence (:obj:`str`):
+ A sequence to tokenize
+
+ Returns:
+ A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
+ """
+ pass