aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py122
1 files changed, 122 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
new file mode 100644
index 00000000..c7e3dbc4
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.py
@@ -0,0 +1,122 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
+
+from .base_tokenizer import BaseTokenizer
+
+
+class ByteLevelBPETokenizer(BaseTokenizer):
+ """ByteLevelBPETokenizer
+
+ Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+ add_prefix_space: bool = False,
+ lowercase: bool = False,
+ dropout: Optional[float] = None,
+ unicode_normalizer: Optional[str] = None,
+ continuing_subword_prefix: Optional[str] = None,
+ end_of_word_suffix: Optional[str] = None,
+ trim_offsets: bool = False,
+ ):
+ if vocab is not None and merges is not None:
+ tokenizer = Tokenizer(
+ BPE(
+ vocab,
+ merges,
+ dropout=dropout,
+ continuing_subword_prefix=continuing_subword_prefix or "",
+ end_of_word_suffix=end_of_word_suffix or "",
+ )
+ )
+ else:
+ tokenizer = Tokenizer(BPE())
+
+ # Check for Unicode normalization first (before everything else)
+ normalizers = []
+
+ if unicode_normalizer:
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+ if lowercase:
+ normalizers += [Lowercase()]
+
+ # Create the normalizer structure
+ if len(normalizers) > 0:
+ if len(normalizers) > 1:
+ tokenizer.normalizer = Sequence(normalizers)
+ else:
+ tokenizer.normalizer = normalizers[0]
+
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+ tokenizer.decoder = decoders.ByteLevel()
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=trim_offsets)
+
+ parameters = {
+ "model": "ByteLevelBPE",
+ "add_prefix_space": add_prefix_space,
+ "lowercase": lowercase,
+ "dropout": dropout,
+ "unicode_normalizer": unicode_normalizer,
+ "continuing_subword_prefix": continuing_subword_prefix,
+ "end_of_word_suffix": end_of_word_suffix,
+ "trim_offsets": trim_offsets,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ return ByteLevelBPETokenizer(vocab, merges, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ show_progress: bool = True,
+ special_tokens: List[Union[str, AddedToken]] = [],
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ show_progress=show_progress,
+ special_tokens=special_tokens,
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ show_progress: bool = True,
+ special_tokens: List[Union[str, AddedToken]] = [],
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ show_progress=show_progress,
+ special_tokens=special_tokens,
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )