aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py150
1 files changed, 150 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
new file mode 100644
index 00000000..29ca5977
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.py
@@ -0,0 +1,150 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from ..models import BPE
+from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
+from .base_tokenizer import BaseTokenizer
+
+
+class CharBPETokenizer(BaseTokenizer):
+ """Original BPE Tokenizer
+
+ Represents the BPE algorithm, as introduced by Rico Sennrich
+ (https://arxiv.org/abs/1508.07909)
+
+ The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
+ Sennrich subword-nmt implementation by the following options that you can deactivate:
+ - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
+ * removing any control characters and replacing all whitespaces by the classic one.
+ * handle chinese chars by putting spaces around them.
+ * strip all accents.
+ - spitting on punctuation in addition to whitespaces (deactivate it with
+ `split_on_whitespace_only=True`)
+ """
+
+ def __init__(
+ self,
+ vocab: Optional[Union[str, Dict[str, int]]] = None,
+ merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+ unk_token: Union[str, AddedToken] = "<unk>",
+ suffix: str = "</w>",
+ dropout: Optional[float] = None,
+ lowercase: bool = False,
+ unicode_normalizer: Optional[str] = None,
+ bert_normalizer: bool = True,
+ split_on_whitespace_only: bool = False,
+ ):
+ if vocab is not None and merges is not None:
+ tokenizer = Tokenizer(
+ BPE(
+ vocab,
+ merges,
+ dropout=dropout,
+ unk_token=str(unk_token),
+ end_of_word_suffix=suffix,
+ )
+ )
+ else:
+ tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix))
+
+ if tokenizer.token_to_id(str(unk_token)) is not None:
+ tokenizer.add_special_tokens([str(unk_token)])
+
+ # Check for Unicode normalization first (before everything else)
+ normalizers = []
+
+ if unicode_normalizer:
+ normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
+
+ if bert_normalizer:
+ normalizers += [BertNormalizer(lowercase=False)]
+
+ if lowercase:
+ normalizers += [Lowercase()]
+
+ # Create the normalizer structure
+ if len(normalizers) > 0:
+ if len(normalizers) > 1:
+ tokenizer.normalizer = Sequence(normalizers)
+ else:
+ tokenizer.normalizer = normalizers[0]
+
+ if split_on_whitespace_only:
+ tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
+ else:
+ tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+ tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
+
+ parameters = {
+ "model": "BPE",
+ "unk_token": unk_token,
+ "suffix": suffix,
+ "dropout": dropout,
+ "lowercase": lowercase,
+ "unicode_normalizer": unicode_normalizer,
+ "bert_normalizer": bert_normalizer,
+ "split_on_whitespace_only": split_on_whitespace_only,
+ }
+
+ super().__init__(tokenizer, parameters)
+
+ @staticmethod
+ def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+ vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+ return CharBPETokenizer(vocab, merges, **kwargs)
+
+ def train(
+ self,
+ files: Union[str, List[str]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ suffix: Optional[str] = "</w>",
+ show_progress: bool = True,
+ ):
+ """Train the model using the given files"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ end_of_word_suffix=suffix,
+ show_progress=show_progress,
+ )
+ if isinstance(files, str):
+ files = [files]
+ self._tokenizer.train(files, trainer=trainer)
+
+ def train_from_iterator(
+ self,
+ iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+ vocab_size: int = 30000,
+ min_frequency: int = 2,
+ special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+ limit_alphabet: int = 1000,
+ initial_alphabet: List[str] = [],
+ suffix: Optional[str] = "</w>",
+ show_progress: bool = True,
+ length: Optional[int] = None,
+ ):
+ """Train the model using the given iterator"""
+
+ trainer = trainers.BpeTrainer(
+ vocab_size=vocab_size,
+ min_frequency=min_frequency,
+ special_tokens=special_tokens,
+ limit_alphabet=limit_alphabet,
+ initial_alphabet=initial_alphabet,
+ end_of_word_suffix=suffix,
+ show_progress=show_progress,
+ )
+ self._tokenizer.train_from_iterator(
+ iterator,
+ trainer=trainer,
+ length=length,
+ )