about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py103
1 files changed, 103 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
new file mode 100644
index 00000000..cd550b41
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.py
@@ -0,0 +1,103 @@
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+
+from .base_tokenizer import BaseTokenizer
+
+
+class SentencePieceBPETokenizer(BaseTokenizer):
+    """SentencePiece BPE Tokenizer
+
+    Represents the BPE algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        vocab: Optional[Union[str, Dict[str, int]]] = None,
+        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+        dropout: Optional[float] = None,
+        fuse_unk: Optional[bool] = False,
+    ):
+        if vocab is not None and merges is not None:
+            tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+        else:
+            tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
+
+        if tokenizer.token_to_id(str(unk_token)) is not None:
+            tokenizer.add_special_tokens([str(unk_token)])
+
+        tokenizer.normalizer = NFKC()
+        prepend_scheme = "always" if add_prefix_space else "never"
+        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+
+        parameters = {
+            "model": "SentencePieceBPE",
+            "unk_token": unk_token,
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+            "dropout": dropout,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    @staticmethod
+    def from_file(vocab_filename: str, merges_filename: str, **kwargs):
+        vocab, merges = BPE.read_file(vocab_filename, merges_filename)
+        return SentencePieceBPETokenizer(vocab, merges, **kwargs)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        show_progress: bool = True,
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            show_progress=show_progress,
+        )
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 30000,
+        min_frequency: int = 2,
+        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
+        limit_alphabet: int = 1000,
+        initial_alphabet: List[str] = [],
+        show_progress: bool = True,
+        length: Optional[int] = None,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=special_tokens,
+            limit_alphabet=limit_alphabet,
+            initial_alphabet=initial_alphabet,
+            show_progress=show_progress,
+        )
+        self._tokenizer.train_from_iterator(
+            iterator,
+            trainer=trainer,
+            length=length,
+        )