diff options
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info')
4 files changed, 259 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/INSTALLER b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/INSTALLER new file mode 100644 index 00000000..a1b589e3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/METADATA b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/METADATA new file mode 100644 index 00000000..92ed843e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/METADATA @@ -0,0 +1,209 @@ +Metadata-Version: 2.3 +Name: tokenizers +Version: 0.19.0 +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Education +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Requires-Dist: huggingface-hub >=0.16.4, <1.0 +Requires-Dist: pytest ; extra == 'testing' +Requires-Dist: requests ; extra == 'testing' +Requires-Dist: numpy ; extra == 'testing' +Requires-Dist: datasets ; extra == 'testing' +Requires-Dist: black ==22.3 ; extra == 'testing' +Requires-Dist: ruff ; extra == 'testing' +Requires-Dist: sphinx ; extra == 'docs' +Requires-Dist: sphinx-rtd-theme ; extra == 'docs' +Requires-Dist: setuptools-rust ; extra == 'docs' +Requires-Dist: tokenizers[testing] ; extra == 'dev' +Provides-Extra: testing +Provides-Extra: docs +Provides-Extra: dev +Keywords: NLP,tokenizer,BPE,transformer,deep learning +Author: Anthony MOI <m.anthony.moi@gmail.com> +Author-email: Nicolas Patry <patry.nicolas@protonmail.com>, Anthony Moi <anthony@huggingface.co> +Requires-Python: >=3.7 +Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM +Project-URL: Homepage, https://github.com/huggingface/tokenizers +Project-URL: Source, https://github.com/huggingface/tokenizers + +<p align="center"> + <br> + <img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/> + <br> +<p> +<p align="center"> + <a href="https://badge.fury.io/py/tokenizers"> + <img alt="Build" src="https://badge.fury.io/py/tokenizers.svg"> + </a> + <a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE"> + <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue"> + </a> +</p> +<br> + +# Tokenizers + +Provides an implementation of today's most used tokenizers, with a focus on performance and +versatility. + +Bindings over the [Rust](https://github.com/huggingface/tokenizers/tree/master/tokenizers) implementation. +If you are interested in the High-level design, you can go check it there. + +Otherwise, let's dive in! + +## Main features: + + - Train new vocabularies and tokenize using 4 pre-made tokenizers (Bert WordPiece and the 3 + most common BPE versions). + - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes + less than 20 seconds to tokenize a GB of text on a server's CPU. + - Easy to use, but also extremely versatile. + - Designed for research and production. + - Normalization comes with alignments tracking. It's always possible to get the part of the + original sentence that corresponds to a given token. + - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs. + +### Installation + +#### With pip: + +```bash +pip install tokenizers +``` + +#### From sources: + +To use this method, you need to have the Rust installed: + +```bash +# Install with: +curl https://sh.rustup.rs -sSf | sh -s -- -y +export PATH="$HOME/.cargo/bin:$PATH" +``` + +Once Rust is installed, you can compile doing the following + +```bash +git clone https://github.com/huggingface/tokenizers +cd tokenizers/bindings/python + +# Create a virtual env (you can use yours as well) +python -m venv .env +source .env/bin/activate + +# Install `tokenizers` in the current virtual env +pip install -e . +``` + +### Load a pretrained tokenizer from the Hub + +```python +from tokenizers import Tokenizer + +tokenizer = Tokenizer.from_pretrained("bert-base-cased") +``` + +### Using the provided Tokenizers + +We provide some pre-build tokenizers to cover the most common cases. You can easily load one of +these using some `vocab.json` and `merges.txt` files: + +```python +from tokenizers import CharBPETokenizer + +# Initialize a tokenizer +vocab = "./path/to/vocab.json" +merges = "./path/to/merges.txt" +tokenizer = CharBPETokenizer(vocab, merges) + +# And then encode: +encoded = tokenizer.encode("I can feel the magic, can you?") +print(encoded.ids) +print(encoded.tokens) +``` + +And you can train them just as simply: + +```python +from tokenizers import CharBPETokenizer + +# Initialize a tokenizer +tokenizer = CharBPETokenizer() + +# Then train it! +tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ]) + +# Now, let's use it: +encoded = tokenizer.encode("I can feel the magic, can you?") + +# And finally save it somewhere +tokenizer.save("./path/to/directory/my-bpe.tokenizer.json") +``` + +#### Provided Tokenizers + + - `CharBPETokenizer`: The original BPE + - `ByteLevelBPETokenizer`: The byte level version of the BPE + - `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece + - `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece + +All of these can be used and trained as explained above! + +### Build your own + +Whenever these provided tokenizers don't give you enough freedom, you can build your own tokenizer, +by putting all the different parts you need together. +You can check how we implemented the [provided tokenizers](https://github.com/huggingface/tokenizers/tree/master/bindings/python/py_src/tokenizers/implementations) and adapt them easily to your own needs. + +#### Building a byte-level BPE + +Here is an example showing how to build your own byte-level BPE by putting all the different pieces +together, and then saving it to a single file: + +```python +from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors + +# Initialize a tokenizer +tokenizer = Tokenizer(models.BPE()) + +# Customize pre-tokenization and decoding +tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) +tokenizer.decoder = decoders.ByteLevel() +tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) + +# And then train +trainer = trainers.BpeTrainer( + vocab_size=20000, + min_frequency=2, + initial_alphabet=pre_tokenizers.ByteLevel.alphabet() +) +tokenizer.train([ + "./path/to/dataset/1.txt", + "./path/to/dataset/2.txt", + "./path/to/dataset/3.txt" +], trainer=trainer) + +# And Save it +tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True) +``` + +Now, when you want to use this tokenizer, this is as simple as: + +```python +from tokenizers import Tokenizer + +tokenizer = Tokenizer.from_file("byte-level-bpe.tokenizer.json") + +encoded = tokenizer.encode("I can feel the magic, can you?") +``` + diff --git a/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/RECORD b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/RECORD new file mode 100644 index 00000000..01874586 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/RECORD @@ -0,0 +1,45 @@ +tokenizers-0.19.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+tokenizers-0.19.0.dist-info/METADATA,sha256=rDKbm3b4RBHOQOy32zupeoU1BeQDweHwV1XKr10Zzng,6719
+tokenizers-0.19.0.dist-info/RECORD,,
+tokenizers-0.19.0.dist-info/WHEEL,sha256=mmu2E0mKgwj7RU_goBCp9zGID39rD2A9DEmV3gS8Cpw,129
+tokenizers/__init__.py,sha256=ZE5ZagUvobBScrHBQdEobhx4wqM0bsq9F9aLYkBNjYQ,2615
+tokenizers/__init__.pyi,sha256=YBIWZCSN4Rs_-yKdEwhVv77bgHRE36hX9iwFrWGMJ8E,38536
+tokenizers/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/decoders/__init__.py,sha256=lGp32h8qerE0F48gyZL8wGmeQVlmjVpeIsRb1SM9kf4,335
+tokenizers/decoders/__init__.pyi,sha256=xsReo7OFRCiQ4bBZY9ogYb1iLJ5DTgI5elNB-Uggocs,7244
+tokenizers/decoders/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/implementations/__init__.py,sha256=VzAsplaIo7rl4AFO8Miu7ig7MfZjvonwVblZw01zR6M,310
+tokenizers/implementations/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/base_tokenizer.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/bert_wordpiece.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/byte_level_bpe.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/char_level_bpe.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/sentencepiece_bpe.cpython-312.pyc,,
+tokenizers/implementations/__pycache__/sentencepiece_unigram.cpython-312.pyc,,
+tokenizers/implementations/base_tokenizer.py,sha256=2TFZhLupaJiMDYGJuUNmxYJv-cnR8bDHmbMzaYpFROs,14206
+tokenizers/implementations/bert_wordpiece.py,sha256=sKCum0FKPYdSgJFJN8LDerVBoTDRSqyqSdrcm-lvQqI,5520
+tokenizers/implementations/byte_level_bpe.py,sha256=OA_jyy3EQmYTa6hnf-EKwLOFuyroqFYOJz25ysM2BUk,4289
+tokenizers/implementations/char_level_bpe.py,sha256=Q2ZEAW0xMQHF7YCUtmplwaxbU-J0P2NK4PJGMxUb-_c,5466
+tokenizers/implementations/sentencepiece_bpe.py,sha256=LwrofoohnUfME2lK2lQYoyQIhP84RP0CIlHRaj0hyNs,3738
+tokenizers/implementations/sentencepiece_unigram.py,sha256=SYiVXL8ZtqLXKpuqwnwmrfxgGotu8yAkOu7dLztEXIo,7580
+tokenizers/models/__init__.py,sha256=eJZ4HTAQZpxnKILNylWaTFqxXy-Ba6OKswWN47feeV8,176
+tokenizers/models/__init__.pyi,sha256=wH4M-ZZprw3UQ98fxWrF3MpivuNVY3s3pv4pGY0A_kE,16932
+tokenizers/models/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/normalizers/__init__.py,sha256=hKOwnqWM-IlcVv7HDWT9SYhlczevuCNDQJY05ZFxkzk,808
+tokenizers/normalizers/__init__.pyi,sha256=5SGm-u896MZht6TXMS9sWv1lCATnwNqbC2Udl5aP4dg,19597
+tokenizers/normalizers/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/pre_tokenizers/__init__.py,sha256=wd6KYQA_RsGSQK-HeG9opTRhv4ttSRkyno2dk6az-PM,557
+tokenizers/pre_tokenizers/__init__.pyi,sha256=IhF7dZt9_9_WM2ESKwEIvN59uW_YzS2PzmWBUScysWU,23258
+tokenizers/pre_tokenizers/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/processors/__init__.py,sha256=xM2DEKwKtHIumHsszM8AMkq-AlaqvBZFXWgLU8SNhOY,307
+tokenizers/processors/__init__.pyi,sha256=hx767ZY8SHhxb_hiXPRxm-f_KcoR4XDx7vfK2c0lR-Q,11357
+tokenizers/processors/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/tokenizers.cpython-312-x86_64-linux-gnu.so,sha256=NblTQIwRa0H6rewQpIymX1xTTdxYcgPX3RGI7rXvklk,11826456
+tokenizers/tools/__init__.py,sha256=xG8caB9OHC8cbB01S5vYV14HZxhO6eWbLehsb70ppio,55
+tokenizers/tools/__pycache__/__init__.cpython-312.pyc,,
+tokenizers/tools/__pycache__/visualizer.cpython-312.pyc,,
+tokenizers/tools/visualizer-styles.css,sha256=zAydq1oGWD8QEll4-eyL8Llw0B1sty_hpIE3tYxL02k,4850
+tokenizers/tools/visualizer.py,sha256=gi-E2NCP7FuG6ujpQOdalSTXUlaV85V6NI-ZPPTvA_4,14625
+tokenizers/trainers/__init__.py,sha256=UTu22AGcp76IvpW45xLRbJWET04NxPW6NfCb2YYz0EM,248
+tokenizers/trainers/__init__.pyi,sha256=3TwFKts4me7zQfVRcSTmtXYiP4XwcRjfAYtwqoZVtoQ,5382
+tokenizers/trainers/__pycache__/__init__.cpython-312.pyc,,
diff --git a/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/WHEEL b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/WHEEL new file mode 100644 index 00000000..d4c2118c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tokenizers-0.19.0.dist-info/WHEEL @@ -0,0 +1,4 @@ +Wheel-Version: 1.0 +Generator: maturin (1.5.1) +Root-Is-Purelib: false +Tag: cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 |