diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info')
6 files changed, 222 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/INSTALLER b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/INSTALLER new file mode 100644 index 00000000..a1b589e3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/INSTALLER @@ -0,0 +1 @@ +pip diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/LICENSE b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/LICENSE new file mode 100644 index 00000000..83ed1036 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 OpenAI, Shantanu Jain + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/METADATA b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/METADATA new file mode 100644 index 00000000..96a905e6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/METADATA @@ -0,0 +1,170 @@ +Metadata-Version: 2.1 +Name: tiktoken +Version: 0.8.0 +Summary: tiktoken is a fast BPE tokeniser for use with OpenAI's models +Author: Shantanu Jain +Author-email: shantanu@openai.com +License: MIT License + + Copyright (c) 2022 OpenAI, Shantanu Jain + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +Project-URL: homepage, https://github.com/openai/tiktoken +Project-URL: repository, https://github.com/openai/tiktoken +Project-URL: changelog, https://github.com/openai/tiktoken/blob/main/CHANGELOG.md +Requires-Python: >=3.9 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: regex>=2022.1.18 +Requires-Dist: requests>=2.26.0 +Provides-Extra: blobfile +Requires-Dist: blobfile>=2; extra == "blobfile" + +# ⏳ tiktoken + +tiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with +OpenAI's models. + +```python +import tiktoken +enc = tiktoken.get_encoding("o200k_base") +assert enc.decode(enc.encode("hello world")) == "hello world" + +# To get the tokeniser corresponding to a specific model in the OpenAI API: +enc = tiktoken.encoding_for_model("gpt-4o") +``` + +The open source version of `tiktoken` can be installed from PyPI: +``` +pip install tiktoken +``` + +The tokeniser API is documented in `tiktoken/core.py`. + +Example code using `tiktoken` can be found in the +[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). + + +## Performance + +`tiktoken` is between 3-6x faster than a comparable open source tokeniser: + + + +Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from +`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`. + + +## Getting help + +Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues). + +If you work at OpenAI, make sure to check the internal documentation or feel free to contact +@shantanu. + +## What is BPE anyway? + +Language models don't see text like you and I, instead they see a sequence of numbers (known as tokens). +Byte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable +properties: +1) It's reversible and lossless, so you can convert tokens back into the original text +2) It works on arbitrary text, even text that is not in the tokeniser's training data +3) It compresses the text: the token sequence is shorter than the bytes corresponding to the + original text. On average, in practice, each token corresponds to about 4 bytes. +4) It attempts to let the model see common subwords. For instance, "ing" is a common subword in + English, so BPE encodings will often split "encoding" into tokens like "encod" and "ing" + (instead of e.g. "enc" and "oding"). Because the model will then see the "ing" token again and + again in different contexts, it helps models generalise and better understand grammar. + +`tiktoken` contains an educational submodule that is friendlier if you want to learn more about +the details of BPE, including code that helps visualise the BPE procedure: +```python +from tiktoken._educational import * + +# Train a BPE tokeniser on a small amount of text +enc = train_simple_encoding() + +# Visualise how the GPT-4 encoder encodes text +enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base") +enc.encode("hello world aaaaaaaaaaaa") +``` + + +## Extending tiktoken + +You may wish to extend `tiktoken` to support new encodings. There are two ways to do this. + + +**Create your `Encoding` object exactly the way you want and simply pass it around.** + +```python +cl100k_base = tiktoken.get_encoding("cl100k_base") + +# In production, load the arguments directly instead of accessing private attributes +# See openai_public.py for examples of arguments for specific encodings +enc = tiktoken.Encoding( + # If you're changing the set of special tokens, make sure to use a different name + # It should be clear from the name what behaviour to expect. + name="cl100k_im", + pat_str=cl100k_base._pat_str, + mergeable_ranks=cl100k_base._mergeable_ranks, + special_tokens={ + **cl100k_base._special_tokens, + "<|im_start|>": 100264, + "<|im_end|>": 100265, + } +) +``` + +**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.** + +This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer +option 1. + +To do this, you'll need to create a namespace package under `tiktoken_ext`. + +Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file: +``` +my_tiktoken_extension +├── tiktoken_ext +│ └── my_encodings.py +└── setup.py +``` + +`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`. +This is a dictionary from an encoding name to a function that takes no arguments and returns +arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see +`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`. + +Your `setup.py` should look something like this: +```python +from setuptools import setup, find_namespace_packages + +setup( + name="my_tiktoken_extension", + packages=find_namespace_packages(include=['tiktoken_ext*']), + install_requires=["tiktoken"], + ... +) +``` + +Then simply `pip install ./my_tiktoken_extension` and you should be able to use your +custom encodings! Make sure **not** to use an editable install. + diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/RECORD b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/RECORD new file mode 100644 index 00000000..741177bc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/RECORD @@ -0,0 +1,22 @@ +tiktoken-0.8.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+tiktoken-0.8.0.dist-info/LICENSE,sha256=QYy0mbQ2Eo1lPXmUEzOlQ3t74uqSE9zC8E0V1dLFHYY,1078
+tiktoken-0.8.0.dist-info/METADATA,sha256=G22RttOE0IfhdQfm3Pk8TJ0dMyd2iI3l5QbctJnIW7w,6628
+tiktoken-0.8.0.dist-info/RECORD,,
+tiktoken-0.8.0.dist-info/WHEEL,sha256=7B4nnId14TToQHuAKpxbDLCJbNciqBsV-mvXE2hVLJc,151
+tiktoken-0.8.0.dist-info/top_level.txt,sha256=54G5MceQnuD7EXvp7jzGxDDapA1iOwsh77jhCN9WKkc,22
+tiktoken/__init__.py,sha256=AYmbR4RBsDfEtKozDMVL4Uc-53jEQKOJE0poPhff_Ec,345
+tiktoken/__pycache__/__init__.cpython-312.pyc,,
+tiktoken/__pycache__/_educational.cpython-312.pyc,,
+tiktoken/__pycache__/core.cpython-312.pyc,,
+tiktoken/__pycache__/load.cpython-312.pyc,,
+tiktoken/__pycache__/model.cpython-312.pyc,,
+tiktoken/__pycache__/registry.cpython-312.pyc,,
+tiktoken/_educational.py,sha256=TUFOp8Q91WjrTvGKhCNEyrhtva82UlenXfhPy9zS7VQ,8229
+tiktoken/_tiktoken.cpython-312-x86_64-linux-gnu.so,sha256=eAaknWTo8KzK8xN0WuMsEVee9rtNyY6o5NfKJVMFTWo,3427456
+tiktoken/core.py,sha256=6rZvV6hDe_y6QP5RU-Mn-Awx7QMXqesjM7_Dq8uB01Y,15959
+tiktoken/load.py,sha256=kb0HIT25tDYGE_iM7BVdTYO3g-G5QdFUjKbplGI32-Y,5305
+tiktoken/model.py,sha256=TBiFm7qHmnMA0Q86t1PgZojTigG9_JDzzCcew598PBI,3705
+tiktoken/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tiktoken/registry.py,sha256=7fktZbJ1Kcm8sVyWgEfIy-ZxfUvcXupLUNXKPfSGwQU,3256
+tiktoken_ext/__pycache__/openai_public.cpython-312.pyc,,
+tiktoken_ext/openai_public.py,sha256=lGdukqVfxy0fkLYQgspecfBxa4DRYH7pACd3r0QcRa8,4624
diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/WHEEL b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/WHEEL new file mode 100644 index 00000000..3e811828 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: setuptools (75.1.0) +Root-Is-Purelib: false +Tag: cp312-cp312-manylinux_2_17_x86_64 +Tag: cp312-cp312-manylinux2014_x86_64 + diff --git a/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/top_level.txt b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/top_level.txt new file mode 100644 index 00000000..859880ea --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken-0.8.0.dist-info/top_level.txt @@ -0,0 +1,2 @@ +tiktoken +tiktoken_ext |