aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tiktoken/model.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tiktoken/model.py')
-rw-r--r--.venv/lib/python3.12/site-packages/tiktoken/model.py105
1 files changed, 105 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tiktoken/model.py b/.venv/lib/python3.12/site-packages/tiktoken/model.py
new file mode 100644
index 00000000..681b9131
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tiktoken/model.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+from .core import Encoding
+from .registry import get_encoding
+
+# TODO: these will likely be replaced by an API endpoint
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+ "o1-": "o200k_base",
+ # chat
+ "chatgpt-4o-": "o200k_base",
+ "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
+ "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
+ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
+ "gpt-35-turbo-": "cl100k_base", # Azure deployment name
+ # fine-tuned
+ "ft:gpt-4": "cl100k_base",
+ "ft:gpt-3.5-turbo": "cl100k_base",
+ "ft:davinci-002": "cl100k_base",
+ "ft:babbage-002": "cl100k_base",
+}
+
+MODEL_TO_ENCODING: dict[str, str] = {
+ # chat
+ "gpt-4o": "o200k_base",
+ "gpt-4": "cl100k_base",
+ "gpt-3.5-turbo": "cl100k_base",
+ "gpt-3.5": "cl100k_base", # Common shorthand
+ "gpt-35-turbo": "cl100k_base", # Azure deployment name
+ # base
+ "davinci-002": "cl100k_base",
+ "babbage-002": "cl100k_base",
+ # embeddings
+ "text-embedding-ada-002": "cl100k_base",
+ "text-embedding-3-small": "cl100k_base",
+ "text-embedding-3-large": "cl100k_base",
+ # DEPRECATED MODELS
+ # text (DEPRECATED)
+ "text-davinci-003": "p50k_base",
+ "text-davinci-002": "p50k_base",
+ "text-davinci-001": "r50k_base",
+ "text-curie-001": "r50k_base",
+ "text-babbage-001": "r50k_base",
+ "text-ada-001": "r50k_base",
+ "davinci": "r50k_base",
+ "curie": "r50k_base",
+ "babbage": "r50k_base",
+ "ada": "r50k_base",
+ # code (DEPRECATED)
+ "code-davinci-002": "p50k_base",
+ "code-davinci-001": "p50k_base",
+ "code-cushman-002": "p50k_base",
+ "code-cushman-001": "p50k_base",
+ "davinci-codex": "p50k_base",
+ "cushman-codex": "p50k_base",
+ # edit (DEPRECATED)
+ "text-davinci-edit-001": "p50k_edit",
+ "code-davinci-edit-001": "p50k_edit",
+ # old embeddings (DEPRECATED)
+ "text-similarity-davinci-001": "r50k_base",
+ "text-similarity-curie-001": "r50k_base",
+ "text-similarity-babbage-001": "r50k_base",
+ "text-similarity-ada-001": "r50k_base",
+ "text-search-davinci-doc-001": "r50k_base",
+ "text-search-curie-doc-001": "r50k_base",
+ "text-search-babbage-doc-001": "r50k_base",
+ "text-search-ada-doc-001": "r50k_base",
+ "code-search-babbage-code-001": "r50k_base",
+ "code-search-ada-code-001": "r50k_base",
+ # open source
+ "gpt2": "gpt2",
+ "gpt-2": "gpt2", # Maintains consistency with gpt-4
+}
+
+
+def encoding_name_for_model(model_name: str) -> str:
+ """Returns the name of the encoding used by a model.
+
+ Raises a KeyError if the model name is not recognised.
+ """
+ encoding_name = None
+ if model_name in MODEL_TO_ENCODING:
+ encoding_name = MODEL_TO_ENCODING[model_name]
+ else:
+ # Check if the model matches a known prefix
+ # Prefix matching avoids needing library updates for every model version release
+ # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
+ for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
+ if model_name.startswith(model_prefix):
+ return model_encoding_name
+
+ if encoding_name is None:
+ raise KeyError(
+ f"Could not automatically map {model_name} to a tokeniser. "
+ "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
+ ) from None
+
+ return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+ """Returns the encoding used by a model.
+
+ Raises a KeyError if the model name is not recognised.
+ """
+ return get_encoding(encoding_name_for_model(model_name))