From 4a52a71956a8d46fcb7294ac71734504bb09bcc2 Mon Sep 17 00:00:00 2001 From: S. Solomon Darnell Date: Fri, 28 Mar 2025 21:52:21 -0500 Subject: two version of R2R are here --- .../lib/python3.12/site-packages/tiktoken/model.py | 105 +++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 .venv/lib/python3.12/site-packages/tiktoken/model.py (limited to '.venv/lib/python3.12/site-packages/tiktoken/model.py') diff --git a/.venv/lib/python3.12/site-packages/tiktoken/model.py b/.venv/lib/python3.12/site-packages/tiktoken/model.py new file mode 100644 index 00000000..681b9131 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken/model.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from .core import Encoding +from .registry import get_encoding + +# TODO: these will likely be replaced by an API endpoint +MODEL_PREFIX_TO_ENCODING: dict[str, str] = { + "o1-": "o200k_base", + # chat + "chatgpt-4o-": "o200k_base", + "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13 + "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k + "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. + "gpt-35-turbo-": "cl100k_base", # Azure deployment name + # fine-tuned + "ft:gpt-4": "cl100k_base", + "ft:gpt-3.5-turbo": "cl100k_base", + "ft:davinci-002": "cl100k_base", + "ft:babbage-002": "cl100k_base", +} + +MODEL_TO_ENCODING: dict[str, str] = { + # chat + "gpt-4o": "o200k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "gpt-3.5": "cl100k_base", # Common shorthand + "gpt-35-turbo": "cl100k_base", # Azure deployment name + # base + "davinci-002": "cl100k_base", + "babbage-002": "cl100k_base", + # embeddings + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + # DEPRECATED MODELS + # text (DEPRECATED) + "text-davinci-003": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-001": "r50k_base", + "text-curie-001": "r50k_base", + "text-babbage-001": "r50k_base", + "text-ada-001": "r50k_base", + "davinci": "r50k_base", + "curie": "r50k_base", + "babbage": "r50k_base", + "ada": "r50k_base", + # code (DEPRECATED) + "code-davinci-002": "p50k_base", + "code-davinci-001": "p50k_base", + "code-cushman-002": "p50k_base", + "code-cushman-001": "p50k_base", + "davinci-codex": "p50k_base", + "cushman-codex": "p50k_base", + # edit (DEPRECATED) + "text-davinci-edit-001": "p50k_edit", + "code-davinci-edit-001": "p50k_edit", + # old embeddings (DEPRECATED) + "text-similarity-davinci-001": "r50k_base", + "text-similarity-curie-001": "r50k_base", + "text-similarity-babbage-001": "r50k_base", + "text-similarity-ada-001": "r50k_base", + "text-search-davinci-doc-001": "r50k_base", + "text-search-curie-doc-001": "r50k_base", + "text-search-babbage-doc-001": "r50k_base", + "text-search-ada-doc-001": "r50k_base", + "code-search-babbage-code-001": "r50k_base", + "code-search-ada-code-001": "r50k_base", + # open source + "gpt2": "gpt2", + "gpt-2": "gpt2", # Maintains consistency with gpt-4 +} + + +def encoding_name_for_model(model_name: str) -> str: + """Returns the name of the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ + encoding_name = None + if model_name in MODEL_TO_ENCODING: + encoding_name = MODEL_TO_ENCODING[model_name] + else: + # Check if the model matches a known prefix + # Prefix matching avoids needing library updates for every model version release + # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) + for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): + if model_name.startswith(model_prefix): + return model_encoding_name + + if encoding_name is None: + raise KeyError( + f"Could not automatically map {model_name} to a tokeniser. " + "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect." + ) from None + + return encoding_name + + +def encoding_for_model(model_name: str) -> Encoding: + """Returns the encoding used by a model. + + Raises a KeyError if the model name is not recognised. + """ + return get_encoding(encoding_name_for_model(model_name)) -- cgit v1.2.3