diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py b/.venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py new file mode 100644 index 00000000..f2599f16 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/tiktoken_ext/openai_public.py @@ -0,0 +1,130 @@ +from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe + +ENDOFTEXT = "<|endoftext|>" +FIM_PREFIX = "<|fim_prefix|>" +FIM_MIDDLE = "<|fim_middle|>" +FIM_SUFFIX = "<|fim_suffix|>" +ENDOFPROMPT = "<|endofprompt|>" + +# The pattern in the original GPT-2 release is: +# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +# This is equivalent, but executes faster: +r50k_pat_str = ( + r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s""" +) + + +def gpt2(): + mergeable_ranks = data_gym_to_mergeable_bpe_ranks( + vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", + encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json", + vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5", + encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783", + ) + return { + "name": "gpt2", + "explicit_n_vocab": 50257, + "pat_str": r50k_pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": {ENDOFTEXT: 50256}, + } + + +def r50k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", + expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930", + ) + return { + "name": "r50k_base", + "explicit_n_vocab": 50257, + "pat_str": r50k_pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": {ENDOFTEXT: 50256}, + } + + +def p50k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", + ) + return { + "name": "p50k_base", + "explicit_n_vocab": 50281, + "pat_str": r50k_pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": {ENDOFTEXT: 50256}, + } + + +def p50k_edit(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069", + ) + special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} + return { + "name": "p50k_edit", + "pat_str": r50k_pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + +def cl100k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7", + ) + special_tokens = { + ENDOFTEXT: 100257, + FIM_PREFIX: 100258, + FIM_MIDDLE: 100259, + FIM_SUFFIX: 100260, + ENDOFPROMPT: 100276, + } + return { + "name": "cl100k_base", + "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s""", + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + +def o200k_base(): + mergeable_ranks = load_tiktoken_bpe( + "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", + expected_hash="446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d", + ) + special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018} + # This regex could be made more efficient. If I was the one working on this encoding, I would + # have done a few other things differently too, e.g. I think you can allocate tokens more + # efficiently across languages. + pat_str = "|".join( + [ + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""", + r"""\p{N}{1,3}""", + r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", + r"""\s*[\r\n]+""", + r"""\s+(?!\S)""", + r"""\s+""", + ] + ) + return { + "name": "o200k_base", + "pat_str": pat_str, + "mergeable_ranks": mergeable_ranks, + "special_tokens": special_tokens, + } + + +ENCODING_CONSTRUCTORS = { + "gpt2": gpt2, + "r50k_base": r50k_base, + "p50k_base": p50k_base, + "p50k_edit": p50k_edit, + "cl100k_base": cl100k_base, + "o200k_base": o200k_base, +} |