Spaces:

xu-song
/

tokenizer-arena

Running

File size: 1,028 Bytes

"""
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

https://github.com/openai/tiktoken

词典路径： https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

"""

import json
import tiktoken


tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
encoding = tokenizer.encode("a bcjik今天天气颗粒剂范大将军发卡卡萨")
decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
print(encoding)
print(decoding_bytes)

# for token in tokens:
#     token_str = encoding.decode([token])
#     print(token, token_str, json.dumps(token_str))


tokenizer.decode_tokens_bytes([10])
tokenizer.decode_single_token_bytes(10)
tokenizer.decode_bytes([10])

f_out = open("vocab.jsonl", "w")
# 100255
for i in range(tokenizer.n_vocab):
    # decode_bytes
    # decode_single_token_bytes
    try:
        token_str = tokenizer.decode([i])
    except:
        token_str = None
    f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")