planRun / some_base_method /tiktoken_count_tokens.py
sanbo
update sth. at 2025-01-04 18:31:02
68515e1
raw
history blame
1.97 kB
"""
o200k_base:适用于某些特定模型。
cl100k_base:适用于gpt-4、gpt-3.5-turbo和text-embedding-ada-002等模型。
r50k_base(或gpt2):适用于像davinci这样的GPT-3模型。
p50k_base:适用于Codex模型、text-davinci-002和text-davinci-003等模型。
"""
import tiktoken
def num_tokens_from_string(string: str, encoding_name='cl100k_base') -> int:
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
def compare_encodings(string: str, encodings: list):
for enc_name in encodings:
enc = tiktoken.get_encoding(enc_name)
tokens = enc.encode(string)
print(f"Encoding: {enc_name}, Token Count: {len(tokens)}, Byte Size: {len(enc.encode_ordinary(string))}")
def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
print("Warning: model not found. Using cl100k_base encoding.")
encoding = tiktoken.get_encoding("cl100k_base")
tokens_per_message = 3
tokens_per_name = 1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name":
num_tokens += tokens_per_name
return num_tokens
if __name__ == '__main__':
# demo 1
print(num_tokens_from_string('tiktoken is great!')) # 输出: 6
print(num_tokens_from_string('大模型是什么?')) # 输出: 8
# demo 2
compare_encodings("这是一个示例文本", ["cl100k_base", "p50k_base", "r50k_base"])
# demo 3
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Hello! How can I help you?"}
]
print(num_tokens_from_messages(messages)) # 输出: 28