Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

dutch-tokenizer-arena / vocab /gpt_nexo_20b /test_tokenizer.py

xu-song

update

751936e over 1 year ago

raw

history blame

1.94 kB

	"""
	最简单的tokenizer
	"""

	import json
	from tokenizers import Tokenizer

	tokenizer = Tokenizer.from_file("20B_tokenizer.json")

	print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
	print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False))

	vocab = tokenizer.get_vocab()


	def test_single_token():
	"""
	单个字符的编码（一个字符可能会编码成多个id）
	"""
	for word in "发大厦三分赛中国解决方法黑白侗鸩，。！？；":
	encoding = tokenizer.encode(word)
	for token_id in encoding.ids:
	decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �，对应 "\ufffd"
	token = tokenizer.id_to_token(token_id)
	print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))


	def test_long_token():
	"""


	"""
	words = [
	"//----------------------------------------------------------------", # 代码里有
	"--------------------------",
	"-------------------------",
	"-----------------------",
	]
	for word in words:
	encoding = tokenizer.encode(word)
	for token_id in encoding.ids:
	decode_str = tokenizer.decode([token_id]) #
	token = tokenizer.id_to_token(token_id)
	print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))


	def test_encode():
	text = "中国解决方法黑白侗鸩，。！？；一个人去哪里一个"
	encoding = tokenizer.encode(text)
	for token_id in encoding.ids:
	decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �，对应 "\ufffd"
	token = tokenizer.id_to_token(token_id)
	print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))


	test_single_token()
	# test_long_token()
	# test_encode()