Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / playground_examples.py

xu-song

update

6ef6bf4 about 2 months ago

raw

history blame contribute delete

4.16 kB

	"""

	## characters

	- alphanumeric characters
	- numeric characters
	- special characters: A special character is a character that is not an alphabetic or numeric character.
	- ASCII control characters
	- punctuation marks
	- accent marks
	- 数学符号
	- whitespace:
	- https://en.wikipedia.org/wiki/Whitespace_character
	- https://emptycharacter.com/


	https://www.computerhope.com/jargon/s/specchar.htm
	"""
	import random
	from datasets import load_dataset

	default_user_input = """\
	Replace this text in the input field to see how tokenization works.
	Buenos días!
	华为发布Mate60手机。
	ラグビーワールドカップ2023フランス"""
	# default_tokenizer_name_1 = "Meta/llama3"
	default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
	default_tokenizer_name_2 = "openai/gpt-4o"



	def get_sample_input():
	default_inputs = {
	"en": "Replace this text in the input field to see how tokenization works.",
	"zh-Hans": "",
	"es": "",
	"de": "",
	}
	random.seed(10) # For reproducibility
	lines = []
	for lang in default_inputs.keys():
	dataset = load_dataset("eson/cc100-samples", lang, split="train")
	print(dataset)
	print(1)
	return default_inputs


	examples = {
	"en": [
	["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
	["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"], # chatglm 有blank_n, bert丢掉了空格，
	# ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
	["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"], # llama词典有点小
	["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
	# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <\|system\|> <\|user\|> <\|assistant\|> <\|endoftext\|>", "", ""],
	],
	"zh": [
	["空格测试： 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
	["标点测试：，。！？；", "baichuan_7b", "llama"],
	["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
	["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
	["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
	]
	}



	more_examples = [
	# bert系列
	("google-bert/bert-base-cased", "google-bert/bert-base-uncased", "", ""), # # clue VS kplug， bert VS clue
	("bert-base-cased", "clue", "", "增加了[]()"),
	("roberta-chinese-clue", "kplug", "", ""),

	# llama系列 (基于sentencepiece)
	("baichuan", "baichuan2", "baichuan2支持多空格，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
	("llama", "baichuan2", "baichuan2支持多空格，多个换行\n\n"),
	("llama", "chinese-llama-2-7b", ""),
	("llama", "llama3", "扩充词典"),
	("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),

	# glm系列（基于sentencepiece）
	("glm", "chatglm1", ""),
	("chatglm1", "chatglm2", ""),

	# gpt2系列
	("gpt2", "moss", ""),
	("", "", ""),

	# openai系列（tiktoken）
	("qwen", "gpt_35_turbo", ""),

	]

	lang = "en"

	example_types = [t[0].split(":")[0] for t in examples[lang]]


	def example_fn(example_idx):
	return examples[lang][example_idx]


	def get_more_example():
	import urllib.parse
	url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
	for tokenizer1, tokenizer2, text, comment in more_examples:
	full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
	print(full_url)


	if __name__ == "__main__":
	get_more_example()