Spaces:
Running
Running
examples = { | |
"en": [ | |
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n, | |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏. | |
["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"], | |
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"], | |
["digits: (10086 + 98) = 100184", "baichuan", "llama"] | |
] | |
, | |
"zh": [ | |
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n, | |
["标点测试:,。!?;", "baichuan_7b", "llama"], | |
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"], | |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"], | |
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"], | |
] | |
} | |
more_examples = [ | |
# bert VS clue | |
# bert系列 | |
("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug, bert VS clue | |
# llama系列 (基于sentencepiece) | |
("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"), | |
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"), | |
("llama", "chinese_llama2", ""), | |
("chinese_llama", "chinese_llama2", ""), | |
# glm系列 (基于sentencepiece) | |
("glm", "chatglm1", ""), | |
("chatglm1", "chatglm2", ""), | |
# gpt2系列 | |
("gpt2", "moss", ""), | |
("", "", ""), | |
# openai系列 (tiktoken) | |
("qwen", "gpt_35_turbo", ""), | |
] | |
def example_fn(example_idx): | |
return examples["en"][example_idx] | |
def get_more_example(): | |
import urllib.parse | |
url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena" | |
for tokenizer1, tokenizer2, text in more_examples: | |
full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}' | |
print(full_url) | |
if __name__ == "__main__": | |
get_more_example() | |