Spaces:

xu-song
/

tokenizer-arena

Running

File size: 2,365 Bytes

examples = {
    "en": [
        ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
        ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
        ["number: (10086 + 98) = 100184", "baichuan", "llama"]
    ]
    ,
    "zh": [
        ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
        ["标点测试：，。！？；", "baichuan_7b", "llama"],
        ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
    ]

}

more_examples = [
    # bert VS clue
    # bert系列
    ("bert_base_cased", "bert_base_uncased", ""),  # # clue VS kplug， bert VS clue

    # llama系列 (基于sentencepiece)
    ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
    ("llama", "chinese_llama2", ""),
    ("chinese_llama", "chinese_llama2", ""),

    # glm系列 （基于sentencepiece）
    ("glm", "chatglm1", ""),
    ("chatglm1", "chatglm2", ""),

    # gpt2系列
    ("gpt2", "moss", ""),
    ("", "", ""),

    # openai系列 （tiktoken）
    ("qwen", "gpt_35_turbo", ""),


]


def example_fn(example_idx):
    return examples["en"][example_idx]


def get_more_example():
    import urllib.parse
    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
    for tokenizer1, tokenizer2, text in more_examples:
        full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
        print(full_url)


if __name__ == "__main__":
    get_more_example()