Spaces:

xu-song
/

tokenizer-arena

Running

xu-song commited on Dec 5, 2023

Commit

3030d21

1 Parent(s): 293bad6

add more tokenizer

Files changed (2) hide show

vocab/__init__.py CHANGED Viewed

@@ -102,6 +102,7 @@ all_tokenizers = [
     # "goat",
     # tiktoken 系列
     "qwen_7b_chat",
     "qwen_72b_chat",

     # "goat",
     # tiktoken 系列
+    "qwen_1_8b_chat",
     "qwen_7b_chat",
     "qwen_72b_chat",

vocab/qwen_1_8b_chat/__init__.py ADDED Viewed

+"""
+依赖 torch tiktoken
+依赖 transformer 4.31.0 及以上，
+https://huggingface.co/tangger/Qwen-7B-Chat  Qwen官方模型临时下架了，这个是备份
+https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
+"""
+import os
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)
+tokenizer.comments = ""
+def test():
+    encoding = tokenizer.encode("测试华为手机10086        8个空格")
+    for token_id in encoding:
+        token = tokenizer.convert_ids_to_tokens([token_id])[0].decode("utf-8")
+        print(token_id, ":", token)
+if __name__ == "__main__":
+    test()