Spaces:
Running
Running
add more tokenizer
Browse files- vocab/__init__.py +1 -0
- vocab/qwen_1_8b_chat/__init__.py +25 -0
vocab/__init__.py
CHANGED
@@ -102,6 +102,7 @@ all_tokenizers = [
|
|
102 |
# "goat",
|
103 |
|
104 |
# tiktoken 系列
|
|
|
105 |
"qwen_7b_chat",
|
106 |
"qwen_72b_chat",
|
107 |
|
|
|
102 |
# "goat",
|
103 |
|
104 |
# tiktoken 系列
|
105 |
+
"qwen_1_8b_chat",
|
106 |
"qwen_7b_chat",
|
107 |
"qwen_72b_chat",
|
108 |
|
vocab/qwen_1_8b_chat/__init__.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
依赖 torch tiktoken
|
3 |
+
依赖 transformer 4.31.0 及以上,
|
4 |
+
|
5 |
+
https://huggingface.co/tangger/Qwen-7B-Chat Qwen官方模型临时下架了,这个是备份
|
6 |
+
|
7 |
+
https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
from transformers import AutoTokenizer
|
12 |
+
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)
|
14 |
+
|
15 |
+
tokenizer.comments = ""
|
16 |
+
|
17 |
+
|
18 |
+
def test():
|
19 |
+
encoding = tokenizer.encode("测试华为手机10086 8个空格")
|
20 |
+
for token_id in encoding:
|
21 |
+
token = tokenizer.convert_ids_to_tokens([token_id])[0].decode("utf-8")
|
22 |
+
print(token_id, ":", token)
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
test()
|