|
""" |
|
依赖 torch tiktoken |
|
依赖 transformer 4.31.0 及以上, |
|
|
|
https://huggingface.co/tangger/Qwen-7B-Chat Qwen官方模型临时下架了,这个是备份 |
|
|
|
https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md |
|
""" |
|
|
|
import os |
|
from transformers import AutoTokenizer |
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Qwen-7B-Chat") |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True) |
|
|
|
tokenizer.comments = "在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test(): |
|
encoding = tokenizer.encode("测试华为手机10086 8个空格") |
|
for token_id in encoding: |
|
token = tokenizer.convert_ids_to_tokens([token_id])[0].decode("utf-8") |
|
print(token_id, ":", token) |
|
|
|
if __name__ == "__main__": |
|
test() |