File size: 4,962 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

"""

special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
                              啥?         啥?                     bos      eos


[MASK] for short blank filling - 150000
[sMASK] for sentence filling -
[gMASK] for left-to-right generation. - 150001


text.replace("\t", f"<|tab|>")
text.replace(" " * i, f"<|blank_{length}|>")
text.replace("\n", "<n>")


"bos_token": "<sop>",   startofpiece
"eop_token": "<eop>",
"eos_token": "</s>",

## 确认

130005 = <eop>

## 源码:

- https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L32

"""
import os
from transformers import AutoTokenizer

os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)


def encode_text(text):
    """
    能够编码
    """
    tokens = tokenizer.tokenize(text)
    token_id = tokenizer.encode(text=text, add_special_tokens=False)
    decoded_text = tokenizer.decode(token_id)
    print("tokens: ", tokens, ";\tid: ", token_id, ";\ttext: ", decoded_text)


def test_space():
    # " " 编码后是空的
    for text in ["  ", "\t", "你是谁", "你是\n谁", "你是 谁", "你是  谁", "'[Round 0]\n问:你是谁\n答:我是一个名为 ChatGLM-6B 的人工智能助手,是基于清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型开发的。我的任务是针对用户的问题和要求提供适当的答复和支持。\n[Round 1]\n问:你会干什么\n答:"]:
        encode_text(text)


def test_case():
    for text in ["Good morning", "good morning", "good  morning", "goog morningabc"]:
        encode_text(text)

def export():
    with open("chatglm.vocab", "w", encoding="utf-8") as f_out:
        vocab_size = len(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces)
        for i in range(vocab_size):
            f_out.write(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[i].piece + "\n")


# export()


def test_tokens():
    tokens = [43435]
    tokens = [    53,   6945,      5,      8,     42,      4,  64286,     12,  74874,
              4,  67342,     12,  74874, 130328, 130247, 130233, 130227,     35,
          65806,  68241,  75890,  14132,   5388,    340,     11,     21,    222,
              6,  76693,  66877,  63852,      6,  66430,  68747, 102501,  63823,
              4,     52,   6945,      5,      9,     42,      4,  64286,     12,
          65450,  83400,  64213,  66846,      4,  67342,     12, 130001, 130004,
          74747,  83400,  66115,  90478,  70597,  63826,  68076,      6,  63873,
          68684,  64113, 120922,  73129,  63823,  65056,  63829,  63948,  64124,
          79727,  64447,     12,      4,      4,      9,      7,      5,  64716,
          93067,  95119,  64560,     12,  66524,  63827,  70682,  63944,  89160,
          63826,  71304,      6,  79553,  67155,  63826,  68668,  63843,  91351,
          96846,  63823,      4,      4,     10,      7,      5,  95472,  74107,
          66625,  64285,     12,  64442,  67201,  69609,  63824,  81548,  63824,
          70870,  63826,  66800,      6,  94824,  63959,  65195,  65515,  63824,
          64392,  69584,  63824,  81198,  63914,  63835,  63823,      4,      4,
             13,      7,      5,  66544,  69656,     12,  66533,  63891,  63948,
          66544,  69726,      6,  63906,  86089,  63824,  88419,  63824,  69765,
          63853,  64369, 102753,  64736,  63823,      4,      4,     16,      7,
              5,  65073,  63827,  72151,  64020,  67491,  66469,  63853,  68168,
             12,  65289,  95128,  63826,  68819,      6, 118679,  66115,  64174,
          66625,  63823,      4,      4,     15,      7,      5,  86790,     12,
          70666,  89266,  63878,  66544,  69656,      6,  67623,  73129,  63823,
              4,      4,     21,      7,  71210,  79856,  63912,  63831,  66625,
          69204,  64659,     12,  66312,  63922,  64984,  67427,  63824,  63959,
          65419,  63853,  64384,  63835,  63823,      4,      4,  63976, 106490,
          65921,  64542,  73129,      6,  63852,  80917,  65207,  64678,  63853,
          66625,  64427,      6,  89385,  64124,  79727,  64447,  63823, 130005]
    # print(tokenizer.decode(tokens))
    start_idx = 0  # chatglm里的token_id是从0开始的
    # start_idx = 20000 # 默认词典,前20000是图片
    for i, token in enumerate(tokens):
        # print(i, token, tokenizer.decode([token - start_idx]))
        # print(tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece, end=" ")
        print(i, token, tokenizer.sp_tokenizer.text_tokenizer.proto.pieces[token - start_idx].piece)


test_tokens()

# tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens

# test_case()
# test_space()




# s