update
Browse files- app.py +76 -119
- style.css +32 -0
- util.py +94 -0
- utils/zh_util.py +9 -4
- vocab/{alpaca_7b → Intern_gpt}/README.md +0 -0
- vocab/README.md +3 -1
- vocab/__init__.py +40 -11
- vocab/{bert_en → _alpaca_7b}/README.md +0 -0
- vocab/{goat → _goat}/README.md +0 -0
- tokenizer.py → vocab/_goat/__init__.py +0 -0
- vocab/baichuan_7b/demo.py +3 -0
- vocab/bert_base_cased/README.md +0 -0
- vocab/bert_base_cased/__init__.py +3 -0
- vocab/{bert_chinese → bert_base_chinese}/README.md +0 -0
- vocab/{bert_chinese → bert_base_chinese}/__init__.py +0 -0
- vocab/{bert_chinese → bert_base_chinese}/test.py +0 -0
- vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py +0 -0
- vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json +0 -0
- vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json +0 -0
- vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json +0 -0
- vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt +0 -0
- vocab/{bert_chinese → bert_base_chinese}/vocab.txt +0 -0
- vocab/bert_base_uncased/__init__.py +3 -0
- vocab/chatglm2_6b/__init__.py +2 -0
- vocab/gpt_35_turbo/__init__.py +17 -1
- vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +1 -1
- vocab/gpt_neox_chinese_v1/to_v2/test2.py +1 -1
- vocab/gpt_nexo_20b/__init__.py +1 -0
- vocab/internlm_chat_7b/README.md +0 -0
- vocab/internlm_chat_7b/__init__.py +6 -0
- vocab/kplug/__init__.py +5 -0
- vocab/llama/__init__.py +1 -1
- vocab/llama2/__init__.py +0 -0
- vocab/moss/test_tokenizer.py +3 -4
app.py
CHANGED
@@ -3,6 +3,12 @@
|
|
3 |
# time: 2022/8/23 16:06
|
4 |
|
5 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
plots
|
8 |
|
@@ -19,21 +25,11 @@ table
|
|
19 |
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
20 |
"""
|
21 |
|
22 |
-
|
23 |
-
import pandas as pd
|
24 |
import gradio as gr
|
25 |
|
26 |
-
from vocab import all_tokenizers
|
27 |
-
|
28 |
-
# 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673
|
29 |
-
# 隐藏legend:
|
30 |
-
css = """
|
31 |
-
.space-show {white-space: pre-wrap;}
|
32 |
-
.cell-wrap {white-space: pre-wrap;}
|
33 |
-
.category-legend {display: none !important}
|
34 |
-
.statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
|
35 |
-
.statistics label {text-align: center !important;}
|
36 |
-
"""
|
37 |
|
38 |
example_text = """Replace this text in the input field to see how tokenization works
|
39 |
华为智能音箱发布:华为Sound X"""
|
@@ -42,81 +38,18 @@ example_text = """Replace this text in the input field to see how tokenization w
|
|
42 |
examples = [
|
43 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
45 |
-
["
|
46 |
-
["
|
47 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
48 |
]
|
49 |
|
50 |
|
|
|
|
|
51 |
|
52 |
-
def tokenize(text, tokenizer_type, color_num=5):
|
53 |
-
"""
|
54 |
-
TODO: cache tokenizer
|
55 |
-
"""
|
56 |
-
print(text, tokenizer_type)
|
57 |
-
pos_tokens = []
|
58 |
-
tokenizer = load_tokener(tokenizer_type)
|
59 |
-
encoding = tokenizer.encode(text)
|
60 |
-
|
61 |
-
table = []
|
62 |
-
|
63 |
-
for idx, token_id in enumerate(encoding):
|
64 |
-
decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
65 |
-
pos_tokens.extend([(decode_text, str(idx % color_num))])
|
66 |
-
|
67 |
-
# token "Byte": # 这是 utf-8编码吧?
|
68 |
-
token = tokenizer.convert_ids_to_tokens([token_id])[0]
|
69 |
-
if isinstance(token, bytes):
|
70 |
-
try:
|
71 |
-
token_str = token.decode("utf-8")
|
72 |
-
except:
|
73 |
-
token_str = token.decode("utf-8", errors="ignore")
|
74 |
-
print("decode_error", token, token_str)
|
75 |
-
|
76 |
-
token_bytes = token
|
77 |
-
json_dumps = json.dumps(token_str)
|
78 |
-
elif isinstance(token, str):
|
79 |
-
token_str = token
|
80 |
-
token_bytes = bytes(token_str, "utf-8")
|
81 |
-
json_dumps = json.dumps(token_str)
|
82 |
-
else:
|
83 |
-
return
|
84 |
-
|
85 |
-
|
86 |
-
# ⭐
|
87 |
-
table.append(
|
88 |
-
{"TokenID": token_id,
|
89 |
-
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
90 |
-
"Text": decode_text, #
|
91 |
-
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
92 |
-
"Bytes": str(token_bytes),
|
93 |
-
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
94 |
-
}
|
95 |
-
)
|
96 |
-
|
97 |
-
table_df = pd.DataFrame(table)
|
98 |
-
print(table)
|
99 |
-
# print(table_df)
|
100 |
-
|
101 |
-
return pos_tokens, table_df, len(encoding)
|
102 |
-
|
103 |
-
|
104 |
-
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
105 |
-
pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
|
106 |
-
pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
|
107 |
-
return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
|
108 |
|
109 |
|
110 |
-
|
111 |
-
tokenizer = load_tokener(tokenizer_type)
|
112 |
-
return tokenizer.vocab_size
|
113 |
-
|
114 |
-
def test_coding():
|
115 |
-
bytes1 = b'\xe4\xb8\xad'
|
116 |
-
print(bytes1) # b'\xe4\xb8\xad'
|
117 |
-
|
118 |
-
|
119 |
-
with gr.Blocks(css=css) as demo:
|
120 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
121 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
122 |
# 功能:输入文本,进行分词
|
@@ -125,16 +58,29 @@ with gr.Blocks(css=css) as demo:
|
|
125 |
#
|
126 |
# Byte: 表示分词
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
gr.Markdown("## Input Text")
|
130 |
user_input = gr.Textbox(
|
131 |
value=example_text,
|
132 |
label="Input Text",
|
133 |
lines=5,
|
134 |
show_label=False,
|
135 |
) # placeholder="Enter sentence here..."
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
# submitBtn = gr.Button("生成回复", variant="primary")
|
138 |
|
139 |
gr.Markdown("## Tokenization")
|
140 |
|
@@ -156,18 +102,24 @@ with gr.Blocks(css=css) as demo:
|
|
156 |
lines=1,
|
157 |
elem_classes="statistics"
|
158 |
)
|
159 |
-
|
160 |
-
|
|
|
161 |
lines=1,
|
162 |
elem_classes="statistics"
|
163 |
)
|
164 |
-
|
165 |
-
label="
|
166 |
lines=1,
|
167 |
elem_classes="statistics"
|
168 |
)
|
|
|
|
|
|
|
|
|
|
|
169 |
# https://www.onlinewebfonts.com/icon/418591
|
170 |
-
gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False)
|
171 |
with gr.Column(scale=6):
|
172 |
with gr.Group():
|
173 |
tokenizer_type_2 = gr.Dropdown(
|
@@ -182,19 +134,23 @@ with gr.Blocks(css=css) as demo:
|
|
182 |
lines=1,
|
183 |
elem_classes="statistics"
|
184 |
)
|
185 |
-
|
186 |
-
|
|
|
187 |
lines=1,
|
188 |
elem_classes="statistics"
|
189 |
)
|
190 |
-
stats_6 = gr.TextArea(
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
192 |
lines=1,
|
193 |
elem_classes="statistics"
|
194 |
)
|
195 |
|
196 |
-
|
197 |
-
|
198 |
# TODO: 图 表 压缩率
|
199 |
with gr.Row():
|
200 |
with gr.Column():
|
@@ -212,41 +168,42 @@ with gr.Blocks(css=css) as demo:
|
|
212 |
|
213 |
with gr.Row():
|
214 |
output_table_1 = gr.Dataframe(
|
215 |
-
|
216 |
-
|
217 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
218 |
)
|
219 |
output_table_2 = gr.Dataframe(
|
220 |
-
|
221 |
-
|
222 |
)
|
223 |
|
224 |
-
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
225 |
-
|
|
|
|
|
|
|
|
|
226 |
|
227 |
user_input.change(tokenize_pair,
|
228 |
[user_input, tokenizer_type_1, tokenizer_type_2],
|
229 |
-
[output_text_1, output_table_1,
|
230 |
-
|
231 |
-
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
240 |
)
|
241 |
|
242 |
-
#
|
243 |
-
|
244 |
|
245 |
-
# examples=[
|
246 |
-
# ["What a beautiful morning for a walk!"],
|
247 |
-
# ["It was the best of times, it was the worst of times."],
|
248 |
-
# ["多个空格 It ss was the best of times, it was the worst of times."],
|
249 |
-
# ]
|
250 |
|
251 |
if __name__ == "__main__":
|
252 |
-
demo.launch()
|
|
|
|
3 |
# time: 2022/8/23 16:06
|
4 |
|
5 |
"""
|
6 |
+
## TODO:
|
7 |
+
1. token数,放到 label里
|
8 |
+
2. http get方式获取参数,
|
9 |
+
3. 自启动
|
10 |
+
4.
|
11 |
+
|
12 |
|
13 |
plots
|
14 |
|
|
|
25 |
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
26 |
"""
|
27 |
|
28 |
+
|
|
|
29 |
import gradio as gr
|
30 |
|
31 |
+
from vocab import all_tokenizers
|
32 |
+
from util import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
example_text = """Replace this text in the input field to see how tokenization works
|
35 |
华为智能音箱发布:华为Sound X"""
|
|
|
38 |
examples = [
|
39 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
40 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
41 |
+
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
42 |
+
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
43 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
44 |
]
|
45 |
|
46 |
|
47 |
+
def example_fn(example_idx):
|
48 |
+
return examples[example_idx]
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
+
with gr.Blocks(css="style.css") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
54 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
55 |
# 功能:输入文本,进行分词
|
|
|
58 |
#
|
59 |
# Byte: 表示分词
|
60 |
|
61 |
+
with gr.Row():
|
62 |
+
gr.Markdown("## Input Text")
|
63 |
+
dropdown_examples = gr.Dropdown(
|
64 |
+
["Example1", "Example2", "Example3"],
|
65 |
+
value="Examples",
|
66 |
+
type="index",
|
67 |
+
show_label=False,
|
68 |
+
container=False,
|
69 |
+
scale=0,
|
70 |
+
elem_classes="example-style"
|
71 |
+
)
|
72 |
|
|
|
73 |
user_input = gr.Textbox(
|
74 |
value=example_text,
|
75 |
label="Input Text",
|
76 |
lines=5,
|
77 |
show_label=False,
|
78 |
) # placeholder="Enter sentence here..."
|
79 |
+
# gr.Examples(
|
80 |
+
# examples,
|
81 |
+
# None,
|
82 |
+
# )
|
83 |
|
|
|
84 |
|
85 |
gr.Markdown("## Tokenization")
|
86 |
|
|
|
102 |
lines=1,
|
103 |
elem_classes="statistics"
|
104 |
)
|
105 |
+
stats_zh_token_size_1 = gr.TextArea(
|
106 |
+
# value="1252/1455",
|
107 |
+
label="ZH char/word",
|
108 |
lines=1,
|
109 |
elem_classes="statistics"
|
110 |
)
|
111 |
+
stats_overlap_token_size_1 = gr.TextArea(
|
112 |
+
label="Overlap Tokens",
|
113 |
lines=1,
|
114 |
elem_classes="statistics"
|
115 |
)
|
116 |
+
# stats_3 = gr.TextArea(
|
117 |
+
# label="Compress Rate",
|
118 |
+
# lines=1,
|
119 |
+
# elem_classes="statistics"
|
120 |
+
# )
|
121 |
# https://www.onlinewebfonts.com/icon/418591
|
122 |
+
gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
|
123 |
with gr.Column(scale=6):
|
124 |
with gr.Group():
|
125 |
tokenizer_type_2 = gr.Dropdown(
|
|
|
134 |
lines=1,
|
135 |
elem_classes="statistics"
|
136 |
)
|
137 |
+
stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
|
138 |
+
# value="12/45",
|
139 |
+
label="ZH char/word",
|
140 |
lines=1,
|
141 |
elem_classes="statistics"
|
142 |
)
|
143 |
+
# stats_6 = gr.TextArea(
|
144 |
+
# label="Compress Rate",
|
145 |
+
# lines=1,
|
146 |
+
# elem_classes="statistics"
|
147 |
+
# )
|
148 |
+
stats_overlap_token_size_2 = gr.TextArea(
|
149 |
+
label="Overlap Tokens",
|
150 |
lines=1,
|
151 |
elem_classes="statistics"
|
152 |
)
|
153 |
|
|
|
|
|
154 |
# TODO: 图 表 压缩率
|
155 |
with gr.Row():
|
156 |
with gr.Column():
|
|
|
168 |
|
169 |
with gr.Row():
|
170 |
output_table_1 = gr.Dataframe(
|
171 |
+
headers=["TokenID", "Byte", "Text"],
|
172 |
+
datatype=["str", "str", "str"],
|
173 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
174 |
)
|
175 |
output_table_2 = gr.Dataframe(
|
176 |
+
headers=["TokenID", "Token", "Text"],
|
177 |
+
datatype=["str", "str", "str"],
|
178 |
)
|
179 |
|
180 |
+
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
181 |
+
[output_text_1, output_table_1])
|
182 |
+
# 下面两个好像可以合并
|
183 |
+
tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
|
184 |
+
tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
185 |
+
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
186 |
|
187 |
user_input.change(tokenize_pair,
|
188 |
[user_input, tokenizer_type_1, tokenizer_type_2],
|
189 |
+
[output_text_1, output_table_1, output_text_2, output_table_2])
|
190 |
+
|
191 |
+
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
|
192 |
+
[output_text_2, output_table_2])
|
193 |
+
tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
|
194 |
+
tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
195 |
+
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
196 |
+
|
197 |
+
dropdown_examples.change(
|
198 |
+
example_fn,
|
199 |
+
dropdown_examples,
|
200 |
+
[user_input, tokenizer_type_1, tokenizer_type_2]
|
201 |
)
|
202 |
|
203 |
+
# start up 初始化
|
204 |
+
gr.update(lines=2, visible=True, value="Short story: ")
|
205 |
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
+
demo.queue(max_size=20).launch()
|
209 |
+
# demo.launch()
|
style.css
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
/* 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673 */
|
3 |
+
.space-show {
|
4 |
+
white-space: pre-wrap;
|
5 |
+
}
|
6 |
+
|
7 |
+
.cell-wrap {
|
8 |
+
white-space: pre-wrap;
|
9 |
+
}
|
10 |
+
|
11 |
+
/* 隐藏legend */
|
12 |
+
.category-legend {
|
13 |
+
display: none !important;
|
14 |
+
}
|
15 |
+
|
16 |
+
.statistics textarea {
|
17 |
+
min-width: min(50px, 100%) !important;
|
18 |
+
font-size: 20px !important;
|
19 |
+
font-weight: 600 !important;
|
20 |
+
text-align: center !important;
|
21 |
+
border: none !important;
|
22 |
+
}
|
23 |
+
|
24 |
+
.statistics label {
|
25 |
+
text-align: center !important;
|
26 |
+
}
|
27 |
+
|
28 |
+
/* align-self: flex-end; */
|
29 |
+
.example-style {
|
30 |
+
max-width: 150px;
|
31 |
+
align-self: self-end;
|
32 |
+
}
|
util.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import json
|
5 |
+
import pandas as pd
|
6 |
+
from vocab import load_tokener
|
7 |
+
from utils.zh_util import iter_vocab
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def tokenize(text, tokenizer_type, color_num=5):
|
13 |
+
"""
|
14 |
+
TODO: cache tokenizer
|
15 |
+
"""
|
16 |
+
print(f"入参:tokenize, {text}, {tokenizer_type}")
|
17 |
+
pos_tokens = []
|
18 |
+
tokenizer = load_tokener(tokenizer_type)
|
19 |
+
encoding = tokenizer.encode(text)
|
20 |
+
|
21 |
+
table = []
|
22 |
+
|
23 |
+
for idx, token_id in enumerate(encoding):
|
24 |
+
decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
25 |
+
pos_tokens.extend([(decode_text, str(idx % color_num))])
|
26 |
+
|
27 |
+
# token "Byte": # 这是 utf-8编码吧?
|
28 |
+
token = tokenizer.convert_ids_to_tokens([token_id])[0]
|
29 |
+
if isinstance(token, bytes):
|
30 |
+
try:
|
31 |
+
token_str = token.decode("utf-8")
|
32 |
+
except:
|
33 |
+
token_str = token.decode("utf-8", errors="ignore")
|
34 |
+
print("decode_error", tokenizer_type, token, token_str)
|
35 |
+
|
36 |
+
token_bytes = token
|
37 |
+
json_dumps = json.dumps(token_str)
|
38 |
+
elif isinstance(token, str):
|
39 |
+
token_str = token
|
40 |
+
token_bytes = bytes(token_str, "utf-8")
|
41 |
+
json_dumps = json.dumps(token_str)
|
42 |
+
else:
|
43 |
+
return
|
44 |
+
|
45 |
+
# ⭐
|
46 |
+
table.append(
|
47 |
+
{"TokenID": token_id,
|
48 |
+
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
49 |
+
"Text": decode_text, #
|
50 |
+
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
51 |
+
"Bytes": str(token_bytes),
|
52 |
+
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
53 |
+
}
|
54 |
+
)
|
55 |
+
|
56 |
+
table_df = pd.DataFrame(table)
|
57 |
+
print(f"Tokenization[{tokenizer_type}]: {table}")
|
58 |
+
# print(table_df)
|
59 |
+
|
60 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
61 |
+
|
62 |
+
|
63 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
64 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
65 |
+
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
66 |
+
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
67 |
+
|
68 |
+
|
69 |
+
def basic_count(tokenizer_type):
|
70 |
+
tokenizer = load_tokener(tokenizer_type)
|
71 |
+
stats = iter_vocab(tokenizer, tokenizer_type)
|
72 |
+
return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
|
73 |
+
|
74 |
+
|
75 |
+
def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
|
76 |
+
tokenizer1 = load_tokener(tokenizer_type_1)
|
77 |
+
tokenizer2 = load_tokener(tokenizer_type_2)
|
78 |
+
vocab1 = tokenizer1.get_vocab()
|
79 |
+
vocab2 = tokenizer2.get_vocab()
|
80 |
+
overlap_tokens = vocab1.keys() & vocab2.keys()
|
81 |
+
overlap_token_size = len(overlap_tokens)
|
82 |
+
print(f"OverlapTokens: {tokenizer_type_1}, {tokenizer_type_2} {list(overlap_tokens)[:10]}")
|
83 |
+
return overlap_token_size, overlap_token_size
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
def test_coding():
|
89 |
+
bytes1 = b'\xe4\xb8\xad'
|
90 |
+
print(bytes1) # b'\xe4\xb8\xad'
|
91 |
+
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
print(basic_count("internlm_chat_7b"))
|
utils/zh_util.py
CHANGED
@@ -37,8 +37,12 @@ def get_coding_length(tokenizer, vocab, filter=None):
|
|
37 |
def has_zh_char(text):
|
38 |
return any(ch in zh_punc for ch in text)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
def iter_vocab(tokenizer, name=""):
|
42 |
f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
|
43 |
zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
|
44 |
all_single_zh_tokens = set()
|
@@ -72,16 +76,17 @@ def iter_vocab(tokenizer, name=""):
|
|
72 |
# TODO: 繁体字,简体字
|
73 |
zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
|
74 |
|
75 |
-
|
76 |
"name": name,
|
77 |
"impl": str(tokenizer.__class__),
|
78 |
"vocab_size": tokenizer.vocab_size,
|
79 |
-
"中文汉字数":
|
80 |
"中文标点数": zh_symbol_count,
|
81 |
"中文汉字编码长度均值": mean_length,
|
82 |
"中文汉字编码长度分布": json.dumps(dist_length),
|
83 |
-
|
84 |
}
|
|
|
|
|
85 |
|
86 |
|
87 |
if __name__ == "__main__":
|
|
|
37 |
def has_zh_char(text):
|
38 |
return any(ch in zh_punc for ch in text)
|
39 |
|
40 |
+
cache = {}
|
41 |
+
|
42 |
+
def iter_vocab(tokenizer, name="", from_cache=True):
|
43 |
+
if from_cache and name in cache:
|
44 |
+
return cache[name]
|
45 |
|
|
|
46 |
f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
|
47 |
zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
|
48 |
all_single_zh_tokens = set()
|
|
|
76 |
# TODO: 繁体字,简体字
|
77 |
zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
|
78 |
|
79 |
+
result = {
|
80 |
"name": name,
|
81 |
"impl": str(tokenizer.__class__),
|
82 |
"vocab_size": tokenizer.vocab_size,
|
83 |
+
"中文汉字数": zh_token_count,
|
84 |
"中文标点数": zh_symbol_count,
|
85 |
"中文汉字编码长度均值": mean_length,
|
86 |
"中文汉字编码长度分布": json.dumps(dist_length),
|
|
|
87 |
}
|
88 |
+
cache[name] = result
|
89 |
+
return result
|
90 |
|
91 |
|
92 |
if __name__ == "__main__":
|
vocab/{alpaca_7b → Intern_gpt}/README.md
RENAMED
File without changes
|
vocab/README.md
CHANGED
@@ -11,9 +11,9 @@ gpt-neox词典
|
|
11 |
## decode
|
12 |
|
13 |
bert词典有个特殊字符 #
|
14 |
-
gpt词典有个特殊字符 G
|
15 |
|
16 |
gpt-neox词典呢?
|
|
|
17 |
|
18 |
|
19 |
## 关于分词粒度
|
@@ -80,6 +80,8 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
|
|
80 |
- 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
|
81 |
- 很多数字独立编码,几乎上千个。
|
82 |
|
|
|
|
|
83 |
## 空格、tab、换行
|
84 |
|
85 |
|
|
|
11 |
## decode
|
12 |
|
13 |
bert词典有个特殊字符 #
|
|
|
14 |
|
15 |
gpt-neox词典呢?
|
16 |
+
- _开头表示空格或句首
|
17 |
|
18 |
|
19 |
## 关于分词粒度
|
|
|
80 |
- 功能符号: `<|endoftext|>` 表示换行。tab? 空格?
|
81 |
- 很多数字独立编码,几乎上千个。
|
82 |
|
83 |
+
- 类似的还有:moss
|
84 |
+
|
85 |
## 空格、tab、换行
|
86 |
|
87 |
|
vocab/__init__.py
CHANGED
@@ -1,16 +1,39 @@
|
|
1 |
import importlib
|
2 |
from enum import Enum, auto
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
|
9 |
tokenizer.parent = ""
|
|
|
|
|
10 |
tokenizer.type = TokenizerType.ByteBPE.name
|
11 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
tokenizer.comments = "split all numbers into individual digits, " \
|
13 |
"and fallback to bytes to decompose unknown UTF-8 characters"
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"""
|
15 |
|
16 |
Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
@@ -21,9 +44,13 @@ uniq_tokenizers = [
|
|
21 |
|
22 |
all_tokenizers = [
|
23 |
"gpt_35_turbo",
|
|
|
24 |
"gpt2",
|
25 |
"gpt2_chinese",
|
26 |
-
"
|
|
|
|
|
|
|
27 |
"moss",
|
28 |
#
|
29 |
# ######
|
@@ -31,7 +58,7 @@ all_tokenizers = [
|
|
31 |
# "prompt_clue",
|
32 |
#
|
33 |
# #### bloom 系列
|
34 |
-
|
35 |
# "bloomz_6b4_zh",
|
36 |
# "belle_7b_2m", # 模型和词典都基于bloom
|
37 |
#
|
@@ -41,19 +68,21 @@ all_tokenizers = [
|
|
41 |
# ##### glm系列
|
42 |
# "glm_chinese",
|
43 |
"chatglm_6b",
|
|
|
44 |
#
|
45 |
# #### llama alpaca系列
|
46 |
-
"llama", #
|
47 |
"chinese_llama_lora_7b", #
|
48 |
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
49 |
# "belle_llama_ext_7b",
|
50 |
# "alpaca_7b",
|
51 |
"baichuan_7b",
|
52 |
-
"qwen"
|
|
|
|
|
53 |
]
|
54 |
|
55 |
|
56 |
-
|
57 |
class TokenizerType(Enum):
|
58 |
"""
|
59 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
@@ -105,10 +134,10 @@ class TokenizerImpl(Enum):
|
|
105 |
BertTokenizer = auto() #
|
106 |
|
107 |
|
108 |
-
|
109 |
def load_tokener(model_name):
|
110 |
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
|
111 |
return tokenizer
|
112 |
|
|
|
113 |
if __name__ == "__main__":
|
114 |
-
pass
|
|
|
1 |
import importlib
|
2 |
from enum import Enum, auto
|
3 |
|
4 |
+
"""Interface:
|
5 |
+
tokenizer.encode
|
6 |
+
tokenizer.decode
|
7 |
+
tokenizer.convert_ids_to_tokens
|
8 |
|
9 |
tokenizer.parent = ""
|
10 |
+
tokenizer.vocab_size
|
11 |
+
tokenizer.get_vocab() # gpt-neox-20b, llama
|
12 |
tokenizer.type = TokenizerType.ByteBPE.name
|
13 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
14 |
+
- bert
|
15 |
+
- 特征
|
16 |
+
- 示例:
|
17 |
+
- gpt2
|
18 |
+
- 特征:
|
19 |
+
- sentencepiece:
|
20 |
+
- 特征:.sp_model 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,词典字符有 ▁,
|
21 |
+
- 示例:llama,baichuan
|
22 |
+
- tiktoken
|
23 |
+
- icetk
|
24 |
+
- hf_tokenizer
|
25 |
+
- 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有 merge.txt
|
26 |
+
- 示例:gpt_neox_20b, moss
|
27 |
+
- gpt3.5 gpt4
|
28 |
+
- 特征:tiktoken
|
29 |
tokenizer.comments = "split all numbers into individual digits, " \
|
30 |
"and fallback to bytes to decompose unknown UTF-8 characters"
|
31 |
+
|
32 |
+
tokenizer.all_special_tokens # baichuan
|
33 |
+
tokenizer.special_tokens_set # gpt3.5_turbo
|
34 |
+
tokenizer.special_tokens_map
|
35 |
+
|
36 |
+
tokenizer.dependency [sentencepiece, tiktoken, icetk]
|
37 |
"""
|
38 |
|
39 |
Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
|
|
44 |
|
45 |
all_tokenizers = [
|
46 |
"gpt_35_turbo",
|
47 |
+
"gpt4",
|
48 |
"gpt2",
|
49 |
"gpt2_chinese",
|
50 |
+
"bert_base_cased",
|
51 |
+
"bert_base_uncased",
|
52 |
+
"bert_base_chinese",
|
53 |
+
"kplug",
|
54 |
"moss",
|
55 |
#
|
56 |
# ######
|
|
|
58 |
# "prompt_clue",
|
59 |
#
|
60 |
# #### bloom 系列
|
61 |
+
"bloom",
|
62 |
# "bloomz_6b4_zh",
|
63 |
# "belle_7b_2m", # 模型和词典都基于bloom
|
64 |
#
|
|
|
68 |
# ##### glm系列
|
69 |
# "glm_chinese",
|
70 |
"chatglm_6b",
|
71 |
+
"chatglm2-6b",
|
72 |
#
|
73 |
# #### llama alpaca系列
|
74 |
+
"llama", # '中文单字': 700, '中文多字': 0
|
75 |
"chinese_llama_lora_7b", #
|
76 |
# "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
77 |
# "belle_llama_ext_7b",
|
78 |
# "alpaca_7b",
|
79 |
"baichuan_7b",
|
80 |
+
"qwen",
|
81 |
+
"internlm_chat_7b",
|
82 |
+
"goat",
|
83 |
]
|
84 |
|
85 |
|
|
|
86 |
class TokenizerType(Enum):
|
87 |
"""
|
88 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
|
|
134 |
BertTokenizer = auto() #
|
135 |
|
136 |
|
|
|
137 |
def load_tokener(model_name):
|
138 |
tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
|
139 |
return tokenizer
|
140 |
|
141 |
+
|
142 |
if __name__ == "__main__":
|
143 |
+
pass
|
vocab/{bert_en → _alpaca_7b}/README.md
RENAMED
File without changes
|
vocab/{goat → _goat}/README.md
RENAMED
File without changes
|
tokenizer.py → vocab/_goat/__init__.py
RENAMED
File without changes
|
vocab/baichuan_7b/demo.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from vocab.baichuan_7b import tokenizer
|
3 |
+
|
vocab/bert_base_cased/README.md
ADDED
File without changes
|
vocab/bert_base_cased/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import BertTokenizer
|
3 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
vocab/{bert_chinese → bert_base_chinese}/README.md
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/__init__.py
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/test.py
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/test_zh_coding_len.py
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/tokenizer/config.json
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer.json
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/tokenizer/tokenizer_config.json
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/tokenizer/vocab.txt
RENAMED
File without changes
|
vocab/{bert_chinese → bert_base_chinese}/vocab.txt
RENAMED
File without changes
|
vocab/bert_base_uncased/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import BertTokenizer
|
3 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
vocab/chatglm2_6b/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -4,10 +4,10 @@ import tiktoken
|
|
4 |
from tiktoken import Encoding
|
5 |
|
6 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
7 |
-
|
8 |
tokenizer.vocab_size = tokenizer.n_vocab
|
9 |
|
10 |
|
|
|
11 |
def decode(self, tokens, errors="replace"):
|
12 |
# def decode(self, tokens: list[int], errors: str = "replace") -> str:
|
13 |
try:
|
@@ -19,8 +19,24 @@ def decode(self, tokens, errors="replace"):
|
|
19 |
def convert_ids_to_tokens(self, tokens):
|
20 |
return tokenizer.decode_tokens_bytes(tokens)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
Encoding.decode = decode
|
24 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
|
|
25 |
|
26 |
|
|
|
4 |
from tiktoken import Encoding
|
5 |
|
6 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
|
|
7 |
tokenizer.vocab_size = tokenizer.n_vocab
|
8 |
|
9 |
|
10 |
+
|
11 |
def decode(self, tokens, errors="replace"):
|
12 |
# def decode(self, tokens: list[int], errors: str = "replace") -> str:
|
13 |
try:
|
|
|
19 |
def convert_ids_to_tokens(self, tokens):
|
20 |
return tokenizer.decode_tokens_bytes(tokens)
|
21 |
|
22 |
+
def get_vocab(self):
|
23 |
+
"""Returns vocab as a dict"""
|
24 |
+
vocab = {}
|
25 |
+
for i in range(self.vocab_size):
|
26 |
+
try:
|
27 |
+
token_byte = self.convert_ids_to_tokens([i])[0]
|
28 |
+
token_str = token_byte.decode("utf-8")
|
29 |
+
vocab[token_str] = i
|
30 |
+
except KeyError:
|
31 |
+
print("gpt_35_turbo decode KeyError", i)
|
32 |
+
except UnicodeDecodeError:
|
33 |
+
print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
|
34 |
+
# vocab.update(self.added_tokens_encoder)
|
35 |
+
return vocab
|
36 |
+
|
37 |
|
38 |
Encoding.decode = decode
|
39 |
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
40 |
+
Encoding.get_vocab = get_vocab
|
41 |
|
42 |
|
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py
CHANGED
@@ -47,7 +47,7 @@ def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=No
|
|
47 |
data, base_tokenizer = base_tokenizer
|
48 |
vocab = data["model"]["vocab"]
|
49 |
merges = data["model"]["merges"]
|
50 |
-
vocab_size = base_tokenizer.
|
51 |
|
52 |
for word in word_list:
|
53 |
encoding = base_tokenizer.encode(word)
|
|
|
47 |
data, base_tokenizer = base_tokenizer
|
48 |
vocab = data["model"]["vocab"]
|
49 |
merges = data["model"]["merges"]
|
50 |
+
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
51 |
|
52 |
for word in word_list:
|
53 |
encoding = base_tokenizer.encode(word)
|
vocab/gpt_neox_chinese_v1/to_v2/test2.py
CHANGED
@@ -21,7 +21,7 @@ def append_token(word_list, base_tokenizer, unused_ids=None):
|
|
21 |
data, base_tokenizer = base_tokenizer
|
22 |
vocab = data["model"]["vocab"]
|
23 |
merges = data["model"]["merges"]
|
24 |
-
vocab_size = base_tokenizer.
|
25 |
|
26 |
for word in word_list:
|
27 |
encoding = base_tokenizer.encode(word)
|
|
|
21 |
data, base_tokenizer = base_tokenizer
|
22 |
vocab = data["model"]["vocab"]
|
23 |
merges = data["model"]["merges"]
|
24 |
+
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
25 |
|
26 |
for word in word_list:
|
27 |
encoding = base_tokenizer.encode(word)
|
vocab/gpt_nexo_20b/__init__.py
CHANGED
@@ -21,3 +21,4 @@ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
|
21 |
# tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
|
22 |
|
23 |
|
|
|
|
21 |
# tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
|
22 |
|
23 |
|
24 |
+
|
vocab/internlm_chat_7b/README.md
ADDED
File without changes
|
vocab/internlm_chat_7b/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://huggingface.co/internlm/internlm-chat-7b
|
3 |
+
"""
|
4 |
+
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
vocab/kplug/__init__.py
CHANGED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import BertTokenizer
|
3 |
+
|
4 |
+
tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
|
5 |
+
print(tokenizer)
|
vocab/llama/__init__.py
CHANGED
@@ -20,4 +20,4 @@ tokenizer.parent = ""
|
|
20 |
tokenizer.type = TokenizerType.ByteBPE.name
|
21 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
22 |
tokenizer.comments = "split all numbers into individual digits, " \
|
23 |
-
"and fallback to bytes to decompose unknown UTF-8 characters"
|
|
|
20 |
tokenizer.type = TokenizerType.ByteBPE.name
|
21 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
22 |
tokenizer.comments = "split all numbers into individual digits, " \
|
23 |
+
"and fallback to bytes to decompose unknown UTF-8 characters"
|
vocab/llama2/__init__.py
ADDED
File without changes
|
vocab/moss/test_tokenizer.py
CHANGED
@@ -3,6 +3,8 @@
|
|
3 |
vocab size: 106029
|
4 |
|
5 |
中文汉字数:54230, 中文标点数: 549
|
|
|
|
|
6 |
"""
|
7 |
|
8 |
import json
|
@@ -21,15 +23,12 @@ for token in tokens:
|
|
21 |
print(token, tokenizer.decode([token]))
|
22 |
|
23 |
|
24 |
-
def id2token(ids):
|
25 |
-
return tokenizer.convert_ids_to_tokens(ids)
|
26 |
-
|
27 |
def test_token():
|
28 |
for word in "中国解决方法黑白侗,。!?;":
|
29 |
encoding = tokenizer.encode(word)
|
30 |
for token_id in encoding:
|
31 |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
32 |
-
token =
|
33 |
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
34 |
|
35 |
|
|
|
3 |
vocab size: 106029
|
4 |
|
5 |
中文汉字数:54230, 中文标点数: 549
|
6 |
+
|
7 |
+
moss很奇怪,
|
8 |
"""
|
9 |
|
10 |
import json
|
|
|
23 |
print(token, tokenizer.decode([token]))
|
24 |
|
25 |
|
|
|
|
|
|
|
26 |
def test_token():
|
27 |
for word in "中国解决方法黑白侗,。!?;":
|
28 |
encoding = tokenizer.encode(word)
|
29 |
for token_id in encoding:
|
30 |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
31 |
+
token = tokenizer.convert_ids_to_tokens([token_id])
|
32 |
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
33 |
|
34 |
|