update
Browse files- .gitignore +2 -1
- README.md +8 -1
- app.py +140 -43
- app_v1.py +196 -0
- images/VS.svg +7 -0
- tokenizer.py +0 -0
- vocab/__init__.py +13 -2
- vocab/baichuan_7b/__init__.py +3 -0
- vocab/{chatglm → chatglm_6b}/README.md +0 -0
- vocab/{chatglm → chatglm_6b}/__init__.py +0 -0
- vocab/{chatglm → chatglm_6b}/chatglm.vocab +0 -0
- vocab/{chatglm → chatglm_6b}/test_chatglm.py +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/config.json +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py +0 -0
- vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json +0 -0
- vocab/gpt_35_turbo/__init__.py +4 -0
- vocab/gpt_35_turbo/test2.py +4 -0
- vocab/{bert_kplug → kplug}/README.md +0 -0
- vocab/kplug/__init__.py +0 -0
- vocab/{bert_kplug → kplug}/bpe_oov.py +0 -0
- vocab/{bert_kplug → kplug}/bpe_oov2.py +0 -0
- vocab/{bert_kplug → kplug}/jd_vocab.py +0 -0
- vocab/{bert_kplug → kplug}/langconv.py +0 -0
- vocab/{bert_kplug → kplug}/test_langconv.py +0 -0
- vocab/{bert_kplug → kplug}/vocab.jd.txt +0 -0
- vocab/{bert_kplug → kplug}/vocab.jd.txt.v2 +0 -0
- vocab/{bert_kplug → kplug}/zh_wiki.py +0 -0
.gitignore
CHANGED
@@ -13,4 +13,5 @@ dist/
|
|
13 |
downloads/
|
14 |
eggs/
|
15 |
.eggs/
|
16 |
-
.idea/
|
|
|
|
13 |
downloads/
|
14 |
eggs/
|
15 |
.eggs/
|
16 |
+
.idea/
|
17 |
+
gradio_cached_examples
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: ⚡
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
@@ -10,3 +10,10 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Tokenizer Arena
|
3 |
emoji: ⚡
|
4 |
colorFrom: red
|
5 |
colorTo: gray
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
|
15 |
+
## ss
|
16 |
+
|
17 |
+
|
18 |
+
## ss
|
19 |
+
|
app.py
CHANGED
@@ -9,7 +9,10 @@ plots
|
|
9 |
table
|
10 |
|
11 |
## related demo
|
12 |
-
http://text-processing.com/demo/tokenize/
|
|
|
|
|
|
|
13 |
|
14 |
## 可视化
|
15 |
|
@@ -28,15 +31,28 @@ css = """
|
|
28 |
.space-show {white-space: pre-wrap;}
|
29 |
.cell-wrap {white-space: pre-wrap;}
|
30 |
.category-legend {display: none !important}
|
|
|
|
|
31 |
"""
|
32 |
|
33 |
-
example_text = """
|
34 |
-
|
35 |
-
空格测试: 2个空格 8个空格
|
36 |
-
数字测试:(10086 + 98) = 100184"""
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
40 |
print(text, tokenizer_type)
|
41 |
pos_tokens = []
|
42 |
tokenizer = load_tokener(tokenizer_type)
|
@@ -46,12 +62,17 @@ def tokenize(text, tokenizer_type):
|
|
46 |
|
47 |
for idx, token_id in enumerate(encoding):
|
48 |
decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
49 |
-
pos_tokens.extend([(decode_text, str(idx %
|
50 |
|
51 |
# token "Byte": # 这是 utf-8编码吧?
|
52 |
token = tokenizer.convert_ids_to_tokens([token_id])[0]
|
53 |
if isinstance(token, bytes):
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
55 |
token_bytes = token
|
56 |
json_dumps = json.dumps(token_str)
|
57 |
elif isinstance(token, str):
|
@@ -61,9 +82,11 @@ def tokenize(text, tokenizer_type):
|
|
61 |
else:
|
62 |
return
|
63 |
|
|
|
|
|
64 |
table.append(
|
65 |
{"TokenID": token_id,
|
66 |
-
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
67 |
"Text": decode_text, #
|
68 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
69 |
"Bytes": str(token_bytes),
|
@@ -73,74 +96,148 @@ def tokenize(text, tokenizer_type):
|
|
73 |
|
74 |
table_df = pd.DataFrame(table)
|
75 |
print(table)
|
76 |
-
print(table_df)
|
|
|
|
|
|
|
77 |
|
78 |
-
|
|
|
|
|
|
|
79 |
|
80 |
|
|
|
|
|
|
|
|
|
81 |
def test_coding():
|
82 |
bytes1 = b'\xe4\xb8\xad'
|
83 |
print(bytes1) # b'\xe4\xb8\xad'
|
84 |
|
85 |
|
86 |
with gr.Blocks(css=css) as demo:
|
87 |
-
gr.HTML("""<h1 align="center">Tokenizer Arena
|
88 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
|
|
|
|
|
|
89 |
#
|
|
|
90 |
|
91 |
|
|
|
92 |
user_input = gr.Textbox(
|
93 |
value=example_text,
|
94 |
-
|
|
|
|
|
95 |
) # placeholder="Enter sentence here..."
|
96 |
|
97 |
# submitBtn = gr.Button("生成回复", variant="primary")
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
# TODO: 图 表 压缩率
|
100 |
-
# llama chatglm gpt_nexo_20b baichuan baichuan_7b
|
101 |
with gr.Row():
|
102 |
with gr.Column():
|
103 |
-
tokenizer_type_1 = gr.Dropdown(
|
104 |
-
all_tokenizers, value="llama", label="tokenizer"
|
105 |
-
)
|
106 |
-
token_counter_1 = None # 计数器
|
107 |
output_text_1 = gr.Highlightedtext(
|
108 |
-
label="
|
109 |
show_legend=True,
|
110 |
elem_classes="space-show"
|
111 |
)
|
112 |
-
|
113 |
-
output_table_1 = gr.Dataframe(
|
114 |
-
headers=["TokenID", "Byte", "Text"],
|
115 |
-
datatype=["str", "str", "str"],
|
116 |
-
#elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
117 |
-
)
|
118 |
-
|
119 |
with gr.Column():
|
120 |
-
tokenizer_type_2 = gr.Dropdown(
|
121 |
-
all_tokenizers, value="baichuan_7b", label="tokenizer"
|
122 |
-
)
|
123 |
-
token_counter_2 = None # 计数器
|
124 |
output_text_2 = gr.Highlightedtext(
|
125 |
-
label="
|
126 |
show_legend=True,
|
127 |
elem_classes="space-show"
|
128 |
)
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
[output_text_1, output_table_1])
|
138 |
-
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
144 |
|
145 |
# submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
|
146 |
# show_progress=True)
|
|
|
9 |
table
|
10 |
|
11 |
## related demo
|
12 |
+
- [](http://text-processing.com/demo/tokenize/)
|
13 |
+
- [gpt-tokenizer](https://gpt-tokenizer.dev/)
|
14 |
+
- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
|
15 |
+
- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
|
16 |
|
17 |
## 可视化
|
18 |
|
|
|
31 |
.space-show {white-space: pre-wrap;}
|
32 |
.cell-wrap {white-space: pre-wrap;}
|
33 |
.category-legend {display: none !important}
|
34 |
+
.statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;}
|
35 |
+
.statistics label {text-align: center !important;}
|
36 |
"""
|
37 |
|
38 |
+
example_text = """Replace this text in the input field to see how tokenization works
|
39 |
+
华为智能音箱发布:华为Sound X"""
|
|
|
|
|
40 |
|
41 |
+
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
42 |
+
examples = [
|
43 |
+
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
44 |
+
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
45 |
+
["符号测试:🦙", "baichuan_7b", "llama"],
|
46 |
+
["中文测试:🦙", "baichuan_7b", "llama"],
|
47 |
+
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
48 |
+
]
|
49 |
|
50 |
+
|
51 |
+
|
52 |
+
def tokenize(text, tokenizer_type, color_num=5):
|
53 |
+
"""
|
54 |
+
TODO: cache tokenizer
|
55 |
+
"""
|
56 |
print(text, tokenizer_type)
|
57 |
pos_tokens = []
|
58 |
tokenizer = load_tokener(tokenizer_type)
|
|
|
62 |
|
63 |
for idx, token_id in enumerate(encoding):
|
64 |
decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
65 |
+
pos_tokens.extend([(decode_text, str(idx % color_num))])
|
66 |
|
67 |
# token "Byte": # 这是 utf-8编码吧?
|
68 |
token = tokenizer.convert_ids_to_tokens([token_id])[0]
|
69 |
if isinstance(token, bytes):
|
70 |
+
try:
|
71 |
+
token_str = token.decode("utf-8")
|
72 |
+
except:
|
73 |
+
token_str = token.decode("utf-8", errors="ignore")
|
74 |
+
print("decode_error", token, token_str)
|
75 |
+
|
76 |
token_bytes = token
|
77 |
json_dumps = json.dumps(token_str)
|
78 |
elif isinstance(token, str):
|
|
|
82 |
else:
|
83 |
return
|
84 |
|
85 |
+
|
86 |
+
|
87 |
table.append(
|
88 |
{"TokenID": token_id,
|
89 |
+
"⭐Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
90 |
"Text": decode_text, #
|
91 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
92 |
"Bytes": str(token_bytes),
|
|
|
96 |
|
97 |
table_df = pd.DataFrame(table)
|
98 |
print(table)
|
99 |
+
# print(table_df)
|
100 |
+
|
101 |
+
return pos_tokens, table_df, len(encoding)
|
102 |
+
|
103 |
|
104 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
105 |
+
pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1)
|
106 |
+
pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2)
|
107 |
+
return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2
|
108 |
|
109 |
|
110 |
+
def get_vocab_size(tokenizer_type):
|
111 |
+
tokenizer = load_tokener(tokenizer_type)
|
112 |
+
return tokenizer.vocab_size
|
113 |
+
|
114 |
def test_coding():
|
115 |
bytes1 = b'\xe4\xb8\xad'
|
116 |
print(bytes1) # b'\xe4\xb8\xad'
|
117 |
|
118 |
|
119 |
with gr.Blocks(css=css) as demo:
|
120 |
+
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
121 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
122 |
+
# 功能:输入文本,进行分词
|
123 |
+
# 分词器:常见的分词器有集中,
|
124 |
+
# 背景:方便分词、看词粒度、对比
|
125 |
#
|
126 |
+
# Byte: 表示分词
|
127 |
|
128 |
|
129 |
+
gr.Markdown("## Input Text")
|
130 |
user_input = gr.Textbox(
|
131 |
value=example_text,
|
132 |
+
label="Input Text",
|
133 |
+
lines=5,
|
134 |
+
show_label=False,
|
135 |
) # placeholder="Enter sentence here..."
|
136 |
|
137 |
# submitBtn = gr.Button("生成回复", variant="primary")
|
138 |
|
139 |
+
gr.Markdown("## Tokenization")
|
140 |
+
|
141 |
+
with gr.Row():
|
142 |
+
with gr.Column(scale=6):
|
143 |
+
with gr.Group():
|
144 |
+
tokenizer_type_1 = gr.Dropdown(
|
145 |
+
all_tokenizers,
|
146 |
+
value="llama",
|
147 |
+
label="Tokenizer 1",
|
148 |
+
)
|
149 |
+
with gr.Group():
|
150 |
+
"""
|
151 |
+
<div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div>
|
152 |
+
"""
|
153 |
+
with gr.Row():
|
154 |
+
stats_vocab_size_1 = gr.TextArea(
|
155 |
+
label="VocabSize",
|
156 |
+
lines=1,
|
157 |
+
elem_classes="statistics"
|
158 |
+
)
|
159 |
+
stats_token_size_1 = gr.TextArea(
|
160 |
+
label="Tokens",
|
161 |
+
lines=1,
|
162 |
+
elem_classes="statistics"
|
163 |
+
)
|
164 |
+
stats_3 = gr.TextArea(
|
165 |
+
label="Compress Rate",
|
166 |
+
lines=1,
|
167 |
+
elem_classes="statistics"
|
168 |
+
)
|
169 |
+
# https://www.onlinewebfonts.com/icon/418591
|
170 |
+
gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) # height=10,
|
171 |
+
with gr.Column(scale=6):
|
172 |
+
with gr.Group():
|
173 |
+
tokenizer_type_2 = gr.Dropdown(
|
174 |
+
all_tokenizers,
|
175 |
+
value="baichuan_7b",
|
176 |
+
label="Tokenizer 2",
|
177 |
+
)
|
178 |
+
with gr.Group():
|
179 |
+
with gr.Row():
|
180 |
+
stats_vocab_size_2 = gr.TextArea(
|
181 |
+
label="VocabSize",
|
182 |
+
lines=1,
|
183 |
+
elem_classes="statistics"
|
184 |
+
)
|
185 |
+
stats_token_size_2 = gr.TextArea(
|
186 |
+
label="Tokens",
|
187 |
+
lines=1,
|
188 |
+
elem_classes="statistics"
|
189 |
+
)
|
190 |
+
stats_6 = gr.TextArea(
|
191 |
+
label="Compress Rate",
|
192 |
+
lines=1,
|
193 |
+
elem_classes="statistics"
|
194 |
+
)
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
# TODO: 图 表 压缩率
|
|
|
199 |
with gr.Row():
|
200 |
with gr.Column():
|
|
|
|
|
|
|
|
|
201 |
output_text_1 = gr.Highlightedtext(
|
202 |
+
label="Tokens 1",
|
203 |
show_legend=True,
|
204 |
elem_classes="space-show"
|
205 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
with gr.Column():
|
|
|
|
|
|
|
|
|
207 |
output_text_2 = gr.Highlightedtext(
|
208 |
+
label="Tokens 2",
|
209 |
show_legend=True,
|
210 |
elem_classes="space-show"
|
211 |
)
|
212 |
|
213 |
+
with gr.Row():
|
214 |
+
output_table_1 = gr.Dataframe(
|
215 |
+
headers=["TokenID", "Byte", "Text"],
|
216 |
+
datatype=["str", "str", "str"],
|
217 |
+
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
218 |
+
)
|
219 |
+
output_table_2 = gr.Dataframe(
|
220 |
+
headers=["TokenID", "Token", "Text"],
|
221 |
+
datatype=["str", "str", "str"],
|
222 |
+
)
|
223 |
+
|
224 |
+
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
|
225 |
+
tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1])
|
226 |
+
|
227 |
+
user_input.change(tokenize_pair,
|
228 |
+
[user_input, tokenizer_type_1, tokenizer_type_2],
|
229 |
+
[output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2])
|
230 |
|
231 |
+
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2])
|
232 |
+
tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2])
|
|
|
|
|
233 |
|
234 |
+
gr.Examples(
|
235 |
+
examples,
|
236 |
+
[user_input, tokenizer_type_1, tokenizer_type_2],
|
237 |
+
[output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2],
|
238 |
+
tokenize_pair,
|
239 |
+
cache_examples=True,
|
240 |
+
)
|
241 |
|
242 |
# submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
|
243 |
# show_progress=True)
|
app_v1.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# author: xusong
|
3 |
+
# time: 2022/8/23 16:06
|
4 |
+
|
5 |
+
"""
|
6 |
+
|
7 |
+
plots
|
8 |
+
|
9 |
+
table
|
10 |
+
|
11 |
+
## related demo
|
12 |
+
- [](http://text-processing.com/demo/tokenize/)
|
13 |
+
- [gpt-tokenizer](https://gpt-tokenizer.dev/)
|
14 |
+
- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/)
|
15 |
+
- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground)
|
16 |
+
|
17 |
+
## 可视化
|
18 |
+
|
19 |
+
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
20 |
+
"""
|
21 |
+
|
22 |
+
import json
|
23 |
+
import pandas as pd
|
24 |
+
import gradio as gr
|
25 |
+
|
26 |
+
from vocab import all_tokenizers, load_tokener
|
27 |
+
|
28 |
+
# 显示空格:https://blog.csdn.net/liuxiao723846/article/details/118994673
|
29 |
+
# 隐藏legend:
|
30 |
+
css = """
|
31 |
+
.space-show {white-space: pre-wrap;}
|
32 |
+
.cell-wrap {white-space: pre-wrap;}
|
33 |
+
.category-legend {display: none !important}
|
34 |
+
"""
|
35 |
+
|
36 |
+
example_text = """Replace this text in the input field to see how tokenization works
|
37 |
+
中文测试:华为智能音箱发布:华为Sound X。維基百科由非營利組織──維基媒體基金會負責維持
|
38 |
+
数字测试:(10086 + 98) = 100184"""
|
39 |
+
|
40 |
+
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
41 |
+
examples = [
|
42 |
+
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
43 |
+
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
44 |
+
["标点测试:🦙", "baichuan_7b", "llama"],
|
45 |
+
]
|
46 |
+
|
47 |
+
|
48 |
+
def tokenize(text, tokenizer_type, color_num=5):
|
49 |
+
print(text, tokenizer_type)
|
50 |
+
pos_tokens = []
|
51 |
+
tokenizer = load_tokener(tokenizer_type)
|
52 |
+
encoding = tokenizer.encode(text)
|
53 |
+
|
54 |
+
table = []
|
55 |
+
|
56 |
+
for idx, token_id in enumerate(encoding):
|
57 |
+
decode_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
58 |
+
pos_tokens.extend([(decode_text, str(idx % color_num))])
|
59 |
+
|
60 |
+
# token "Byte": # 这是 utf-8编码吧?
|
61 |
+
token = tokenizer.convert_ids_to_tokens([token_id])[0]
|
62 |
+
if isinstance(token, bytes):
|
63 |
+
try:
|
64 |
+
token_str = token.decode("utf-8")
|
65 |
+
except:
|
66 |
+
token_str = token.decode("utf-8", errors="ignore")
|
67 |
+
print("decode_error", token, token_str)
|
68 |
+
|
69 |
+
token_bytes = token
|
70 |
+
json_dumps = json.dumps(token_str)
|
71 |
+
elif isinstance(token, str):
|
72 |
+
token_str = token
|
73 |
+
token_bytes = bytes(token_str, "utf-8")
|
74 |
+
json_dumps = json.dumps(token_str)
|
75 |
+
else:
|
76 |
+
return
|
77 |
+
|
78 |
+
table.append(
|
79 |
+
{"TokenID": token_id,
|
80 |
+
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
81 |
+
"Text": decode_text, #
|
82 |
+
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
83 |
+
"Bytes": str(token_bytes),
|
84 |
+
# "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
|
85 |
+
}
|
86 |
+
)
|
87 |
+
|
88 |
+
table_df = pd.DataFrame(table)
|
89 |
+
print(table)
|
90 |
+
# print(table_df)
|
91 |
+
|
92 |
+
return pos_tokens, table_df
|
93 |
+
|
94 |
+
|
95 |
+
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
96 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
97 |
+
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
98 |
+
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
99 |
+
|
100 |
+
|
101 |
+
def test_coding():
|
102 |
+
bytes1 = b'\xe4\xb8\xad'
|
103 |
+
print(bytes1) # b'\xe4\xb8\xad'
|
104 |
+
|
105 |
+
|
106 |
+
with gr.Blocks(css=css) as demo:
|
107 |
+
gr.HTML("""<h1 align="center">The Tokenizer Arena</h1>""")
|
108 |
+
# links: https://www.coderstool.com/utf8-encoding-decoding
|
109 |
+
#
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
gr.Markdown("## Input Text")
|
114 |
+
user_input = gr.Textbox(
|
115 |
+
value=example_text,
|
116 |
+
label="Input Text",
|
117 |
+
lines=5
|
118 |
+
) # placeholder="Enter sentence here..."
|
119 |
+
|
120 |
+
# submitBtn = gr.Button("生成回复", variant="primary")
|
121 |
+
|
122 |
+
gr.Markdown("## Tokenization")
|
123 |
+
|
124 |
+
# with gr.Row():
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
# TODO: 图 表 压缩率
|
129 |
+
with gr.Row():
|
130 |
+
with gr.Column():
|
131 |
+
tokenizer_type_1 = gr.Dropdown(
|
132 |
+
all_tokenizers,
|
133 |
+
value="llama",
|
134 |
+
label="Tokenizer 1",
|
135 |
+
)
|
136 |
+
token_counter_1 = None # 计数器
|
137 |
+
output_text_1 = gr.Highlightedtext(
|
138 |
+
label="Tokens 1",
|
139 |
+
show_legend=True,
|
140 |
+
elem_classes="space-show"
|
141 |
+
)
|
142 |
+
|
143 |
+
with gr.Column():
|
144 |
+
tokenizer_type_2 = gr.Dropdown(
|
145 |
+
all_tokenizers,
|
146 |
+
value="baichuan_7b",
|
147 |
+
label="Tokenizer 2"
|
148 |
+
)
|
149 |
+
token_counter_2 = None # 计数器
|
150 |
+
output_text_2 = gr.Highlightedtext(
|
151 |
+
label="Tokens 2",
|
152 |
+
show_legend=True,
|
153 |
+
elem_classes="space-show"
|
154 |
+
)
|
155 |
+
|
156 |
+
with gr.Row():
|
157 |
+
output_table_1 = gr.Dataframe(
|
158 |
+
headers=["TokenID", "Byte", "Text"],
|
159 |
+
datatype=["str", "str", "str"],
|
160 |
+
# elem_classes="space-show", # 给���个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
161 |
+
)
|
162 |
+
output_table_2 = gr.Dataframe(
|
163 |
+
headers=["TokenID", "Token", "Text"],
|
164 |
+
datatype=["str", "str", "str"],
|
165 |
+
)
|
166 |
+
|
167 |
+
user_input.change(tokenize,
|
168 |
+
[user_input, tokenizer_type_1],
|
169 |
+
[output_text_1, output_table_1])
|
170 |
+
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1])
|
171 |
+
|
172 |
+
user_input.change(tokenize,
|
173 |
+
[user_input, tokenizer_type_2],
|
174 |
+
[output_text_2, output_table_2])
|
175 |
+
|
176 |
+
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2])
|
177 |
+
|
178 |
+
gr.Examples(
|
179 |
+
examples,
|
180 |
+
[user_input, tokenizer_type_1, tokenizer_type_2],
|
181 |
+
[output_text_1, output_table_1, output_text_2, output_table_2],
|
182 |
+
tokenize_pair,
|
183 |
+
cache_examples=True,
|
184 |
+
)
|
185 |
+
|
186 |
+
# submitBtn.click(tokenize, [user_input, tokenizer_type], outputs,
|
187 |
+
# show_progress=True)
|
188 |
+
|
189 |
+
# examples=[
|
190 |
+
# ["What a beautiful morning for a walk!"],
|
191 |
+
# ["It was the best of times, it was the worst of times."],
|
192 |
+
# ["多个空格 It ss was the best of times, it was the worst of times."],
|
193 |
+
# ]
|
194 |
+
|
195 |
+
if __name__ == "__main__":
|
196 |
+
demo.launch()
|
images/VS.svg
ADDED
tokenizer.py
ADDED
File without changes
|
vocab/__init__.py
CHANGED
@@ -1,7 +1,18 @@
|
|
1 |
-
import transformers
|
2 |
import importlib
|
3 |
from enum import Enum, auto
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
6 |
|
7 |
uniq_tokenizers = [
|
@@ -29,7 +40,7 @@ all_tokenizers = [
|
|
29 |
#
|
30 |
# ##### glm系列
|
31 |
# "glm_chinese",
|
32 |
-
"
|
33 |
#
|
34 |
# #### llama alpaca系列
|
35 |
"llama", # '中文单字': 700, '中文多字': 0
|
|
|
|
|
1 |
import importlib
|
2 |
from enum import Enum, auto
|
3 |
|
4 |
+
|
5 |
+
"""
|
6 |
+
Interface:
|
7 |
+
-
|
8 |
+
|
9 |
+
tokenizer.parent = ""
|
10 |
+
tokenizer.type = TokenizerType.ByteBPE.name
|
11 |
+
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
12 |
+
tokenizer.comments = "split all numbers into individual digits, " \
|
13 |
+
"and fallback to bytes to decompose unknown UTF-8 characters"
|
14 |
+
"""
|
15 |
+
|
16 |
Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
17 |
|
18 |
uniq_tokenizers = [
|
|
|
40 |
#
|
41 |
# ##### glm系列
|
42 |
# "glm_chinese",
|
43 |
+
"chatglm_6b",
|
44 |
#
|
45 |
# #### llama alpaca系列
|
46 |
"llama", # '中文单字': 700, '中文多字': 0
|
vocab/baichuan_7b/__init__.py
CHANGED
@@ -6,3 +6,6 @@ tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remo
|
|
6 |
|
7 |
# byte-bpe sentencepiece
|
8 |
tokenizer.type = TokenizerType.ByteBPE
|
|
|
|
|
|
|
|
6 |
|
7 |
# byte-bpe sentencepiece
|
8 |
tokenizer.type = TokenizerType.ByteBPE
|
9 |
+
|
10 |
+
tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"
|
11 |
+
|
vocab/{chatglm → chatglm_6b}/README.md
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/__init__.py
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/chatglm.vocab
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/test_chatglm.py
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/tokenizer/config.json
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/tokenizer/ice_text.model
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/tokenizer/tokenization_chatglm.py
RENAMED
File without changes
|
vocab/{chatglm → chatglm_6b}/tokenizer/tokenizer_config.json
RENAMED
File without changes
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -16,7 +16,11 @@ def decode(self, tokens, errors="replace"):
|
|
16 |
decode_str = "null"
|
17 |
return decode_str
|
18 |
|
|
|
|
|
|
|
19 |
|
20 |
Encoding.decode = decode
|
|
|
21 |
|
22 |
|
|
|
16 |
decode_str = "null"
|
17 |
return decode_str
|
18 |
|
19 |
+
def convert_ids_to_tokens(self, tokens):
|
20 |
+
return tokenizer.decode_tokens_bytes(tokens)
|
21 |
+
|
22 |
|
23 |
Encoding.decode = decode
|
24 |
+
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
25 |
|
26 |
|
vocab/gpt_35_turbo/test2.py
CHANGED
@@ -22,6 +22,10 @@ print(decoding_bytes)
|
|
22 |
# print(token, token_str, json.dumps(token_str))
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
25 |
f_out = open("vocab.jsonl", "w")
|
26 |
# 100255
|
27 |
for i in range(tokenizer.n_vocab):
|
|
|
22 |
# print(token, token_str, json.dumps(token_str))
|
23 |
|
24 |
|
25 |
+
tokenizer.decode_tokens_bytes([10])
|
26 |
+
tokenizer.decode_single_token_bytes(10)
|
27 |
+
tokenizer.decode_bytes([10])
|
28 |
+
|
29 |
f_out = open("vocab.jsonl", "w")
|
30 |
# 100255
|
31 |
for i in range(tokenizer.n_vocab):
|
vocab/{bert_kplug → kplug}/README.md
RENAMED
File without changes
|
vocab/kplug/__init__.py
ADDED
File without changes
|
vocab/{bert_kplug → kplug}/bpe_oov.py
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/bpe_oov2.py
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/jd_vocab.py
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/langconv.py
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/test_langconv.py
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/vocab.jd.txt
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/vocab.jd.txt.v2
RENAMED
File without changes
|
vocab/{bert_kplug → kplug}/zh_wiki.py
RENAMED
File without changes
|