Spaces:
Running
Running
startup
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
- theme 开关 light/dark
|
12 |
- token_id/tokens/bytes 开关
|
13 |
- 通过 javascript 添加 hover_text
|
14 |
-
-
|
15 |
|
16 |
|
17 |
|
@@ -36,9 +36,6 @@ import gradio as gr
|
|
36 |
from vocab import all_tokenizers
|
37 |
from util import *
|
38 |
|
39 |
-
example_text = """Replace this text in the input field to see how tokenization works
|
40 |
-
华为智能音箱发布:华为Sound X"""
|
41 |
-
|
42 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
43 |
examples = [
|
44 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
@@ -53,6 +50,20 @@ def example_fn(example_idx):
|
|
53 |
return examples[example_idx]
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
with gr.Blocks(css="style.css") as demo:
|
58 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
@@ -76,7 +87,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
76 |
)
|
77 |
|
78 |
user_input = gr.Textbox(
|
79 |
-
value=
|
80 |
label="Input Text",
|
81 |
lines=5,
|
82 |
show_label=False,
|
@@ -94,7 +105,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
94 |
with gr.Group():
|
95 |
tokenizer_type_1 = gr.Dropdown(
|
96 |
all_tokenizers,
|
97 |
-
value=
|
98 |
label="Tokenizer 1",
|
99 |
)
|
100 |
with gr.Group():
|
@@ -103,17 +114,19 @@ with gr.Blocks(css="style.css") as demo:
|
|
103 |
"""
|
104 |
with gr.Row():
|
105 |
stats_vocab_size_1 = gr.TextArea(
|
|
|
106 |
label="VocabSize",
|
107 |
lines=1,
|
108 |
elem_classes="statistics"
|
109 |
)
|
110 |
stats_zh_token_size_1 = gr.TextArea(
|
111 |
-
|
112 |
label="ZH char/word",
|
113 |
lines=1,
|
114 |
elem_classes="statistics"
|
115 |
)
|
116 |
stats_overlap_token_size_1 = gr.TextArea(
|
|
|
117 |
label="Overlap Tokens",
|
118 |
lines=1,
|
119 |
elem_classes="statistics"
|
@@ -137,12 +150,13 @@ with gr.Blocks(css="style.css") as demo:
|
|
137 |
with gr.Group():
|
138 |
with gr.Row():
|
139 |
stats_vocab_size_2 = gr.TextArea(
|
|
|
140 |
label="VocabSize",
|
141 |
lines=1,
|
142 |
elem_classes="statistics"
|
143 |
)
|
144 |
stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
|
145 |
-
|
146 |
label="ZH char/word",
|
147 |
lines=1,
|
148 |
elem_classes="statistics"
|
@@ -153,6 +167,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
153 |
# elem_classes="statistics"
|
154 |
# )
|
155 |
stats_overlap_token_size_2 = gr.TextArea(
|
|
|
156 |
label="Overlap Tokens",
|
157 |
lines=1,
|
158 |
elem_classes="statistics"
|
@@ -162,12 +177,14 @@ with gr.Blocks(css="style.css") as demo:
|
|
162 |
with gr.Row():
|
163 |
with gr.Column():
|
164 |
output_text_1 = gr.Highlightedtext(
|
|
|
165 |
label="Tokens 1",
|
166 |
show_legend=True,
|
167 |
elem_classes="space-show"
|
168 |
)
|
169 |
with gr.Column():
|
170 |
output_text_2 = gr.Highlightedtext(
|
|
|
171 |
label="Tokens 2",
|
172 |
show_legend=True,
|
173 |
elem_classes="space-show"
|
@@ -175,11 +192,13 @@ with gr.Blocks(css="style.css") as demo:
|
|
175 |
|
176 |
with gr.Row():
|
177 |
output_table_1 = gr.Dataframe(
|
|
|
178 |
headers=["TokenID", "Byte", "Text"],
|
179 |
datatype=["str", "str", "str"],
|
180 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
181 |
)
|
182 |
output_table_2 = gr.Dataframe(
|
|
|
183 |
headers=["TokenID", "Token", "Text"],
|
184 |
datatype=["str", "str", "str"],
|
185 |
)
|
|
|
11 |
- theme 开关 light/dark
|
12 |
- token_id/tokens/bytes 开关
|
13 |
- 通过 javascript 添加 hover_text
|
14 |
+
- i18
|
15 |
|
16 |
|
17 |
|
|
|
36 |
from vocab import all_tokenizers
|
37 |
from util import *
|
38 |
|
|
|
|
|
|
|
39 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
40 |
examples = [
|
41 |
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
|
|
50 |
return examples[example_idx]
|
51 |
|
52 |
|
53 |
+
"""Replace this text in the input field to see how tokenization works
|
54 |
+
华为智能音箱发布:华为发布mate60 pro手机"""
|
55 |
+
|
56 |
+
default_user_input = """Replace this text in the input field to see how tokenization works
|
57 |
+
华为发布mate60 pro手机"""
|
58 |
+
default_tokenizer_type_1 = "llama"
|
59 |
+
default_tokenizer_type_2 = "internlm_chat_7b"
|
60 |
+
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
|
61 |
+
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
|
62 |
+
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
|
63 |
+
default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
64 |
+
default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
65 |
+
|
66 |
+
|
67 |
|
68 |
with gr.Blocks(css="style.css") as demo:
|
69 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
|
|
87 |
)
|
88 |
|
89 |
user_input = gr.Textbox(
|
90 |
+
value=default_user_input,
|
91 |
label="Input Text",
|
92 |
lines=5,
|
93 |
show_label=False,
|
|
|
105 |
with gr.Group():
|
106 |
tokenizer_type_1 = gr.Dropdown(
|
107 |
all_tokenizers,
|
108 |
+
value=default_tokenizer_type_1,
|
109 |
label="Tokenizer 1",
|
110 |
)
|
111 |
with gr.Group():
|
|
|
114 |
"""
|
115 |
with gr.Row():
|
116 |
stats_vocab_size_1 = gr.TextArea(
|
117 |
+
value=default_stats_vocab_size_1,
|
118 |
label="VocabSize",
|
119 |
lines=1,
|
120 |
elem_classes="statistics"
|
121 |
)
|
122 |
stats_zh_token_size_1 = gr.TextArea(
|
123 |
+
value=default_stats_zh_token_size_1,
|
124 |
label="ZH char/word",
|
125 |
lines=1,
|
126 |
elem_classes="statistics"
|
127 |
)
|
128 |
stats_overlap_token_size_1 = gr.TextArea(
|
129 |
+
value=default_stats_overlap_token_size,
|
130 |
label="Overlap Tokens",
|
131 |
lines=1,
|
132 |
elem_classes="statistics"
|
|
|
150 |
with gr.Group():
|
151 |
with gr.Row():
|
152 |
stats_vocab_size_2 = gr.TextArea(
|
153 |
+
value=default_stats_vocab_size_2,
|
154 |
label="VocabSize",
|
155 |
lines=1,
|
156 |
elem_classes="statistics"
|
157 |
)
|
158 |
stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
|
159 |
+
value=default_stats_zh_token_size_2,
|
160 |
label="ZH char/word",
|
161 |
lines=1,
|
162 |
elem_classes="statistics"
|
|
|
167 |
# elem_classes="statistics"
|
168 |
# )
|
169 |
stats_overlap_token_size_2 = gr.TextArea(
|
170 |
+
value=default_stats_overlap_token_size,
|
171 |
label="Overlap Tokens",
|
172 |
lines=1,
|
173 |
elem_classes="statistics"
|
|
|
177 |
with gr.Row():
|
178 |
with gr.Column():
|
179 |
output_text_1 = gr.Highlightedtext(
|
180 |
+
value=default_output_text_1,
|
181 |
label="Tokens 1",
|
182 |
show_legend=True,
|
183 |
elem_classes="space-show"
|
184 |
)
|
185 |
with gr.Column():
|
186 |
output_text_2 = gr.Highlightedtext(
|
187 |
+
value=default_output_text_2,
|
188 |
label="Tokens 2",
|
189 |
show_legend=True,
|
190 |
elem_classes="space-show"
|
|
|
192 |
|
193 |
with gr.Row():
|
194 |
output_table_1 = gr.Dataframe(
|
195 |
+
value=default_output_table_1,
|
196 |
headers=["TokenID", "Byte", "Text"],
|
197 |
datatype=["str", "str", "str"],
|
198 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
199 |
)
|
200 |
output_table_2 = gr.Dataframe(
|
201 |
+
value=default_output_table_2,
|
202 |
headers=["TokenID", "Token", "Text"],
|
203 |
datatype=["str", "str", "str"],
|
204 |
)
|
util.py
CHANGED
@@ -9,7 +9,7 @@ from utils.zh_util import iter_vocab
|
|
9 |
|
10 |
|
11 |
|
12 |
-
def tokenize(text, tokenizer_type, color_num=5):
|
13 |
"""
|
14 |
TODO: cache tokenizer
|
15 |
"""
|
@@ -57,11 +57,14 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
57 |
print(f"Tokenization[{tokenizer_type}]: {table}")
|
58 |
# print(table_df)
|
59 |
|
60 |
-
|
|
|
|
|
|
|
61 |
|
62 |
|
63 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
64 |
-
pos_tokens_1, table_df_1
|
65 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
66 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
67 |
|
|
|
9 |
|
10 |
|
11 |
|
12 |
+
def tokenize(text, tokenizer_type, color_num=5, update=True):
|
13 |
"""
|
14 |
TODO: cache tokenizer
|
15 |
"""
|
|
|
57 |
print(f"Tokenization[{tokenizer_type}]: {table}")
|
58 |
# print(table_df)
|
59 |
|
60 |
+
if update:
|
61 |
+
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
62 |
+
else:
|
63 |
+
return pos_tokens, table_df
|
64 |
|
65 |
|
66 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
67 |
+
pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
|
68 |
pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
|
69 |
return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
|
70 |
|