Spaces:
Running
Running
update
Browse files
app.py
CHANGED
@@ -5,13 +5,16 @@
|
|
5 |
"""
|
6 |
## TODO:
|
7 |
- http get方式获取参数,
|
8 |
-
- 自启动
|
9 |
- iter_vocab 的 warmup
|
10 |
- add_special_token 开关
|
11 |
- theme 开关 light/dark
|
12 |
- token_id/tokens/bytes 开关
|
13 |
- 通过 javascript 添加 hover_text
|
14 |
- i18
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
|
@@ -30,7 +33,6 @@ table
|
|
30 |
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
31 |
"""
|
32 |
|
33 |
-
|
34 |
import gradio as gr
|
35 |
|
36 |
from vocab import all_tokenizers
|
@@ -63,8 +65,6 @@ default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type
|
|
63 |
default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
64 |
default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
65 |
|
66 |
-
|
67 |
-
|
68 |
with gr.Blocks(css="style.css") as demo:
|
69 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
70 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
@@ -97,7 +97,6 @@ with gr.Blocks(css="style.css") as demo:
|
|
97 |
# None,
|
98 |
# )
|
99 |
|
100 |
-
|
101 |
gr.Markdown("## Tokenization")
|
102 |
|
103 |
with gr.Row():
|
@@ -139,12 +138,12 @@ with gr.Blocks(css="style.css") as demo:
|
|
139 |
# https://www.onlinewebfonts.com/icon/418591
|
140 |
gr.Image("images/VS.svg", scale=1, show_label=False,
|
141 |
show_download_button=False, container=False,
|
142 |
-
show_share_button=False)
|
143 |
with gr.Column(scale=6):
|
144 |
with gr.Group():
|
145 |
tokenizer_type_2 = gr.Dropdown(
|
146 |
all_tokenizers,
|
147 |
-
value=
|
148 |
label="Tokenizer 2",
|
149 |
)
|
150 |
with gr.Group():
|
@@ -229,7 +228,6 @@ with gr.Blocks(css="style.css") as demo:
|
|
229 |
# start up 初始化
|
230 |
# user_input.update(user_input.value + "___")
|
231 |
|
232 |
-
|
233 |
if __name__ == "__main__":
|
234 |
demo.queue(max_size=20).launch()
|
235 |
# demo.launch()
|
|
|
5 |
"""
|
6 |
## TODO:
|
7 |
- http get方式获取参数,
|
|
|
8 |
- iter_vocab 的 warmup
|
9 |
- add_special_token 开关
|
10 |
- theme 开关 light/dark
|
11 |
- token_id/tokens/bytes 开关
|
12 |
- 通过 javascript 添加 hover_text
|
13 |
- i18
|
14 |
+
- 给方法 + 缓存,避免重复调用
|
15 |
+
- 英文 utf-8编码
|
16 |
+
- 词典支持下载
|
17 |
+
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
|
19 |
|
20 |
|
|
|
33 |
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
|
34 |
"""
|
35 |
|
|
|
36 |
import gradio as gr
|
37 |
|
38 |
from vocab import all_tokenizers
|
|
|
65 |
default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
66 |
default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
67 |
|
|
|
|
|
68 |
with gr.Blocks(css="style.css") as demo:
|
69 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
70 |
# links: https://www.coderstool.com/utf8-encoding-decoding
|
|
|
97 |
# None,
|
98 |
# )
|
99 |
|
|
|
100 |
gr.Markdown("## Tokenization")
|
101 |
|
102 |
with gr.Row():
|
|
|
138 |
# https://www.onlinewebfonts.com/icon/418591
|
139 |
gr.Image("images/VS.svg", scale=1, show_label=False,
|
140 |
show_download_button=False, container=False,
|
141 |
+
show_share_button=False)
|
142 |
with gr.Column(scale=6):
|
143 |
with gr.Group():
|
144 |
tokenizer_type_2 = gr.Dropdown(
|
145 |
all_tokenizers,
|
146 |
+
value=default_tokenizer_type_2,
|
147 |
label="Tokenizer 2",
|
148 |
)
|
149 |
with gr.Group():
|
|
|
228 |
# start up 初始化
|
229 |
# user_input.update(user_input.value + "___")
|
230 |
|
|
|
231 |
if __name__ == "__main__":
|
232 |
demo.queue(max_size=20).launch()
|
233 |
# demo.launch()
|