update
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@
|
|
17 |
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
- OOV
|
|
|
20 |
|
21 |
|
22 |
plots
|
@@ -40,11 +41,12 @@ from util import *
|
|
40 |
|
41 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
42 |
examples = [
|
43 |
-
|
44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
45 |
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
46 |
-
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
47 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
|
|
|
|
48 |
]
|
49 |
|
50 |
# jieba.enable_parallel() # flask中没办法parallel
|
@@ -66,8 +68,8 @@ default_tokenizer_type_2 = "internlm_chat_7b"
|
|
66 |
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
|
67 |
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
|
68 |
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
|
69 |
-
default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
70 |
-
default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
71 |
|
72 |
with gr.Blocks(css="style.css") as demo:
|
73 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
@@ -81,7 +83,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
81 |
with gr.Row():
|
82 |
gr.Markdown("## Input Text")
|
83 |
dropdown_examples = gr.Dropdown(
|
84 |
-
["
|
85 |
value="Examples",
|
86 |
type="index",
|
87 |
show_label=False,
|
@@ -181,14 +183,14 @@ with gr.Blocks(css="style.css") as demo:
|
|
181 |
with gr.Column():
|
182 |
output_text_1 = gr.Highlightedtext(
|
183 |
value=default_output_text_1,
|
184 |
-
label="Tokens
|
185 |
show_legend=True,
|
186 |
elem_classes="space-show"
|
187 |
)
|
188 |
with gr.Column():
|
189 |
output_text_2 = gr.Highlightedtext(
|
190 |
value=default_output_text_2,
|
191 |
-
label="Tokens
|
192 |
show_legend=True,
|
193 |
elem_classes="space-show"
|
194 |
)
|
|
|
17 |
- 中文字词统计,是否要包括 _ G 等字符
|
18 |
- baichuan的单字数量怎么两万多个?
|
19 |
- OOV
|
20 |
+
- feedback位置
|
21 |
|
22 |
|
23 |
plots
|
|
|
41 |
|
42 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
43 |
examples = [
|
44 |
+
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
45 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
46 |
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
|
|
|
47 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
48 |
+
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
49 |
+
|
50 |
]
|
51 |
|
52 |
# jieba.enable_parallel() # flask中没办法parallel
|
|
|
68 |
default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
|
69 |
default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
|
70 |
default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
|
71 |
+
default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
|
72 |
+
default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
|
73 |
|
74 |
with gr.Blocks(css="style.css") as demo:
|
75 |
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
|
|
|
83 |
with gr.Row():
|
84 |
gr.Markdown("## Input Text")
|
85 |
dropdown_examples = gr.Dropdown(
|
86 |
+
["空格测试", "标点测试", "符号测试", "数字测试"],
|
87 |
value="Examples",
|
88 |
type="index",
|
89 |
show_label=False,
|
|
|
183 |
with gr.Column():
|
184 |
output_text_1 = gr.Highlightedtext(
|
185 |
value=default_output_text_1,
|
186 |
+
label=f"Tokens: {default_output_len_1}",
|
187 |
show_legend=True,
|
188 |
elem_classes="space-show"
|
189 |
)
|
190 |
with gr.Column():
|
191 |
output_text_2 = gr.Highlightedtext(
|
192 |
value=default_output_text_2,
|
193 |
+
label=f"Tokens: {default_output_len_2}",
|
194 |
show_legend=True,
|
195 |
elem_classes="space-show"
|
196 |
)
|
util.py
CHANGED
@@ -59,7 +59,7 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
|
|
59 |
if update:
|
60 |
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
61 |
else:
|
62 |
-
return pos_tokens, table_df
|
63 |
|
64 |
|
65 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|
|
|
59 |
if update:
|
60 |
return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
|
61 |
else:
|
62 |
+
return pos_tokens, table_df, len(encoding)
|
63 |
|
64 |
|
65 |
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
|