xu-song commited on
Commit
b15345c
·
1 Parent(s): e4187ae
Files changed (2) hide show
  1. app.py +27 -8
  2. util.py +6 -3
app.py CHANGED
@@ -11,7 +11,7 @@
11
  - theme 开关 light/dark
12
  - token_id/tokens/bytes 开关
13
  - 通过 javascript 添加 hover_text
14
- -
15
 
16
 
17
 
@@ -36,9 +36,6 @@ import gradio as gr
36
  from vocab import all_tokenizers
37
  from util import *
38
 
39
- example_text = """Replace this text in the input field to see how tokenization works
40
- 华为智能音箱发布:华为Sound X"""
41
-
42
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
43
  examples = [
44
  # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
@@ -53,6 +50,20 @@ def example_fn(example_idx):
53
  return examples[example_idx]
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  with gr.Blocks(css="style.css") as demo:
58
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
@@ -76,7 +87,7 @@ with gr.Blocks(css="style.css") as demo:
76
  )
77
 
78
  user_input = gr.Textbox(
79
- value=example_text,
80
  label="Input Text",
81
  lines=5,
82
  show_label=False,
@@ -94,7 +105,7 @@ with gr.Blocks(css="style.css") as demo:
94
  with gr.Group():
95
  tokenizer_type_1 = gr.Dropdown(
96
  all_tokenizers,
97
- value="llama",
98
  label="Tokenizer 1",
99
  )
100
  with gr.Group():
@@ -103,17 +114,19 @@ with gr.Blocks(css="style.css") as demo:
103
  """
104
  with gr.Row():
105
  stats_vocab_size_1 = gr.TextArea(
 
106
  label="VocabSize",
107
  lines=1,
108
  elem_classes="statistics"
109
  )
110
  stats_zh_token_size_1 = gr.TextArea(
111
- # value="1252/1455",
112
  label="ZH char/word",
113
  lines=1,
114
  elem_classes="statistics"
115
  )
116
  stats_overlap_token_size_1 = gr.TextArea(
 
117
  label="Overlap Tokens",
118
  lines=1,
119
  elem_classes="statistics"
@@ -137,12 +150,13 @@ with gr.Blocks(css="style.css") as demo:
137
  with gr.Group():
138
  with gr.Row():
139
  stats_vocab_size_2 = gr.TextArea(
 
140
  label="VocabSize",
141
  lines=1,
142
  elem_classes="statistics"
143
  )
144
  stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
145
- # value="12/45",
146
  label="ZH char/word",
147
  lines=1,
148
  elem_classes="statistics"
@@ -153,6 +167,7 @@ with gr.Blocks(css="style.css") as demo:
153
  # elem_classes="statistics"
154
  # )
155
  stats_overlap_token_size_2 = gr.TextArea(
 
156
  label="Overlap Tokens",
157
  lines=1,
158
  elem_classes="statistics"
@@ -162,12 +177,14 @@ with gr.Blocks(css="style.css") as demo:
162
  with gr.Row():
163
  with gr.Column():
164
  output_text_1 = gr.Highlightedtext(
 
165
  label="Tokens 1",
166
  show_legend=True,
167
  elem_classes="space-show"
168
  )
169
  with gr.Column():
170
  output_text_2 = gr.Highlightedtext(
 
171
  label="Tokens 2",
172
  show_legend=True,
173
  elem_classes="space-show"
@@ -175,11 +192,13 @@ with gr.Blocks(css="style.css") as demo:
175
 
176
  with gr.Row():
177
  output_table_1 = gr.Dataframe(
 
178
  headers=["TokenID", "Byte", "Text"],
179
  datatype=["str", "str", "str"],
180
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
181
  )
182
  output_table_2 = gr.Dataframe(
 
183
  headers=["TokenID", "Token", "Text"],
184
  datatype=["str", "str", "str"],
185
  )
 
11
  - theme 开关 light/dark
12
  - token_id/tokens/bytes 开关
13
  - 通过 javascript 添加 hover_text
14
+ - i18
15
 
16
 
17
 
 
36
  from vocab import all_tokenizers
37
  from util import *
38
 
 
 
 
39
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
40
  examples = [
41
  # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
 
50
  return examples[example_idx]
51
 
52
 
53
+ """Replace this text in the input field to see how tokenization works
54
+ 华为智能音箱发布:华为发布mate60 pro手机"""
55
+
56
+ default_user_input = """Replace this text in the input field to see how tokenization works
57
+ 华为发布mate60 pro手机"""
58
+ default_tokenizer_type_1 = "llama"
59
+ default_tokenizer_type_2 = "internlm_chat_7b"
60
+ default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
61
+ default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
62
+ default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
63
+ default_output_text_1, default_output_table_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
64
+ default_output_text_2, default_output_table_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
65
+
66
+
67
 
68
  with gr.Blocks(css="style.css") as demo:
69
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
 
87
  )
88
 
89
  user_input = gr.Textbox(
90
+ value=default_user_input,
91
  label="Input Text",
92
  lines=5,
93
  show_label=False,
 
105
  with gr.Group():
106
  tokenizer_type_1 = gr.Dropdown(
107
  all_tokenizers,
108
+ value=default_tokenizer_type_1,
109
  label="Tokenizer 1",
110
  )
111
  with gr.Group():
 
114
  """
115
  with gr.Row():
116
  stats_vocab_size_1 = gr.TextArea(
117
+ value=default_stats_vocab_size_1,
118
  label="VocabSize",
119
  lines=1,
120
  elem_classes="statistics"
121
  )
122
  stats_zh_token_size_1 = gr.TextArea(
123
+ value=default_stats_zh_token_size_1,
124
  label="ZH char/word",
125
  lines=1,
126
  elem_classes="statistics"
127
  )
128
  stats_overlap_token_size_1 = gr.TextArea(
129
+ value=default_stats_overlap_token_size,
130
  label="Overlap Tokens",
131
  lines=1,
132
  elem_classes="statistics"
 
150
  with gr.Group():
151
  with gr.Row():
152
  stats_vocab_size_2 = gr.TextArea(
153
+ value=default_stats_vocab_size_2,
154
  label="VocabSize",
155
  lines=1,
156
  elem_classes="statistics"
157
  )
158
  stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
159
+ value=default_stats_zh_token_size_2,
160
  label="ZH char/word",
161
  lines=1,
162
  elem_classes="statistics"
 
167
  # elem_classes="statistics"
168
  # )
169
  stats_overlap_token_size_2 = gr.TextArea(
170
+ value=default_stats_overlap_token_size,
171
  label="Overlap Tokens",
172
  lines=1,
173
  elem_classes="statistics"
 
177
  with gr.Row():
178
  with gr.Column():
179
  output_text_1 = gr.Highlightedtext(
180
+ value=default_output_text_1,
181
  label="Tokens 1",
182
  show_legend=True,
183
  elem_classes="space-show"
184
  )
185
  with gr.Column():
186
  output_text_2 = gr.Highlightedtext(
187
+ value=default_output_text_2,
188
  label="Tokens 2",
189
  show_legend=True,
190
  elem_classes="space-show"
 
192
 
193
  with gr.Row():
194
  output_table_1 = gr.Dataframe(
195
+ value=default_output_table_1,
196
  headers=["TokenID", "Byte", "Text"],
197
  datatype=["str", "str", "str"],
198
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
199
  )
200
  output_table_2 = gr.Dataframe(
201
+ value=default_output_table_2,
202
  headers=["TokenID", "Token", "Text"],
203
  datatype=["str", "str", "str"],
204
  )
util.py CHANGED
@@ -9,7 +9,7 @@ from utils.zh_util import iter_vocab
9
 
10
 
11
 
12
- def tokenize(text, tokenizer_type, color_num=5):
13
  """
14
  TODO: cache tokenizer
15
  """
@@ -57,11 +57,14 @@ def tokenize(text, tokenizer_type, color_num=5):
57
  print(f"Tokenization[{tokenizer_type}]: {table}")
58
  # print(table_df)
59
 
60
- return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
 
 
 
61
 
62
 
63
  def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
64
- pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
65
  pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
66
  return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
67
 
 
9
 
10
 
11
 
12
+ def tokenize(text, tokenizer_type, color_num=5, update=True):
13
  """
14
  TODO: cache tokenizer
15
  """
 
57
  print(f"Tokenization[{tokenizer_type}]: {table}")
58
  # print(table_df)
59
 
60
+ if update:
61
+ return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
62
+ else:
63
+ return pos_tokens, table_df
64
 
65
 
66
  def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
67
+ pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
68
  pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
69
  return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
70