xu-song commited on
Commit
9495a4f
·
1 Parent(s): 0ce6477
app.py CHANGED
@@ -4,21 +4,19 @@
4
 
5
  """
6
  ## TODO:
7
- - http get方式获取参数,(高优先级)
8
  - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
9
  - iter_vocab 的 warmup
10
- - add_special_token 开关
11
- - theme 开关 light/dark
12
- - token_id/tokens/bytes 开关
 
 
 
 
13
  - 通过 javascript 添加 hover_text
14
- - 给方法 + 缓存,避免重复调用
15
  - 英文 utf-8编码
16
- - 词典支持下载
17
- - 中文字词统计,是否要包括 _ G 等字符
18
  - baichuan的单字数量怎么两万多个?
19
- - OOV
20
- - feedback位置
21
- - gpt4, gpt3.5 的overlap tokens 有问题。
22
  - qwen: ValueError: Unclosed image token
23
 
24
  plots
@@ -39,57 +37,16 @@ table
39
  import gradio as gr
40
  from vocab import all_tokenizers
41
  from util import *
 
42
 
43
- # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
44
- examples_zh = [
45
- ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
46
- ["标点测试:,。!?;", "baichuan_7b", "llama"],
47
- ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
48
- ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
49
- ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
50
- ]
51
 
52
- examples = [
53
- ["spaces: 2spaces 8spaces", "llama", "chatglm_6b"], # chatglm 有blank_n,
54
- ["punctuations: ,./?\",。!?;", "baichuan_7b", "llama"],
55
- ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
56
- ["digits: (10086 + 98) = 100184", "baichuan_7b", "llama"],
57
- ]
58
 
59
-
60
- # jieba.enable_parallel() # flask中没办法parallel
61
-
62
-
63
-
64
-
65
- def example_fn(example_idx):
66
- return examples[example_idx]
67
-
68
-
69
- """Replace this text in the input field to see how tokenization works
70
-
71
-
72
- """
73
-
74
- default_user_input = """Replace this text in the input field to see how tokenization works
75
- 华为发布Mate60手机
76
- ラグビーワールドカップ2023フランス"""
77
- default_tokenizer_type_1 = "llama"
78
- default_tokenizer_type_2 = "internlm_chat_7b"
79
- default_stats_vocab_size_1, default_stats_zh_token_size_1 = basic_count(default_tokenizer_type_1)
80
- default_stats_vocab_size_2, default_stats_zh_token_size_2 = basic_count(default_tokenizer_type_2)
81
- default_stats_overlap_token_size = get_overlap_token_size(default_tokenizer_type_1, default_tokenizer_type_2)[0]
82
- default_output_text_1, default_output_table_1, default_output_len_1 = tokenize(default_user_input, default_tokenizer_type_1, update=False)
83
- default_output_text_2, default_output_table_2, default_output_len_2 = tokenize(default_user_input, default_tokenizer_type_2, update=False)
84
-
85
- with gr.Blocks(css="style.css") as demo:
86
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
87
  # links: https://www.coderstool.com/utf8-encoding-decoding
88
  # 功能:输入文本,进行分词
89
  # 分词器:常见的分词器有集中,
90
  # 背景:方便分词、看词粒度、对比
91
- #
92
- # Byte: 表示分词
93
 
94
  with gr.Row():
95
  gr.Markdown("## Input Text")
@@ -103,26 +60,18 @@ with gr.Blocks(css="style.css") as demo:
103
  scale=0,
104
  elem_classes="example-style"
105
  )
106
-
107
  user_input = gr.Textbox(
108
- value=default_user_input,
109
  label="Input Text",
110
  lines=5,
111
  show_label=False,
112
- ) # placeholder="Enter sentence here..."
113
- # gr.Examples(
114
- # examples,
115
- # None,
116
- # )
117
-
118
  gr.Markdown("## Tokenization")
119
-
120
  with gr.Row():
121
  with gr.Column(scale=6):
122
  with gr.Group():
123
  tokenizer_type_1 = gr.Dropdown(
124
  all_tokenizers,
125
- value=default_tokenizer_type_1,
126
  label="Tokenizer 1",
127
  )
128
  with gr.Group():
@@ -131,19 +80,17 @@ with gr.Blocks(css="style.css") as demo:
131
  """
132
  with gr.Row():
133
  stats_vocab_size_1 = gr.TextArea(
134
- value=default_stats_vocab_size_1,
135
  label="VocabSize",
136
  lines=1,
137
  elem_classes="statistics"
138
  )
139
  stats_zh_token_size_1 = gr.TextArea(
140
- value=default_stats_zh_token_size_1,
141
  label="ZH char/word",
142
  lines=1,
143
  elem_classes="statistics"
144
  )
145
  stats_overlap_token_size_1 = gr.TextArea(
146
- value=default_stats_overlap_token_size,
147
  label="Overlap Tokens",
148
  lines=1,
149
  elem_classes="statistics"
@@ -161,19 +108,16 @@ with gr.Blocks(css="style.css") as demo:
161
  with gr.Group():
162
  tokenizer_type_2 = gr.Dropdown(
163
  all_tokenizers,
164
- value=default_tokenizer_type_2,
165
  label="Tokenizer 2",
166
  )
167
  with gr.Group():
168
  with gr.Row():
169
  stats_vocab_size_2 = gr.TextArea(
170
- value=default_stats_vocab_size_2,
171
  label="VocabSize",
172
  lines=1,
173
  elem_classes="statistics"
174
  )
175
  stats_zh_token_size_2 = gr.TextArea(
176
- value=default_stats_zh_token_size_2,
177
  label="ZH char/word", # 中文字/词
178
  lines=1,
179
  elem_classes="statistics"
@@ -184,7 +128,6 @@ with gr.Blocks(css="style.css") as demo:
184
  # elem_classes="statistics"
185
  # )
186
  stats_overlap_token_size_2 = gr.TextArea(
187
- value=default_stats_overlap_token_size,
188
  label="Overlap Tokens",
189
  lines=1,
190
  elem_classes="statistics"
@@ -194,42 +137,28 @@ with gr.Blocks(css="style.css") as demo:
194
  with gr.Row():
195
  with gr.Column():
196
  output_text_1 = gr.Highlightedtext(
197
- value=default_output_text_1,
198
- label=f"Tokens: {default_output_len_1}",
199
  show_legend=True,
200
  elem_classes="space-show"
201
  )
202
  with gr.Column():
203
  output_text_2 = gr.Highlightedtext(
204
- value=default_output_text_2,
205
- label=f"Tokens: {default_output_len_2}",
206
  show_legend=True,
207
  elem_classes="space-show"
208
  )
209
 
210
  with gr.Row():
211
- output_table_1 = gr.Dataframe(
212
- value=default_output_table_1,
213
- headers=["TokenID", "Byte", "Text"],
214
- datatype=["str", "str", "str"],
215
- # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
216
- )
217
- output_table_2 = gr.Dataframe(
218
- value=default_output_table_2,
219
- headers=["TokenID", "Token", "Text"],
220
- datatype=["str", "str", "str"],
221
- )
222
 
223
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
224
  [output_text_1, output_table_1])
225
- # 下面两个好像可以合并
226
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
227
  tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
228
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
229
 
230
  user_input.change(tokenize_pair,
231
  [user_input, tokenizer_type_1, tokenizer_type_2],
232
- [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
233
 
234
  tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
235
  [output_text_2, output_table_2])
@@ -243,9 +172,21 @@ with gr.Blocks(css="style.css") as demo:
243
  [user_input, tokenizer_type_1, tokenizer_type_2]
244
  )
245
 
246
- # start up 初始化
247
- # user_input.update(user_input.value + "___")
 
 
 
 
 
248
 
249
  if __name__ == "__main__":
250
- demo.queue(max_size=20).launch()
251
- # demo.launch()
 
 
 
 
 
 
 
 
4
 
5
  """
6
  ## TODO:
 
7
  - i18 国际化 https://blog.csdn.net/qq_26212731/article/details/78457198 request.header中也有language
8
  - iter_vocab 的 warmup
9
+ - 开关
10
+ - add_special_token 开关
11
+ - theme 开关 light/dark
12
+ - token_id/tokens/bytes 开关
13
+ - 中文字词统计,是否要包括 _ G 等字符
14
+ - 评测
15
+ - OOV评测
16
  - 通过 javascript 添加 hover_text
 
17
  - 英文 utf-8编码
18
+ - 词典支持下载,借用image下载的标签,
 
19
  - baichuan的单字数量怎么两万多个?
 
 
 
20
  - qwen: ValueError: Unclosed image token
21
 
22
  plots
 
37
  import gradio as gr
38
  from vocab import all_tokenizers
39
  from util import *
40
+ from examples import example_fn
41
 
 
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
43
 
44
+ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""")
46
  # links: https://www.coderstool.com/utf8-encoding-decoding
47
  # 功能:输入文本,进行分词
48
  # 分词器:常见的分词器有集中,
49
  # 背景:方便分词、看词粒度、对比
 
 
50
 
51
  with gr.Row():
52
  gr.Markdown("## Input Text")
 
60
  scale=0,
61
  elem_classes="example-style"
62
  )
 
63
  user_input = gr.Textbox(
64
+ # value=default_user_input,
65
  label="Input Text",
66
  lines=5,
67
  show_label=False,
68
+ )
 
 
 
 
 
69
  gr.Markdown("## Tokenization")
 
70
  with gr.Row():
71
  with gr.Column(scale=6):
72
  with gr.Group():
73
  tokenizer_type_1 = gr.Dropdown(
74
  all_tokenizers,
 
75
  label="Tokenizer 1",
76
  )
77
  with gr.Group():
 
80
  """
81
  with gr.Row():
82
  stats_vocab_size_1 = gr.TextArea(
 
83
  label="VocabSize",
84
  lines=1,
85
  elem_classes="statistics"
86
  )
87
  stats_zh_token_size_1 = gr.TextArea(
 
88
  label="ZH char/word",
89
  lines=1,
90
  elem_classes="statistics"
91
  )
92
  stats_overlap_token_size_1 = gr.TextArea(
93
+ # value=default_stats_overlap_token_size,
94
  label="Overlap Tokens",
95
  lines=1,
96
  elem_classes="statistics"
 
108
  with gr.Group():
109
  tokenizer_type_2 = gr.Dropdown(
110
  all_tokenizers,
 
111
  label="Tokenizer 2",
112
  )
113
  with gr.Group():
114
  with gr.Row():
115
  stats_vocab_size_2 = gr.TextArea(
 
116
  label="VocabSize",
117
  lines=1,
118
  elem_classes="statistics"
119
  )
120
  stats_zh_token_size_2 = gr.TextArea(
 
121
  label="ZH char/word", # 中文字/词
122
  lines=1,
123
  elem_classes="statistics"
 
128
  # elem_classes="statistics"
129
  # )
130
  stats_overlap_token_size_2 = gr.TextArea(
 
131
  label="Overlap Tokens",
132
  lines=1,
133
  elem_classes="statistics"
 
137
  with gr.Row():
138
  with gr.Column():
139
  output_text_1 = gr.Highlightedtext(
 
 
140
  show_legend=True,
141
  elem_classes="space-show"
142
  )
143
  with gr.Column():
144
  output_text_2 = gr.Highlightedtext(
 
 
145
  show_legend=True,
146
  elem_classes="space-show"
147
  )
148
 
149
  with gr.Row():
150
+ output_table_1 = gr.Dataframe()
151
+ output_table_2 = gr.Dataframe()
 
 
 
 
 
 
 
 
 
152
 
153
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
154
  [output_text_1, output_table_1])
 
155
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
156
  tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
157
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
158
 
159
  user_input.change(tokenize_pair,
160
  [user_input, tokenizer_type_1, tokenizer_type_2],
161
+ [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
162
 
163
  tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2],
164
  [output_text_2, output_table_2])
 
172
  [user_input, tokenizer_type_1, tokenizer_type_2]
173
  )
174
 
175
+ demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read())
176
+ demo.load(
177
+ fn=on_load,
178
+ inputs=None,
179
+ outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
180
+ )
181
+
182
 
183
  if __name__ == "__main__":
184
+ print("http://127.0.0.1:7860/?tokenizer1=llama&tokenizer2=chinese_llama2&text=fdsjlk") # llama chinese_llama2
185
+ print(
186
+ "http://127.0.0.1:7860/?tokenizer1=chinese_llama&tokenizer2=chinese_llama2&text=fdsjlk") # llama chinese_llama2
187
+ print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss") # baichuan 1 VS 2
188
+ print("http://127.0.0.1:7860/?tokenizer1=bert&tokenizer2=clue&text=sss") # bert VS clue
189
+ print("http://127.0.0.1:7860/?tokenizer1=clue&tokenizer2=kplug&text=sss") # clue VS kplug
190
+ print("http://127.0.0.1:7860/?tokenizer1=baichuan&tokenizer2=baichuan2&text=sss") #
191
+ # demo.queue(max_size=20).launch()
192
+ demo.launch()
style.css → css/style.css RENAMED
File without changes
evaluation.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ ## coverage
4
+
5
+ rare characters falling back to utf-8 bytes
examples.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ examples = {
2
+ "en": [
3
+ ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n,
4
+ # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
+ ["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
+ ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
7
+ ["digits: (10086 + 98) = 100184", "baichuan", "llama"]
8
+ ]
9
+ ,
10
+ "zh": [
11
+ ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
12
+ ["标点测试:,。!?;", "baichuan_7b", "llama"],
13
+ ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
14
+ ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
15
+ ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
16
+ ]
17
+
18
+ }
19
+
20
+
21
+ def example_fn(example_idx):
22
+ return examples["en"][example_idx]
images/README.md CHANGED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ ## info
3
+
4
+ https://huggingface.co/bert-base-uncased
5
+
images/download_button.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div class="icon-buttons svelte-1btp92j"><a href="" download="image" target="_blank"><button aria-label="Download" title="Download" class="svelte-1030q2h"> <div class="svelte-1030q2h"><svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 32 32"><path fill="currentColor" d="M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z"></path></svg></div></button></a> </div>
js/onload.js ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function() {
2
+ // feedback
3
+ //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
4
+ //$("footer a").childNodes[0].textContent ="Send Feedback"
5
+
6
+ document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
7
+ document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
8
+
9
+ // download button
10
+
11
+ // API
12
+ }
util.py CHANGED
@@ -5,13 +5,15 @@ import pandas as pd
5
  from vocab import load_tokener
6
  from utils.zh_util import iter_vocab
7
  from utils.log_util import logger
 
 
8
 
9
 
10
- def tokenize(text, tokenizer_type, color_num=5, update=True):
 
11
  """
12
- TODO: cache tokenizer
13
  """
14
- logger.info("[param]:" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
15
  pos_tokens = []
16
  tokenizer = load_tokener(tokenizer_type)
17
  encoding = tokenizer.encode(text)
@@ -29,16 +31,16 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
29
  token_str = token.decode("utf-8")
30
  except:
31
  token_str = token.decode("utf-8", errors="ignore")
32
- logger.info("[decode_error]: " + json.dumps(
33
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
34
  ensure_ascii=False))
35
 
36
  token_bytes = token
37
- json_dumps = json.dumps(token_str)
38
  elif isinstance(token, str):
39
  token_str = token
40
  token_bytes = bytes(token_str, "utf-8")
41
- json_dumps = json.dumps(token_str)
42
  else:
43
  return
44
 
@@ -48,31 +50,23 @@ def tokenize(text, tokenizer_type, color_num=5, update=True):
48
  "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
49
  "Text": decode_text, #
50
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
51
- "Bytes": str(token_bytes),
52
  # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
53
  }
54
  )
55
 
56
  table_df = pd.DataFrame(table)
57
- logger.info(f"[Tokens {tokenizer_type}]: {table[:2]}")
58
  # print(table_df)
59
 
60
- if update:
61
- return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
62
- else:
63
- return pos_tokens, table_df, len(encoding)
64
 
65
 
66
- def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2, request: gr.Request):
67
- if request:
68
- client_ip = request.client.host
69
- # local_ip = socket.gethostbyname(socket.gethostbyname(""))
70
- headers = request.kwargs['headers']
71
- if headers and 'x-forwarded-for' in headers:
72
- x_forwarded_for = headers['x-forwarded-for']
73
- client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
74
- logger.info(f"[client_ip]: {client_ip}, {tokenizer_type_1}, {tokenizer_type_2}")
75
-
76
  pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
77
  pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
78
  return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
@@ -84,21 +78,67 @@ def basic_count(tokenizer_type):
84
  return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
85
 
86
 
 
87
  def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
88
  tokenizer1 = load_tokener(tokenizer_type_1)
89
  tokenizer2 = load_tokener(tokenizer_type_2)
90
- vocab1 = tokenizer1.get_vocab()
91
- vocab2 = tokenizer2.get_vocab()
92
- overlap_tokens = vocab1.keys() & vocab2.keys()
 
 
 
 
 
 
 
 
 
 
93
  overlap_token_size = len(overlap_tokens)
94
- logger.info(f"[OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}]: {list(overlap_tokens)[:10]}")
 
95
  return overlap_token_size, overlap_token_size
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def test_coding():
99
  bytes1 = b'\xe4\xb8\xad'
100
  print(bytes1) # b'\xe4\xb8\xad'
101
 
102
 
103
  if __name__ == "__main__":
104
- print(basic_count("internlm_chat_7b"))
 
 
5
  from vocab import load_tokener
6
  from utils.zh_util import iter_vocab
7
  from utils.log_util import logger
8
+ from functools import lru_cache
9
+ from urllib.parse import urlparse, parse_qs
10
 
11
 
12
+ @lru_cache
13
+ def tokenize(text, tokenizer_type, color_num=5):
14
  """
 
15
  """
16
+ logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
17
  pos_tokens = []
18
  tokenizer = load_tokener(tokenizer_type)
19
  encoding = tokenizer.encode(text)
 
31
  token_str = token.decode("utf-8")
32
  except:
33
  token_str = token.decode("utf-8", errors="ignore")
34
+ logger.error("decode_error: " + json.dumps(
35
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
36
  ensure_ascii=False))
37
 
38
  token_bytes = token
39
+ # json_dumps = json.dumps(token_str)
40
  elif isinstance(token, str):
41
  token_str = token
42
  token_bytes = bytes(token_str, "utf-8")
43
+ # json_dumps = json.dumps(token_str)
44
  else:
45
  return
46
 
 
50
  "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
51
  "Text": decode_text, #
52
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
53
+ "UTF8 Bytes": str(token_bytes),
54
  # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
55
  }
56
  )
57
 
58
  table_df = pd.DataFrame(table)
59
+ logger.info(f"Tokens={table[:2]}")
60
  # print(table_df)
61
 
62
+ return gr.update(value=pos_tokens, label=f"Tokens: {len(encoding)}"), table_df
 
 
 
63
 
64
 
65
+ @lru_cache
66
+ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
67
+ """
68
+ input_text.change
69
+ """
 
 
 
 
 
70
  pos_tokens_1, table_df_1 = tokenize(text, tokenizer_type_1)
71
  pos_tokens_2, table_df_2 = tokenize(text, tokenizer_type_2)
72
  return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2
 
78
  return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
79
 
80
 
81
+ @lru_cache
82
  def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
83
  tokenizer1 = load_tokener(tokenizer_type_1)
84
  tokenizer2 = load_tokener(tokenizer_type_2)
85
+
86
+ vocab_set_1 = tokenizer1.get_vocab().keys()
87
+ vocab_set_2 = tokenizer2.get_vocab().keys()
88
+
89
+ token1 = next(iter(vocab_set_1))
90
+ token2 = next(iter(vocab_set_2))
91
+ if type(token1) != type(token2): # bytes str
92
+ if isinstance(token1, str):
93
+ vocab_set_1 = set([token.encode("utf-8") for token in vocab_set_1])
94
+ if isinstance(token2, str):
95
+ vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
96
+
97
+ overlap_tokens = vocab_set_1 & vocab_set_2
98
  overlap_token_size = len(overlap_tokens)
99
+ logger.info(
100
+ f"{overlap_token_size} OverlapTokens of {tokenizer_type_1} {tokenizer_type_2}: {list(overlap_tokens)[:10]}")
101
  return overlap_token_size, overlap_token_size
102
 
103
 
104
+ default_user_input = """Replace this text in the input field to see how tokenization works
105
+ 华为发布Mate60手机
106
+ ラグビーワールドカップ2023フランス"""
107
+ default_tokenizer_type_1 = "llama"
108
+ # default_tokenizer_type_2 = "internlm_chat_7b"
109
+ default_tokenizer_type_2 = "gpt_35_turbo"
110
+
111
+
112
+ def on_load(request: gr.Request):
113
+ """
114
+ onLoad
115
+ """
116
+ text = None
117
+ tokenizer_type_1 = None
118
+ tokenizer_type_2 = None
119
+ query_params = {}
120
+ if request:
121
+ client_ip = request.client.host
122
+ # local_ip = socket.gethostbyname(socket.gethostbyname(""))
123
+ # headers = request.kwargs['headers']
124
+ # if headers and 'x-forwarded-for' in headers:
125
+ # x_forwarded_for = headers['x-forwarded-for']
126
+ # client_ip = x_forwarded_for.split(' ')[0] if x_forwarded_for else ""
127
+ if "referer" in request.headers:
128
+ query_params = parse_qs(urlparse(request.headers["referer"]).query)
129
+ query_params = {k: v[0] for k, v in query_params.items() if len(v) > 0}
130
+ tokenizer_type_1 = query_params.get("tokenizer1", default_tokenizer_type_1)
131
+ tokenizer_type_2 = query_params.get("tokenizer2", default_tokenizer_type_2)
132
+ text = query_params.get("text", default_user_input)
133
+ logger.info(f"client_ip: {client_ip}; params: {query_params}")
134
+ return text, tokenizer_type_1, tokenizer_type_2
135
+
136
+
137
  def test_coding():
138
  bytes1 = b'\xe4\xb8\xad'
139
  print(bytes1) # b'\xe4\xb8\xad'
140
 
141
 
142
  if __name__ == "__main__":
143
+ print(get_overlap_token_size("gpt_35_turbo", "gpt_4"))
144
+ # print(basic_count("internlm_chat_7b"))
utils/_vocab.zh.jsonl ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"id": 529, "token": "’", "type": "中文标点"}
2
+ {"id": 753, "token": "’s", "type": "中文标点"}
3
+ {"id": 863, "token": "”", "type": "中文标点"}
4
+ {"id": 1054, "token": " “", "type": "中文标点"}
5
+ {"id": 1389, "token": " –", "type": "中文标点"}
6
+ {"id": 1431, "token": "’t", "type": "中文标点"}
7
+ {"id": 1811, "token": "。", "type": "中文标点"}
8
+ {"id": 1981, "token": "…", "type": "中文标点"}
9
+ {"id": 2001, "token": " —", "type": "中文标点"}
10
+ {"id": 2029, "token": ".”", "type": "中文标点"}
11
+ {"id": 2118, "token": "“", "type": "中文标点"}
12
+ {"id": 2345, "token": "—", "type": "中文标点"}
13
+ {"id": 2476, "token": ",”", "type": "中文标点"}
14
+ {"id": 2950, "token": ".”\n\n", "type": "中文标点"}
15
+ {"id": 3207, "token": "’re", "type": "中文标点"}
16
+ {"id": 3451, "token": " ‘", "type": "中文标点"}
17
+ {"id": 3490, "token": "。\n\n", "type": "中文标点"}
18
+ {"id": 3922, "token": ",", "type": "中文标点"}
19
+ {"id": 4070, "token": "’ve", "type": "中文标点"}
20
+ {"id": 4235, "token": "–", "type": "中文标点"}
21
+ {"id": 4344, "token": "’m", "type": "中文标点"}
22
+ {"id": 4696, "token": " …", "type": "中文标点"}
23
+ {"id": 4805, "token": "’ll", "type": "中文标点"}
24
+ {"id": 5232, "token": ":", "type": "中文标点"}
25
+ {"id": 5486, "token": "、", "type": "中文标点"}
26
+ {"id": 5551, "token": "…\n\n", "type": "中文标点"}
27
+ {"id": 6447, "token": "!", "type": "中文标点"}
28
+ {"id": 7070, "token": "’d", "type": "中文标点"}
29
+ {"id": 7663, "token": "”\n\n", "type": "中文标点"}
30
+ {"id": 7705, "token": ")", "type": "中文标点"}
31
+ {"id": 8107, "token": "年", "type": "中文单字"}
32
+ {"id": 8713, "token": "——", "type": "中文标点"}
33
+ {"id": 9039, "token": "数", "type": "中文单字"}
34
+ {"id": 9080, "token": "日", "type": "中文单字"}
35
+ {"id": 9174, "token": "。\n", "type": "中文标点"}
36
+ {"id": 9520, "token": "”,", "type": "中文标点"}
37
+ {"id": 9554, "token": "的", "type": "中文单字"}
38
+ {"id": 9787, "token": " ·", "type": "中文标点"}
39
+ {"id": 9953, "token": "月", "type": "中文单字"}
40
+ {"id": 10110, "token": "(", "type": "中文标点"}
41
+ {"id": 10378, "token": "“I", "type": "中文标点"}
42
+ {"id": 10416, "token": " […", "type": "中文标点"}
43
+ {"id": 10646, "token": "」", "type": "中文标点"}
44
+ {"id": 11144, "token": "【", "type": "中文标点"}
45
+ {"id": 11199, "token": "】", "type": "中文标点"}
46
+ {"id": 11453, "token": "”.", "type": "中文标点"}
47
+ {"id": 11571, "token": "?", "type": "中文标点"}
48
+ {"id": 11883, "token": "用", "type": "中文单字"}
49
+ {"id": 12291, "token": " …\n\n", "type": "中文标点"}
50
+ {"id": 12671, "token": "?”", "type": "中文标点"}
51
+ {"id": 12996, "token": " […]\n\n", "type": "中文标点"}
52
+ {"id": 13153, "token": "成", "type": "中文单字"}
53
+ {"id": 13177, "token": "「", "type": "中文标点"}
54
+ {"id": 13372, "token": "名", "type": "中文单字"}
55
+ {"id": 13646, "token": "时", "type": "中文单字"}
56
+ {"id": 14260, "token": "·", "type": "中文标点"}
57
+ {"id": 14305, "token": "“The", "type": "中文标点"}
58
+ {"id": 14336, "token": "‘", "type": "中文标点"}
59
+ {"id": 14382, "token": "……", "type": "中文标点"}
60
+ {"id": 14558, "token": "件", "type": "中文单字"}
61
+ {"id": 14639, "token": ".’", "type": "中文标点"}
62
+ {"id": 15085, "token": "“We", "type": "中文标点"}
63
+ {"id": 15120, "token": "一", "type": "中文单字"}
64
+ {"id": 15179, "token": " „", "type": "中文标点"}
65
+ {"id": 15225, "token": "请", "type": "中文单字"}
66
+ {"id": 15397, "token": "”.\n\n", "type": "中文标点"}
67
+ {"id": 16325, "token": "中", "type": "中文单字"}
68
+ {"id": 16423, "token": "据", "type": "中文单字"}
69
+ {"id": 16616, "token": "?”\n\n", "type": "中文标点"}
70
+ {"id": 16620, "token": "————", "type": "中文标点"}
71
+ {"id": 16882, "token": "码", "type": "中文单字"}
72
+ {"id": 16937, "token": "不", "type": "中文单字"}
73
+ {"id": 17039, "token": "新", "type": "中文单字"}
74
+ {"id": 17161, "token": "文", "type": "中文单字"}
75
+ {"id": 17223, "token": "—and", "type": "中文标点"}
76
+ {"id": 17297, "token": "下", "type": "中文单字"}
77
+ {"id": 17620, "token": "分", "type": "中文单字"}
78
+ {"id": 17701, "token": "入", "type": "中文单字"}
79
+ {"id": 17792, "token": "人", "type": "中文单字"}
80
+ {"id": 17818, "token": "“It", "type": "中文标点"}
81
+ {"id": 17860, "token": "功", "type": "中文单字"}
82
+ {"id": 17905, "token": "上", "type": "中文单字"}
83
+ {"id": 17982, "token": "户", "type": "中文单字"}
84
+ {"id": 18171, "token": "!\n\n", "type": "中文标点"}
85
+ {"id": 18184, "token": "为", "type": "中文单字"}
86
+ {"id": 18217, "token": " ’", "type": "中文标点"}
87
+ {"id": 18319, "token": "!”", "type": "中文标点"}
88
+ {"id": 18363, "token": "间", "type": "中文单字"}
89
+ {"id": 18476, "token": "号", "type": "中文单字"}
90
+ {"id": 18655, "token": "取", "type": "中文单字"}
91
+ {"id": 18904, "token": "回", "type": "中文单字"}
92
+ {"id": 19000, "token": "在", "type": "��文单字"}
93
+ {"id": 19047, "token": "页", "type": "中文单字"}
94
+ {"id": 19066, "token": "。\n\n\n\n", "type": "中文标点"}
95
+ {"id": 19113, "token": "字", "type": "中文单字"}
96
+ {"id": 19361, "token": "有", "type": "中文单字"}
97
+ {"id": 19483, "token": "个", "type": "中文单字"}
98
+ {"id": 19524, "token": " ”", "type": "中文标点"}
99
+ {"id": 19653, "token": "成功", "type": "中文多字"}
100
+ {"id": 19967, "token": "作", "type": "中文单字"}
101
+ {"id": 20145, "token": "】【", "type": "中文标点"}
102
+ {"id": 20182, "token": "’,", "type": "中文标点"}
103
+ {"id": 20379, "token": "示", "type": "中文单字"}
104
+ {"id": 20600, "token": "用户", "type": "中文多字"}
105
+ {"id": 20675, "token": "数据", "type": "中文多字"}
106
+ {"id": 20834, "token": "出", "type": "中文单字"}
107
+ {"id": 21043, "token": "是", "type": "中文单字"}
108
+ {"id": 21060, "token": "….", "type": "中文标点"}
109
+ {"id": 21082, "token": "时间", "type": "中文多字"}
110
+ {"id": 21388, "token": "失", "type": "中文单字"}
111
+ {"id": 21405, "token": "表", "type": "中文单字"}
112
+ {"id": 21418, "token": "除", "type": "中文单字"}
113
+ {"id": 21601, "token": "加", "type": "中文单字"}
114
+ {"id": 21809, "token": "败", "type": "中文单字"}
115
+ {"id": 21909, "token": "~", "type": "中文标点"}
116
+ {"id": 21990, "token": "生", "type": "中文单字"}
117
+ {"id": 22023, "token": "信", "type": "中文单字"}
118
+ {"id": 22117, "token": "’est", "type": "中文标点"}
119
+ {"id": 22238, "token": "类", "type": "中文单字"}
120
+ {"id": 22324, "token": "置", "type": "中文单字"}
121
+ {"id": 22416, "token": "—the", "type": "中文标点"}
122
+ {"id": 22649, "token": "理", "type": "中文单字"}
123
+ {"id": 22656, "token": "本", "type": "中文单字"}
124
+ {"id": 22820, "token": "失败", "type": "中文多字"}
125
+ {"id": 23018, "token": "息", "type": "中文单字"}
126
+ {"id": 23039, "token": "行", "type": "中文单字"}
127
+ {"id": 23187, "token": "定", "type": "中文单字"}
128
+ {"id": 23189, "token": ",’", "type": "中文标点"}
129
+ {"id": 23226, "token": "改", "type": "中文单字"}
130
+ {"id": 23249, "token": " ", "type": "中文标点"}
131
+ {"id": 23530, "token": "市", "type": "中文单字"}
132
+ {"id": 23538, "token": "期", "type": "中文单字"}
133
+ {"id": 23897, "token": "以", "type": "中文单字"}
134
+ {"id": 23951, "token": "修", "type": "中文单字"}
135
+ {"id": 23954, "token": ")\n", "type": "中文标点"}
136
+ {"id": 24186, "token": "元", "type": "中文单字"}
137
+ {"id": 24273, "token": "方", "type": "中文单字"}
138
+ {"id": 24535, "token": "’.", "type": "中文标点"}
139
+ {"id": 24580, "token": "录", "type": "中文单字"}
140
+ {"id": 24775, "token": "区", "type": "中文单字"}
141
+ {"id": 24946, "token": "单", "type": "中文单字"}
142
+ {"id": 25010, "token": "�除", "type": "中文多字"}
143
+ {"id": 25129, "token": "位", "type": "中文单字"}
144
+ {"id": 25287, "token": "型", "type": "中文单字"}
145
+ {"id": 25333, "token": "法", "type": "中文单字"}
146
+ {"id": 25336, "token": "县", "type": "中文单字"}
147
+ {"id": 25359, "token": "存", "type": "中文单字"}
148
+ {"id": 25446, "token": "品", "type": "中文单字"}
149
+ {"id": 25580, "token": "前", "type": "中文单字"}
150
+ {"id": 25666, "token": "称", "type": "中文单字"}
151
+ {"id": 25758, "token": "!”\n\n", "type": "中文标点"}
152
+ {"id": 26016, "token": ";", "type": "中文标点"}
153
+ {"id": 26062, "token": "�回", "type": "中文多字"}
154
+ {"id": 26123, "token": "》", "type": "中文标点"}
155
+ {"id": 26130, "token": "注", "type": "中文单字"}
156
+ {"id": 26239, "token": "修改", "type": "中文多字"}
157
+ {"id": 26592, "token": "值", "type": "中文单字"}
158
+ {"id": 26794, "token": "输", "type": "中文单字"}
159
+ {"id": 26892, "token": "建", "type": "中文单字"}
160
+ {"id": 27179, "token": " (“", "type": "中文标点"}
161
+ {"id": 27327, "token": "能", "type": "中文单字"}
162
+ {"id": 27384, "token": "大", "type": "中文单字"}
163
+ {"id": 27452, "token": "例", "type": "中文单字"}
164
+ {"id": 27479, "token": "度", "type": "中文单字"}
165
+ {"id": 27704, "token": "始", "type": "中文单字"}
166
+ {"id": 27948, "token": "?\n\n", "type": "中文标点"}
167
+ {"id": 27996, "token": "文件", "type": "中文多字"}
168
+ {"id": 28037, "token": "到", "type": "中文单字"}
169
+ {"id": 28038, "token": "《", "type": "中文标点"}
170
+ {"id": 28190, "token": "面", "type": "中文单字"}
171
+ {"id": 28359, "token": "�数", "type": "中文多字"}
172
+ {"id": 28466, "token": "载", "type": "中文单字"}
173
+ {"id": 28469, "token": "信息", "type": "中文多字"}
174
+ {"id": 28542, "token": "点", "type": "中文单字"}
175
+ {"id": 28587, "token": "��取", "type": "中文多字"}
176
+ {"id": 28624, "token": " […]", "type": "中文标点"}
177
+ {"id": 28741, "token": "密", "type": "中文单字"}
178
+ {"id": 28833, "token": "动", "type": "中文单字"}
179
+ {"id": 28873, "token": "果", "type": "中文单字"}
180
+ {"id": 28918, "token": "、\n\n", "type": "中文标点"}
181
+ {"id": 28966, "token": ")\n\n", "type": "中文标点"}
182
+ {"id": 29096, "token": "—a", "type": "中文标点"}
183
+ {"id": 29129, "token": "图", "type": "中文单字"}
184
+ {"id": 29172, "token": "提", "type": "中文单字"}
185
+ {"id": 29391, "token": "发", "type": "中文单字"}
186
+ {"id": 29411, "token": ":\n", "type": "中文标点"}
187
+ {"id": 29430, "token": "式", "type": "中文单字"}
188
+ {"id": 29472, "token": "—\n\n", "type": "中文标点"}
189
+ {"id": 29504, "token": "国", "type": "中文单字"}
190
+ {"id": 29681, "token": "」\n\n", "type": "中文标点"}
191
+ {"id": 29706, "token": "删除", "type": "中文多字"}
192
+ {"id": 29719, "token": "’un", "type": "中文标点"}
193
+ {"id": 29741, "token": "登", "type": "中文单字"}
194
+ {"id": 29826, "token": "错", "type": "中文单字"}
195
+ {"id": 30019, "token": "。。", "type": "中文标点"}
196
+ {"id": 30046, "token": "者", "type": "中文单字"}
197
+ {"id": 30051, "token": "认", "type": "中文单字"}
198
+ {"id": 30156, "token": "误", "type": "中文单字"}
199
+ {"id": 30177, "token": "接", "type": "中文单字"}
200
+ {"id": 30184, "token": "’\n\n", "type": "中文标点"}
201
+ {"id": 30356, "token": "关", "type": "中文单字"}
202
+ {"id": 30358, "token": "重", "type": "中文单字"}
203
+ {"id": 30537, "token": "第", "type": "中文单字"}
204
+ {"id": 30590, "token": "地", "type": "中文单字"}
205
+ {"id": 30624, "token": "如", "type": "中文单字"}
206
+ {"id": 30697, "token": "————————", "type": "中文标点"}
207
+ {"id": 30735, "token": "设", "type": "中文单字"}
208
+ {"id": 30832, "token": "目", "type": "中文单字"}
209
+ {"id": 30867, "token": "开", "type": "中文单字"}
210
+ {"id": 30926, "token": "事", "type": "中文单字"}
211
+ {"id": 31041, "token": "�数", "type": "中文多字"}
212
+ {"id": 31091, "token": "名称", "type": "中文多字"}
213
+ {"id": 31378, "token": "“This", "type": "中文标点"}
214
+ {"id": 31472, "token": " :", "type": "中文标点"}
215
+ {"id": 31540, "token": "可", "type": "中文单字"}
216
+ {"id": 31634, "token": "要", "type": "中文单字"}
217
+ {"id": 31640, "token": "代", "type": "中文单字"}
218
+ {"id": 31809, "token": "小", "type": "中文单字"}
219
+ {"id": 31867, "token": "选", "type": "中文单字"}
220
+ {"id": 31944, "token": "标", "type": "中文单字"}
221
+ {"id": 31958, "token": "明", "type": "中文单字"}
222
+ {"id": 31968, "token": "编", "type": "中文单字"}
223
+ {"id": 32018, "token": "求", "type": "中文单字"}
224
+ {"id": 32218, "token": "列", "type": "中文单字"}
225
+ {"id": 32239, "token": "网", "type": "中文单字"}
226
+ {"id": 32296, "token": "输入", "type": "中文多字"}
227
+ {"id": 32307, "token": "万", "type": "中文单字"}
228
+ {"id": 32335, "token": "最", "type": "中文单字"}
229
+ {"id": 32351, "token": "!!", "type": "中文标点"}
230
+ {"id": 32438, "token": "�建", "type": "中文多字"}
231
+ {"id": 32626, "token": "返回", "type": "中文多字"}
232
+ {"id": 32648, "token": "器", "type": "中文单字"}
233
+ {"id": 32938, "token": "所", "type": "中文单字"}
234
+ {"id": 32943, "token": "内", "type": "中文单字"}
235
+ {"id": 33005, "token": "类型", "type": "中文多字"}
236
+ {"id": 33014, "token": "体", "type": "中文单字"}
237
+ {"id": 33035, "token": "通", "type": "中文单字"}
238
+ {"id": 33052, "token": "务", "type": "中文单字"}
239
+ {"id": 33091, "token": "此", "type": "中文单字"}
240
+ {"id": 33122, "token": "商", "type": "中文单字"}
241
+ {"id": 33144, "token": "序", "type": "中文单字"}
242
+ {"id": 33200, "token": "错误", "type": "中文多字"}
243
+ {"id": 33208, "token": "化", "type": "中文单字"}
244
+ {"id": 33420, "token": "消", "type": "中文单字"}
245
+ {"id": 33476, "token": "否", "type": "中文单字"}
246
+ {"id": 33563, "token": "保", "type": "中文单字"}
247
+ {"id": 33611, "token": "”)", "type": "中文标点"}
248
+ {"id": 33655, "token": "使", "type": "中文单字"}
249
+ {"id": 33671, "token": "次", "type": "中文单字"}
250
+ {"id": 33672, "token": "“You", "type": "中文标点"}
251
+ {"id": 33748, "token": "机", "type": "中文单字"}
252
+ {"id": 33764, "token": "对", "type": "中文单字"}
253
+ {"id": 33765, "token": "参数", "type": "中文多字"}
254
+ {"id": 33777, "token": "’é", "type": "中文标点"}
255
+ {"id": 33857, "token": "量", "type": "中文单字"}
256
+ {"id": 33904, "token": "函数", "type": "中文多字"}
257
+ {"id": 33967, "token": "密码", "type": "中文多字"}
258
+ {"id": 33976, "token": "查", "type": "中文单字"}
259
+ {"id": 34045, "token": "。”", "type": "中文标点"}
260
+ {"id": 34048, "token": "部", "type": "中文单字"}
261
+ {"id": 34171, "token": "性", "type": "中文单字"}
262
+ {"id": 34208, "token": "和", "type": "中文单字"}
263
+ {"id": 34226, "token": "更", "type": "中文单字"}
264
+ {"id": 34547, "token": "后", "type": "中文单字"}
265
+ {"id": 34577, "token": "证", "type": "中文单字"}
266
+ {"id": 34676, "token": " 【", "type": "中文标点"}
267
+ {"id": 34690, "token": "”,", "type": "中文标点"}
268
+ {"id": 34972, "token": "题", "type": "中文单字"}
269
+ {"id": 35056, "token": "确", "type": "中文单字"}
270
+ {"id": 35083, "token": "格", "type": "中文单字"}
271
+ {"id": 35147, "token": ".“", "type": "中文标点"}
272
+ {"id": 35192, "token": ".—", "type": "中文标点"}
273
+ {"id": 35284, "token": ".”\n\n\n\n", "type": "中文标点"}
274
+ {"id": 35287, "token": "了", "type": "中文单字"}
275
+ {"id": 35304, "token": "���", "type": "中文单字"}
276
+ {"id": 35330, "token": "金", "type": "中文单字"}
277
+ {"id": 35417, "token": "公", "type": "中文单字"}
278
+ {"id": 35424, "token": "午", "type": "中文单字"}
279
+ {"id": 35757, "token": "円", "type": "中文单字"}
280
+ {"id": 35816, "token": "“There", "type": "中文标点"}
281
+ {"id": 35818, "token": "片", "type": "中文单字"}
282
+ {"id": 35894, "token": "空", "type": "中文单字"}
283
+ {"id": 35959, "token": "请求", "type": "中文多字"}
284
+ {"id": 36225, "token": "��加", "type": "中文多字"}
285
+ {"id": 36319, "token": ".’\n\n", "type": "中文标点"}
286
+ {"id": 36343, "token": "态", "type": "中文单字"}
287
+ {"id": 36515, "token": "登录", "type": "中文多字"}
288
+ {"id": 36577, "token": "’une", "type": "中文标点"}
289
+ {"id": 36651, "token": "管", "type": "中文单字"}
290
+ {"id": 36668, "token": "主", "type": "中文单字"}
291
+ {"id": 36761, "token": "』", "type": "中文标点"}
292
+ {"id": 36827, "token": "天", "type": "中文单字"}
293
+ {"id": 36896, "token": "、「", "type": "中文标点"}
294
+ {"id": 37026, "token": "自", "type": "中文单字"}
295
+ {"id": 37046, "token": "我", "type": "中文单字"}
296
+ {"id": 37087, "token": "全", "type": "中文单字"}
297
+ {"id": 37271, "token": "今", "type": "中文单字"}
298
+ {"id": 37395, "token": "页面", "type": "中文多字"}
299
+ {"id": 37507, "token": "来", "type": "中文单字"}
300
+ {"id": 37648, "token": "��作", "type": "中文多字"}
301
+ {"id": 37656, "token": "正", "type": "中文单字"}
302
+ {"id": 37687, "token": "说", "type": "中文单字"}
303
+ {"id": 37689, "token": "意", "type": "中文单字"}
304
+ {"id": 37705, "token": "送", "type": "中文单字"}
305
+ {"id": 37729, "token": "容", "type": "中文单字"}
306
+ {"id": 37767, "token": "已", "type": "中文单字"}
307
+ {"id": 37985, "token": "结", "type": "中文单字"}
308
+ {"id": 38087, "token": ":“", "type": "中文标点"}
309
+ {"id": 38093, "token": "会", "type": "中文单字"}
310
+ {"id": 38129, "token": "使用", "type": "中文多字"}
311
+ {"id": 38232, "token": "。</", "type": "中文标点"}
312
+ {"id": 38365, "token": "。\r\n", "type": "中文标点"}
313
+ {"id": 38542, "token": "—but", "type": "中文标点"}
314
+ {"id": 38574, "token": "段", "type": "中文单字"}
315
+ {"id": 38609, "token": "�认", "type": "中文多字"}
316
+ {"id": 38684, "token": "“If", "type": "中文标点"}
317
+ {"id": 38741, "token": "。,", "type": "中文标点"}
318
+ {"id": 38743, "token": "计", "type": "中文单字"}
319
+ {"id": 39045, "token": ",请", "type": "中文多字"}
320
+ {"id": 39084, "token": "源", "type": "中文单字"}
321
+ {"id": 39135, "token": "色", "type": "中文单字"}
322
+ {"id": 39177, "token": "時", "type": "中文单字"}
323
+ {"id": 39209, "token": "交", "type": "中文单字"}
324
+ {"id": 39276, "token": "系", "type": "中文单字"}
325
+ {"id": 39282, "token": "过", "type": "中文单字"}
326
+ {"id": 39312, "token": "电", "type": "中文单字"}
327
+ {"id": 39365, "token": "询", "type": "中文单字"}
328
+ {"id": 39404, "token": "符", "type": "中文单字"}
329
+ {"id": 39425, "token": "…………", "type": "中文标点"}
330
+ {"id": 39442, "token": "未", "type": "中文单字"}
331
+ {"id": 39607, "token": "程", "type": "中文单字"}
332
+ {"id": 40053, "token": "常", "type": "中文单字"}
333
+ {"id": 40089, "token": "条", "type": "中文单字"}
334
+ {"id": 40195, "token": "下", "type": "中文单字"}
335
+ {"id": 40265, "token": "当", "type": "中文单字"}
336
+ {"id": 40452, "token": "管理", "type": "中文多字"}
337
+ {"id": 40466, "token": "��态", "type": "中文多字"}
338
+ {"id": 40474, "token": "情", "type": "中文单字"}
339
+ {"id": 40526, "token": "口", "type": "中文单字"}
340
+ {"id": 40565, "token": "“He", "type": "中文标点"}
341
+ {"id": 40702, "token": "’S", "type": "中文标点"}
342
+ {"id": 40753, "token": "’a", "type": "中文标点"}
343
+ {"id": 40862, "token": "合", "type": "中文单字"}
344
+ {"id": 41007, "token": "方法", "type": "中文多字"}
345
+ {"id": 41053, "token": "车", "type": "中文单字"}
346
+ {"id": 41073, "token": "实", "type": "中文单字"}
347
+ {"id": 41127, "token": "组", "type": "中文单字"}
348
+ {"id": 41128, "token": "—that", "type": "中文标点"}
349
+ {"id": 41190, "token": "操作", "type": "中文多字"}
350
+ {"id": 41354, "token": "’.\n\n", "type": "中文标点"}
351
+ {"id": 41401, "token": "版", "type": "中文单字"}
352
+ {"id": 41642, "token": "周", "type": "中文单字"}
353
+ {"id": 41723, "token": "址", "type": "中文单字"}
354
+ {"id": 41771, "token": "获取", "type": "中文多字"}
355
+ {"id": 41827, "token": ":\"", "type": "中文标点"}
356
+ {"id": 41914, "token": "记", "type": "中文单字"}
357
+ {"id": 41920, "token": "二", "type": "中文单字"}
358
+ {"id": 42016, "token": "同", "type": "中文单字"}
359
+ {"id": 42052, "token": "业", "type": "中文单字"}
360
+ {"id": 42081, "token": "权", "type": "中文单字"}
361
+ {"id": 42246, "token": "其", "type": "中文单字"}
362
+ {"id": 42275, "token": " ,", "type": "中文标点"}
363
+ {"id": 42399, "token": "进", "type": "中文单字"}
364
+ {"id": 42421, "token": "试", "type": "中文单字"}
365
+ {"id": 42462, "token": "验", "type": "中文单字"}
366
+ {"id": 42506, "token": "料", "type": "中文单字"}
367
+ {"id": 42553, "token": ",\n", "type": "中文标点"}
368
+ {"id": 42605, "token": ",“", "type": "中文标点"}
369
+ {"id": 42783, "token": "传", "type": "中文单字"}
370
+ {"id": 43032, "token": "述", "type": "中文单字"}
371
+ {"id": 43167, "token": "集", "type": "中文单字"}
372
+ {"id": 43240, "token": "多", "type": "中文单字"}
373
+ {"id": 43292, "token": "无", "type": "中文单字"}
374
+ {"id": 43323, "token": "员", "type": "中文单字"}
375
+ {"id": 43378, "token": "报", "type": "中文单字"}
376
+ {"id": 43444, "token": " (", "type": "中文标点"}
377
+ {"id": 43511, "token": "他", "type": "中文单字"}
378
+ {"id": 43568, "token": "無", "type": "中文单字"}
379
+ {"id": 43741, "token": "‘s", "type": "中文标点"}
380
+ {"id": 43955, "token": "添加", "type": "中文多字"}
381
+ {"id": 44130, "token": "“What", "type": "中文标点"}
382
+ {"id": 44309, "token": "服", "type": "中文单字"}
383
+ {"id": 44368, "token": "线", "type": "中文单字"}
384
+ {"id": 44388, "token": "这", "type": "中文单字"}
385
+ {"id": 44416, "token": "制", "type": "中文单字"}
386
+ {"id": 44529, "token": "  ", "type": "中文标点"}
387
+ {"id": 44603, "token": "—it", "type": "中文标点"}
388
+ {"id": 44620, "token": "『", "type": "中文标点"}
389
+ {"id": 44689, "token": "的", "type": "中文单字"}
390
+ {"id": 44816, "token": "�始", "type": "中文多字"}
391
+ {"id": 44820, "token": "�单", "type": "中文多字"}
392
+ {"id": 44915, "token": "内容", "type": "中文多字"}
393
+ {"id": 44996, "token": "’il", "type": "中文标点"}
394
+ {"id": 45018, "token": "设置", "type": "中文多字"}
395
+ {"id": 45059, "token": "生成", "type": "中文多字"}
396
+ {"id": 45163, "token": "将", "type": "中文单字"}
397
+ {"id": 45191, "token": "状态", "type": "中文多字"}
398
+ {"id": 45221, "token": "=”", "type": "中文标点"}
399
+ {"id": 45258, "token": "?’", "type": "中文标点"}
400
+ {"id": 45277, "token": "列表", "type": "中文多字"}
401
+ {"id": 45390, "token": "处", "type": "中文单字"}
402
+ {"id": 45460, "token": "】\n\n", "type": "中文标点"}
403
+ {"id": 45472, "token": "输", "type": "中文单字"}
404
+ {"id": 45516, "token": "!\");\n", "type": "中文标点"}
405
+ {"id": 45631, "token": " 「", "type": "中文标点"}
406
+ {"id": 45736, "token": "高", "type": "中文单字"}
407
+ {"id": 45829, "token": "子", "type": "中文单字"}
408
+ {"id": 45893, "token": "道", "type": "中文单字"}
409
+ {"id": 45934, "token": "�述", "type": "中文多字"}
410
+ {"id": 46028, "token": "章", "type": "中文单字"}
411
+ {"id": 46031, "token": "字段", "type": "中文多字"}
412
+ {"id": 46034, "token": "手", "type": "中文单字"}
413
+ {"id": 46056, "token": "库", "type": "中文单字"}
414
+ {"id": 46091, "token": "三", "type": "中文单字"}
415
+ {"id": 46093, "token": "….\n\n", "type": "中文标点"}
416
+ {"id": 46233, "token": "“In", "type": "中文标点"}
417
+ {"id": 46239, "token": "提示", "type": "中文多字"}
418
+ {"id": 46281, "token": "从", "type": "中文单字"}
419
+ {"id": 46456, "token": "支", "type": "中文单字"}
420
+ {"id": 46690, "token": "“They", "type": "中文标点"}
421
+ {"id": 46729, "token": "家", "type": "中文单字"}
422
+ {"id": 46885, "token": "日期", "type": "中文多字"}
423
+ {"id": 46961, "token": "长", "type": "中文单字"}
424
+ {"id": 47000, "token": "付", "type": "中文单字"}
425
+ {"id": 47012, "token": "获取", "type": "中文多字"}
426
+ {"id": 47018, "token": "秒", "type": "中文单字"}
427
+ {"id": 47030, "token": "图片", "type": "中文多字"}
428
+ {"id": 47043, "token": "商品", "type": "中文多字"}
429
+ {"id": 47095, "token": "路", "type": "中文单字"}
430
+ {"id": 47200, "token": "代码", "type": "中文多字"}
431
+ {"id": 47406, "token": "完", "type": "中文单字"}
432
+ {"id": 47436, "token": ":</", "type": "中文标点"}
433
+ {"id": 47523, "token": "象", "type": "中文单字"}
434
+ {"id": 47548, "token": "则", "type": "中文单字"}
435
+ {"id": 47551, "token": "现", "type": "中文单字"}
436
+ {"id": 47566, "token": "设", "type": "中文单字"}
437
+ {"id": 47577, "token": "地址", "type": "中文多字"}
438
+ {"id": 47585, "token": "保存", "type": "中文多字"}
439
+ {"id": 47653, "token": "京", "type": "中文单字"}
440
+ {"id": 47770, "token": "转", "type": "中文单字"}
441
+ {"id": 47896, "token": " –\n\n", "type": "中文标点"}
442
+ {"id": 47971, "token": "�示", "type": "中文多字"}
443
+ {"id": 48039, "token": "辑", "type": "中文单字"}
444
+ {"id": 48044, "token": "一个", "type": "中文多字"}
445
+ {"id": 48249, "token": "限", "type": "中文单字"}
446
+ {"id": 48349, "token": "“A", "type": "中文标点"}
447
+ {"id": 48463, "token": "默认", "type": "中文多字"}
448
+ {"id": 48634, "token": "力", "type": "中文单字"}
449
+ {"id": 48706, "token": "存在", "type": "中文多字"}
450
+ {"id": 48785, "token": "数", "type": "中文单字"}
451
+ {"id": 48858, "token": "创建", "type": "中文多字"}
452
+ {"id": 48864, "token": "学", "type": "中文单字"}
453
+ {"id": 48915, "token": "外", "type": "中文单字"}
454
+ {"id": 48972, "token": "调", "type": "中文单字"}
455
+ {"id": 48974, "token": "服务", "type": "中文多字"}
456
+ {"id": 48982, "token": "项", "type": "中文单字"}
457
+ {"id": 49055, "token": "请输入", "type": "中文多字"}
458
+ {"id": 49216, "token": ".”\n", "type": "中文标点"}
459
+ {"id": 49372, "token": "),", "type": "中文标点"}
460
+ {"id": 49409, "token": "北", "type": "中文单字"}
461
+ {"id": 49491, "token": "字符", "type": "中文多字"}
462
+ {"id": 49525, "token": "—in", "type": "中文标点"}
463
+ {"id": 49543, "token": ":\n\n", "type": "中文标点"}
464
+ {"id": 49792, "token": "工", "type": "中文单字"}
465
+ {"id": 49838, "token": "笑", "type": "中文单字"}
466
+ {"id": 49928, "token": "监", "type": "中文单字"}
467
+ {"id": 49977, "token": "“That", "type": "中文标点"}
468
+ {"id": 49988, "token": "任", "type": "中文单字"}
469
+ {"id": 50004, "token": "—which", "type": "中文标点"}
470
+ {"id": 50021, "token": "相", "type": "中文单字"}
471
+ {"id": 50027, "token": "验证", "type": "中文多字"}
472
+ {"id": 50034, "token": "微", "type": "中文单字"}
473
+ {"id": 50126, "token": "册", "type": "中文单字"}
474
+ {"id": 50182, "token": "联", "type": "中文单字"}
475
+ {"id": 50211, "token": "平", "type": "中文单字"}
476
+ {"id": 50285, "token": "增", "type": "中文单字"}
477
+ {"id": 50287, "token": "听", "type": "中文单字"}
478
+ {"id": 50338, "token": "解", "type": "中文单字"}
479
+ {"id": 50617, "token": "—to", "type": "中文标点"}
480
+ {"id": 50667, "token": "等", "type": "中文单字"}
481
+ {"id": 50808, "token": "’ai", "type": "中文标点"}
482
+ {"id": 50928, "token": "得", "type": "中文单字"}
483
+ {"id": 51107, "token": "更新", "type": "中文多字"}
484
+ {"id": 51109, "token": "收", "type": "中文单字"}
485
+ {"id": 51142, "token": "用户", "type": "中文多字"}
486
+ {"id": 51202, "token": "选�", "type": "中文多字"}
487
+ {"id": 51279, "token": "…”", "type": "中文标点"}
488
+ {"id": 51385, "token": "安", "type": "中文单字"}
489
+ {"id": 51392, "token": "价", "type": "中文单字"}
490
+ {"id": 51431, "token": "第", "type": "中文单字"}
491
+ {"id": 51450, "token": "取消", "type": "中文多字"}
492
+ {"id": 51466, "token": "藏", "type": "中文单字"}
493
+ {"id": 51477, "token": "创建", "type": "中文多字"}
494
+ {"id": 51504, "token": "选择", "type": "中文多字"}
495
+ {"id": 51510, "token": "订单", "type": "中文多字"}
496
+ {"id": 51609, "token": "命", "type": "中文单字"}
497
+ {"id": 51611, "token": "应", "type": "中文单字"}
498
+ {"id": 51747, "token": "为空", "type": "中文多字"}
499
+ {"id": 51749, "token": "—or", "type": "中文标点"}
500
+ {"id": 51757, "token": "—I", "type": "中文标点"}
501
+ {"id": 51786, "token": "“,", "type": "中文标点"}
502
+ {"id": 51928, "token": "“When", "type": "中文标点"}
503
+ {"id": 52030, "token": "看", "type": "中文单字"}
504
+ {"id": 52084, "token": "索", "type": "中文单字"}
505
+ {"id": 52188, "token": "�始化", "type": "中文多字"}
506
+ {"id": 52225, "token": "资", "type": "中文单字"}
507
+ {"id": 52254, "token": "查询", "type": "中文多字"}
508
+ {"id": 52289, "token": "’en", "type": "中文标点"}
509
+ {"id": 52332, "token": "产", "type": "中文单字"}
510
+ {"id": 52563, "token": "表示", "type": "中文多字"}
511
+ {"id": 52675, "token": "串", "type": "中文单字"}
512
+ {"id": 52927, "token": "布", "type": "中文单字"}
513
+ {"id": 53229, "token": "原", "type": "中文单字"}
514
+ {"id": 53263, "token": "…..", "type": "中文标点"}
515
+ {"id": 53283, "token": "知", "type": "中文单字"}
516
+ {"id": 53434, "token": "级", "type": "中文单字"}
517
+ {"id": 53513, "token": "––", "type": "中文标点"}
518
+ {"id": 53610, "token": "水", "type": "中文单字"}
519
+ {"id": 53626, "token": "上传", "type": "中文多字"}
520
+ {"id": 53676, "token": "…and", "type": "中文标点"}
521
+ {"id": 53802, "token": "监听", "type": "中文多字"}
522
+ {"id": 53826, "token": "击", "type": "中文单字"}
523
+ {"id": 53901, "token": "好", "type": "中文单字"}
524
+ {"id": 53953, "token": "物", "type": "中文单字"}
525
+ {"id": 54140, "token": "文", "type": "中文单字"}
526
+ {"id": 54154, "token": "设置", "type": "中文多字"}
527
+ {"id": 54253, "token": "不能", "type": "中文多字"}
528
+ {"id": 54322, "token": "放", "type": "中文单字"}
529
+ {"id": 54456, "token": "亿", "type": "中文单字"}
530
+ {"id": 54493, "token": "经", "type": "中文单字"}
531
+ {"id": 54581, "token": "描述", "type": "中文多字"}
532
+ {"id": 54689, "token": "。。\n\n", "type": "中文标点"}
533
+ {"id": 54747, "token": "。“", "type": "中文标点"}
534
+ {"id": 54872, "token": "模", "type": "中文单字"}
535
+ {"id": 55030, "token": "之", "type": "中文单字"}
536
+ {"id": 55038, "token": "台", "type": "中文单字"}
537
+ {"id": 55080, "token": "…I", "type": "中文标点"}
538
+ {"id": 55121, "token": "显示", "type": "中文多字"}
539
+ {"id": 55139, "token": "州", "type": "中文单字"}
540
+ {"id": 55434, "token": "—is", "type": "中文标点"}
541
+ {"id": 55487, "token": "配", "type": "中文单字"}
542
+ {"id": 55642, "token": "处理", "type": "中文多字"}
543
+ {"id": 55723, "token": "画", "type": "中文单字"}
544
+ {"id": 55758, "token": "统", "type": "中文单字"}
545
+ {"id": 55951, "token": "是", "type": "中文单字"}
546
+ {"id": 55999, "token": "共", "type": "中文单字"}
547
+ {"id": 56026, "token": "连", "type": "中文单字"}
548
+ {"id": 56040, "token": "〜", "type": "中文标点"}
549
+ {"id": 56163, "token": "„", "type": "中文标点"}
550
+ {"id": 56209, "token": "…\"", "type": "中文标点"}
551
+ {"id": 56235, "token": "海", "type": "中文单字"}
552
+ {"id": 56386, "token": "开始", "type": "中文多字"}
553
+ {"id": 56438, "token": "所有", "type": "中文多字"}
554
+ {"id": 56602, "token": "节", "type": "中文单字"}
555
+ {"id": 56716, "token": "返回", "type": "中文多字"}
556
+ {"id": 56906, "token": "退", "type": "中文单字"}
557
+ {"id": 56907, "token": "”。", "type": "中文标点"}
558
+ {"id": 56955, "token": "”),", "type": "中文标点"}
559
+ {"id": 56965, "token": "間", "type": "中文单字"}
560
+ {"id": 57106, "token": "比", "type": "中文单字"}
561
+ {"id": 57107, "token": "问", "type": "中文单字"}
562
+ {"id": 57237, "token": "至", "type": "中文单字"}
563
+ {"id": 57287, "token": "’aut", "type": "中文标点"}
564
+ {"id": 57378, "token": "备", "type": "中文单字"}
565
+ {"id": 57633, "token": "”:", "type": "中文标点"}
566
+ {"id": 57668, "token": "你", "type": "中文单字"}
567
+ {"id": 57752, "token": "黑", "type": "中文单字"}
568
+ {"id": 57861, "token": "…”\n\n", "type": "中文标点"}
569
+ {"id": 57892, "token": "’av", "type": "中文标点"}
570
+ {"id": 58004, "token": "下午", "type": "中文多字"}
571
+ {"id": 58119, "token": "编辑", "type": "中文多字"}
572
+ {"id": 58291, "token": "或", "type": "中文单字"}
573
+ {"id": 58318, "token": "与", "type": "中文单字"}
574
+ {"id": 58322, "token": "影", "type": "中文单字"}
575
+ {"id": 58386, "token": "’h", "type": "中文标点"}
576
+ {"id": 58521, "token": "作者", "type": "中文多字"}
577
+ {"id": 58543, "token": "话", "type": "中文单字"}
578
+ {"id": 58552, "token": "视", "type": "中文单字"}
579
+ {"id": 58653, "token": "读", "type": "中文单字"}
580
+ {"id": 58655, "token": "告", "type": "中文单字"}
581
+ {"id": 58666, "token": "美", "type": "中文单字"}
582
+ {"id": 58721, "token": "事件", "type": "中文多字"}
583
+ {"id": 58850, "token": "女", "type": "中文单字"}
584
+ {"id": 58911, "token": "山", "type": "中文单字"}
585
+ {"id": 59243, "token": "和", "type": "中文单字"}
586
+ {"id": 59363, "token": "生", "type": "中文单字"}
587
+ {"id": 59459, "token": "。(", "type": "中文标点"}
588
+ {"id": 59462, "token": "需", "type": "中文单字"}
589
+ {"id": 59464, "token": "复", "type": "中文单字"}
590
+ {"id": 59505, "token": "手机", "type": "中文多字"}
591
+ {"id": 59563, "token": "南", "type": "中文单字"}
592
+ {"id": 59614, "token": "必", "type": "中文单字"}
593
+ {"id": 59622, "token": "�行", "type": "中文多字"}
594
+ {"id": 59712, "token": "」「", "type": "中文标点"}
595
+ {"id": 59757, "token": "分", "type": "中文单字"}
596
+ {"id": 59795, "token": "中国", "type": "中文多字"}
597
+ {"id": 59892, "token": "闭", "type": "中文单字"}
598
+ {"id": 59914, "token": "加载", "type": "中文多字"}
599
+ {"id": 60174, "token": "城", "type": "中文单字"}
600
+ {"id": 60205, "token": "用户名", "type": "中文多字"}
601
+ {"id": 60233, "token": " 。", "type": "中文标点"}
602
+ {"id": 60239, "token": "�性", "type": "中文多字"}
603
+ {"id": 60251, "token": "结果", "type": "中文多字"}
604
+ {"id": 60317, "token": ";\n", "type": "中文标点"}
605
+ {"id": 60358, "token": "近", "type": "中文单字"}
606
+ {"id": 60455, "token": "效", "type": "中文单字"}
607
+ {"id": 60632, "token": "利", "type": "中文单字"}
608
+ {"id": 60634, "token": "移", "type": "中文单字"}
609
+ {"id": 60654, "token": "—as", "type": "中文标点"}
610
+ {"id": 60656, "token": "’int", "type": "中文标点"}
611
+ {"id": 60710, "token": "–\n\n", "type": "中文标点"}
612
+ {"id": 60843, "token": "总", "type": "中文单字"}
613
+ {"id": 60979, "token": "按", "type": "中文单字"}
614
+ {"id": 61056, "token": "排", "type": "中文单字"}
615
+ {"id": 61075, "token": "首", "type": "中文单字"}
616
+ {"id": 61131, "token": "’n", "type": "中文标点"}
617
+ {"id": 61176, "token": "··", "type": "中文标点"}
618
+ {"id": 61304, "token": "記", "type": "中文单字"}
619
+ {"id": 61311, "token": "————————————————", "type": "中文标点"}
620
+ {"id": 61337, "token": "社", "type": "中文单字"}
621
+ {"id": 61496, "token": "标题", "type": "中文多字"}
622
+ {"id": 61553, "token": "“As", "type": "中文标点"}
623
+ {"id": 61559, "token": "“No", "type": "中文标点"}
624
+ {"id": 61603, "token": "“But", "type": "中文标点"}
625
+ {"id": 61633, "token": "注意", "type": "中文多字"}
626
+ {"id": 61648, "token": "完成", "type": "中文多字"}
627
+ {"id": 61710, "token": "确定", "type": "中文多字"}
628
+ {"id": 61786, "token": "西", "type": "中文单字"}
629
+ {"id": 61826, "token": "先", "type": "中文单字"}
630
+ {"id": 61903, "token": "…\"\n\n", "type": "中文标点"}
631
+ {"id": 61994, "token": "然", "type": "中文单字"}
632
+ {"id": 62049, "token": "键", "type": "中文单字"}
633
+ {"id": 62205, "token": "名", "type": "中文单字"}
634
+ {"id": 62249, "token": "周期", "type": "中文多字"}
635
+ {"id": 62291, "token": "额", "type": "中文单字"}
636
+ {"id": 62543, "token": "写", "type": "中文单字"}
637
+ {"id": 62597, "token": "“My", "type": "中文标点"}
638
+ {"id": 62717, "token": "�名", "type": "中文多字"}
639
+ {"id": 62789, "token": "注册", "type": "中文多字"}
640
+ {"id": 62855, "token": "签", "type": "中文单字"}
641
+ {"id": 63091, "token": "自", "type": "中文单字"}
642
+ {"id": 63093, "token": "。',\n", "type": "中文标点"}
643
+ {"id": 63212, "token": "因", "type": "中文单字"}
644
+ {"id": 63289, "token": "下载", "type": "中文多字"}
645
+ {"id": 63344, "token": "如果", "type": "中文多字"}
646
+ {"id": 63362, "token": "数据", "type": "中文多字"}
647
+ {"id": 63397, "token": "命周期", "type": "中文多字"}
648
+ {"id": 63679, "token": "注", "type": "中文单字"}
649
+ {"id": 63750, "token": "”—", "type": "中文标点"}
650
+ {"id": 63938, "token": "—not", "type": "中文标点"}
651
+ {"id": 63977, "token": " —\n\n", "type": "中文标点"}
652
+ {"id": 64022, "token": "别", "type": "中文单字"}
653
+ {"id": 64026, "token": "并", "type": "中文单字"}
654
+ {"id": 64045, "token": "异", "type": "中文单字"}
655
+ {"id": 64063, "token": "束", "type": "中文单字"}
656
+ {"id": 64171, "token": "修改", "type": "中文多字"}
657
+ {"id": 64173, "token": "删除", "type": "中文多字"}
658
+ {"id": 64179, "token": "生命周期", "type": "中文多字"}
659
+ {"id": 64209, "token": "心", "type": "中文单字"}
660
+ {"id": 64376, "token": "。\",\n", "type": "中文标点"}
661
+ {"id": 64414, "token": "链", "type": "中文单字"}
662
+ {"id": 64467, "token": "指", "type": "中文单字"}
663
+ {"id": 64479, "token": "评", "type": "中文单字"}
664
+ {"id": 64531, "token": "整", "type": "中文单字"}
665
+ {"id": 64623, "token": "’in", "type": "中文标点"}
666
+ {"id": 64803, "token": "四", "type": "中文单字"}
667
+ {"id": 64889, "token": "断", "type": "中文单字"}
668
+ {"id": 64936, "token": "角", "type": "中文单字"}
669
+ {"id": 64960, "token": "生命周期函数", "type": "中文多字"}
670
+ {"id": 65053, "token": "监听页面", "type": "中文多字"}
671
+ {"id": 65164, "token": "连接", "type": "中文多字"}
672
+ {"id": 65218, "token": "上", "type": "中文单字"}
673
+ {"id": 65305, "token": "消息", "type": "中文多字"}
674
+ {"id": 65312, "token": "”).", "type": "中文标点"}
675
+ {"id": 65372, "token": "软", "type": "中文单字"}
676
+ {"id": 65455, "token": "头", "type": "中文单字"}
677
+ {"id": 65459, "token": ")、", "type": "中文标点"}
678
+ {"id": 65529, "token": "对象", "type": "中文多字"}
679
+ {"id": 65571, "token": "是否", "type": "中文多字"}
680
+ {"id": 65573, "token": "邮", "type": "中文单字"}
681
+ {"id": 65659, "token": "义", "type": "中文单字"}
682
+ {"id": 65743, "token": "司", "type": "中文单字"}
683
+ {"id": 65782, "token": "步", "type": "中文单字"}
684
+ {"id": 65789, "token": "门", "type": "中文单字"}
685
+ {"id": 65820, "token": "导", "type": "中文单字"}
686
+ {"id": 65854, "token": "客", "type": "中文单字"}
687
+ {"id": 65884, "token": "不能为空", "type": "中文多字"}
688
+ {"id": 65917, "token": "右", "type": "中文单字"}
689
+ {"id": 66052, "token": "频", "type": "中文单字"}
690
+ {"id": 66101, "token": "\"—", "type": "中文标点"}
691
+ {"id": 66201, "token": "像", "type": "中文单字"}
692
+ {"id": 66327, "token": "。「", "type": "中文标点"}
693
+ {"id": 66378, "token": "特", "type": "中文单字"}
694
+ {"id": 66383, "token": "」と", "type": "中文标点"}
695
+ {"id": 66545, "token": "”;", "type": "中文标点"}
696
+ {"id": 66621, "token": " ….", "type": "中文标点"}
697
+ {"id": 66625, "token": "“Our", "type": "中文标点"}
698
+ {"id": 66677, "token": "记录", "type": "中文多字"}
699
+ {"id": 66679, "token": "…\n\n\n", "type": "中文标点"}
700
+ {"id": 66776, "token": "非", "type": "中文单字"}
701
+ {"id": 66850, "token": " “[", "type": "中文标点"}
702
+ {"id": 66870, "token": "省", "type": "中文单字"}
703
+ {"id": 67117, "token": "输出", "type": "中文多字"}
704
+ {"id": 67178, "token": "造", "type": "中文单字"}
705
+ {"id": 67282, "token": "’ét", "type": "中文标点"}
706
+ {"id": 67287, "token": "姓名", "type": "中文多字"}
707
+ {"id": 67494, "token": "说明", "type": "中文多字"}
708
+ {"id": 67658, "token": "字符串", "type": "中文多字"}
709
+ {"id": 67669, "token": "径", "type": "中文单字"}
710
+ {"id": 67735, "token": "�试", "type": "中文多字"}
711
+ {"id": 67870, "token": "’e", "type": "中文标点"}
712
+ {"id": 67886, "token": " ”\n\n", "type": "中文标点"}
713
+ {"id": 67933, "token": "详", "type": "中文单字"}
714
+ {"id": 67986, "token": "验证码", "type": "中文多字"}
715
+ {"id": 67998, "token": "。\\", "type": "中文标点"}
716
+ {"id": 68171, "token": "由", "type": "中文单字"}
717
+ {"id": 68230, "token": "^", "type": "中文标点"}
718
+ {"id": 68306, "token": "’on", "type": "中文标点"}
719
+ {"id": 68379, "token": "包", "type": "中文单字"}
720
+ {"id": 68438, "token": "通过", "type": "中文多字"}
721
+ {"id": 68464, "token": "东", "type": "中文单字"}
722
+ {"id": 68850, "token": ")—", "type": "中文标点"}
723
+ {"id": 68931, "token": "论", "type": "中文单字"}
724
+ {"id": 68932, "token": "“And", "type": "中文标点"}
725
+ {"id": 69049, "token": "当前", "type": "中文多字"}
726
+ {"id": 69165, "token": "络", "type": "中文单字"}
727
+ {"id": 69253, "token": "款", "type": "中文单字"}
728
+ {"id": 69272, "token": "�藏", "type": "中文多字"}
729
+ {"id": 69362, "token": "支付", "type": "中文多字"}
730
+ {"id": 69496, "token": "启", "type": "中文单字"}
731
+ {"id": 69636, "token": "而", "type": "中文单字"}
732
+ {"id": 69856, "token": "填", "type": "中文单字"}
733
+ {"id": 69905, "token": "格式", "type": "中文多字"}
734
+ {"id": 69962, "token": "释", "type": "中文单字"}
735
+ {"id": 69978, "token": "持", "type": "中文单字"}
736
+ {"id": 70041, "token": "��索", "type": "中文多字"}
737
+ {"id": 70090, "token": "北京", "type": "中文多字"}
738
+ {"id": 70141, "token": "向", "type": "中文单字"}
739
+ {"id": 70158, "token": "输入", "type": "中文多字"}
740
+ {"id": 70203, "token": "算", "type": "中文单字"}
741
+ {"id": 70214, "token": "“So", "type": "中文标点"}
742
+ {"id": 70262, "token": "对", "type": "中文单字"}
743
+ {"id": 70277, "token": "江", "type": "中文单字"}
744
+ {"id": 70284, "token": "不存在", "type": "中文多字"}
745
+ {"id": 70349, "token": "里", "type": "中文单字"}
746
+ {"id": 70453, "token": "查", "type": "中文单字"}
747
+ {"id": 70472, "token": "如", "type": "中文单字"}
748
+ {"id": 70525, "token": "发", "type": "中文单字"}
749
+ {"id": 70542, "token": "份", "type": "中文单字"}
750
+ {"id": 70615, "token": "),", "type": "中文标点"}
751
+ {"id": 70616, "token": "责", "type": "中文单字"}
752
+ {"id": 70626, "token": "科", "type": "中文单字"}
753
+ {"id": 70694, "token": "文件", "type": "中文多字"}
754
+ {"id": 70774, "token": "类", "type": "中文单字"}
755
+ {"id": 70821, "token": "民", "type": "中文单字"}
756
+ {"id": 70924, "token": "数组", "type": "中文多字"}
757
+ {"id": 71005, "token": "治", "type": "中文单字"}
758
+ {"id": 71082, "token": "%,", "type": "中文标点"}
759
+ {"id": 71174, "token": "声", "type": "中文单字"}
760
+ {"id": 71201, "token": "—they", "type": "中文标点"}
761
+ {"id": 71208, "token": "男", "type": "中文单字"}
762
+ {"id": 71270, "token": "“(", "type": "中文标点"}
763
+ {"id": 71298, "token": "[…", "type": "中文标点"}
764
+ {"id": 71461, "token": "重新", "type": "中文多字"}
765
+ {"id": 71480, "token": "—you", "type": "中文标点"}
766
+ {"id": 71600, "token": "设计", "type": "中文多字"}
767
+ {"id": 71638, "token": "分类", "type": "中文多字"}
768
+ {"id": 71668, "token": "输出", "type": "中文多字"}
769
+ {"id": 71689, "token": "以上", "type": "中文多字"}
770
+ {"id": 71733, "token": "异常", "type": "中文多字"}
771
+ {"id": 71869, "token": "族", "type": "中文单字"}
772
+ {"id": 71890, "token": "站", "type": "中文单字"}
773
+ {"id": 72027, "token": "没", "type": "中文单字"}
774
+ {"id": 72069, "token": "参数", "type": "中文多字"}
775
+ {"id": 72099, "token": "県", "type": "中文单字"}
776
+ {"id": 72125, "token": "雅", "type": "中文单字"}
777
+ {"id": 72209, "token": "版本", "type": "中文多字"}
778
+ {"id": 72234, "token": "换", "type": "中文单字"}
779
+ {"id": 72237, "token": "核", "type": "中文单字"}
780
+ {"id": 72238, "token": "素", "type": "中文单字"}
781
+ {"id": 72318, "token": "—for", "type": "中文标点"}
782
+ {"id": 72368, "token": "都", "type": "中文单字"}
783
+ {"id": 72404, "token": "超", "type": "中文单字"}
784
+ {"id": 72434, "token": "!’", "type": "中文标点"}
785
+ {"id": 72456, "token": "网络", "type": "中文多字"}
786
+ {"id": 72516, "token": "店", "type": "中文单字"}
787
+ {"id": 72718, "token": "起", "type": "中文单字"}
788
+ {"id": 72794, "token": "隐藏", "type": "中文多字"}
789
+ {"id": 72843, "token": "享", "type": "中文单字"}
790
+ {"id": 72873, "token": "方", "type": "中文单字"}
791
+ {"id": 72917, "token": "进行", "type": "中文多字"}
792
+ {"id": 73051, "token": "是否", "type": "中文多字"}
793
+ {"id": 73071, "token": "提交", "type": "中文多字"}
794
+ {"id": 73117, "token": "发送", "type": "中文多字"}
795
+ {"id": 73164, "token": "联系", "type": "中文多字"}
796
+ {"id": 73325, "token": "拉", "type": "中文单字"}
797
+ {"id": 73329, "token": "…\n\n\n\n", "type": "中文标点"}
798
+ {"id": 73361, "token": "米", "type": "中文单字"}
799
+ {"id": 73548, "token": "系统", "type": "中文多字"}
800
+ {"id": 73686, "token": "引", "type": "中文单字"}
801
+ {"id": 73740, "token": "编号", "type": "中文多字"}
802
+ {"id": 73751, "token": "点击", "type": "中文多字"}
803
+ {"id": 73769, "token": "更", "type": "中文单字"}
804
+ {"id": 73939, "token": "…)", "type": "中文标点"}
805
+ {"id": 73958, "token": "中", "type": "中文单字"}
806
+ {"id": 73981, "token": "语", "type": "中文单字"}
807
+ {"id": 74022, "token": "”?", "type": "中文标点"}
808
+ {"id": 74090, "token": "土", "type": "中文单字"}
809
+ {"id": 74138, "token": "宋", "type": "中文单字"}
810
+ {"id": 74245, "token": "直", "type": "中文单字"}
811
+ {"id": 74257, "token": "每", "type": "中文单字"}
812
+ {"id": 74318, "token": "公司", "type": "中文多字"}
813
+ {"id": 74396, "token": "箱", "type": "中文单字"}
814
+ {"id": 74412, "token": "字", "type": "中文单字"}
815
+ {"id": 74445, "token": "项目", "type": "中文多字"}
816
+ {"id": 74482, "token": "後", "type": "中文单字"}
817
+ {"id": 74662, "token": "在", "type": "中文单字"}
818
+ {"id": 74770, "token": "可以", "type": "中文多字"}
819
+ {"id": 74843, "token": "参", "type": "中文单字"}
820
+ {"id": 75140, "token": "变", "type": "中文单字"}
821
+ {"id": 75146, "token": "基", "type": "中文单字"}
822
+ {"id": 75259, "token": "页面", "type": "中文多字"}
823
+ {"id": 75267, "token": "場", "type": "中文单字"}
824
+ {"id": 75293, "token": "待", "type": "中文单字"}
825
+ {"id": 75320, "token": "程序", "type": "中文多字"}
826
+ {"id": 75376, "token": ")。", "type": "中文标点"}
827
+ {"id": 75486, "token": "规", "type": "中文单字"}
828
+ {"id": 75493, "token": "数据库", "type": "中文多字"}
829
+ {"id": 75513, "token": "政", "type": "中文单字"}
830
+ {"id": 75550, "token": "“For", "type": "中文标点"}
831
+ {"id": 75630, "token": "雅黑", "type": "中文多字"}
832
+ {"id": 75631, "token": "软雅黑", "type": "中文多字"}
833
+ {"id": 75761, "token": "排序", "type": "中文多字"}
834
+ {"id": 75787, "token": "。\n\n\n\n\n\n", "type": "中文标点"}
835
+ {"id": 75863, "token": "也", "type": "中文单字"}
836
+ {"id": 75910, "token": "介", "type": "中文单字"}
837
+ {"id": 75976, "token": "首页", "type": "中文多字"}
838
+ {"id": 76070, "token": "—including", "type": "中文标点"}
839
+ {"id": 76099, "token": "关闭", "type": "中文多字"}
840
+ {"id": 76148, "token": ",\n\n", "type": "中文标点"}
841
+ {"id": 76161, "token": "钟", "type": "中文单字"}
842
+ {"id": 76208, "token": "五", "type": "中文单字"}
843
+ {"id": 76217, "token": "执行", "type": "中文多字"}
844
+ {"id": 76323, "token": "审", "type": "中文单字"}
845
+ {"id": 76417, "token": "单位", "type": "中文多字"}
846
+ {"id": 76455, "token": "手机号", "type": "中文多字"}
847
+ {"id": 76502, "token": "日", "type": "中文单字"}
848
+ {"id": 76505, "token": "木", "type": "中文单字"}
849
+ {"id": 76537, "token": "打", "type": "中文单字"}
850
+ {"id": 76706, "token": "活", "type": "中文单字"}
851
+ {"id": 76718, "token": "微软雅黑", "type": "中文多字"}
852
+ {"id": 76750, "token": "播", "type": "中文单字"}
853
+ {"id": 76843, "token": "!!\n\n", "type": "中文标点"}
854
+ {"id": 76858, "token": "!”", "type": "中文标点"}
855
+ {"id": 76864, "token": "!」", "type": "中文标点"}
856
+ {"id": 76868, "token": "方式", "type": "中文多字"}
857
+ {"id": 76929, "token": "—he", "type": "中文标点"}
858
+ {"id": 76982, "token": "该", "type": "中文单字"}
859
+ {"id": 77138, "token": "’am", "type": "中文标点"}
860
+ {"id": 77158, "token": "…)\n\n", "type": "中文标点"}
861
+ {"id": 77190, "token": "初始化", "type": "中文多字"}
862
+ {"id": 77195, "token": "条件", "type": "中文多字"}
863
+ {"id": 77219, "token": "記事", "type": "中文多字"}
864
+ {"id": 77284, "token": "“.", "type": "中文标点"}
865
+ {"id": 77413, "token": "展", "type": "中文单字"}
866
+ {"id": 77479, "token": ",…\n\n", "type": "中文标点"}
867
+ {"id": 77748, "token": "钮", "type": "中文单字"}
868
+ {"id": 77913, "token": "具", "type": "中文单字"}
869
+ {"id": 77937, "token": "路径", "type": "中文多字"}
870
+ {"id": 78021, "token": "退出", "type": "中文多字"}
871
+ {"id": 78111, "token": "宋体", "type": "中文多字"}
872
+ {"id": 78228, "token": "志", "type": "中文单字"}
873
+ {"id": 78244, "token": "言", "type": "中文单字"}
874
+ {"id": 78272, "token": "购", "type": "中文单字"}
875
+ {"id": 78366, "token": "……………………", "type": "中文标点"}
876
+ {"id": 78388, "token": "但", "type": "中文单字"}
877
+ {"id": 78519, "token": "星", "type": "中文单字"}
878
+ {"id": 78640, "token": "两", "type": "中文单字"}
879
+ {"id": 78657, "token": "例如", "type": "中文多字"}
880
+ {"id": 78659, "token": "左", "type": "中文单字"}
881
+ {"id": 78698, "token": "考", "type": "中文单字"}
882
+ {"id": 78935, "token": "构", "type": "中文单字"}
883
+ {"id": 78943, "token": "報", "type": "中文单字"}
884
+ {"id": 79059, "token": "球", "type": "中文单字"}
885
+ {"id": 79108, "token": "设计器", "type": "中文多字"}
886
+ {"id": 79203, "token": "更新", "type": "中文多字"}
887
+ {"id": 79656, "token": "相关", "type": "中文多字"}
888
+ {"id": 79785, "token": "音", "type": "中文单字"}
889
+ {"id": 79908, "token": "动生成", "type": "中文多字"}
890
+ {"id": 79982, "token": "端", "type": "中文单字"}
891
+ {"id": 80000, "token": "。”\n\n", "type": "中文标点"}
892
+ {"id": 80003, "token": ",默认", "type": "中文多字"}
893
+ {"id": 80019, "token": "新", "type": "中文单字"}
894
+ {"id": 80073, "token": "搜索", "type": "中文多字"}
895
+ {"id": 80078, "token": "—even", "type": "中文标点"}
896
+ {"id": 80172, "token": "投", "type": "中文单字"}
897
+ {"id": 80195, "token": "立", "type": "中文单字"}
898
+ {"id": 80356, "token": "属性", "type": "中文多字"}
899
+ {"id": 80426, "token": "�断", "type": "中文多字"}
900
+ {"id": 80578, "token": "们", "type": "中文单字"}
901
+ {"id": 80615, "token": ".…\n\n", "type": "中文标点"}
902
+ {"id": 80699, "token": "火", "type": "中文单字"}
903
+ {"id": 80804, "token": "示", "type": "中文单字"}
904
+ {"id": 80866, "token": "清", "type": "中文单字"}
905
+ {"id": 81194, "token": "金额", "type": "中文多字"}
906
+ {"id": 81201, "token": "账", "type": "中文单字"}
907
+ {"id": 81258, "token": "就", "type": "中文单字"}
908
+ {"id": 81368, "token": "费", "type": "中文单字"}
909
+ {"id": 81506, "token": "请选择", "type": "中文多字"}
910
+ {"id": 81526, "token": "示例", "type": "中文多字"}
911
+ {"id": 81543, "token": "没有", "type": "中文多字"}
912
+ {"id": 81546, "token": ":\"+", "type": "中文标点"}
913
+ {"id": 81628, "token": "查询", "type": "中文多字"}
914
+ {"id": 81646, "token": "默认", "type": "中文多字"}
915
+ {"id": 81665, "token": "结束", "type": "中文多字"}
916
+ {"id": 81742, "token": "案", "type": "中文单字"}
917
+ {"id": 81902, "token": "—with", "type": "中文标点"}
918
+ {"id": 81951, "token": "控", "type": "中文单字"}
919
+ {"id": 81976, "token": "请求", "type": "中文多字"}
920
+ {"id": 82042, "token": "广", "type": "中文单字"}
921
+ {"id": 82175, "token": "’app", "type": "中文标点"}
922
+ {"id": 82267, "token": "确认", "type": "中文多字"}
923
+ {"id": 82302, "token": "历", "type": "中文单字"}
924
+ {"id": 82317, "token": "及", "type": "中文单字"}
925
+ {"id": 82363, "token": "如果", "type": "中文多字"}
926
+ {"id": 82364, "token": "?”", "type": "中文标点"}
927
+ {"id": 82420, "token": "計", "type": "中文单字"}
928
+ {"id": 82530, "token": "、、", "type": "中文标点"}
929
+ {"id": 82533, "token": "止", "type": "中文单字"}
930
+ {"id": 82554, "token": "方法", "type": "中文多字"}
931
+ {"id": 82696, "token": "么", "type": "中文单字"}
932
+ {"id": 82768, "token": "货", "type": "中文单字"}
933
+ {"id": 82805, "token": "测试", "type": "中文多字"}
934
+ {"id": 82900, "token": "数量", "type": "中文多字"}
935
+ {"id": 82912, "token": "位置", "type": "中文多字"}
936
+ {"id": 82973, "token": "時間", "type": "中文多字"}
937
+ {"id": 83042, "token": "�权", "type": "中文多字"}
938
+ {"id": 83047, "token": "开", "type": "中文单字"}
939
+ {"id": 83125, "token": "文章", "type": "中文多字"}
940
+ {"id": 83175, "token": "阳", "type": "中文单字"}
941
+ {"id": 83266, "token": "队", "type": "中文单字"}
942
+ {"id": 83301, "token": "技", "type": "中文单字"}
943
+ {"id": 83324, "token": "场", "type": "中文单字"}
944
+ {"id": 83337, "token": "链接", "type": "中文多字"}
945
+ {"id": 83354, "token": ">", "type": "中文标点"}
946
+ {"id": 83439, "token": "添加", "type": "中文多字"}
947
+ {"id": 83639, "token": "最", "type": "中文单字"}
948
+ {"id": 83687, "token": "数字", "type": "中文多字"}
949
+ {"id": 83741, "token": "声明", "type": "中文多字"}
950
+ {"id": 83747, "token": "少", "type": "中文单字"}
951
+ {"id": 83766, "token": "…but", "type": "中文标点"}
952
+ {"id": 83799, "token": "形", "type": "中文单字"}
953
+ {"id": 83800, "token": "产品", "type": "中文多字"}
954
+ {"id": 83872, "token": "—are", "type": "中文标点"}
955
+ {"id": 83932, "token": "稿", "type": "中文单字"}
956
+ {"id": 83947, "token": "英", "type": "中文单字"}
957
+ {"id": 83994, "token": "游", "type": "中文单字"}
958
+ {"id": 84095, "token": "亿元", "type": "中文多字"}
959
+ {"id": 84131, "token": "分钟", "type": "中文多字"}
960
+ {"id": 84341, "token": ".…", "type": "中文标点"}
961
+ {"id": 84410, "token": "商", "type": "中文单字"}
962
+ {"id": 84498, "token": "“She", "type": "中文标点"}
963
+ {"id": 84765, "token": "!\",", "type": "中文标点"}
964
+ {"id": 84844, "token": "供", "type": "中文单字"}
965
+ {"id": 84851, "token": "推", "type": "中文单字"}
966
+ {"id": 84875, "token": "!\n\n\n\n", "type": "中文标点"}
967
+ {"id": 84941, "token": "—who", "type": "中文标点"}
968
+ {"id": 85155, "token": "初始化", "type": "中文多字"}
969
+ {"id": 85188, "token": "税", "type": "中文单字"}
970
+ {"id": 85284, "token": "按钮", "type": "中文多字"}
971
+ {"id": 85366, "token": "—an", "type": "中文标点"}
972
+ {"id": 85663, "token": "無し�", "type": "中文多字"}
973
+ {"id": 85707, "token": "初", "type": "中文单字"}
974
+ {"id": 85997, "token": "当", "type": "中文单字"}
975
+ {"id": 85998, "token": "!');\n", "type": "中文标点"}
976
+ {"id": 86127, "token": "私", "type": "中文单字"}
977
+ {"id": 86206, "token": "需要", "type": "中文多字"}
978
+ {"id": 86222, "token": "解", "type": "中文单字"}
979
+ {"id": 86319, "token": "—we", "type": "中文标点"}
980
+ {"id": 86348, "token": "全部", "type": "中文多字"}
981
+ {"id": 86354, "token": "景", "type": "中文单字"}
982
+ {"id": 86429, "token": "资源", "type": "中文多字"}
983
+ {"id": 86436, "token": "去", "type": "中文单字"}
984
+ {"id": 86461, "token": "华", "type": "中文单字"}
985
+ {"id": 86508, "token": "“Yes", "type": "中文标点"}
986
+ {"id": 86601, "token": "’T", "type": "中文标点"}
987
+ {"id": 86741, "token": "评论", "type": "中文多字"}
988
+ {"id": 86758, "token": "使用", "type": "中文多字"}
989
+ {"id": 86846, "token": "’B", "type": "中文标点"}
990
+ {"id": 86867, "token": "配置", "type": "中文多字"}
991
+ {"id": 87023, "token": "–and", "type": "中文标点"}
992
+ {"id": 87109, "token": "不", "type": "中文单字"}
993
+ {"id": 87177, "token": "話", "type": "中文单字"}
994
+ {"id": 87217, "token": "番", "type": "中文单字"}
995
+ {"id": 87219, "token": "问题", "type": "中文多字"}
996
+ {"id": 87247, "token": "—all", "type": "中文标点"}
997
+ {"id": 87327, "token": "报道", "type": "中文多字"}
998
+ {"id": 87412, "token": "环", "type": "中文单字"}
999
+ {"id": 87441, "token": "张", "type": "中文单字"}
1000
+ {"id": 87447, "token": "開", "type": "中文单字"}
1001
+ {"id": 87474, "token": "無しさん", "type": "中文多字"}
1002
+ {"id": 87502, "token": "种", "type": "中文单字"}
1003
+ {"id": 87646, "token": "成", "type": "中文单字"}
1004
+ {"id": 87671, "token": "—one", "type": "中文标点"}
1005
+ {"id": 87844, "token": "易", "type": "中文单字"}
1006
+ {"id": 87990, "token": "“Oh", "type": "中文标点"}
1007
+ {"id": 88108, "token": "……\n\n", "type": "中文标点"}
1008
+ {"id": 88126, "token": "您", "type": "中文单字"}
1009
+ {"id": 88161, "token": "’an", "type": "中文标点"}
1010
+ {"id": 88240, "token": "视频", "type": "中文多字"}
1011
+ {"id": 88343, "token": "》,", "type": "中文标点"}
1012
+ {"id": 88348, "token": ".’”\n\n", "type": "中文标点"}
1013
+ {"id": 88356, "token": "再", "type": "中文单字"}
1014
+ {"id": 88367, "token": "可能", "type": "中文多字"}
1015
+ {"id": 88435, "token": "文字", "type": "中文多字"}
1016
+ {"id": 88631, "token": "板", "type": "中文单字"}
1017
+ {"id": 88851, "token": "’acc", "type": "中文标点"}
1018
+ {"id": 88852, "token": "以下", "type": "中文多字"}
1019
+ {"id": 88905, "token": "电话", "type": "中文多字"}
1020
+ {"id": 88925, "token": "“Well", "type": "中文标点"}
1021
+ {"id": 88958, "token": "—from", "type": "中文标点"}
1022
+ {"id": 89046, "token": "連", "type": "中文单字"}
1023
+ {"id": 89151, "token": "真", "type": "中文单字"}
1024
+ {"id": 89186, "token": "有效", "type": "中文多字"}
1025
+ {"id": 89213, "token": "’:", "type": "中文标点"}
1026
+ {"id": 89408, "token": "今年", "type": "中文多字"}
1027
+ {"id": 89575, "token": "€“", "type": "中文标点"}
1028
+ {"id": 89753, "token": "流", "type": "中文单字"}
1029
+ {"id": 89783, "token": "余", "type": "中文单字"}
1030
+ {"id": 89874, "token": "”\n", "type": "中文标点"}
1031
+ {"id": 89902, "token": "任务", "type": "中文多字"}
1032
+ {"id": 90070, "token": "见", "type": "中文单字"}
1033
+ {"id": 90091, "token": "正确", "type": "中文多字"}
1034
+ {"id": 90112, "token": "给", "type": "中文单字"}
1035
+ {"id": 90147, "token": "服务器", "type": "中文多字"}
1036
+ {"id": 90223, "token": "’es", "type": "中文标点"}
1037
+ {"id": 90261, "token": "来源", "type": "中文多字"}
1038
+ {"id": 90354, "token": "结", "type": "中文单字"}
1039
+ {"id": 90493, "token": "。<", "type": "中文标点"}
1040
+ {"id": 90578, "token": "…\n", "type": "中文标点"}
1041
+ {"id": 90581, "token": "-", "type": "中文标点"}
1042
+ {"id": 90756, "token": "详情", "type": "中文多字"}
1043
+ {"id": 90863, "token": "—if", "type": "中文标点"}
1044
+ {"id": 91006, "token": "?」", "type": "中文标点"}
1045
+ {"id": 91077, "token": "局", "type": "中文单字"}
1046
+ {"id": 91082, "token": "主", "type": "中文单字"}
1047
+ {"id": 91240, "token": "’à", "type": "中文标点"}
1048
+ {"id": 91272, "token": "优", "type": "中文单字"}
1049
+ {"id": 91386, "token": "书", "type": "中文单字"}
1050
+ {"id": 91417, "token": "’y", "type": "中文标点"}
1051
+ {"id": 91418, "token": "’util", "type": "中文标点"}
1052
+ {"id": 91443, "token": "’hui", "type": "中文标点"}
1053
+ {"id": 91466, "token": "一页", "type": "中文多字"}
1054
+ {"id": 91495, "token": ",并", "type": "中文多字"}
1055
+ {"id": 91547, "token": "发布", "type": "中文多字"}
1056
+ {"id": 91763, "token": "思", "type": "中文单字"}
1057
+ {"id": 91774, "token": "見", "type": "中文单字"}
1058
+ {"id": 91837, "token": ":<", "type": "中文标点"}
1059
+ {"id": 91875, "token": "動", "type": "中文单字"}
1060
+ {"id": 91940, "token": "运", "type": "中文单字"}
1061
+ {"id": 91951, "token": "审核", "type": "中文多字"}
1062
+ {"id": 91967, "token": "图", "type": "中文单字"}
1063
+ {"id": 91985, "token": "样", "type": "中文单字"}
1064
+ {"id": 92019, "token": "其中", "type": "中文多字"}
1065
+ {"id": 92056, "token": "权限", "type": "中文多字"}
1066
+ {"id": 92099, "token": "删除成功", "type": "中文多字"}
1067
+ {"id": 92113, "token": " “…", "type": "中文标点"}
1068
+ {"id": 92150, "token": "�新", "type": "中文多字"}
1069
+ {"id": 92193, "token": "(笑", "type": "中文多字"}
1070
+ {"id": 92211, "token": ",《", "type": "中文标点"}
1071
+ {"id": 92264, "token": ",’”", "type": "中文标点"}
1072
+ {"id": 92318, "token": "时间", "type": "中文多字"}
1073
+ {"id": 92366, "token": "】,", "type": "中文标点"}
1074
+ {"id": 92378, "token": ")\r\n", "type": "中文标点"}
1075
+ {"id": 92382, "token": "定义", "type": "中文多字"}
1076
+ {"id": 92517, "token": "关", "type": "中文单字"}
1077
+ {"id": 92527, "token": "登", "type": "中文单字"}
1078
+ {"id": 92553, "token": "销", "type": "中文单字"}
1079
+ {"id": 92555, "token": "万元", "type": "中文多字"}
1080
+ {"id": 92672, "token": "同时", "type": "中文多字"}
1081
+ {"id": 92693, "token": "無料", "type": "中文多字"}
1082
+ {"id": 92748, "token": "’all", "type": "中文标点"}
1083
+ {"id": 92776, "token": "即", "type": "中文单字"}
1084
+ {"id": 92780, "token": "只", "type": "中文单字"}
1085
+ {"id": 92877, "token": "老", "type": "中文单字"}
1086
+ {"id": 93056, "token": "、“", "type": "中文标点"}
1087
+ {"id": 93115, "token": "岁", "type": "中文单字"}
1088
+ {"id": 93126, "token": "’Brien", "type": "中文标点"}
1089
+ {"id": 93132, "token": "大小", "type": "中文多字"}
1090
+ {"id": 93233, "token": "找", "type": "中文单字"}
1091
+ {"id": 93269, "token": "“These", "type": "中文标点"}
1092
+ {"id": 93393, "token": "实", "type": "中文单字"}
1093
+ {"id": 93413, "token": "或", "type": "中文单字"}
1094
+ {"id": 93446, "token": "“\n\n", "type": "中文标点"}
1095
+ {"id": 93474, "token": "节点", "type": "中文多字"}
1096
+ {"id": 93598, "token": "若", "type": "中文单字"}
1097
+ {"id": 93636, "token": "小时", "type": "中文多字"}
1098
+ {"id": 93673, "token": "“To", "type": "中文标点"}
1099
+ {"id": 93830, "token": "—\"", "type": "中文标点"}
1100
+ {"id": 93922, "token": "’autres", "type": "中文标点"}
1101
+ {"id": 93994, "token": "其他", "type": "中文多字"}
1102
+ {"id": 94134, "token": "自治", "type": "中文多字"}
1103
+ {"id": 94249, "token": "分享", "type": "中文多字"}
1104
+ {"id": 94345, "token": "’ex", "type": "中文标点"}
1105
+ {"id": 94366, "token": "稍", "type": "中文单字"}
1106
+ {"id": 94518, "token": "…the", "type": "中文标点"}
1107
+ {"id": 94537, "token": "�件", "type": "中文多字"}
1108
+ {"id": 94588, "token": "达", "type": "中文单字"}
1109
+ {"id": 94668, "token": "邮箱", "type": "中文多字"}
1110
+ {"id": 94720, "token": "新增", "type": "中文多字"}
1111
+ {"id": 94785, "token": "提", "type": "中文单字"}
1112
+ {"id": 94895, "token": ":%", "type": "中文标点"}
1113
+ {"id": 94923, "token": "院", "type": "中文单字"}
1114
+ {"id": 94983, "token": "加", "type": "中文单字"}
1115
+ {"id": 95001, "token": "価", "type": "中文单字"}
1116
+ {"id": 95221, "token": "気", "type": "中文单字"}
1117
+ {"id": 95337, "token": "约", "type": "中文单字"}
1118
+ {"id": 95399, "token": "速", "type": "中文单字"}
1119
+ {"id": 95475, "token": "停", "type": "中文单字"}
1120
+ {"id": 95532, "token": "?\n", "type": "中文标点"}
1121
+ {"id": 95543, "token": "反", "type": "中文单字"}
1122
+ {"id": 95544, "token": "票", "type": "中文单字"}
1123
+ {"id": 95598, "token": "十", "type": "中文单字"}
1124
+ {"id": 96153, "token": ",则", "type": "中文多字"}
1125
+ {"id": 96197, "token": ",—", "type": "中文标点"}
1126
+ {"id": 96203, "token": "“At", "type": "中文标点"}
1127
+ {"id": 96206, "token": "’)", "type": "中文标点"}
1128
+ {"id": 96332, "token": "[…]", "type": "中文标点"}
1129
+ {"id": 96356, "token": "身", "type": "中文单字"}
1130
+ {"id": 96407, "token": "商品", "type": "中文多字"}
1131
+ {"id": 96412, "token": "含", "type": "中文单字"}
1132
+ {"id": 96455, "token": "率", "type": "中文单字"}
1133
+ {"id": 96500, "token": "汽", "type": "中文单字"}
1134
+ {"id": 96511, "token": "专", "type": "中文单字"}
1135
+ {"id": 96555, "token": "/", "type": "中文标点"}
1136
+ {"id": 96557, "token": "管理员", "type": "中文多字"}
1137
+ {"id": 97049, "token": "歳", "type": "中文单字"}
1138
+ {"id": 97150, "token": ",在", "type": "中文多字"}
1139
+ {"id": 97360, "token": ".–", "type": "中文标点"}
1140
+ {"id": 97432, "token": "”。\n\n", "type": "中文标点"}
1141
+ {"id": 97518, "token": "関", "type": "中文单字"}
1142
+ {"id": 97522, "token": "议", "type": "中文单字"}
1143
+ {"id": 97565, "token": "雷", "type": "中文单字"}
1144
+ {"id": 97655, "token": "正在", "type": "中文多字"}
1145
+ {"id": 97908, "token": "�能", "type": "中文多字"}
1146
+ {"id": 97999, "token": "。(", "type": "中文标点"}
1147
+ {"id": 98128, "token": "自动生成", "type": "中文多字"}
1148
+ {"id": 98134, "token": "’elle", "type": "中文标点"}
1149
+ {"id": 98184, "token": "些", "type": "中文单字"}
1150
+ {"id": 98220, "token": "界", "type": "中文单字"}
1151
+ {"id": 98245, "token": "陆", "type": "中文单字"}
1152
+ {"id": 98261, "token": "注意", "type": "中文多字"}
1153
+ {"id": 98390, "token": "备注", "type": "中文多字"}
1154
+ {"id": 98406, "token": "倍", "type": "中文单字"}
1155
+ {"id": 98458, "token": ",’’", "type": "中文标点"}
1156
+ {"id": 98476, "token": "“How", "type": "中文标点"}
1157
+ {"id": 98499, "token": "読", "type": "中文单字"}
1158
+ {"id": 98580, "token": "价格", "type": "中文多字"}
1159
+ {"id": 98657, "token": "检", "type": "中文单字"}
1160
+ {"id": 98711, "token": "我的", "type": "中文多字"}
1161
+ {"id": 98739, "token": "我们", "type": "中文多字"}
1162
+ {"id": 98806, "token": "还", "type": "中文单字"}
1163
+ {"id": 98871, "token": "析", "type": "中文单字"}
1164
+ {"id": 98897, "token": "企", "type": "中文单字"}
1165
+ {"id": 98915, "token": "友", "type": "中文单字"}
1166
+ {"id": 99007, "token": "”的", "type": "中文多字"}
1167
+ {"id": 99072, "token": "。www", "type": "中文标点"}
1168
+ {"id": 99083, "token": "“All", "type": "中文标点"}
1169
+ {"id": 99313, "token": ",…", "type": "中文标点"}
1170
+ {"id": 99337, "token": "简", "type": "中文单字"}
1171
+ {"id": 99379, "token": "移到", "type": "中文多字"}
1172
+ {"id": 99382, "token": ")”", "type": "中文标点"}
1173
+ {"id": 99397, "token": "問", "type": "中文单字"}
1174
+ {"id": 99480, "token": "功能", "type": "中文多字"}
1175
+ {"id": 99496, "token": "若要", "type": "中文多字"}
1176
+ {"id": 99502, "token": "长度", "type": "中文多字"}
1177
+ {"id": 99563, "token": "—at", "type": "中文标点"}
1178
+ {"id": 99643, "token": "】,【", "type": "中文标点"}
1179
+ {"id": 99741, "token": "装", "type": "中文单字"}
1180
+ {"id": 99750, "token": "感", "type": "中文单字"}
1181
+ {"id": 99771, "token": "哈", "type": "中文单字"}
1182
+ {"id": 99799, "token": "“One", "type": "中文标点"}
1183
+ {"id": 99849, "token": "何", "type": "中文单字"}
1184
+ {"id": 99941, "token": "预", "type": "中文单字"}
1185
+ {"id": 100065, "token": "~\n\n", "type": "中文标点"}
1186
+ {"id": 100066, "token": "送料", "type": "中文多字"}
1187
+ {"id": 100067, "token": "…it", "type": "中文标点"}
1188
+ {"id": 100179, "token": "尔", "type": "中文单字"}
1189
+ {"id": 100207, "token": "在线", "type": "中文多字"}
utils/log_util.py CHANGED
@@ -2,7 +2,7 @@
2
  import logging
3
 
4
  logging.basicConfig(
5
- format='%(asctime)s - %(filename)s - %(levelname)s - %(process)d - %(thread)d - %(message)s',
6
  level=logging.INFO,
7
  datefmt="%Y-%m-%d %H:%M:%S",
8
 
 
2
  import logging
3
 
4
  logging.basicConfig(
5
+ format='[%(asctime)s] [%(levelname)s] [%(process)d:%(thread)d] [%(filename)s:%(lineno)d:%(funcName)s] %(message)s',
6
  level=logging.INFO,
7
  datefmt="%Y-%m-%d %H:%M:%S",
8
 
utils/zh_util.py CHANGED
@@ -52,7 +52,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
52
  if has_chinese(decode_str):
53
  # bert词典有 ##开头的
54
  # byteBPE词典有带空格的
55
- decode_str = decode_str.strip().replace("#", "")
56
  zh_token_count["total"] += 1
57
  if len(decode_str) > 1:
58
  zh_token_count["中文多字"] += 1
@@ -93,4 +93,6 @@ if __name__ == "__main__":
93
  # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
94
  # test_coding_length(zh_punc)
95
  # test_coding_length(zh_iterator())
96
- iter_vocab()
 
 
 
52
  if has_chinese(decode_str):
53
  # bert词典有 ##开头的
54
  # byteBPE词典有带空格的
55
+ decode_str = decode_str.strip().replace("#", "") # TODO, 按类型
56
  zh_token_count["total"] += 1
57
  if len(decode_str) > 1:
58
  zh_token_count["中文多字"] += 1
 
93
  # test_coding_length(jd_vocab_tokens, filter=lambda k: not is_chinese(k))
94
  # test_coding_length(zh_punc)
95
  # test_coding_length(zh_iterator())
96
+
97
+ from vocab.gpt_35_turbo import tokenizer
98
+ iter_vocab(tokenizer)
vocab/README.md CHANGED
@@ -86,4 +86,6 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
86
 
87
 
88
 
89
- ##
 
 
 
86
 
87
 
88
 
89
+ ## reversible and lossless
90
+
91
+ It's reversible and lossless, so you can convert tokens back into the original text
vocab/__init__.py CHANGED
@@ -24,8 +24,12 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
24
  - tiktoken
25
  - icetk
26
  - hf_tokenizer
27
- - 特征:.model 是 tokenizer.models.BPE 类型,词典有 Ġ "\u0120" 开头,有1个tokenizer.json(包括 merge vocab),或者分开独立文件
28
- - 示例:gpt_neox_20b, moss
 
 
 
 
29
  - tiktoken
30
  - 特征:空格就是空格,
31
  - 示例:gpt3.5 gpt4
@@ -57,8 +61,8 @@ all_tokenizers = [
57
  "moss",
58
  #
59
  # ######
60
- # "chatyuan_large_v2",
61
- # "prompt_clue",
62
  #
63
  # #### bloom 系列
64
  "bloom",
@@ -69,7 +73,7 @@ all_tokenizers = [
69
  # "gpt_neox_chinese_v1",
70
  #
71
  # ##### glm系列
72
- # "glm_chinese",
73
  "chatglm_6b",
74
  "chatglm2-6b",
75
  #
@@ -80,13 +84,14 @@ all_tokenizers = [
80
  # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
81
  # "belle_llama_ext_7b",
82
  # "alpaca_7b",
83
- "baichuan_7b",
 
84
  "qwen",
85
  "internlm_chat_7b",
86
- "goat",
 
87
  ]
88
 
89
-
90
  class TokenizerType(Enum):
91
  """
92
  - https://huggingface.co/docs/transformers/tokenizer_summary
 
24
  - tiktoken
25
  - icetk
26
  - hf_tokenizer
27
+ - 特征:
28
+ - .model 是 tokenizer.models.BPE 类型
29
+ - 词典有 Ġ "\u0120" 开头
30
+ - 有1个tokenizer.json(包括 merge vocab),或者分开独立文件
31
+ - .model.from_file .model.save .model.token_to_id .model.tokenize
32
+ - 示例:gpt_neox_20b, moss, bloom
33
  - tiktoken
34
  - 特征:空格就是空格,
35
  - 示例:gpt3.5 gpt4
 
61
  "moss",
62
  #
63
  # ######
64
+ "chatyuan_large_v2",
65
+ "prompt_clue",
66
  #
67
  # #### bloom 系列
68
  "bloom",
 
73
  # "gpt_neox_chinese_v1",
74
  #
75
  # ##### glm系列
76
+ "glm_chinese",
77
  "chatglm_6b",
78
  "chatglm2-6b",
79
  #
 
84
  # "chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
85
  # "belle_llama_ext_7b",
86
  # "alpaca_7b",
87
+ "baichuan",
88
+ "baichuan2",
89
  "qwen",
90
  "internlm_chat_7b",
91
+ "falcon_180b",
92
+ # "goat",
93
  ]
94
 
 
95
  class TokenizerType(Enum):
96
  """
97
  - https://huggingface.co/docs/transformers/tokenizer_summary
vocab/{baichuan_7b → baichuan}/Baichuan-7B/config.json RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/Baichuan-7B/configuration_baichuan.py RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/Baichuan-7B/special_tokens_map.json RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenization_baichuan.py RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer.model RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/Baichuan-7B/tokenizer_config.json RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/__init__.py RENAMED
File without changes
vocab/{baichuan_7b → baichuan}/demo.py RENAMED
File without changes
vocab/baichuan2/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from vocab import TokenizerType
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True)
5
+
6
+
7
+ # byte-bpe sentencepiece
8
+ tokenizer.type = TokenizerType.ByteBPE
9
+
10
+ tokenizer.comments = "expand the vocqbulary size from 64000 in Baichuan1 to 125696"
vocab/bloom/test_tokenizer.py CHANGED
@@ -12,6 +12,8 @@ print("vocab size:", tokenizer.vocab_size)
12
  tokens = tokenizer.encode("中")
13
  decode_line = tokenizer.decode(tokens)
14
 
 
 
15
 
16
  def id2token(ids):
17
  return tokenizer.convert_ids_to_tokens(ids)
 
12
  tokens = tokenizer.encode("中")
13
  decode_line = tokenizer.decode(tokens)
14
 
15
+ tokenizer.save_vocabulary("tmp", "ddd")
16
+
17
 
18
  def id2token(ids):
19
  return tokenizer.convert_ids_to_tokens(ids)
vocab/chinese_llama2/__init__.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  from transformers import LlamaTokenizer
2
 
3
  tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")
 
1
+ """
2
+ ## 词典扩容
3
+ 32000 <pad>
4
+ 32001 但
5
+
6
+ """
7
+
8
  from transformers import LlamaTokenizer
9
 
10
  tokenizer = LlamaTokenizer.from_pretrained("ziqingyang/chinese-llama-2-7b")
vocab/falcon_180b/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
7
+
8
+
9
+
10
+ # tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-180b") # token
11
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
vocab/falcon_180b/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ ">>TITLE<<",
4
+ ">>ABSTRACT<<",
5
+ ">>INTRODUCTION<<",
6
+ ">>SUMMARY<<",
7
+ ">>COMMENT<<",
8
+ ">>ANSWER<<",
9
+ ">>QUESTION<<",
10
+ ">>DOMAIN<<",
11
+ ">>PREFIX<<",
12
+ ">>SUFFIX<<",
13
+ ">>MIDDLE<<"
14
+ ],
15
+ "eos_token": "<|endoftext|>"
16
+ }
vocab/falcon_180b/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/falcon_180b/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "eos_token": "<|endoftext|>",
4
+ "model_max_length": 2048,
5
+ "name_or_path": "tiiuae/falcon-40b",
6
+ "special_tokens_map_file": null,
7
+ "tokenizer_class": "PreTrainedTokenizerFast"
8
+ }
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -7,6 +7,8 @@ from utils.log_util import logger
7
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
8
  tokenizer.vocab_size = tokenizer.n_vocab
9
 
 
 
10
 
11
 
12
  def decode(self, tokens, errors="replace"):
@@ -20,8 +22,11 @@ def decode(self, tokens, errors="replace"):
20
  def convert_ids_to_tokens(self, tokens):
21
  return tokenizer.decode_tokens_bytes(tokens)
22
 
23
- def get_vocab(self):
24
- """Returns vocab as a dict"""
 
 
 
25
  vocab = {}
26
  key_error_list = []
27
  unicode_decode_error_list = []
@@ -29,11 +34,13 @@ def get_vocab(self):
29
  try:
30
  token_byte = self.convert_ids_to_tokens([i])[0]
31
  token_str = token_byte.decode("utf-8")
32
- vocab[token_str] = i
33
- except KeyError: # 100256 100261-100275
34
  key_error_list.append(i)
35
- except UnicodeDecodeError: # 特别多
 
36
  unicode_decode_error_list.append((i, str(token_byte)))
 
37
 
38
  # vocab.update(self.added_tokens_encoder)
39
  logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
@@ -41,6 +48,8 @@ def get_vocab(self):
41
  return vocab
42
 
43
 
 
 
44
  Encoding.decode = decode
45
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
46
  Encoding.get_vocab = get_vocab
 
7
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
8
  tokenizer.vocab_size = tokenizer.n_vocab
9
 
10
+ tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
11
+ tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
12
 
13
 
14
  def decode(self, tokens, errors="replace"):
 
22
  def convert_ids_to_tokens(self, tokens):
23
  return tokenizer.decode_tokens_bytes(tokens)
24
 
25
+ def get_vocab(self, token_type="str"):
26
+ """Returns vocab as a dict
27
+ :param token_type: ["str", "byte"]
28
+ :return:
29
+ """
30
  vocab = {}
31
  key_error_list = []
32
  unicode_decode_error_list = []
 
34
  try:
35
  token_byte = self.convert_ids_to_tokens([i])[0]
36
  token_str = token_byte.decode("utf-8")
37
+ vocab[token_byte] = i
38
+ except KeyError: # 16 KeyError, 100256 100261-100275
39
  key_error_list.append(i)
40
+ # vocab[f"[KeyError]-{i}"] = i
41
+ except UnicodeDecodeError: # 773 UnicodeDecodeError
42
  unicode_decode_error_list.append((i, str(token_byte)))
43
+ vocab[token_byte] = i
44
 
45
  # vocab.update(self.added_tokens_encoder)
46
  logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
 
48
  return vocab
49
 
50
 
51
+
52
+ # tiktoken patch
53
  Encoding.decode = decode
54
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
55
  Encoding.get_vocab = get_vocab
vocab/gpt_35_turbo/aaa.py CHANGED
@@ -17,6 +17,11 @@ import tiktoken
17
 
18
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
19
 
 
 
 
 
 
20
 
21
  for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
22
  try:
 
17
 
18
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
19
 
20
+ tokens = [100263, 99834]
21
+
22
+ tokenizer.decode(tokens)
23
+
24
+ tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace")
25
 
26
  for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
27
  try:
vocab/gpt_4/__init__.py CHANGED
@@ -1,48 +1,3 @@
1
 
2
-
3
- import tiktoken
4
- from tiktoken import Encoding
5
- from utils.log_util import logger
6
-
7
- tokenizer = tiktoken.encoding_for_model('gpt-4')
8
- tokenizer.vocab_size = tokenizer.n_vocab
9
-
10
-
11
-
12
- def decode(self, tokens, errors="replace"):
13
- # def decode(self, tokens: list[int], errors: str = "replace") -> str:
14
- try:
15
- decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
16
- except:
17
- decode_str = "null"
18
- return decode_str
19
-
20
- def convert_ids_to_tokens(self, tokens):
21
- return tokenizer.decode_tokens_bytes(tokens)
22
-
23
- def get_vocab(self):
24
- """Returns vocab as a dict"""
25
- vocab = {}
26
- key_error_list = []
27
- unicode_decode_error_list = []
28
- for i in range(self.vocab_size):
29
- try:
30
- token_byte = self.convert_ids_to_tokens([i])[0]
31
- token_str = token_byte.decode("utf-8")
32
- vocab[token_str] = i
33
- except KeyError: # 100256 100261-100275
34
- key_error_list.append(i)
35
- except UnicodeDecodeError: # 特别多
36
- unicode_decode_error_list.append((i, str(token_byte)))
37
-
38
- # vocab.update(self.added_tokens_encoder)
39
- logger.info(f"gpt-4 {len(key_error_list)} KeyError: {key_error_list}")
40
- logger.info(f"gpt-4 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
41
- return vocab
42
-
43
-
44
- Encoding.decode = decode
45
- Encoding.convert_ids_to_tokens = convert_ids_to_tokens
46
- Encoding.get_vocab = get_vocab
47
-
48
 
 
1
 
2
+ from vocab.gpt_35_turbo import tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json CHANGED
@@ -255,6 +255,8 @@
255
  "end_of_word_suffix": null,
256
  "fuse_unk": false,
257
  "vocab": {
 
 
258
  "531": 531,
259
  "541": 541,
260
  "566": 566,
 
255
  "end_of_word_suffix": null,
256
  "fuse_unk": false,
257
  "vocab": {
258
+ "<|endoftext|>": 0,
259
+ "<|padding|>": 1,
260
  "531": 531,
261
  "541": 541,
262
  "566": 566,
vocab/gpt_neox_chinese_v1/mock.py CHANGED
@@ -1,17 +1,32 @@
1
  import copy
2
  import json
 
3
 
4
- input_path = "20B_tokenizer_chinese.json"
 
5
 
6
- tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
7
 
8
- vocab = tokenizer["model"]["vocab"]
 
9
 
 
 
 
 
10
 
11
- for k, v in copy.deepcopy(vocab).items():
12
- vocab[str(v)] = v
13
- vocab.pop(k)
14
 
15
- out_path = input_path.replace(".json", ".mock.json")
16
- with open(out_path, "w", encoding="utf-8") as f_out:
17
- f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
 
 
 
 
 
 
 
 
 
1
  import copy
2
  import json
3
+ from tokenizers import Tokenizer
4
 
5
+ def export_mock_tokenizer():
6
+ input_path = "20B_tokenizer_chinese.json"
7
 
8
+ tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
9
 
10
+ vocab = tokenizer["model"]["vocab"]
11
+ added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
12
 
13
+ for k, v in copy.deepcopy(vocab).items():
14
+ if v not in added_tokens:
15
+ vocab[str(v)] = v
16
+ vocab.pop(k)
17
 
18
+ out_path = input_path.replace(".json", ".mock.json")
19
+ with open(out_path, "w", encoding="utf-8") as f_out:
20
+ f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
21
 
22
+
23
+ def mock2():
24
+ pass
25
+
26
+
27
+ def load_mock_tokenizer():
28
+ tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
29
+ print('')
30
+
31
+ export_mock_tokenizer()
32
+ load_mock_tokenizer()
vocab/gpt_neox_chinese_v1/trouble-shooting.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
4
+
5
+
6
+
7
+
8
+ ## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
9
+
10
+
11
+ ```
12
+ The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
13
+ The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
14
+ The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
15
+ ```
16
+
17
+
18
+ 原因:50254 这些token并未在vocab中定义,只在 `added_tokens` 里定义了。
19
+
20
+ ## ss
21
+
22
+
vocab/llama/__init__.py CHANGED
@@ -1,7 +1,20 @@
1
 
2
  """
3
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
 
1
 
2
  """
3
 
4
+ ## 指令 special token
5
 
6
+ {"token_id": 29961, "decode_str": "[", "token": "["}
7
+ {"token_id": 25580, "decode_str": "INST", "token": "INST"}
8
+ {"token_id": 29962, "decode_str": "]", "token": "]"}
9
+
10
+ {"token_id": 3532, "decode_str": "<<", "token": "▁<<"}
11
+ {"token_id": 14816, "decode_str": "SY", "token": "SY"}
12
+ {"token_id": 29903, "decode_str": "S", "token": "S"}
13
+ {"token_id": 6778, "decode_str": ">>", "token": ">>"}
14
+
15
+ {"token_id": 13, "decode_str": "\n", "token": "<0x0A>"}
16
+
17
+ 疑问:为什么不将 <<SYS>> <</SYS>> [INST] [/INST] 做成1个id?
18
  """
19
 
20
  import os
vocab/llama/demo.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import os
4
+ from transformers import LlamaTokenizer
5
+
6
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
7
+ TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
8
+
9
+
10
+
11
+ tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)
12
+
13
+
14
+ tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492,
15
+ 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889,
16
+ 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641,
17
+ 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311,
18
+ 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916,
19
+ 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793,
20
+ 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443,
21
+ 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644,
22
+ 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338,
23
+ 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012,
24
+ 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915,
25
+ 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016,
26
+ 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816,
27
+ 29903, 6778, 13, 13, 15970, 526, 366, 518, 29914, 25580,
28
+ 29962]
29
+
30
+ text = tokenizer.decode(tokens)
31
+ print(text)
32
+ for token_id in tokens:
33
+ print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))