xu-song commited on
Commit
814ee6b
1 Parent(s): a6aee1d

add compress rate

Browse files
.gitattributes CHANGED
@@ -37,5 +37,5 @@ vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
39
  vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
- vocab/
41
-
 
37
  vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
39
  vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ vocab/grok_1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
41
+ vocab/llama3/Meta-Llama-3-70B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -32,4 +32,119 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
32
  -
33
 
34
 
35
- https://huggingface.co/spaces/yenniejun/tokenizers-languages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  -
33
 
34
 
35
+ https://huggingface.co/spaces/yenniejun/tokenizers-languages
36
+
37
+
38
+ ## gradio app
39
+
40
+ - https://arena.lmsys.org/
41
+
42
+
43
+ ## lang
44
+
45
+
46
+
47
+ ## number
48
+
49
+
50
+
51
+ ## diff
52
+
53
+
54
+
55
+
56
+
57
+
58
+ ## Compress Rate
59
+
60
+
61
+ **简介**
62
+ we tokenize in cc-100
63
+
64
+ | tokenizer | vocab_size | g_bytes/b_tokens | t_bytes/t_tokens | b_tokens/g_bytes |
65
+ |:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|
66
+ | amber | 32000 | 1.84 | 1.8 | 0.54 |
67
+ | aya_101 | 250100 | 3.89 | 3.79 | 0.26 |
68
+ | baichuan | 64000 | 3.92 | 3.82 | 0.26 |
69
+ | baichuan2 | 125696 | 4.53 | 4.42 | 0.22 |
70
+ | bert_base_cased | 28996 | 2.73 | 2.66 | 0.37 |
71
+ | bert_base_chinese | 21128 | 2.74 | 2.67 | 0.37 |
72
+ | bert_base_uncased | 30522 | 2.73 | 2.67 | 0.37 |
73
+ | bloom | 250680 | 4.28 | 4.18 | 0.23 |
74
+ | byt5_small | 256 | 0.93 | 0.91 | 1.08 |
75
+ | character_glm_6b | 64794 | 4.2 | 4.1 | 0.24 |
76
+ | chatglm2_6b | 64794 | 4.2 | 4.1 | 0.24 |
77
+ | chatglm3_6b | 64798 | 4.2 | 4.1 | 0.24 |
78
+ | chatglm_6b | 150344 | 4.65 | 4.54 | 0.22 |
79
+ | chatyuan_large_v2 | 32128 | 4.34 | 4.24 | 0.23 |
80
+ | chinese_llama | 49953 | 3.93 | 3.84 | 0.25 |
81
+ | chinese_llama2 | 55296 | 3.92 | 3.83 | 0.26 |
82
+ | code_davinci_002 | 50281 | 1.31 | 1.28 | 0.77 |
83
+ | crystal_coder | 32000 | 1.86 | 1.81 | 0.54 |
84
+ | deepseek_coder_33b_instruct | 32000 | 3.4 | 3.32 | 0.29 |
85
+ | deepseek_llm_7b_base | 100000 | 4.05 | 3.96 | 0.25 |
86
+ | falcon_180b | 65024 | 2.18 | 2.13 | 0.46 |
87
+ | falcon_7b | 65024 | 2.18 | 2.13 | 0.46 |
88
+ | fastchat_t5_3b | 32000 | 13.7 | 13.38 | 0.07 |
89
+ | flan_t5_base | 32100 | 14.13 | 13.8 | 0.07 |
90
+ | gemma_7b | 256000 | 3.82 | 3.73 | 0.26 |
91
+ | gpt2 | 50257 | 1.31 | 1.28 | 0.77 |
92
+ | gpt2_chinese | 21128 | 2.73 | 2.66 | 0.37 |
93
+ | gpt_35_turbo | 100277 | 2.26 | 2.21 | 0.44 |
94
+ | gpt_4 | 100277 | 2.26 | 2.21 | 0.44 |
95
+ | gpt_nexo_20b | 50254 | 2.01 | 1.96 | 0.5 |
96
+ | internlm2_chat_7b | 92544 | 4.23 | 4.13 | 0.24 |
97
+ | internlm2_math_7b | 92544 | 4.23 | 4.13 | 0.24 |
98
+ | internlm_chat_7b | 103168 | 4.23 | 4.14 | 0.24 |
99
+ | internlm_xcomposer_7b | 103168 | 4.23 | 4.14 | 0.24 |
100
+ | kplug | 10261 | 2.72 | 2.65 | 0.37 |
101
+ | llama | 32000 | 1.84 | 1.8 | 0.54 |
102
+ | llama2 | 32000 | 1.84 | 1.8 | 0.54 |
103
+ | mistral_7b | 32000 | 2.36 | 2.3 | 0.42 |
104
+ | mixtral_8_7b | 32000 | 2.36 | 2.3 | 0.42 |
105
+ | mobilebert_uncased | 30522 | 2.73 | 2.67 | 0.37 |
106
+ | moss | 106029 | 4.4 | 4.3 | 0.23 |
107
+ | mt5_large | 250100 | 3.89 | 3.79 | 0.26 |
108
+ | olmo_7b | 50280 | 2.01 | 1.96 | 0.5 |
109
+ | orion_14b_chat | 84608 | 4.63 | 4.52 | 0.22 |
110
+ | phi_1 | 50257 | 1.31 | 1.28 | 0.77 |
111
+ | phi_2 | 50257 | 1.31 | 1.28 | 0.77 |
112
+ | pko_t5_large | 50258 | 0.97 | 0.95 | 1.03 |
113
+ | prompt_clue | 32128 | 4.34 | 4.24 | 0.23 |
114
+ | qwen1_5_14b_chat | 151643 | 4.16 | 4.06 | 0.24 |
115
+ | qwen_1_8b_chat | 151851 | 4.16 | 4.06 | 0.24 |
116
+ | qwen_72b_chat | 151851 | 4.16 | 4.06 | 0.24 |
117
+ | qwen_7b_chat | 151851 | 4.16 | 4.06 | 0.24 |
118
+ | roberta_chinese_clue | 8021 | 2.7 | 2.64 | 0.37 |
119
+ | skywork_13b_base | 65519 | 3.69 | 3.61 | 0.27 |
120
+ | skywork_13b_math | 65519 | 3.69 | 3.61 | 0.27 |
121
+ | solar_10_7b | 32000 | 2.36 | 2.3 | 0.42 |
122
+ | starchat_alpha | 49152 | 2.78 | 2.72 | 0.36 |
123
+ | switch_c_2048 | 32100 | 14.13 | 13.8 | 0.07 |
124
+ | t5_base | 32100 | 14.13 | 13.8 | 0.07 |
125
+ | t5_large | 32100 | 14.13 | 13.8 | 0.07 |
126
+ | t5_small | 32100 | 14.13 | 13.8 | 0.07 |
127
+ | text_davinci_003 | 50281 | 1.31 | 1.28 | 0.77 |
128
+ | tigerbot_13b_chat_v2 | 60512 | 4.25 | 4.15 | 0.24 |
129
+ | tigerbot_70b_chat_v4_4k | 65107 | 4.25 | 4.15 | 0.24 |
130
+ | wizardcoder_15b_v1 | 49152 | 2.78 | 2.72 | 0.36 |
131
+ | wizardcoder_python_7b_v1 | 32000 | 1.84 | 1.8 | 0.54 |
132
+ | wizardlm_7b_v1 | 32000 | 1.84 | 1.8 | 0.54 |
133
+ | wizardmath_70b_v1 | 32000 | 1.84 | 1.8 | 0.54 |
134
+ | xlm_roberta | 250002 | 3.96 | 3.86 | 0.25 |
135
+ | yi_34b | 64000 | 4.17 | 4.07 | 0.24 |
136
+ | yi_6b | 64000 | 4.17 | 4.07 | 0.24 |
137
+ | yi_vl34b | 64000 | 4.11 | 4.02 | 0.24 |
138
+ | zephyr_7b_beta | 32000 | 2.36 | 2.3 | 0.42 |
139
+
140
+
141
+ **结论**
142
+ larger vocabulary sizes
143
+
144
+
145
+
146
+ ## Reference
147
+
148
+ - Getting the most out of your tokenizer for pre-training and domain adaptation
149
+ - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
150
+ - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
app.py CHANGED
@@ -73,6 +73,31 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
73
  show_label=False,
74
  )
75
  gr.Markdown("## Tokenization")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  with gr.Row():
77
  with gr.Column(scale=6):
78
  with gr.Group():
@@ -86,13 +111,19 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
86
  """
87
  with gr.Row():
88
  stats_vocab_size_1 = gr.TextArea(
89
- label="VocabSize",
90
  lines=1,
91
  elem_classes="statistics"
92
  )
93
  stats_zh_token_size_1 = gr.TextArea(
94
  label="ZH char/word",
95
  lines=1,
 
 
 
 
 
 
96
  elem_classes="statistics"
97
  )
98
  stats_overlap_token_size_1 = gr.TextArea(
@@ -126,13 +157,20 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
126
  stats_zh_token_size_2 = gr.TextArea(
127
  label="ZH char/word", # 中文字/词
128
  lines=1,
 
 
 
 
 
 
129
  elem_classes="statistics"
130
  )
131
- # stats_6 = gr.TextArea(
132
- # label="Compress Rate",
133
- # lines=1,
134
- # elem_classes="statistics"
135
- # )
 
136
  stats_overlap_token_size_2 = gr.TextArea(
137
  label="Overlap Tokens",
138
  lines=1,
@@ -141,6 +179,7 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
141
 
142
  # TODO: 图 表 压缩率
143
  with gr.Row():
 
144
  with gr.Column():
145
  output_text_1 = gr.Highlightedtext(
146
  show_legend=True,
@@ -156,12 +195,21 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
156
  output_table_1 = gr.Dataframe()
157
  output_table_2 = gr.Dataframe()
158
 
 
 
 
 
 
 
159
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
160
  [output_text_1, output_table_1])
161
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
162
  tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
163
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
 
 
164
 
 
165
  user_input.change(tokenize_pair,
166
  [user_input, tokenizer_type_1, tokenizer_type_2],
167
  [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
@@ -171,6 +219,15 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
171
  tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
172
  tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
173
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
 
 
 
 
 
 
 
 
 
174
 
175
  dropdown_examples.change(
176
  example_fn,
@@ -178,15 +235,15 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
178
  [user_input, tokenizer_type_1, tokenizer_type_2]
179
  )
180
 
181
- demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read())
182
  demo.load(
183
  fn=on_load,
184
  inputs=[user_input], # 这里只需要传个空object即可。
185
  outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
186
- _js=get_window_url_params
187
  )
188
 
189
-
190
  if __name__ == "__main__":
191
  # demo.queue(max_size=20).launch()
192
  demo.launch()
 
 
73
  show_label=False,
74
  )
75
  gr.Markdown("## Tokenization")
76
+
77
+ # compress rate setting
78
+ with gr.Accordion("Compress Rate Setting", open=True):
79
+ gr.Markdown("Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
80
+ with gr.Row():
81
+ compress_rate_corpus = gr.CheckboxGroup(
82
+ ["cc100-en", "cc100-zh-Hans", "cc100-es", "code"],
83
+ value=["cc100-en", "cc100-zh-Hans"],
84
+ label="corpus",
85
+ # info=""
86
+ )
87
+ compress_rate_unit = gr.Radio(
88
+ ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens"],
89
+ value="b_tokens/g_bytes",
90
+ label="unit",
91
+ )
92
+ # TODO: Token Setting
93
+ # with gr.Accordion("Token Filter Setting", open=False):
94
+ # gr.Markdown(
95
+ # "Get total number of tokens which contain the following character)")
96
+ # gr.Radio(
97
+ # ["zh-Hans", "", "number", "space"],
98
+ # value="zh",
99
+ # )
100
+
101
  with gr.Row():
102
  with gr.Column(scale=6):
103
  with gr.Group():
 
111
  """
112
  with gr.Row():
113
  stats_vocab_size_1 = gr.TextArea(
114
+ label="Vocab Size",
115
  lines=1,
116
  elem_classes="statistics"
117
  )
118
  stats_zh_token_size_1 = gr.TextArea(
119
  label="ZH char/word",
120
  lines=1,
121
+ elem_classes="statistics",
122
+ visible=False
123
+ )
124
+ stats_compress_rate_1 = gr.TextArea(
125
+ label="Compress Rate",
126
+ lines=1,
127
  elem_classes="statistics"
128
  )
129
  stats_overlap_token_size_1 = gr.TextArea(
 
157
  stats_zh_token_size_2 = gr.TextArea(
158
  label="ZH char/word", # 中文字/词
159
  lines=1,
160
+ elem_classes="statistics",
161
+ visible=False
162
+ )
163
+ stats_compress_rate_2 = gr.TextArea(
164
+ label="Compress Rate",
165
+ lines=1,
166
  elem_classes="statistics"
167
  )
168
+ stats_filtered_token_2 = gr.TextArea(
169
+ label="filtered tokens",
170
+ lines=1,
171
+ elem_classes="statistics",
172
+ visible=False
173
+ )
174
  stats_overlap_token_size_2 = gr.TextArea(
175
  label="Overlap Tokens",
176
  lines=1,
 
179
 
180
  # TODO: 图 表 压缩率
181
  with gr.Row():
182
+ # dynamic change label
183
  with gr.Column():
184
  output_text_1 = gr.Highlightedtext(
185
  show_legend=True,
 
195
  output_table_1 = gr.Dataframe()
196
  output_table_2 = gr.Dataframe()
197
 
198
+
199
+ # setting
200
+ # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
201
+ # [stats_compress_rate_1, stats_compress_rate_2])
202
+
203
+
204
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
205
  [output_text_1, output_table_1])
206
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
207
  tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
208
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
209
+ tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
210
+ [stats_compress_rate_1])
211
 
212
+ # TODO: every=3
213
  user_input.change(tokenize_pair,
214
  [user_input, tokenizer_type_1, tokenizer_type_2],
215
  [output_text_1, output_table_1, output_text_2, output_table_2]) # , pass_request=1
 
219
  tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
220
  tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
221
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
222
+ tokenizer_type_2.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
223
+ [stats_compress_rate_2])
224
+
225
+
226
+ compress_rate_unit.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
227
+ [stats_compress_rate_1])
228
+ compress_rate_unit.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
229
+ [stats_compress_rate_2])
230
+
231
 
232
  dropdown_examples.change(
233
  example_fn,
 
235
  [user_input, tokenizer_type_1, tokenizer_type_2]
236
  )
237
 
238
+ demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
239
  demo.load(
240
  fn=on_load,
241
  inputs=[user_input], # 这里只需要传个空object即可。
242
  outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
243
+ js=get_window_url_params
244
  )
245
 
 
246
  if __name__ == "__main__":
247
  # demo.queue(max_size=20).launch()
248
  demo.launch()
249
+ # demo.launch(share=True)
config.py CHANGED
@@ -1,2 +1,12 @@
1
- USE_REMOTE = False
 
 
 
 
2
  ADD_SPECIAL_TOKEN = False
 
 
 
 
 
 
 
1
+ USE_REMOTE = False # use remote tokenizer or local tokenizer
2
+
3
+ # load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
4
+
5
+ # encoding config
6
  ADD_SPECIAL_TOKEN = False
7
+
8
+ #
9
+ LAZY_IMPORT = True
10
+
11
+ # DEBUG: 设置环境变量 RUST_BACKTRACE=full
12
+ #
examples.py CHANGED
@@ -24,6 +24,7 @@ examples = {
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
26
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
 
27
  ],
28
  "zh": [
29
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
@@ -38,6 +39,7 @@ more_examples = [
38
  # bert VS clue
39
  # bert系列
40
  ("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug, bert VS clue
 
41
 
42
  # llama系列 (基于sentencepiece)
43
  ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
 
24
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
25
  ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
26
  ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
27
+ ["special: [PAD] [UNK] [CLS] [SEP] [MASK] "],
28
  ],
29
  "zh": [
30
  ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
 
39
  # bert VS clue
40
  # bert系列
41
  ("bert_base_cased", "bert_base_uncased", ""), # # clue VS kplug, bert VS clue
42
+ ("bert_base_cased", "clue", ""),
43
 
44
  # llama系列 (基于sentencepiece)
45
  ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- transformers==4.38.0
2
  sentencepiece
3
  tiktoken
4
  icetk
 
1
+ transformers
2
  sentencepiece
3
  tiktoken
4
  icetk
tokenizer/chinese_sptokenizer_patch.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ ref: glm_chinese
3
+ """
4
+
5
+
tokenizer/sptokenizer_patch.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+
4
+
5
+ ## usage
6
+
7
+ - grok
8
+
9
+ ## 风险评估
10
+
11
+ - 会干扰 sentencepiece.SentencePieceProcessor的正常使用吗?
12
+
13
+ """
14
+ import sentencepiece
15
+
16
+
17
+
18
+ @property
19
+ def vocab_size(self):
20
+ """Returns vocab size"""
21
+ return self.get_piece_size()
22
+
23
+
24
+ def get_vocab(self):
25
+ """Returns vocab as a dict"""
26
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
27
+ # vocab.update(self.added_tokens_encoder)
28
+ return vocab
29
+
30
+
31
+ def _tokenize(self, text):
32
+ """Returns a tokenized string."""
33
+ return self.encode(text, out_type=str)
34
+
35
+ def _convert_token_to_id(self, token):
36
+ """Converts a token (str) in an id using the vocab."""
37
+ return self.piece_to_id(token)
38
+
39
+ def _convert_id_to_token(self, index):
40
+ """Converts an index (integer) in a token (str) using the vocab."""
41
+ token = self.IdToPiece(index)
42
+ return token
43
+
44
+ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
45
+ """ copy from transformers.PreTrainedTokenizer
46
+ Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
47
+ added tokens.
48
+
49
+ Args:
50
+ ids (`int` or `List[int]`):
51
+ The token id (or token ids) to convert to tokens.
52
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
53
+ Whether or not to remove special tokens in the decoding.
54
+
55
+ Returns:
56
+ `str` or `List[str]`: The decoded token(s).
57
+ """
58
+ self._added_tokens_decoder = {} # add by xs
59
+ if isinstance(ids, int):
60
+ if ids in self._added_tokens_decoder:
61
+ return self._added_tokens_decoder[ids].content
62
+ else:
63
+ return self._convert_id_to_token(ids)
64
+ tokens = []
65
+ for index in ids:
66
+ index = int(index)
67
+ if skip_special_tokens and index in self.all_special_ids:
68
+ continue
69
+ if index in self._added_tokens_decoder:
70
+ tokens.append(self._added_tokens_decoder[index].content)
71
+ else:
72
+ tokens.append(self._convert_id_to_token(index))
73
+ return tokens
74
+
75
+
76
+ def encode(self, *args, **kwargs):
77
+ """
78
+ add_special_token 是为了兼容 hf_tokenizer
79
+ """
80
+ kwargs.pop("add_special_tokens", None)
81
+ kwargs.pop("allowed_special", None)
82
+ return self.Encode(*args, **kwargs)
83
+
84
+
85
+ def decode(self, *args, **kwargs):
86
+ kwargs.pop("skip_special_tokens", None)
87
+ return self.Decode(*args, **kwargs)
88
+
89
+
90
+ sentencepiece.SentencePieceProcessor.vocab_size = vocab_size
91
+ sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
92
+ sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
93
+ sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
94
+ # sentencepiece.SentencePieceProcessor.tokenize = _tokenize
95
+ sentencepiece.SentencePieceProcessor.encode = encode
96
+ sentencepiece.SentencePieceProcessor.decode = decode
97
+
tokenizer/tiktoken_patch.py CHANGED
@@ -17,7 +17,6 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
17
  "namereplace"
18
  """
19
  try:
20
- print(tokens)
21
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
22
  except Exception as e: # 捕捉不到 PyO3PanicException
23
  logger.error(f"{e} for {tokens} -> return 'null'")
@@ -69,6 +68,12 @@ def get_vocab(self, token_type="str"):
69
  return vocab
70
 
71
 
 
 
 
 
 
 
72
  def encode(self, *args, **kwargs):
73
  """
74
  add_special_token 是为了兼容 hf_tokenizer
@@ -84,3 +89,4 @@ Encoding.encode = encode
84
  Encoding.decode = decode
85
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
86
  Encoding.get_vocab = get_vocab
 
 
17
  "namereplace"
18
  """
19
  try:
 
20
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
21
  except Exception as e: # 捕捉不到 PyO3PanicException
22
  logger.error(f"{e} for {tokens} -> return 'null'")
 
68
  return vocab
69
 
70
 
71
+ @property
72
+ def vocab_size(self):
73
+ """Returns vocab size"""
74
+ return self.n_vocab
75
+
76
+
77
  def encode(self, *args, **kwargs):
78
  """
79
  add_special_token 是为了兼容 hf_tokenizer
 
89
  Encoding.decode = decode
90
  Encoding.convert_ids_to_tokens = convert_ids_to_tokens
91
  Encoding.get_vocab = get_vocab
92
+ Encoding.vocab_size = vocab_size
tokenizer/tokenizer_patcher.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None:
4
+ if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
5
+ tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)
util.py CHANGED
@@ -1,13 +1,12 @@
1
  import gradio as gr
2
  import json
3
- import socket
4
  import pandas as pd
5
  import config
6
  from vocab import load_tokener
7
  from utils.zh_util import iter_vocab
8
  from utils.log_util import logger
 
9
  from functools import lru_cache
10
- from urllib.parse import urlparse, parse_qs
11
 
12
 
13
  @lru_cache
@@ -83,8 +82,16 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
83
  @lru_cache
84
  def basic_count(tokenizer_type):
85
  tokenizer = load_tokener(tokenizer_type)
86
- stats = iter_vocab(tokenizer, tokenizer_type)
87
- return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
 
 
 
 
 
 
 
 
88
 
89
 
90
  @lru_cache
@@ -110,8 +117,9 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
110
  return overlap_token_size, overlap_token_size
111
 
112
 
113
- default_user_input = """Replace this text in the input field to see how tokenization works
114
- 华为发布Mate60手机
 
115
  ラグビーワールドカップ2023フランス"""
116
  default_tokenizer_type_1 = "llama"
117
  # default_tokenizer_type_2 = "internlm_chat_7b"
@@ -147,6 +155,9 @@ def on_load(url_params, request: gr.Request):
147
  return text, tokenizer_type_1, tokenizer_type_2
148
 
149
 
 
 
 
150
  def test_coding():
151
  bytes1 = b'\xe4\xb8\xad'
152
  print(bytes1) # b'\xe4\xb8\xad'
 
1
  import gradio as gr
2
  import json
 
3
  import pandas as pd
4
  import config
5
  from vocab import load_tokener
6
  from utils.zh_util import iter_vocab
7
  from utils.log_util import logger
8
+ from utils.compress_rate_util import tokenize_corpus, unit_convertor
9
  from functools import lru_cache
 
10
 
11
 
12
  @lru_cache
 
82
  @lru_cache
83
  def basic_count(tokenizer_type):
84
  tokenizer = load_tokener(tokenizer_type)
85
+ stats = iter_vocab(tokenizer)
86
+ return tokenizer.vocab_size, f'{stats["中文token数"]}'
87
+ # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
88
+
89
+ def get_compress_rate(tokenizer_type, all_corpus, unit):
90
+ corpus_name = all_corpus[0]
91
+ tokenizer = load_tokener(tokenizer_type)
92
+ compress_rate_stats = tokenize_corpus(tokenizer, corpus_name)
93
+ compress_rate = unit_convertor(compress_rate_stats, unit)
94
+ return compress_rate
95
 
96
 
97
  @lru_cache
 
117
  return overlap_token_size, overlap_token_size
118
 
119
 
120
+ default_user_input = """Replace this text in the input field to see how tokenization works.
121
+ Buenos días!
122
+ 华为发布Mate60手机。
123
  ラグビーワールドカップ2023フランス"""
124
  default_tokenizer_type_1 = "llama"
125
  # default_tokenizer_type_2 = "internlm_chat_7b"
 
155
  return text, tokenizer_type_1, tokenizer_type_2
156
 
157
 
158
+ def compress_rate_unit_change(unit):
159
+ return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
160
+
161
  def test_coding():
162
  bytes1 = b'\xe4\xb8\xad'
163
  print(bytes1) # b'\xe4\xb8\xad'
utils/compress_rate_util.py CHANGED
@@ -1,7 +1,181 @@
1
  """
2
 
3
-
4
  中文数据:clue superclue
5
  英文数据:glue cnn_dailymail gigaword
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- """
 
 
1
  """
2
 
 
3
  中文数据:clue superclue
4
  英文数据:glue cnn_dailymail gigaword
5
+ 代码数据:
6
+ 数字:
7
+
8
+ ## 参考
9
+ - https://github.com/baichuan-inc/Baichuan-7B 记录了不同分词器的压缩率
10
+ - 指标:猜测是 n_tokens/n_chars (baichuan小,说明百川token少,压缩率高)
11
+ - Baichuan 0.73; llama 1.31;
12
+ - https://github.com/QwenLM/Qwen/blob/main/tech_memo.md 记录了不同分词器的压缩率
13
+ - 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) ,
14
+ - Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate)
15
+ - 中文: llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9; qwen7b 0.95
16
+ - 英文:
17
+ - 指标:猜测是 n_tokens / n_tokens_xlmR
18
+ - https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134
19
+ - 有压缩率的计算方式
20
+ - https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer
21
+ - 记录了不同分词器的压缩率
22
+ - 指标:
23
+ - https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24
24
+ - 有压缩率的计算方式 = {n_chars} / {n_tokens}
25
+ -
26
+ - https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354
27
+ - 这个可能不是
28
+ - https://github.com/bojone/bytepiece/blob/main/README_en.md
29
+ - "bytes/token": the average number of bytes per token
30
+ - Getting the most out of your tokenizer for pre-training and domain adaptation 👍
31
+ - 定义:
32
+ - NSL: 两个分词器的编码长度 比例,通常以 llama为基准
33
+ - average number of bytes per token. {n_bytes} / {n_tokens}
34
+ - higher compression rate --
35
+ - *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py
36
+ - 定义:{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0,用 % 来表示。也有>1的情况。
37
+ -
38
+ - {Compressed Size} 指的是?
39
+ - 这里的压缩指的是 模型参数相关的。
40
+ """
41
+
42
+ import json
43
+ import os
44
+ import pandas as pd
45
+ from datasets import load_dataset
46
+ from utils.log_util import logger
47
+ from vocab import load_tokener
48
+
49
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
50
+
51
+
52
+ def get_n_bytes_of_string(string_text):
53
+ n_bytes = len(string_text.encode("utf-8"))
54
+ return n_bytes
55
+
56
+
57
+ def unit_convertor(stat, unit):
58
+ n_tokens = stat["n_tokens"]
59
+ n_chars = stat["n_chars"]
60
+ n_bytes = stat["n_bytes"]
61
+
62
+ n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
63
+ n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
64
+ n_bytes_in_mb = n_bytes / (1024 * 1024)
65
+ n_bytes_in_gb = n_bytes_in_mb / 1024
66
+ n_bytes_in_tb = n_bytes_in_gb / 1024
67
+ # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)
68
+
69
+ if unit == "n_tokens/n_bytes":
70
+ value = n_tokens / n_bytes
71
+ elif unit == "n_chars/n_tokens": # 重要:平均一个token包含多少个字符。
72
+ value = n_chars / n_tokens
73
+ elif unit == "n_tokens/n_chars": # 一个中文汉字需要几个token?
74
+ value = n_tokens / n_chars
75
+ elif unit == "g_bytes/b_tokens":
76
+ value = n_bytes_in_gb / n_tokens_in_billion
77
+ elif unit == "t_bytes/t_tokens": # 重要:
78
+ value = n_bytes_in_tb / n_tokens_in_trillion
79
+ elif unit == "b_tokens/g_bytes":
80
+ value = n_tokens_in_billion / n_bytes_in_gb
81
+ else:
82
+ raise "measure not support"
83
+ return round(value, 2)
84
+
85
+
86
+ all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"]
87
+
88
+
89
+ def pprint(stats):
90
+ table = []
91
+ for tokenizer_name, stat in stats.items():
92
+ columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
93
+ for unit in all_units:
94
+ if unit not in stat:
95
+ columns[unit] = unit_convertor(stat, unit)
96
+ else:
97
+ pass
98
+
99
+ table.append(columns)
100
+ df = pd.DataFrame(table)
101
+ # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
102
+ logger.info(df.to_markdown(index=False))
103
+ return
104
+
105
+
106
+ cache = {}
107
+
108
+
109
+ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
110
+ """
111
+ 这个要独立的cache,因为速度慢。
112
+ :param tokenizer:
113
+ :param lang:
114
+ :param cache_dir:
115
+ :return:
116
+ """
117
+
118
+ def _tokenize(tokenizer, dataset):
119
+ n_tokens = 0
120
+ n_chars = 0
121
+ n_bytes = 0
122
+ for item in dataset:
123
+ text = item["text"]
124
+ n_bytes += get_n_bytes_of_string(text)
125
+ n_chars += len(text)
126
+ encodings = tokenizer.encode(text)
127
+ n_tokens += len(encodings)
128
+ stat = {
129
+ "vocab_size": tokenizer.vocab_size,
130
+ "n_bytes": n_bytes,
131
+ "n_tokens": n_tokens,
132
+ "n_chars": n_chars,
133
+ }
134
+ return stat
135
+
136
+ tokenizer_name = tokenizer.alias
137
+ lang = lang.replace("cc100-", "")
138
+ cache_id = f"{tokenizer_name}.{lang}"
139
+ # L1: in-memory cache
140
+ if cache_id in cache:
141
+ logger.info(f"loading {cache_id} from in-memory cache")
142
+ return cache[cache_id]
143
+
144
+ # L2: file cache
145
+ cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
146
+ os.makedirs(cache_dir, exist_ok=True)
147
+ cache_path = os.path.join(cache_dir, f"{cache_id}.json")
148
+ if os.path.exists(cache_path):
149
+ logger.info(f"loading {cache_id} from file cache")
150
+ stat = json.load(open(cache_path, "r", encoding="utf-8"))
151
+ cache[cache_id] = stat
152
+ return stat
153
+
154
+ # tokenize corpus
155
+ dataset = load_dataset("eson/cc100-samples", lang, split="train")
156
+ stat = _tokenize(tokenizer, dataset)
157
+ logger.info(f"saving {cache_id} to {cache_path}")
158
+ json.dump(stat, open(cache_path, "w", encoding="utf-8"))
159
+ logger.info(f"saving {cache_id} to in-memory cache")
160
+ cache[cache_id] = stat
161
+ return stat
162
+
163
+
164
+ def main():
165
+ from vocab import all_tokenizers
166
+ stats = {}
167
+ for lang in ["en", "zh-Hans"]:
168
+ print("###" * 10 + lang)
169
+
170
+ for tokenizer_name in ['llama', 'llama2', 'llama3']:
171
+ # for tokenizer_name in all_tokenizers:
172
+ tokenizer = load_tokener(tokenizer_name)
173
+ stat = tokenize_corpus(tokenizer, lang)
174
+ # ["qwen1_5_14b_chat", "gpt_35_turbo",]:
175
+ stats[tokenizer_name] = stat
176
+
177
+ pprint(stats)
178
+
179
 
180
+ if __name__ == "__main__":
181
+ main()
utils/digit_util.py CHANGED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+
3
+ qwen segments numbers by single digits.
4
+
5
+
6
+ """
utils/text_util.py CHANGED
@@ -1,9 +1,7 @@
1
 
 
2
 
3
-
4
-
5
-
6
- def is_chinese(uchar):
7
  """
8
  https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
9
  re.compile("([\u4E00-\u9FD5]+)", re.U)
@@ -11,18 +9,33 @@ def is_chinese(uchar):
11
  return u'\u4e00' <= uchar <= u'\u9fa5'
12
 
13
 
14
-
15
- def has_chinese(text):
16
  """ contains Chinese characters """
17
- return any(is_chinese(ch) for ch in text)
18
 
19
 
20
  def get_zh_count(text):
21
- return sum([is_chinese(uchar) for uchar in text])
 
 
 
 
 
 
 
 
22
 
23
 
24
- def is_all_chinese(text):
25
- return all(is_chinese(char) for char in text)
 
 
 
 
 
 
 
 
26
 
27
 
28
  def get_digit_count(text):
@@ -31,3 +44,34 @@ def get_digit_count(text):
31
  if char in "0123456789":
32
  digit_count += 1
33
  return digit_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ from zhon.hanzi import punctuation as zh_punc
3
 
4
+ def is_zh_char(uchar):
 
 
 
5
  """
6
  https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
7
  re.compile("([\u4E00-\u9FD5]+)", re.U)
 
9
  return u'\u4e00' <= uchar <= u'\u9fa5'
10
 
11
 
12
+ def has_zh(text):
 
13
  """ contains Chinese characters """
14
+ return any(is_zh_char(ch) for ch in text)
15
 
16
 
17
  def get_zh_count(text):
18
+ return sum([is_zh_char(uchar) for uchar in text])
19
+
20
+
21
+ def is_all_zh(text):
22
+ return all(is_zh_char(char) for char in text)
23
+
24
+
25
+ def is_all_en(text):
26
+ return text.encode('utf-8').isalpha()
27
 
28
 
29
+ def is_digit_char(uchar):
30
+ return uchar in "0123456789"
31
+
32
+
33
+ def has_digit(text):
34
+ return any(is_digit_char(ch) for ch in text)
35
+
36
+
37
+ def is_all_digit(text):
38
+ return all(is_digit_char(char) for char in text)
39
 
40
 
41
  def get_digit_count(text):
 
44
  if char in "0123456789":
45
  digit_count += 1
46
  return digit_count
47
+
48
+
49
+
50
+ def has_zh_punc(text):
51
+ """
52
+ 是否包含中文标点
53
+ """
54
+ return any(ch in zh_punc for ch in text)
55
+
56
+
57
+
58
+ def is_space_char(uchar):
59
+ """
60
+ https://emptycharacter.com/
61
+
62
+
63
+ """
64
+
65
+
66
+ def has_space(text):
67
+ pass
68
+
69
+ def is_all_space(text):
70
+ pass
71
+
72
+ def get_space_count(text):
73
+ space_count = 0
74
+ for char in text:
75
+ if len(char.strip()) == 0:
76
+ space_count += 1
77
+ return space_count
utils/zh_util.py CHANGED
@@ -4,15 +4,18 @@ TODO: 繁体、简体、语种、
4
  import os
5
  import json
6
  from collections import Counter
7
- from utils.text_util import is_chinese, get_zh_count, get_digit_count
8
- from zhon.hanzi import punctuation as zh_punc
9
 
10
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
11
 
12
  zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
13
- is_chinese(line.strip())]
14
 
15
 
 
 
 
16
  def zh_iterator():
17
  for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
18
  yield (chr(idx))
@@ -28,7 +31,11 @@ def get_coding_length(tokenizer, vocab, filter=None):
28
  continue
29
  if filter is not None and filter(word):
30
  continue
31
- tokens = tokenizer.encode(word)
 
 
 
 
32
  all_length.append(len(tokens))
33
  # if len(tokens.ids) > 1:
34
  # if len(tokens) > 3:
@@ -39,21 +46,6 @@ def get_coding_length(tokenizer, vocab, filter=None):
39
  return dist_length, mean_length
40
 
41
 
42
- def has_zh_punc(text):
43
- """
44
- 是否包含中文标点
45
- """
46
- return any(ch in zh_punc for ch in text)
47
-
48
-
49
-
50
- def get_space_count(text):
51
- space_count = 0
52
- for char in text:
53
- if len(char.strip()) == 0:
54
- space_count += 1
55
- return space_count
56
-
57
 
58
  def remove_special_char():
59
  """
@@ -67,13 +59,39 @@ def remove_special_char():
67
 
68
  cache = {}
69
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- def iter_vocab(tokenizer, name="", from_cache=True):
72
  if from_cache and name in cache:
 
73
  return cache[name]
74
 
75
- f_out = open(name + "_vocab.jsonl", "w", encoding="utf-8")
76
- zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
79
 
@@ -81,56 +99,89 @@ def iter_vocab(tokenizer, name="", from_cache=True):
81
 
82
  all_single_zh_tokens = set()
83
  zh_symbol_count = 0
 
84
  for token_id in range(tokenizer.vocab_size):
85
  decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
86
  token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
87
  # tokenizer.convert_tokens_to_string(tokens)
88
 
 
 
89
  if token is None: # 有些词典有空的id(不连续)
90
  continue
91
  if isinstance(token, bytes):
92
  token = token.decode("utf-8", errors="ignore")
93
 
94
  digit_count = get_digit_count(decode_str)
95
- zh_count = get_zh_count(decode_str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  space_count = get_space_count(decode_str)
 
97
 
98
- f_out.write(json.dumps(
99
  {"id": token_id,
100
  "token": token,
101
  "token_decode": decode_str,
 
 
102
  "token_len": len(decode_str),
103
- "zh_count": zh_count,
104
- "space_count": space_count,
105
- "digit_count": digit_count,
106
  "zh_symbol_count": zh_symbol_count,
 
107
  },
108
- ensure_ascii=False) + "\n"
109
- )
110
-
111
- if zh_count >= 1:
112
- zh_token_count["total"] += 1
113
- if zh_count > 1:
114
- zh_token_count["中文多字"] += 1
115
- else:
116
- zh_token_count["中文单字"] += 1
117
- all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
118
  #
 
119
 
120
- dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_chinese(k))
121
 
122
  # TODO: 繁体字,简体字
123
- zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
124
 
125
  result = {
126
  "name": name,
127
  "impl": str(tokenizer.__class__),
128
  "vocab_size": tokenizer.vocab_size,
129
- "中文汉字数": zh_token_count,
 
 
130
  "中文标点数": zh_symbol_count,
131
  "中文汉字编码长度均值": mean_length,
132
  "中文汉字编码长度分布": json.dumps(dist_length),
 
 
 
 
 
133
  }
 
 
 
 
 
 
134
  cache[name] = result
135
  return result
136
 
@@ -140,9 +191,14 @@ if __name__ == "__main__":
140
  # test_coding_length(zh_punc)
141
  # test_coding_length(zh_iterator())
142
 
143
- from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
144
  # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
145
  # from vocab.baichuan2 import tokenizer; name="baichuan2"
146
- # from vocab.gpt_4 import tokenizer; name="gpt4"
 
 
 
 
 
147
 
148
  print(iter_vocab(tokenizer, name=name))
 
4
  import os
5
  import json
6
  from collections import Counter
7
+ from utils.log_util import logger
8
+ from utils.text_util import is_zh_char, is_all_zh, has_zh, is_all_digit, has_digit, get_zh_count, get_digit_count, get_space_count
9
 
10
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
11
 
12
  zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
13
+ is_zh_char(line.strip())]
14
 
15
 
16
+ def to_unicode(text):
17
+ return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
18
+
19
  def zh_iterator():
20
  for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
21
  yield (chr(idx))
 
31
  continue
32
  if filter is not None and filter(word):
33
  continue
34
+ try:
35
+ tokens = tokenizer.encode(word)
36
+ except Exception as e:
37
+ print(e)
38
+
39
  all_length.append(len(tokens))
40
  # if len(tokens.ids) > 1:
41
  # if len(tokens) > 3:
 
46
  return dist_length, mean_length
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def remove_special_char():
51
  """
 
59
 
60
  cache = {}
61
 
62
+ def iter_vocab(tokenizer, from_cache=True, cache_dir="stats/iter_vocab"):
63
+ """
64
+ 由于速度较快,建议不采用文件缓存。
65
+ :param tokenizer:
66
+ :param from_cache:
67
+ :return:
68
+ """
69
+ cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
70
+ os.makedirs(cache_dir, exist_ok=True)
71
+
72
+ name = tokenizer.alias
73
 
74
+ # L1 cache
75
  if from_cache and name in cache:
76
+ logger.info(f"load {name} from cache")
77
  return cache[name]
78
 
79
+ # L2 cache: not recommended
80
+
81
+ # has_zh_token_stats = {"total_tokens": 0, "mean_token_length": 0}
82
+ # all_zh_token_stats = {"total_tokens": 0, "mean_token_length": 0}
83
+ # has_number_token_stats = {"total_tokens": 0, "mean_token_length": 0}
84
+ # all_number_token_stats = {"total_tokens": 0, "mean_token_length": 0}
85
+
86
+ has_zh_tokens = []
87
+ all_zh_tokens = []
88
+ has_digit_tokens = []
89
+ all_digit_tokens = []
90
+ has_space_tokens = []
91
+ all_space_tokens = []
92
+
93
+ # zh_tags = ["all_zh", "has_zh"]
94
+ # digit_tags = ["all_digit", "has_digit"]
95
 
96
  # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
97
 
 
99
 
100
  all_single_zh_tokens = set()
101
  zh_symbol_count = 0
102
+ buffer = []
103
  for token_id in range(tokenizer.vocab_size):
104
  decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
105
  token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
106
  # tokenizer.convert_tokens_to_string(tokens)
107
 
108
+ tags = []
109
+
110
  if token is None: # 有些词典有空的id(不连续)
111
  continue
112
  if isinstance(token, bytes):
113
  token = token.decode("utf-8", errors="ignore")
114
 
115
  digit_count = get_digit_count(decode_str)
116
+
117
+ if is_all_zh(decode_str):
118
+ tags.append("all_zh")
119
+ all_zh_tokens.append(decode_str)
120
+ elif has_zh(decode_str):
121
+ tags.append("has_zh")
122
+ has_zh_tokens.append(decode_str)
123
+
124
+ if is_all_digit(decode_str):
125
+ tags.append("all_digit")
126
+ all_digit_tokens.append(decode_str)
127
+ elif has_digit(decode_str):
128
+ tags.append("has_digit")
129
+ has_digit_tokens.append(decode_str)
130
+
131
+
132
  space_count = get_space_count(decode_str)
133
+ zh_count = get_zh_count(decode_str)
134
 
135
+ buffer.append(json.dumps(
136
  {"id": token_id,
137
  "token": token,
138
  "token_decode": decode_str,
139
+ "token_dumps": json.dumps(token),
140
+ "token_unicode": to_unicode(token),
141
  "token_len": len(decode_str),
142
+ "zh_count": zh_count, # 包含汉字的数目
143
+ "tags": tags,
 
144
  "zh_symbol_count": zh_symbol_count,
145
+ "": "",
146
  },
147
+ ensure_ascii=False) + "\n")
148
+
149
+ # if zh_count >= 1:
150
+ # zh_token_count["total"] += 1
151
+ # if zh_count > 1:
152
+ # zh_token_count["中文多字"] += 1
153
+ # else:
154
+ # zh_token_count["中文单字"] += 1
155
+ # all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
 
156
  #
157
+ # zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
158
 
159
+ dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
160
 
161
  # TODO: 繁体字,简体字
 
162
 
163
  result = {
164
  "name": name,
165
  "impl": str(tokenizer.__class__),
166
  "vocab_size": tokenizer.vocab_size,
167
+ "中文token数": len(has_zh_tokens),
168
+ "中文token的平均长度": None,
169
+ "纯中文token的平均长度": None,
170
  "中文标点数": zh_symbol_count,
171
  "中文汉字编码长度均值": mean_length,
172
  "中文汉字编码长度分布": json.dumps(dist_length),
173
+ "纯数字token数": digit_count,
174
+ "纯数字token的平均长度": None,
175
+ "纯中文token数": None,
176
+ "纯space的token数": space_count,
177
+ "纯space的token的平均长度": None,
178
  }
179
+ out_path = os.path.join(cache_dir, f"{name}.vocab.jsonl")
180
+ logger.info(f"saving vocab to {out_path}")
181
+ with open(out_path, "w", encoding="utf-8") as f_out:
182
+ f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
183
+ for line in buffer:
184
+ f_out.write(line)
185
  cache[name] = result
186
  return result
187
 
 
191
  # test_coding_length(zh_punc)
192
  # test_coding_length(zh_iterator())
193
 
194
+ # from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
195
  # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
196
  # from vocab.baichuan2 import tokenizer; name="baichuan2"
197
+ from vocab.gpt_4 import tokenizer; name="gpt4"
198
+ # from vocab.gpt2 import tokenizer; name="gpt2"
199
+ # from vocab.qwen1_5_14b_chat import tokenizer; name="qwen1_5_14b_chat"
200
+ # from vocab.gpt_nexo_20b import tokenizer; name="gpt_nexo_20b"
201
+ # from vocab.fastchat_t5_3b import tokenizer; name="fastchat_t5_3b"
202
+
203
 
204
  print(iter_vocab(tokenizer, name=name))
vocab/README.md CHANGED
@@ -36,6 +36,14 @@ chatglm
36
  bloom
37
 
38
 
 
 
 
 
 
 
 
 
39
  ## bert
40
 
41
  ```
@@ -87,10 +95,40 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
87
 
88
  - 类似的还有:moss
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  ## 空格、tab、换行
91
 
92
 
93
 
 
 
94
  ## reversible and lossless
95
 
96
- It's reversible and lossless, so you can convert tokens back into the original text
 
 
 
 
 
 
36
  bloom
37
 
38
 
39
+ ## 最小词典
40
+
41
+ mobilenet
42
+
43
+
44
+ ## ss
45
+
46
+
47
  ## bert
48
 
49
  ```
 
95
 
96
  - 类似的还有:moss
97
 
98
+
99
+ ### Ġ是什么
100
+
101
+ It's a feature of byte-level BPE(an encoded space character).
102
+ Ġ 表示空格,有的版本用Ä代替Ġ。
103
+
104
+
105
+ ```sh
106
+ What's up with the tokenizer?
107
+ # BPE后
108
+ ['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
109
+ # 经过vocab.json编码后
110
+ [ 2061, 338, 510, 351, 262, 11241, 7509, 30]
111
+ # 经过dict.txt编码后(fairseq特有)
112
+ [ 其他数字 ]
113
+ ```
114
+ <>
115
+ 疑问:up会加Ġ,为什么what不加Ġ,因为有个pre
116
+
117
+ - https://github.com/pytorch/fairseq/issues/1716
118
+ - https://github.com/huggingface/transformers/issues/1083
119
+
120
+
121
  ## 空格、tab、换行
122
 
123
 
124
 
125
+
126
+
127
  ## reversible and lossless
128
 
129
+ It's reversible and lossless, so you can convert tokens back into the original text
130
+
131
+
132
+ ## diff
133
+
134
+
vocab/__init__.py CHANGED
@@ -70,7 +70,8 @@ uniq_tokenizers = [
70
  ""
71
  ]
72
 
73
- # TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
 
74
  all_tokenizers = [
75
  ##### bert 系列
76
  ("bert_base_cased", "", "bert"),
@@ -101,6 +102,7 @@ all_tokenizers = [
101
 
102
  ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
103
  ("llama2", "", "sentencepiece"),
 
104
  ("chinese_llama", "", "sentencepiece"), #
105
  ("chinese_llama2", "", "sentencepiece"), #
106
  # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
@@ -154,7 +156,7 @@ all_tokenizers = [
154
  ("phi_2",),
155
  ("solar_10_7b",),
156
  ("mobilebert_uncased",),
157
- ("mobilenet_v2",),
158
  ("switch_c_2048",),
159
  ("byt5_small",),
160
  ("mt5_large",),
@@ -168,7 +170,12 @@ all_tokenizers = [
168
  ("gemma_7b",),
169
  ("olmo_7b",),
170
  ("aya_101",),
171
- ("zephyr_7b_beta",)
 
 
 
 
 
172
  ]
173
 
174
  all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
@@ -234,6 +241,7 @@ class TokenizerImpl(Enum):
234
 
235
  def load_tokener(model_name):
236
  tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
 
237
  return tokenizer
238
 
239
 
 
70
  ""
71
  ]
72
 
73
+ # format: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
74
+ # TODO: append link and description to the end of dropdown button.
75
  all_tokenizers = [
76
  ##### bert 系列
77
  ("bert_base_cased", "", "bert"),
 
102
 
103
  ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
104
  ("llama2", "", "sentencepiece"),
105
+ ("llama3", "", "sentencepiece"),
106
  ("chinese_llama", "", "sentencepiece"), #
107
  ("chinese_llama2", "", "sentencepiece"), #
108
  # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
 
156
  ("phi_2",),
157
  ("solar_10_7b",),
158
  ("mobilebert_uncased",),
159
+ # ("mobilenet_v2",), # error
160
  ("switch_c_2048",),
161
  ("byt5_small",),
162
  ("mt5_large",),
 
170
  ("gemma_7b",),
171
  ("olmo_7b",),
172
  ("aya_101",),
173
+ ("zephyr_7b_beta",),
174
+ ("jamba_v0_1", ),
175
+ ("dbrx_instruct", ),
176
+ ("grok_1",),
177
+ # ("claude",),
178
+
179
  ]
180
 
181
  all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
 
241
 
242
  def load_tokener(model_name):
243
  tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
244
+ tokenizer.alias = model_name
245
  return tokenizer
246
 
247
 
vocab/bert_base_chinese/test_zh_coding_len.py CHANGED
@@ -16,7 +16,7 @@
16
  from collections import Counter
17
  from transformers import AutoTokenizer
18
  from data_sample.oov_base import jd_vocab_tokens
19
- from utils.text_util import is_chinese, has_chinese
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
 
@@ -55,7 +55,7 @@ def iter_vocab():
55
  zh_symbol_count = 0
56
  for idx, word in enumerate(vocab):
57
 
58
- if has_chinese(decode_str):
59
  zh_token_count += 1
60
  f_out.write("%d\t%s\t中文汉字\n" % (idx, decode_str))
61
  elif has_zh_char(decode_str):
 
16
  from collections import Counter
17
  from transformers import AutoTokenizer
18
  from data_sample.oov_base import jd_vocab_tokens
19
+ from utils.text_util import is_zh_char, has_zh
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
 
 
55
  zh_symbol_count = 0
56
  for idx, word in enumerate(vocab):
57
 
58
+ if has_zh(decode_str):
59
  zh_token_count += 1
60
  f_out.write("%d\t%s\t中文汉字\n" % (idx, decode_str))
61
  elif has_zh_char(decode_str):
vocab/bloom/test_zh_coding_len.py CHANGED
@@ -16,7 +16,7 @@
16
  from collections import Counter
17
  from transformers import AutoTokenizer, BloomTokenizerFast
18
  from data_sample.oov_base import jd_vocab_tokens
19
- from utils.text_util import is_chinese
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
  # tokenizer = AutoTokenizer.from_pretrained("tokenizer")
 
16
  from collections import Counter
17
  from transformers import AutoTokenizer, BloomTokenizerFast
18
  from data_sample.oov_base import jd_vocab_tokens
19
+ from utils.text_util import is_zh_char
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
  # tokenizer = AutoTokenizer.from_pretrained("tokenizer")
vocab/bloomz_6b4_zh/__init__.py CHANGED
@@ -7,5 +7,3 @@ TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
9
 
10
- # vocab_size = len(tokenizer.get_vocab())
11
- # vocab_size = tokenizer.vocab_size
 
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
9
 
 
 
vocab/glm/test_tokenizer.py CHANGED
@@ -3,7 +3,7 @@
3
  默认采用:GLMGPT2Tokenizer
4
  """
5
 
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
  tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-10b", trust_remote_code=True)
8
 
9
  tokens_id = [3856, 11030]
 
3
  默认采用:GLMGPT2Tokenizer
4
  """
5
 
6
+ from transformers import AutoTokenizer
7
  tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-10b", trust_remote_code=True)
8
 
9
  tokens_id = [3856, 11030]
vocab/glm_chinese/__init__.py CHANGED
@@ -26,5 +26,26 @@ tokenizer.vocab_size = tokenizer.num_tokens
26
 
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # vocab_size = len(tokenizer.get_vocab())
30
  # vocab_size = tokenizer.vocab_size
 
26
 
27
 
28
 
29
+ def get_vocab(self, token_type="str"):
30
+ """Returns vocab as a dict
31
+ :return:
32
+ """
33
+ vocab = {}
34
+ for i in range(self.vocab_size):
35
+ try:
36
+ token_byte = self.convert_ids_to_tokens([i])[0]
37
+ if token_byte is None:
38
+ continue
39
+ # token_str = token_byte.decode("utf-8")
40
+ vocab[token_byte] = i
41
+
42
+ except Exception as e: # 773 UnicodeDecodeError
43
+ print("exception")
44
+
45
+ return vocab
46
+
47
+
48
+ ChineseSPTokenizer.get_vocab = get_vocab
49
+
50
  # vocab_size = len(tokenizer.get_vocab())
51
  # vocab_size = tokenizer.vocab_size
vocab/glm_chinese/test.py CHANGED
@@ -1,4 +1,7 @@
1
 
2
- from glm_chinese import tokenizer
3
 
4
- print(tokenizer.decode([20]))
 
 
 
 
1
 
2
+ from vocab.glm_chinese import tokenizer
3
 
4
+ print(tokenizer.decode([20]))
5
+ vocab = tokenizer.get_vocab()
6
+
7
+ print(vocab)
vocab/gpt2/README.md CHANGED
@@ -40,42 +40,21 @@ byte-level BPE
40
  - [vocab.json](https://huggingface.co/gpt2-large/resolve/main/vocab.json): 50257个kv-pair. https://huggingface.co/gpt2/resolve/main/vocab.json
41
  - [merges.txt](https://huggingface.co/gpt2-large/resolve/main/merges.txt): 50001行,https://huggingface.co/gpt2/resolve/main/merges.txt
42
  - merges.txts是否包含所有的组合?https://github.com/huggingface/transformers/issues/4777
 
 
43
 
44
- ### fairseq = 官方
45
-
46
- - vocab.bpe:50001行
47
- - encoder.json: 50257个kv-pair
48
- - dict.txt: 50260行 是纯数字的,是由fairseq-preprocess生成的 https://github.com/pytorch/fairseq/issues/1186
49
-
50
-
51
- - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
52
- - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
53
- - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
54
-
55
 
56
- # 相关疑问
57
-
58
- ### Ġ是什么
59
-
60
- It's a feature of byte-level BPE(an encoded space character).
61
- Ġ 表示空格,有的版本用Ä代替Ġ。
62
-
63
-
64
- ```
65
- What's up with the tokenizer?
66
- # BPE后
67
- ['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
68
- # 经过vocab.json编码后
69
- [ 2061, 338, 510, 351, 262, 11241, 7509, 30]
70
- # 经过dict.txt编码后(fairseq特有)
71
- [ 其他数字 ]
72
- ```
73
- 疑问:up会加Ġ,为什么what不加Ġ
74
 
 
 
 
 
 
75
 
76
- - https://github.com/pytorch/fairseq/issues/1716
77
- - https://github.com/huggingface/transformers/issues/1083
78
 
 
79
 
80
 
81
 
 
40
  - [vocab.json](https://huggingface.co/gpt2-large/resolve/main/vocab.json): 50257个kv-pair. https://huggingface.co/gpt2/resolve/main/vocab.json
41
  - [merges.txt](https://huggingface.co/gpt2-large/resolve/main/merges.txt): 50001行,https://huggingface.co/gpt2/resolve/main/merges.txt
42
  - merges.txts是否包含所有的组合?https://github.com/huggingface/transformers/issues/4777
43
+ - [tokenizer.json](https://huggingface.co/openai-community/gpt2-large/blob/main/tokenizer.json)
44
+ - 这个是给
45
 
46
+ 词典加载 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py
 
 
 
 
 
 
 
 
 
 
47
 
48
+ ### fairseq = 官方
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ - [vocab.bpe](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe):50001行
51
+ - 等于 hf的 `merges.txt`
52
+ - [encoder.json](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json): 50257个kv-pair
53
+ - 等于 hf的 `vocab.json`
54
+ - [dict.txt](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt): 50260行 这是词频,是由fairseq-preprocess生成的 https://github.com/pytorch/fairseq/issues/1186
55
 
 
 
56
 
57
+ 词典加载 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py
58
 
59
 
60
 
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -6,7 +6,6 @@ import tiktoken
6
  import tokenizer.tiktoken_patch
7
 
8
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
9
- tokenizer.vocab_size = tokenizer.n_vocab
10
 
11
  tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
12
  tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
 
6
  import tokenizer.tiktoken_patch
7
 
8
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 
9
 
10
  tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
11
  tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
vocab/gpt_35_turbo/decode_test.py CHANGED
@@ -9,5 +9,12 @@ encoding = tokenizer.encode(text)
9
  print(tokenizer.decode([6744]))
10
  print(tokenizer.convert_ids_to_tokens([6744]))
11
 
12
- print(tokenizer.decode([100256]))
13
- print(tokenizer.convert_ids_to_tokens([100256]))
 
 
 
 
 
 
 
 
9
  print(tokenizer.decode([6744]))
10
  print(tokenizer.convert_ids_to_tokens([6744]))
11
 
12
+ print(tokenizer.decode([100256])) # 是没有这个token吗?
13
+ print(tokenizer.convert_ids_to_tokens([100256]))
14
+
15
+
16
+ print(tokenizer.decode([100262]))
17
+ print(tokenizer.convert_ids_to_tokens([100262]))
18
+
19
+ print(tokenizer.decode([100273]))
20
+ print(tokenizer.convert_ids_to_tokens([100273]))
vocab/gpt_35_turbo/test_tiktoken.py CHANGED
@@ -9,15 +9,18 @@ https://github.com/openai/tiktoken
9
 
10
  import json
11
  import tiktoken
 
12
 
13
 
14
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
15
  text = "你好,请告诉我聚乙烯是什么"
16
  # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
17
- encoding = tokenizer.encode(text)
 
18
  decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
19
  print(encoding)
20
  print(decoding_bytes)
 
21
 
22
  # for token in tokens:
23
  # token_str = encoding.decode([token])
 
9
 
10
  import json
11
  import tiktoken
12
+ # from tokenizer import tiktoken_patch
13
 
14
 
15
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
16
  text = "你好,请告诉我聚乙烯是什么"
17
  # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
18
+ text = "'<|endoftext|>"
19
+ encoding = tokenizer.encode(text, allowed_special="all")
20
  decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
21
  print(encoding)
22
  print(decoding_bytes)
23
+ # 100256
24
 
25
  # for token in tokens:
26
  # token_str = encoding.decode([token])
vocab/gpt_35_turbo/vocab.jsonl CHANGED
@@ -99964,3 +99964,314 @@
99964
  {"id": 99963, "token": "\" Geg\""}
99965
  {"id": 99964, "token": "\"\\tdto\""}
99966
  {"id": 99965, "token": "\".defaultValue\""}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99964
  {"id": 99963, "token": "\" Geg\""}
99965
  {"id": 99964, "token": "\"\\tdto\""}
99966
  {"id": 99965, "token": "\".defaultValue\""}
99967
+ {"id": 99966, "token": "\" Kami\""}
99968
+ {"id": 99967, "token": "\" ASE\""}
99969
+ {"id": 99968, "token": "\"optimized\""}
99970
+ {"id": 99969, "token": "\" \\ud3ec\""}
99971
+ {"id": 99970, "token": "\" originates\""}
99972
+ {"id": 99971, "token": "\"errMsg\""}
99973
+ {"id": 99972, "token": "\" espa\\u00e7o\""}
99974
+ {"id": 99973, "token": "\"(SYS\""}
99975
+ {"id": 99974, "token": "\" McB\""}
99976
+ {"id": 99975, "token": "\"dance\""}
99977
+ {"id": 99976, "token": "\"_detected\""}
99978
+ {"id": 99977, "token": "\" fr\\u00fc\""}
99979
+ {"id": 99978, "token": "\"\\t\\t \\t\\t\""}
99980
+ {"id": 99979, "token": "\"<Date\""}
99981
+ {"id": 99980, "token": "\"(comb\""}
99982
+ {"id": 99981, "token": "\" Decide\""}
99983
+ {"id": 99982, "token": "\"\\\\Field\""}
99984
+ {"id": 99983, "token": "\" Proposed\""}
99985
+ {"id": 99984, "token": "\"Rib\""}
99986
+ {"id": 99985, "token": "\" dislikes\""}
99987
+ {"id": 99986, "token": "\" Wien\""}
99988
+ {"id": 99987, "token": "\"\\tDocument\""}
99989
+ {"id": 99988, "token": "\" traf\""}
99990
+ {"id": 99989, "token": "\" storia\""}
99991
+ {"id": 99990, "token": "\" Tells\""}
99992
+ {"id": 99991, "token": "\"')==\""}
99993
+ {"id": 99992, "token": "\"Cri\""}
99994
+ {"id": 99993, "token": "\"(VALUE\""}
99995
+ {"id": 99994, "token": "\" Burnett\""}
99996
+ {"id": 99995, "token": "\",void\""}
99997
+ {"id": 99996, "token": "\" danh\""}
99998
+ {"id": 99997, "token": "\" ccp\""}
99999
+ {"id": 99998, "token": "\"Blockchain\""}
100000
+ {"id": 99999, "token": "\":\\\"-\\\"`\\n\""}
100001
+ {"id": 100000, "token": "\"IClient\""}
100002
+ {"id": 100001, "token": "\"ISODE\""}
100003
+ {"id": 100002, "token": "\"Issuer\""}
100004
+ {"id": 100003, "token": "\")}\\r\\n\""}
100005
+ {"id": 100004, "token": "\",but\""}
100006
+ {"id": 100005, "token": "\" Uph\""}
100007
+ {"id": 100006, "token": "\"(Sub\""}
100008
+ {"id": 100007, "token": "\" t\\u00e9l\\u00e9phone\""}
100009
+ {"id": 100008, "token": "\" onDataChange\""}
100010
+ {"id": 100009, "token": "\" marshaller\""}
100011
+ {"id": 100010, "token": "\"-analytics\""}
100012
+ {"id": 100011, "token": "\",content\""}
100013
+ {"id": 100012, "token": "\" debacle\""}
100014
+ {"id": 100013, "token": "\"_ValueChanged\""}
100015
+ {"id": 100014, "token": "\" fauna\""}
100016
+ {"id": 100015, "token": "\" #=>\""}
100017
+ {"id": 100016, "token": "\" foyer\""}
100018
+ {"id": 100017, "token": "\"'utilisation\""}
100019
+ {"id": 100018, "token": "\" M\\u00fcller\""}
100020
+ {"id": 100019, "token": "\" Fetish\""}
100021
+ {"id": 100020, "token": "\" defaultManager\""}
100022
+ {"id": 100021, "token": "\" backtrack\""}
100023
+ {"id": 100022, "token": "\"Bah\""}
100024
+ {"id": 100023, "token": "\"Explicit\""}
100025
+ {"id": 100024, "token": "\"_ASCII\""}
100026
+ {"id": 100025, "token": "\" mActivity\""}
100027
+ {"id": 100026, "token": "\"(Msg\""}
100028
+ {"id": 100027, "token": "\" \\uac8c\""}
100029
+ {"id": 100028, "token": "\" TERMS\""}
100030
+ {"id": 100029, "token": "\" Angie\""}
100031
+ {"id": 100030, "token": "\"HSV\""}
100032
+ {"id": 100031, "token": "\" Mosque\""}
100033
+ {"id": 100032, "token": "\".Names\""}
100034
+ {"id": 100033, "token": "\"\\ud2bc\""}
100035
+ {"id": 100034, "token": "\"reste\""}
100036
+ {"id": 100035, "token": "\"_parms\""}
100037
+ {"id": 100036, "token": "\" gaping\""}
100038
+ {"id": 100037, "token": "\" cropping\""}
100039
+ {"id": 100038, "token": "\"DataFrame\""}
100040
+ {"id": 100039, "token": "\" responsiveness\""}
100041
+ {"id": 100040, "token": "\"_undo\""}
100042
+ {"id": 100041, "token": "\"_tran\""}
100043
+ {"id": 100042, "token": "\".terminate\""}
100044
+ {"id": 100043, "token": "\" italiane\""}
100045
+ {"id": 100044, "token": "\" walkthrough\""}
100046
+ {"id": 100045, "token": "\" attractiveness\""}
100047
+ {"id": 100046, "token": "\"\\u0434\\u0435\""}
100048
+ {"id": 100047, "token": "\"_STS\""}
100049
+ {"id": 100048, "token": "\"_learn\""}
100050
+ {"id": 100049, "token": "\" chocolates\""}
100051
+ {"id": 100050, "token": "\"ierarchical\""}
100052
+ {"id": 100051, "token": "\"-thinking\""}
100053
+ {"id": 100052, "token": "\" )))\""}
100054
+ {"id": 100053, "token": "\"ishments\""}
100055
+ {"id": 100054, "token": "\".Logf\""}
100056
+ {"id": 100055, "token": "\" TMZ\""}
100057
+ {"id": 100056, "token": "\" Canary\""}
100058
+ {"id": 100057, "token": "\"foil\""}
100059
+ {"id": 100058, "token": "\" Vaccine\""}
100060
+ {"id": 100059, "token": "\".vx\""}
100061
+ {"id": 100060, "token": "\" Surround\""}
100062
+ {"id": 100061, "token": "\"Intermediate\""}
100063
+ {"id": 100062, "token": "\" iov\""}
100064
+ {"id": 100063, "token": "\"vais\""}
100065
+ {"id": 100064, "token": "\"';\\\";\\n\""}
100066
+ {"id": 100065, "token": "\"\\uff5e\\n\\n\""}
100067
+ {"id": 100066, "token": "\"\\u9001\\u6599\""}
100068
+ {"id": 100067, "token": "\"\\u2026it\""}
100069
+ {"id": 100068, "token": "\"Seats\""}
100070
+ {"id": 100069, "token": "\"Clar\""}
100071
+ {"id": 100070, "token": "\"Wars\""}
100072
+ {"id": 100071, "token": "\" Hutchinson\""}
100073
+ {"id": 100072, "token": "\" Hasan\""}
100074
+ {"id": 100073, "token": "\"!')\\n\\n\""}
100075
+ {"id": 100074, "token": "\" Richie\""}
100076
+ {"id": 100075, "token": "\"cheiden\""}
100077
+ {"id": 100076, "token": "\"($('\""}
100078
+ {"id": 100077, "token": "\"York\""}
100079
+ {"id": 100078, "token": "\" lids\""}
100080
+ {"id": 100079, "token": "\" alphanumeric\""}
100081
+ {"id": 100080, "token": "\" Glock\""}
100082
+ {"id": 100081, "token": "\".shapes\""}
100083
+ {"id": 100082, "token": "\" sparking\""}
100084
+ {"id": 100083, "token": "\"_epsilon\""}
100085
+ {"id": 100084, "token": "\"uplicated\""}
100086
+ {"id": 100085, "token": "\".dirty\""}
100087
+ {"id": 100086, "token": "\"])==\""}
100088
+ {"id": 100087, "token": "\" \\uc704\\uce58\""}
100089
+ {"id": 100088, "token": "\" scn\""}
100090
+ {"id": 100089, "token": "\" /****************************************************************\""}
100091
+ {"id": 100090, "token": "\"_PREVIEW\""}
100092
+ {"id": 100091, "token": "\"_HC\""}
100093
+ {"id": 100092, "token": "\"ielding\""}
100094
+ {"id": 100093, "token": "\"fgets\""}
100095
+ {"id": 100094, "token": "\" Addison\""}
100096
+ {"id": 100095, "token": "\" productService\""}
100097
+ {"id": 100096, "token": "\"-figure\""}
100098
+ {"id": 100097, "token": "\"(retval\""}
100099
+ {"id": 100098, "token": "\"zano\""}
100100
+ {"id": 100099, "token": "\" autob\""}
100101
+ {"id": 100100, "token": "\"\\tsd\""}
100102
+ {"id": 100101, "token": "\"_numer\""}
100103
+ {"id": 100102, "token": "\" SetLastError\""}
100104
+ {"id": 100103, "token": "\" Fior\""}
100105
+ {"id": 100104, "token": "\"ificance\""}
100106
+ {"id": 100105, "token": "\"Untitled\""}
100107
+ {"id": 100106, "token": "\" infield\""}
100108
+ {"id": 100107, "token": "\" {}));\\n\""}
100109
+ {"id": 100108, "token": "\" spac\""}
100110
+ {"id": 100109, "token": "\" rookies\""}
100111
+ {"id": 100110, "token": "\"(describing\""}
100112
+ {"id": 100111, "token": "\"ngen\""}
100113
+ {"id": 100112, "token": "\"\\u0bbf\\ufffd\""}
100114
+ {"id": 100113, "token": "\".rdf\""}
100115
+ {"id": 100114, "token": "\".Mutex\""}
100116
+ {"id": 100115, "token": "\" kneeling\""}
100117
+ {"id": 100116, "token": "\" QE\""}
100118
+ {"id": 100117, "token": "\"setMax\""}
100119
+ {"id": 100118, "token": "\"ReadStream\""}
100120
+ {"id": 100119, "token": "\" ventas\""}
100121
+ {"id": 100120, "token": "\"sut\""}
100122
+ {"id": 100121, "token": "\"cmpeq\""}
100123
+ {"id": 100122, "token": "\".WriteAllText\""}
100124
+ {"id": 100123, "token": "\" Experienced\""}
100125
+ {"id": 100124, "token": "\"$__\""}
100126
+ {"id": 100125, "token": "\" kaum\""}
100127
+ {"id": 100126, "token": "\" LIS\""}
100128
+ {"id": 100127, "token": "\" documentos\""}
100129
+ {"id": 100128, "token": "\"_HEALTH\""}
100130
+ {"id": 100129, "token": "\"icontains\""}
100131
+ {"id": 100130, "token": "\" artisans\""}
100132
+ {"id": 100131, "token": "\"OWNER\""}
100133
+ {"id": 100132, "token": "\" blinked\""}
100134
+ {"id": 100133, "token": "\"getDisplay\""}
100135
+ {"id": 100134, "token": "\" toen\""}
100136
+ {"id": 100135, "token": "\" rowNum\""}
100137
+ {"id": 100136, "token": "\" avril\""}
100138
+ {"id": 100137, "token": "\" invis\""}
100139
+ {"id": 100138, "token": "\" Kear\""}
100140
+ {"id": 100139, "token": "\"toBeInTheDocument\""}
100141
+ {"id": 100140, "token": "\"apur\""}
100142
+ {"id": 100141, "token": "\" racked\""}
100143
+ {"id": 100142, "token": "\" McMaster\""}
100144
+ {"id": 100143, "token": "\"_ATTRIB\""}
100145
+ {"id": 100144, "token": "\"Haz\""}
100146
+ {"id": 100145, "token": "\" factura\""}
100147
+ {"id": 100146, "token": "\"/ts\""}
100148
+ {"id": 100147, "token": "\" \\u0440\\u0430\\u0437\\u043c\\u0435\\u0440\""}
100149
+ {"id": 100148, "token": "\" zf\""}
100150
+ {"id": 100149, "token": "\" shortfall\""}
100151
+ {"id": 100150, "token": "\".fasta\""}
100152
+ {"id": 100151, "token": "\" CONSTANT\""}
100153
+ {"id": 100152, "token": "\".managed\""}
100154
+ {"id": 100153, "token": "\"gems\""}
100155
+ {"id": 100154, "token": "\"SharedPointer\""}
100156
+ {"id": 100155, "token": "\" blurry\""}
100157
+ {"id": 100156, "token": "\"brightness\""}
100158
+ {"id": 100157, "token": "\"(components\""}
100159
+ {"id": 100158, "token": "\" ...\\\"\\n\\n\""}
100160
+ {"id": 100159, "token": "\"SELL\""}
100161
+ {"id": 100160, "token": "\" Illustrator\""}
100162
+ {"id": 100161, "token": "\".getChannel\""}
100163
+ {"id": 100162, "token": "\" trouv\\u00e9\""}
100164
+ {"id": 100163, "token": "\"ysters\""}
100165
+ {"id": 100164, "token": "\" vois\""}
100166
+ {"id": 100165, "token": "\" Linden\""}
100167
+ {"id": 100166, "token": "\" emojis\""}
100168
+ {"id": 100167, "token": "\" brawl\""}
100169
+ {"id": 100168, "token": "\" MSR\""}
100170
+ {"id": 100169, "token": "\" Elo\""}
100171
+ {"id": 100170, "token": "\" Croatian\""}
100172
+ {"id": 100171, "token": "\"PopupMenu\""}
100173
+ {"id": 100172, "token": "\"Lewis\""}
100174
+ {"id": 100173, "token": "\".JWT\""}
100175
+ {"id": 100174, "token": "\" astonished\""}
100176
+ {"id": 100175, "token": "\"Bush\""}
100177
+ {"id": 100176, "token": "\"(itemId\""}
100178
+ {"id": 100177, "token": "\" detachment\""}
100179
+ {"id": 100178, "token": "\" Encore\""}
100180
+ {"id": 100179, "token": "\"\\u5c14\""}
100181
+ {"id": 100180, "token": "\" rekl\""}
100182
+ {"id": 100181, "token": "\" cram\""}
100183
+ {"id": 100182, "token": "\")$/\""}
100184
+ {"id": 100183, "token": "\".getHost\""}
100185
+ {"id": 100184, "token": "\"_recommend\""}
100186
+ {"id": 100185, "token": "\"-HT\""}
100187
+ {"id": 100186, "token": "\"_calibration\""}
100188
+ {"id": 100187, "token": "\"Authenticate\""}
100189
+ {"id": 100188, "token": "\".firebaseapp\""}
100190
+ {"id": 100189, "token": "\"UNIX\""}
100191
+ {"id": 100190, "token": "\"\\tCamera\""}
100192
+ {"id": 100191, "token": "\" HEAP\""}
100193
+ {"id": 100192, "token": "\"Ideal\""}
100194
+ {"id": 100193, "token": "\".office\""}
100195
+ {"id": 100194, "token": "\" goofy\""}
100196
+ {"id": 100195, "token": "\"(Symbol\""}
100197
+ {"id": 100196, "token": "\" jouer\""}
100198
+ {"id": 100197, "token": "\"_partitions\""}
100199
+ {"id": 100198, "token": "\" rapidement\""}
100200
+ {"id": 100199, "token": "\" GNUNET\""}
100201
+ {"id": 100200, "token": "\"idUser\""}
100202
+ {"id": 100201, "token": "\" supervise\""}
100203
+ {"id": 100202, "token": "\"(Contact\""}
100204
+ {"id": 100203, "token": "\"AWN\""}
100205
+ {"id": 100204, "token": "\"\\u3058\""}
100206
+ {"id": 100205, "token": "\" naam\""}
100207
+ {"id": 100206, "token": "\" aust\""}
100208
+ {"id": 100207, "token": "\"\\u5728\\u7ebf\""}
100209
+ {"id": 100208, "token": "\"_softmax\""}
100210
+ {"id": 100209, "token": "\"AllowAnonymous\""}
100211
+ {"id": 100210, "token": "\"ammable\""}
100212
+ {"id": 100211, "token": "\"ROUTE\""}
100213
+ {"id": 100212, "token": "\"*D\""}
100214
+ {"id": 100213, "token": "\" aden\""}
100215
+ {"id": 100214, "token": "\" Cristina\""}
100216
+ {"id": 100215, "token": "\" Cristiano\""}
100217
+ {"id": 100216, "token": "\" bloodstream\""}
100218
+ {"id": 100217, "token": "\"subclass\""}
100219
+ {"id": 100218, "token": "\"_persona\""}
100220
+ {"id": 100219, "token": "\"CHILD\""}
100221
+ {"id": 100220, "token": "\"-know\""}
100222
+ {"id": 100221, "token": "\" navigationOptions\""}
100223
+ {"id": 100222, "token": "\" Zukunft\""}
100224
+ {"id": 100223, "token": "\" Pixar\""}
100225
+ {"id": 100224, "token": "\"Tyler\""}
100226
+ {"id": 100225, "token": "\" underworld\""}
100227
+ {"id": 100226, "token": "\" sincerity\""}
100228
+ {"id": 100227, "token": "\" dispenser\""}
100229
+ {"id": 100228, "token": "\" kter\""}
100230
+ {"id": 100229, "token": "\"idders\""}
100231
+ {"id": 100230, "token": "\".addNode\""}
100232
+ {"id": 100231, "token": "\"-checked\""}
100233
+ {"id": 100232, "token": "\" keyst\""}
100234
+ {"id": 100233, "token": "\" WTO\""}
100235
+ {"id": 100234, "token": "\".signals\""}
100236
+ {"id": 100235, "token": "\" adventurer\""}
100237
+ {"id": 100236, "token": "\" Pang\""}
100238
+ {"id": 100237, "token": "\"\\\\R\""}
100239
+ {"id": 100238, "token": "\"=pos\""}
100240
+ {"id": 100239, "token": "\" dispensaries\""}
100241
+ {"id": 100240, "token": "\" Closet\""}
100242
+ {"id": 100241, "token": "\"(\\\"{\\\\\\\"\""}
100243
+ {"id": 100242, "token": "\"ideon\""}
100244
+ {"id": 100243, "token": "\" n\\u00e9cessaire\""}
100245
+ {"id": 100244, "token": "\"()\\\"\\n\""}
100246
+ {"id": 100245, "token": "\"_RECEIVED\""}
100247
+ {"id": 100246, "token": "\" r\\u00e9sultats\""}
100248
+ {"id": 100247, "token": "\" moden\""}
100249
+ {"id": 100248, "token": "\" Icelandic\""}
100250
+ {"id": 100249, "token": "\";d\""}
100251
+ {"id": 100250, "token": "\".allowed\""}
100252
+ {"id": 100251, "token": "\"(newUser\""}
100253
+ {"id": 100252, "token": "\" merciless\""}
100254
+ {"id": 100253, "token": "\".WaitFor\""}
100255
+ {"id": 100254, "token": "\" daycare\""}
100256
+ {"id": 100255, "token": "\" Conveyor\""}
100257
+ {"id": 100256, "token": "\"null\""}
100258
+ {"id": 100257, "token": "\"<|endoftext|>\""}
100259
+ {"id": 100258, "token": "\"<|fim_prefix|>\""}
100260
+ {"id": 100259, "token": "\"<|fim_middle|>\""}
100261
+ {"id": 100260, "token": "\"<|fim_suffix|>\""}
100262
+ {"id": 100261, "token": "\"null\""}
100263
+ {"id": 100262, "token": "\"null\""}
100264
+ {"id": 100263, "token": "\"null\""}
100265
+ {"id": 100264, "token": "\"null\""}
100266
+ {"id": 100265, "token": "\"null\""}
100267
+ {"id": 100266, "token": "\"null\""}
100268
+ {"id": 100267, "token": "\"null\""}
100269
+ {"id": 100268, "token": "\"null\""}
100270
+ {"id": 100269, "token": "\"null\""}
100271
+ {"id": 100270, "token": "\"null\""}
100272
+ {"id": 100271, "token": "\"null\""}
100273
+ {"id": 100272, "token": "\"null\""}
100274
+ {"id": 100273, "token": "\"null\""}
100275
+ {"id": 100274, "token": "\"null\""}
100276
+ {"id": 100275, "token": "\"null\""}
100277
+ {"id": 100276, "token": "\"<|endofprompt|>\""}
vocab/gpt_nexo_20b/README.md CHANGED
@@ -18,11 +18,13 @@ self.padded_vocab_size = 50304
18
 
19
  padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
20
 
 
 
21
  ## 词典
22
 
23
  见 convert_vocab_to_txt.py
24
 
25
- ```
26
  {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
27
 
28
  # 多个符号拼接在一起的
@@ -30,8 +32,16 @@ padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
30
 
31
  # ss
32
 
 
 
 
 
 
33
  ```
34
 
 
 
 
35
  ## special_tokens
36
 
37
  https://huggingface.co/EleutherAI/gpt-neox-20b/blob/main/special_tokens_map.json
@@ -83,4 +93,7 @@ gpt-neox是在800G英文数据集上训练的,为啥词典支持中文?因
83
  "ard less",
84
 
85
 
 
 
 
86
 
 
18
 
19
  padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
20
 
21
+
22
+
23
  ## 词典
24
 
25
  见 convert_vocab_to_txt.py
26
 
27
+ ```sh
28
  {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
29
 
30
  # 多个符号拼接在一起的
 
32
 
33
  # ss
34
 
35
+
36
+
37
+ # 基本字节
38
+ (\u0021-\u007E) + (\u00A1-\u0143)
39
+
40
  ```
41
 
42
+
43
+
44
+
45
  ## special_tokens
46
 
47
  https://huggingface.co/EleutherAI/gpt-neox-20b/blob/main/special_tokens_map.json
 
93
  "ard less",
94
 
95
 
96
+ ## hf格式
97
+
98
+ https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main
99
 
vocab/gpt_nexo_20b/test_tokenizer.py CHANGED
@@ -12,17 +12,60 @@ print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_to
12
 
13
  vocab = tokenizer.get_vocab()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def test_single_token():
17
  """
18
  单个字符的编码(一个字符可能会编码成多个id)
19
  """
20
- for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;":
21
  encoding = tokenizer.encode(word)
22
  for token_id in encoding.ids:
23
  decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
24
  token = tokenizer.id_to_token(token_id)
25
- print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
26
 
27
 
28
  def test_long_token():
@@ -53,6 +96,7 @@ def test_encode():
53
  print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
54
 
55
 
56
- test_single_token()
 
57
  # test_long_token()
58
  # test_encode()
 
12
 
13
  vocab = tokenizer.get_vocab()
14
 
15
+ def to_unicode(text):
16
+ return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
17
+
18
+
19
+ def is_UTF_8(str):
20
+ remain = 0 # 剩余byte数
21
+ for x in range(len(str)):
22
+ if remain == 0:
23
+ if (ord(str[x]) & 0x80) == 0x00:
24
+ remain = 0
25
+ elif (ord(str[x]) & 0xE0) == 0xC0:
26
+ remain = 1
27
+ elif (ord(str[x]) & 0xF0) == 0xE0:
28
+ remain = 2
29
+ elif (ord(str[x]) & 0xF8) == 0xF0:
30
+ remain = 3
31
+ else:
32
+ return False
33
+ else:
34
+ if not ((ord(str[x]) & 0xC0) == 0x80):
35
+ return False
36
+ remain = remain - 1
37
+ if remain == 0: # 最后如果remain不等于零,可能没有匹配完整
38
+ return True
39
+ else:
40
+ return False
41
+
42
+
43
+
44
+ def test_reverse():
45
+ f_out = open("reverse.jsonl", "w", encoding="utf-8")
46
+ for token_id in range(tokenizer.get_vocab_size(with_added_tokens=False)):
47
+ token = tokenizer.id_to_token(token_id)
48
+ print(token_id, is_UTF_8(token))
49
+ if "Ġ" in token:
50
+ continue
51
+
52
+
53
+ encoding = tokenizer.encode(token)
54
+ if len(encoding.ids) > 1 or encoding.ids[0] != token_id:
55
+ f_out.write(json.dumps({"id": token_id, "token": token, "encoding": encoding.ids, "is_utf8": is_UTF_8(token), "isalpha": token.isalpha()}) + "\n")
56
+
57
+
58
 
59
  def test_single_token():
60
  """
61
  单个字符的编码(一个字符可能会编码成多个id)
62
  """
63
+ for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;ĠABC":
64
  encoding = tokenizer.encode(word)
65
  for token_id in encoding.ids:
66
  decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
67
  token = tokenizer.id_to_token(token_id)
68
+ print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token), token.encode("utf-8"), bytes(token, "utf-8"), to_unicode(token))
69
 
70
 
71
  def test_long_token():
 
96
  print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
97
 
98
 
99
+ test_reverse()
100
+ # test_single_token()
101
  # test_long_token()
102
  # test_encode()
vocab/gpt_nexo_20b/tokenzier_hf/README.md DELETED
@@ -1,6 +0,0 @@
1
-
2
- ## hf格式
3
-
4
- https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main
5
-
6
-
 
 
 
 
 
 
 
vocab/jamba_v0_1/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+
4
+ Jamba-v0.1
5
+ """
6
+
7
+ from transformers import AutoTokenizer
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
vocab/kplug/__init__.py CHANGED
@@ -2,4 +2,4 @@
2
  from transformers import BertTokenizer
3
 
4
  tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
5
- print(tokenizer)
 
2
  from transformers import BertTokenizer
3
 
4
  tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
5
+
vocab/llama/gpt_neox/get_oov_zh_tokens.py CHANGED
@@ -1,5 +1,5 @@
1
 
2
- from utils.zh_util import is_chinese
3
  from transformers import LlamaTokenizer
4
  llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()
5
 
@@ -14,7 +14,7 @@ for token, token_id in vocab.items():
14
  # token = token.strip("Ġ")
15
  if len(token) < 1:
16
  continue
17
- if is_chinese(token[0]):
18
  if token not in llama_vocab:
19
  f_out.write(token + "\n")
20
 
 
1
 
2
+ from utils.zh_util import is_zh_char
3
  from transformers import LlamaTokenizer
4
  llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()
5
 
 
14
  # token = token.strip("Ġ")
15
  if len(token) < 1:
16
  continue
17
+ if is_zh_char(token[0]):
18
  if token not in llama_vocab:
19
  f_out.write(token + "\n")
20
 
vocab/llama3/Meta-Llama-3-70B/special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|begin_of_text|>",
3
+ "eos_token": "<|end_of_text|>"
4
+ }
vocab/llama3/Meta-Llama-3-70B/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ac333c83e2d107910928928b5912d8ade91594d08c7c73c4606d05c032d7632
3
+ size 9084463
vocab/llama3/Meta-Llama-3-70B/tokenizer_config.json ADDED
@@ -0,0 +1,2062 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128000": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "128001": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "128002": {
20
+ "content": "<|reserved_special_token_0|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "128003": {
28
+ "content": "<|reserved_special_token_1|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128004": {
36
+ "content": "<|reserved_special_token_2|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128005": {
44
+ "content": "<|reserved_special_token_3|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128006": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128007": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128008": {
68
+ "content": "<|reserved_special_token_4|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128009": {
76
+ "content": "<|eot_id|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128010": {
84
+ "content": "<|reserved_special_token_5|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128011": {
92
+ "content": "<|reserved_special_token_6|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128012": {
100
+ "content": "<|reserved_special_token_7|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128013": {
108
+ "content": "<|reserved_special_token_8|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128014": {
116
+ "content": "<|reserved_special_token_9|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "128015": {
124
+ "content": "<|reserved_special_token_10|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "128016": {
132
+ "content": "<|reserved_special_token_11|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "128017": {
140
+ "content": "<|reserved_special_token_12|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "128018": {
148
+ "content": "<|reserved_special_token_13|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "128019": {
156
+ "content": "<|reserved_special_token_14|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "128020": {
164
+ "content": "<|reserved_special_token_15|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "128021": {
172
+ "content": "<|reserved_special_token_16|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "128022": {
180
+ "content": "<|reserved_special_token_17|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "128023": {
188
+ "content": "<|reserved_special_token_18|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "128024": {
196
+ "content": "<|reserved_special_token_19|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "128025": {
204
+ "content": "<|reserved_special_token_20|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "128026": {
212
+ "content": "<|reserved_special_token_21|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "128027": {
220
+ "content": "<|reserved_special_token_22|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "128028": {
228
+ "content": "<|reserved_special_token_23|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "128029": {
236
+ "content": "<|reserved_special_token_24|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "128030": {
244
+ "content": "<|reserved_special_token_25|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "128031": {
252
+ "content": "<|reserved_special_token_26|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "128032": {
260
+ "content": "<|reserved_special_token_27|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "128033": {
268
+ "content": "<|reserved_special_token_28|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "128034": {
276
+ "content": "<|reserved_special_token_29|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "128035": {
284
+ "content": "<|reserved_special_token_30|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "128036": {
292
+ "content": "<|reserved_special_token_31|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "128037": {
300
+ "content": "<|reserved_special_token_32|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "128038": {
308
+ "content": "<|reserved_special_token_33|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "128039": {
316
+ "content": "<|reserved_special_token_34|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "128040": {
324
+ "content": "<|reserved_special_token_35|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "128041": {
332
+ "content": "<|reserved_special_token_36|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "128042": {
340
+ "content": "<|reserved_special_token_37|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "128043": {
348
+ "content": "<|reserved_special_token_38|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "128044": {
356
+ "content": "<|reserved_special_token_39|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "128045": {
364
+ "content": "<|reserved_special_token_40|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "128046": {
372
+ "content": "<|reserved_special_token_41|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "128047": {
380
+ "content": "<|reserved_special_token_42|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "128048": {
388
+ "content": "<|reserved_special_token_43|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "128049": {
396
+ "content": "<|reserved_special_token_44|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "128050": {
404
+ "content": "<|reserved_special_token_45|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "128051": {
412
+ "content": "<|reserved_special_token_46|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "128052": {
420
+ "content": "<|reserved_special_token_47|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "128053": {
428
+ "content": "<|reserved_special_token_48|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "128054": {
436
+ "content": "<|reserved_special_token_49|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "128055": {
444
+ "content": "<|reserved_special_token_50|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "128056": {
452
+ "content": "<|reserved_special_token_51|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "128057": {
460
+ "content": "<|reserved_special_token_52|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "128058": {
468
+ "content": "<|reserved_special_token_53|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "128059": {
476
+ "content": "<|reserved_special_token_54|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "128060": {
484
+ "content": "<|reserved_special_token_55|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "128061": {
492
+ "content": "<|reserved_special_token_56|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "128062": {
500
+ "content": "<|reserved_special_token_57|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "128063": {
508
+ "content": "<|reserved_special_token_58|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "128064": {
516
+ "content": "<|reserved_special_token_59|>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "128065": {
524
+ "content": "<|reserved_special_token_60|>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "128066": {
532
+ "content": "<|reserved_special_token_61|>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "128067": {
540
+ "content": "<|reserved_special_token_62|>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "128068": {
548
+ "content": "<|reserved_special_token_63|>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "128069": {
556
+ "content": "<|reserved_special_token_64|>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "128070": {
564
+ "content": "<|reserved_special_token_65|>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "128071": {
572
+ "content": "<|reserved_special_token_66|>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "128072": {
580
+ "content": "<|reserved_special_token_67|>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "128073": {
588
+ "content": "<|reserved_special_token_68|>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "128074": {
596
+ "content": "<|reserved_special_token_69|>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "128075": {
604
+ "content": "<|reserved_special_token_70|>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "128076": {
612
+ "content": "<|reserved_special_token_71|>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "128077": {
620
+ "content": "<|reserved_special_token_72|>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "128078": {
628
+ "content": "<|reserved_special_token_73|>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "128079": {
636
+ "content": "<|reserved_special_token_74|>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "128080": {
644
+ "content": "<|reserved_special_token_75|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "128081": {
652
+ "content": "<|reserved_special_token_76|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "128082": {
660
+ "content": "<|reserved_special_token_77|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "128083": {
668
+ "content": "<|reserved_special_token_78|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "128084": {
676
+ "content": "<|reserved_special_token_79|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "128085": {
684
+ "content": "<|reserved_special_token_80|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "128086": {
692
+ "content": "<|reserved_special_token_81|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "128087": {
700
+ "content": "<|reserved_special_token_82|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "128088": {
708
+ "content": "<|reserved_special_token_83|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "128089": {
716
+ "content": "<|reserved_special_token_84|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "128090": {
724
+ "content": "<|reserved_special_token_85|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "128091": {
732
+ "content": "<|reserved_special_token_86|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "128092": {
740
+ "content": "<|reserved_special_token_87|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "128093": {
748
+ "content": "<|reserved_special_token_88|>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "128094": {
756
+ "content": "<|reserved_special_token_89|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "128095": {
764
+ "content": "<|reserved_special_token_90|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "128096": {
772
+ "content": "<|reserved_special_token_91|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "128097": {
780
+ "content": "<|reserved_special_token_92|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "128098": {
788
+ "content": "<|reserved_special_token_93|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "128099": {
796
+ "content": "<|reserved_special_token_94|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "128100": {
804
+ "content": "<|reserved_special_token_95|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "128101": {
812
+ "content": "<|reserved_special_token_96|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "128102": {
820
+ "content": "<|reserved_special_token_97|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "128103": {
828
+ "content": "<|reserved_special_token_98|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "128104": {
836
+ "content": "<|reserved_special_token_99|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "128105": {
844
+ "content": "<|reserved_special_token_100|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "128106": {
852
+ "content": "<|reserved_special_token_101|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "128107": {
860
+ "content": "<|reserved_special_token_102|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "128108": {
868
+ "content": "<|reserved_special_token_103|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "128109": {
876
+ "content": "<|reserved_special_token_104|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "128110": {
884
+ "content": "<|reserved_special_token_105|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "128111": {
892
+ "content": "<|reserved_special_token_106|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "128112": {
900
+ "content": "<|reserved_special_token_107|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "128113": {
908
+ "content": "<|reserved_special_token_108|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "128114": {
916
+ "content": "<|reserved_special_token_109|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "128115": {
924
+ "content": "<|reserved_special_token_110|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "128116": {
932
+ "content": "<|reserved_special_token_111|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "128117": {
940
+ "content": "<|reserved_special_token_112|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "128118": {
948
+ "content": "<|reserved_special_token_113|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "128119": {
956
+ "content": "<|reserved_special_token_114|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "128120": {
964
+ "content": "<|reserved_special_token_115|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "128121": {
972
+ "content": "<|reserved_special_token_116|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "128122": {
980
+ "content": "<|reserved_special_token_117|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "128123": {
988
+ "content": "<|reserved_special_token_118|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "128124": {
996
+ "content": "<|reserved_special_token_119|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "128125": {
1004
+ "content": "<|reserved_special_token_120|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "128126": {
1012
+ "content": "<|reserved_special_token_121|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "128127": {
1020
+ "content": "<|reserved_special_token_122|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "128128": {
1028
+ "content": "<|reserved_special_token_123|>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "128129": {
1036
+ "content": "<|reserved_special_token_124|>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "128130": {
1044
+ "content": "<|reserved_special_token_125|>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "128131": {
1052
+ "content": "<|reserved_special_token_126|>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "128132": {
1060
+ "content": "<|reserved_special_token_127|>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "128133": {
1068
+ "content": "<|reserved_special_token_128|>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "128134": {
1076
+ "content": "<|reserved_special_token_129|>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "128135": {
1084
+ "content": "<|reserved_special_token_130|>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "128136": {
1092
+ "content": "<|reserved_special_token_131|>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "128137": {
1100
+ "content": "<|reserved_special_token_132|>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "128138": {
1108
+ "content": "<|reserved_special_token_133|>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "128139": {
1116
+ "content": "<|reserved_special_token_134|>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "128140": {
1124
+ "content": "<|reserved_special_token_135|>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "128141": {
1132
+ "content": "<|reserved_special_token_136|>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "128142": {
1140
+ "content": "<|reserved_special_token_137|>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "128143": {
1148
+ "content": "<|reserved_special_token_138|>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "128144": {
1156
+ "content": "<|reserved_special_token_139|>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "128145": {
1164
+ "content": "<|reserved_special_token_140|>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "128146": {
1172
+ "content": "<|reserved_special_token_141|>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "128147": {
1180
+ "content": "<|reserved_special_token_142|>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "128148": {
1188
+ "content": "<|reserved_special_token_143|>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "128149": {
1196
+ "content": "<|reserved_special_token_144|>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "128150": {
1204
+ "content": "<|reserved_special_token_145|>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "128151": {
1212
+ "content": "<|reserved_special_token_146|>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "128152": {
1220
+ "content": "<|reserved_special_token_147|>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "128153": {
1228
+ "content": "<|reserved_special_token_148|>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "128154": {
1236
+ "content": "<|reserved_special_token_149|>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "128155": {
1244
+ "content": "<|reserved_special_token_150|>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "128156": {
1252
+ "content": "<|reserved_special_token_151|>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "128157": {
1260
+ "content": "<|reserved_special_token_152|>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "128158": {
1268
+ "content": "<|reserved_special_token_153|>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "128159": {
1276
+ "content": "<|reserved_special_token_154|>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "128160": {
1284
+ "content": "<|reserved_special_token_155|>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "128161": {
1292
+ "content": "<|reserved_special_token_156|>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "128162": {
1300
+ "content": "<|reserved_special_token_157|>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "128163": {
1308
+ "content": "<|reserved_special_token_158|>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "128164": {
1316
+ "content": "<|reserved_special_token_159|>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "128165": {
1324
+ "content": "<|reserved_special_token_160|>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "128166": {
1332
+ "content": "<|reserved_special_token_161|>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "128167": {
1340
+ "content": "<|reserved_special_token_162|>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "128168": {
1348
+ "content": "<|reserved_special_token_163|>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "128169": {
1356
+ "content": "<|reserved_special_token_164|>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "128170": {
1364
+ "content": "<|reserved_special_token_165|>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "128171": {
1372
+ "content": "<|reserved_special_token_166|>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "128172": {
1380
+ "content": "<|reserved_special_token_167|>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "128173": {
1388
+ "content": "<|reserved_special_token_168|>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "128174": {
1396
+ "content": "<|reserved_special_token_169|>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "128175": {
1404
+ "content": "<|reserved_special_token_170|>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "128176": {
1412
+ "content": "<|reserved_special_token_171|>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "128177": {
1420
+ "content": "<|reserved_special_token_172|>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "128178": {
1428
+ "content": "<|reserved_special_token_173|>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "128179": {
1436
+ "content": "<|reserved_special_token_174|>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "128180": {
1444
+ "content": "<|reserved_special_token_175|>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "128181": {
1452
+ "content": "<|reserved_special_token_176|>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "128182": {
1460
+ "content": "<|reserved_special_token_177|>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "128183": {
1468
+ "content": "<|reserved_special_token_178|>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "128184": {
1476
+ "content": "<|reserved_special_token_179|>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "128185": {
1484
+ "content": "<|reserved_special_token_180|>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "128186": {
1492
+ "content": "<|reserved_special_token_181|>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "128187": {
1500
+ "content": "<|reserved_special_token_182|>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "128188": {
1508
+ "content": "<|reserved_special_token_183|>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "128189": {
1516
+ "content": "<|reserved_special_token_184|>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "128190": {
1524
+ "content": "<|reserved_special_token_185|>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "128191": {
1532
+ "content": "<|reserved_special_token_186|>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "128192": {
1540
+ "content": "<|reserved_special_token_187|>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "128193": {
1548
+ "content": "<|reserved_special_token_188|>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "128194": {
1556
+ "content": "<|reserved_special_token_189|>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "128195": {
1564
+ "content": "<|reserved_special_token_190|>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "128196": {
1572
+ "content": "<|reserved_special_token_191|>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "128197": {
1580
+ "content": "<|reserved_special_token_192|>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "128198": {
1588
+ "content": "<|reserved_special_token_193|>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "128199": {
1596
+ "content": "<|reserved_special_token_194|>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "128200": {
1604
+ "content": "<|reserved_special_token_195|>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "128201": {
1612
+ "content": "<|reserved_special_token_196|>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "128202": {
1620
+ "content": "<|reserved_special_token_197|>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "128203": {
1628
+ "content": "<|reserved_special_token_198|>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "128204": {
1636
+ "content": "<|reserved_special_token_199|>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "128205": {
1644
+ "content": "<|reserved_special_token_200|>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "128206": {
1652
+ "content": "<|reserved_special_token_201|>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "128207": {
1660
+ "content": "<|reserved_special_token_202|>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "128208": {
1668
+ "content": "<|reserved_special_token_203|>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "128209": {
1676
+ "content": "<|reserved_special_token_204|>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "128210": {
1684
+ "content": "<|reserved_special_token_205|>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "128211": {
1692
+ "content": "<|reserved_special_token_206|>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "128212": {
1700
+ "content": "<|reserved_special_token_207|>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "128213": {
1708
+ "content": "<|reserved_special_token_208|>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "128214": {
1716
+ "content": "<|reserved_special_token_209|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "128215": {
1724
+ "content": "<|reserved_special_token_210|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "128216": {
1732
+ "content": "<|reserved_special_token_211|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ },
1739
+ "128217": {
1740
+ "content": "<|reserved_special_token_212|>",
1741
+ "lstrip": false,
1742
+ "normalized": false,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": true
1746
+ },
1747
+ "128218": {
1748
+ "content": "<|reserved_special_token_213|>",
1749
+ "lstrip": false,
1750
+ "normalized": false,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": true
1754
+ },
1755
+ "128219": {
1756
+ "content": "<|reserved_special_token_214|>",
1757
+ "lstrip": false,
1758
+ "normalized": false,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": true
1762
+ },
1763
+ "128220": {
1764
+ "content": "<|reserved_special_token_215|>",
1765
+ "lstrip": false,
1766
+ "normalized": false,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": true
1770
+ },
1771
+ "128221": {
1772
+ "content": "<|reserved_special_token_216|>",
1773
+ "lstrip": false,
1774
+ "normalized": false,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": true
1778
+ },
1779
+ "128222": {
1780
+ "content": "<|reserved_special_token_217|>",
1781
+ "lstrip": false,
1782
+ "normalized": false,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": true
1786
+ },
1787
+ "128223": {
1788
+ "content": "<|reserved_special_token_218|>",
1789
+ "lstrip": false,
1790
+ "normalized": false,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": true
1794
+ },
1795
+ "128224": {
1796
+ "content": "<|reserved_special_token_219|>",
1797
+ "lstrip": false,
1798
+ "normalized": false,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": true
1802
+ },
1803
+ "128225": {
1804
+ "content": "<|reserved_special_token_220|>",
1805
+ "lstrip": false,
1806
+ "normalized": false,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": true
1810
+ },
1811
+ "128226": {
1812
+ "content": "<|reserved_special_token_221|>",
1813
+ "lstrip": false,
1814
+ "normalized": false,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": true
1818
+ },
1819
+ "128227": {
1820
+ "content": "<|reserved_special_token_222|>",
1821
+ "lstrip": false,
1822
+ "normalized": false,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": true
1826
+ },
1827
+ "128228": {
1828
+ "content": "<|reserved_special_token_223|>",
1829
+ "lstrip": false,
1830
+ "normalized": false,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": true
1834
+ },
1835
+ "128229": {
1836
+ "content": "<|reserved_special_token_224|>",
1837
+ "lstrip": false,
1838
+ "normalized": false,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": true
1842
+ },
1843
+ "128230": {
1844
+ "content": "<|reserved_special_token_225|>",
1845
+ "lstrip": false,
1846
+ "normalized": false,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": true
1850
+ },
1851
+ "128231": {
1852
+ "content": "<|reserved_special_token_226|>",
1853
+ "lstrip": false,
1854
+ "normalized": false,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": true
1858
+ },
1859
+ "128232": {
1860
+ "content": "<|reserved_special_token_227|>",
1861
+ "lstrip": false,
1862
+ "normalized": false,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": true
1866
+ },
1867
+ "128233": {
1868
+ "content": "<|reserved_special_token_228|>",
1869
+ "lstrip": false,
1870
+ "normalized": false,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": true
1874
+ },
1875
+ "128234": {
1876
+ "content": "<|reserved_special_token_229|>",
1877
+ "lstrip": false,
1878
+ "normalized": false,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": true
1882
+ },
1883
+ "128235": {
1884
+ "content": "<|reserved_special_token_230|>",
1885
+ "lstrip": false,
1886
+ "normalized": false,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": true
1890
+ },
1891
+ "128236": {
1892
+ "content": "<|reserved_special_token_231|>",
1893
+ "lstrip": false,
1894
+ "normalized": false,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": true
1898
+ },
1899
+ "128237": {
1900
+ "content": "<|reserved_special_token_232|>",
1901
+ "lstrip": false,
1902
+ "normalized": false,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": true
1906
+ },
1907
+ "128238": {
1908
+ "content": "<|reserved_special_token_233|>",
1909
+ "lstrip": false,
1910
+ "normalized": false,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": true
1914
+ },
1915
+ "128239": {
1916
+ "content": "<|reserved_special_token_234|>",
1917
+ "lstrip": false,
1918
+ "normalized": false,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": true
1922
+ },
1923
+ "128240": {
1924
+ "content": "<|reserved_special_token_235|>",
1925
+ "lstrip": false,
1926
+ "normalized": false,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": true
1930
+ },
1931
+ "128241": {
1932
+ "content": "<|reserved_special_token_236|>",
1933
+ "lstrip": false,
1934
+ "normalized": false,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": true
1938
+ },
1939
+ "128242": {
1940
+ "content": "<|reserved_special_token_237|>",
1941
+ "lstrip": false,
1942
+ "normalized": false,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": true
1946
+ },
1947
+ "128243": {
1948
+ "content": "<|reserved_special_token_238|>",
1949
+ "lstrip": false,
1950
+ "normalized": false,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": true
1954
+ },
1955
+ "128244": {
1956
+ "content": "<|reserved_special_token_239|>",
1957
+ "lstrip": false,
1958
+ "normalized": false,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": true
1962
+ },
1963
+ "128245": {
1964
+ "content": "<|reserved_special_token_240|>",
1965
+ "lstrip": false,
1966
+ "normalized": false,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": true
1970
+ },
1971
+ "128246": {
1972
+ "content": "<|reserved_special_token_241|>",
1973
+ "lstrip": false,
1974
+ "normalized": false,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": true
1978
+ },
1979
+ "128247": {
1980
+ "content": "<|reserved_special_token_242|>",
1981
+ "lstrip": false,
1982
+ "normalized": false,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": true
1986
+ },
1987
+ "128248": {
1988
+ "content": "<|reserved_special_token_243|>",
1989
+ "lstrip": false,
1990
+ "normalized": false,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": true
1994
+ },
1995
+ "128249": {
1996
+ "content": "<|reserved_special_token_244|>",
1997
+ "lstrip": false,
1998
+ "normalized": false,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": true
2002
+ },
2003
+ "128250": {
2004
+ "content": "<|reserved_special_token_245|>",
2005
+ "lstrip": false,
2006
+ "normalized": false,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": true
2010
+ },
2011
+ "128251": {
2012
+ "content": "<|reserved_special_token_246|>",
2013
+ "lstrip": false,
2014
+ "normalized": false,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": true
2018
+ },
2019
+ "128252": {
2020
+ "content": "<|reserved_special_token_247|>",
2021
+ "lstrip": false,
2022
+ "normalized": false,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": true
2026
+ },
2027
+ "128253": {
2028
+ "content": "<|reserved_special_token_248|>",
2029
+ "lstrip": false,
2030
+ "normalized": false,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": true
2034
+ },
2035
+ "128254": {
2036
+ "content": "<|reserved_special_token_249|>",
2037
+ "lstrip": false,
2038
+ "normalized": false,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": true
2042
+ },
2043
+ "128255": {
2044
+ "content": "<|reserved_special_token_250|>",
2045
+ "lstrip": false,
2046
+ "normalized": false,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": true
2050
+ }
2051
+ },
2052
+ "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
2054
+ "clean_up_tokenization_spaces": true,
2055
+ "eos_token": "<|end_of_text|>",
2056
+ "model_input_names": [
2057
+ "input_ids",
2058
+ "attention_mask"
2059
+ ],
2060
+ "model_max_length": 1000000000000000019884624838656,
2061
+ "tokenizer_class": "PreTrainedTokenizerFast"
2062
+ }
vocab/llama3/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ from transformers import AutoTokenizer
5
+
6
+
7
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
8
+ TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Meta-Llama-3-70B")
9
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
vocab/mobilenet_v2/__init__.py CHANGED
@@ -7,6 +7,10 @@
7
  File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
8
  raise KeyError(key)
9
  KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
 
 
 
 
10
  """
11
 
12
  from transformers import AutoTokenizer
 
7
  File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
8
  raise KeyError(key)
9
  KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
10
+
11
+ ## how to fix?
12
+
13
+
14
  """
15
 
16
  from transformers import AutoTokenizer
vocab/moss/test_zh_coding_len.py CHANGED
@@ -16,7 +16,7 @@
16
  from collections import Counter
17
  from transformers import AutoTokenizer
18
  from data_sample.oov_base import jd_vocab_tokens
19
- from utils.text_util import is_chinese, has_chinese
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
  tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
@@ -56,7 +56,7 @@ def iter_vocab():
56
  zh_symbol_count = 0
57
  for idx in range(len(vocab)):
58
  decode_str = tokenizer.decode([idx])
59
- if has_chinese(decode_str):
60
  zh_token_count["total"] += 1
61
  if len(decode_str.strip()) > 1:
62
  zh_token_count["中文多字"] += 1
 
16
  from collections import Counter
17
  from transformers import AutoTokenizer
18
  from data_sample.oov_base import jd_vocab_tokens
19
+ from utils.text_util import is_zh_char, has_zh
20
  from zhon.hanzi import punctuation as zh_punc
21
 
22
  tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
 
56
  zh_symbol_count = 0
57
  for idx in range(len(vocab)):
58
  decode_str = tokenizer.decode([idx])
59
+ if has_zh(decode_str):
60
  zh_token_count["total"] += 1
61
  if len(decode_str.strip()) > 1:
62
  zh_token_count["中文多字"] += 1