xu-song commited on
Commit
97354e0
·
1 Parent(s): d3c1316
Files changed (6) hide show
  1. README.md +3 -148
  2. character_util.py +1 -1
  3. compression_app.py +2 -1
  4. utils/lang_util.py +3 -0
  5. utils/oov.md +0 -202
  6. vocab.py +1 -1
README.md CHANGED
@@ -43,79 +43,6 @@ python utils/compress_rate_util.py
43
  | aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
44
  | baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 |
45
  | baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 |
46
- | bert_base_cased | 28996 | 3.64 | 0.27 | 3.55 | 0.28 | 3.89 |
47
- | bert_base_chinese | 21128 | 2.78 | 0.36 | 2.71 | 0.37 | 2.97 |
48
- | bert_base_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 |
49
- | bloom | 250680 | 4.07 | 0.25 | 3.97 | 0.25 | 4.36 |
50
- | byt5_small | 256 | 0.92 | 1.08 | 0.9 | 1.11 | 0.99 |
51
- | character_glm_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
52
- | chatglm2_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
53
- | chatglm3_6b | 64798 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
54
- | chatglm_6b | 150344 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 |
55
- | chatyuan_large_v2 | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 |
56
- | chinese_llama | 49953 | 3.59 | 0.28 | 3.51 | 0.28 | 3.85 |
57
- | chinese_llama2 | 55296 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
58
- | code_davinci_002 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
59
- | crystal_coder | 32000 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 |
60
- | dbrx_instruct | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
61
- | deepseek_coder_33b_instruct | 32000 | 3.64 | 0.27 | 3.56 | 0.28 | 3.9 |
62
- | deepseek_llm_7b_base | 100000 | 3.85 | 0.26 | 3.76 | 0.27 | 4.12 |
63
- | falcon_180b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 |
64
- | falcon_7b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 |
65
- | fastchat_t5_3b | 32000 | 2.16 | 0.46 | 2.11 | 0.47 | 2.31 |
66
- | flan_t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
67
- | gemma_7b | 256000 | 3.91 | 0.26 | 3.82 | 0.26 | 4.18 |
68
- | gpt2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
69
- | gpt2_chinese | 21128 | 2.67 | 0.37 | 2.61 | 0.38 | 2.86 |
70
- | gpt_35_turbo | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
71
- | gpt_4 | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
72
- | gpt_nexo_20b | 50254 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 |
73
- | grok_1 | 131072 | 4.06 | 0.25 | 3.96 | 0.25 | 4.35 |
74
- | internlm2_chat_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
75
- | internlm2_math_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
76
- | internlm_chat_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
77
- | internlm_xcomposer_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
78
- | jamba_v0_1 | 65536 | 3.82 | 0.26 | 3.73 | 0.27 | 4.09 |
79
- | kplug | 10261 | 2.66 | 0.38 | 2.6 | 0.38 | 2.85 |
80
- | llama | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
81
- | llama2 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
82
- | llama3 | 128000 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
83
- | mistral_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
84
- | mixtral_8_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
85
- | mobilebert_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 |
86
- | moss | 106029 | 4.08 | 0.25 | 3.98 | 0.25 | 4.36 |
87
- | mt5_large | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
88
- | olmo_7b | 50280 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 |
89
- | orion_14b_chat | 84608 | 3.94 | 0.25 | 3.85 | 0.26 | 4.22 |
90
- | phi_1 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
91
- | phi_2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
92
- | pko_t5_large | 50258 | 1.59 | 0.63 | 1.55 | 0.64 | 1.7 |
93
- | prompt_clue | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 |
94
- | qwen1_5_14b_chat | 151643 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
95
- | qwen_1_8b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
96
- | qwen_72b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
97
- | qwen_7b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
98
- | roberta_chinese_clue | 8021 | 1.8 | 0.56 | 1.75 | 0.57 | 1.92 |
99
- | skywork_13b_base | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
100
- | skywork_13b_math | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
101
- | solar_10_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
102
- | starchat_alpha | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 |
103
- | switch_c_2048 | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
104
- | t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
105
- | t5_large | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
106
- | t5_small | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
107
- | text_davinci_003 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
108
- | tigerbot_13b_chat_v2 | 60512 | 3.67 | 0.27 | 3.58 | 0.28 | 3.93 |
109
- | tigerbot_70b_chat_v4_4k | 65107 | 3.65 | 0.27 | 3.57 | 0.28 | 3.91 |
110
- | wizardcoder_15b_v1 | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 |
111
- | wizardcoder_python_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
112
- | wizardlm_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
113
- | wizardmath_70b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
114
- | xlm_roberta | 250002 | 3.49 | 0.29 | 3.41 | 0.29 | 3.74 |
115
- | yi_34b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 |
116
- | yi_6b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 |
117
- | yi_vl34b | 64000 | 3.88 | 0.26 | 3.79 | 0.26 | 4.16 |
118
- | zephyr_7b_beta | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
119
 
120
  </details>
121
 
@@ -128,80 +55,6 @@ python utils/compress_rate_util.py
128
  | amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
129
  | aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
130
  | baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 |
131
- | baichuan2 | 125696 | 4.53 | 0.22 | 4.42 | 0.23 | 1.71 |
132
- | bert_base_cased | 28996 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 |
133
- | bert_base_chinese | 21128 | 2.74 | 0.37 | 2.67 | 0.37 | 1.03 |
134
- | bert_base_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 |
135
- | bloom | 250680 | 4.28 | 0.23 | 4.18 | 0.24 | 1.62 |
136
- | byt5_small | 256 | 0.93 | 1.08 | 0.91 | 1.1 | 0.35 |
137
- | character_glm_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
138
- | chatglm2_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
139
- | chatglm3_6b | 64798 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
140
- | chatglm_6b | 150344 | 4.65 | 0.22 | 4.54 | 0.22 | 1.76 |
141
- | chatyuan_large_v2 | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 |
142
- | chinese_llama | 49953 | 3.93 | 0.25 | 3.84 | 0.26 | 1.49 |
143
- | chinese_llama2 | 55296 | 3.92 | 0.26 | 3.83 | 0.26 | 1.48 |
144
- | code_davinci_002 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
145
- | crystal_coder | 32000 | 1.86 | 0.54 | 1.81 | 0.55 | 0.7 |
146
- | dbrx_instruct | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
147
- | deepseek_coder_33b_instruct | 32000 | 3.4 | 0.29 | 3.32 | 0.3 | 1.29 |
148
- | deepseek_llm_7b_base | 100000 | 4.05 | 0.25 | 3.96 | 0.25 | 1.53 |
149
- | falcon_180b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 |
150
- | falcon_7b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 |
151
- | fastchat_t5_3b | 32000 | 13.7 | 0.07 | 13.38 | 0.07 | 5.18 |
152
- | flan_t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
153
- | gemma_7b | 256000 | 3.82 | 0.26 | 3.73 | 0.27 | 1.44 |
154
- | gpt2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
155
- | gpt2_chinese | 21128 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 |
156
- | gpt_35_turbo | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
157
- | gpt_4 | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
158
- | gpt_nexo_20b | 50254 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 |
159
- | grok_1 | 131072 | 1.73 | 0.58 | 1.69 | 0.59 | 0.66 |
160
- | internlm2_chat_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 |
161
- | internlm2_math_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 |
162
- | internlm_chat_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 |
163
- | internlm_xcomposer_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 |
164
- | jamba_v0_1 | 65536 | 2.3 | 0.44 | 2.24 | 0.45 | 0.87 |
165
- | kplug | 10261 | 2.72 | 0.37 | 2.65 | 0.38 | 1.03 |
166
- | llama | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
167
- | llama2 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
168
- | llama3 | 128000 | 3.28 | 0.3 | 3.2 | 0.31 | 1.24 |
169
- | mistral_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
170
- | mixtral_8_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
171
- | mobilebert_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 |
172
- | moss | 106029 | 4.4 | 0.23 | 4.3 | 0.23 | 1.66 |
173
- | mt5_large | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
174
- | olmo_7b | 50280 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 |
175
- | orion_14b_chat | 84608 | 4.63 | 0.22 | 4.52 | 0.22 | 1.75 |
176
- | phi_1 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
177
- | phi_2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
178
- | pko_t5_large | 50258 | 0.97 | 1.03 | 0.95 | 1.06 | 0.37 |
179
- | prompt_clue | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 |
180
- | qwen1_5_14b_chat | 151643 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
181
- | qwen_1_8b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
182
- | qwen_72b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
183
- | qwen_7b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
184
- | roberta_chinese_clue | 8021 | 2.7 | 0.37 | 2.64 | 0.38 | 1.02 |
185
- | skywork_13b_base | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 |
186
- | skywork_13b_math | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 |
187
- | solar_10_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
188
- | starchat_alpha | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 |
189
- | switch_c_2048 | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
190
- | t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
191
- | t5_large | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
192
- | t5_small | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
193
- | text_davinci_003 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
194
- | tigerbot_13b_chat_v2 | 60512 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 |
195
- | tigerbot_70b_chat_v4_4k | 65107 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 |
196
- | wizardcoder_15b_v1 | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 |
197
- | wizardcoder_python_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
198
- | wizardlm_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
199
- | wizardmath_70b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
200
- | xlm_roberta | 250002 | 3.96 | 0.25 | 3.86 | 0.26 | 1.5 |
201
- | yi_34b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 |
202
- | yi_6b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 |
203
- | yi_vl34b | 64000 | 4.11 | 0.24 | 4.02 | 0.25 | 1.56 |
204
- | zephyr_7b_beta | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
205
 
206
  </details>
207
 
@@ -212,12 +65,14 @@ python utils/compress_rate_util.py
212
 
213
  - Getting the most out of your tokenizer for pre-training and domain adaptation
214
  - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
215
- - blog
216
  - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
217
  - https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
218
  - https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
219
  - https://zhuanlan.zhihu.com/p/652520262
220
  - https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
 
 
221
  - demo
222
  - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
223
  - https://github.com/dqbd/tiktokenizer
 
43
  | aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
44
  | baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 |
45
  | baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  </details>
48
 
 
55
  | amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
56
  | aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
57
  | baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  </details>
60
 
 
65
 
66
  - Getting the most out of your tokenizer for pre-training and domain adaptation
67
  - Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
68
+ - blog
69
  - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
70
  - https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
71
  - https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
72
  - https://zhuanlan.zhihu.com/p/652520262
73
  - https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
74
+ - https://tonybaloney.github.io/posts/cjk-chinese-japanese-korean-llm-ai-best-practices.html
75
+ -
76
  - demo
77
  - https://huggingface.co/spaces/Xenova/the-tokenizer-playground
78
  - https://github.com/dqbd/tiktokenizer
character_util.py CHANGED
@@ -83,7 +83,7 @@ def iter_vocab(
83
  with open(cache_path, "r", encoding="utf-8") as f_tmp:
84
  cache.update(json.load(f_tmp))
85
  if from_cache and tokenizer_name in cache:
86
- logger.info(f"load {tokenizer_config.name_or_path} from cache")
87
  return cache[tokenizer_name]
88
 
89
  tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
 
83
  with open(cache_path, "r", encoding="utf-8") as f_tmp:
84
  cache.update(json.load(f_tmp))
85
  if from_cache and tokenizer_name in cache:
86
+ # logger.info(f"load {tokenizer_config.name_or_path} from cache")
87
  return cache[tokenizer_name]
88
 
89
  tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
compression_app.py CHANGED
@@ -75,13 +75,14 @@ with gr.Blocks() as demo:
75
  )
76
 
77
  gr.Markdown(
 
78
  "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
79
  "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
80
  "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
81
  # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
82
  # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
83
  "- `char/token` measures how many chars per token on the tokenized corpus.\n"
84
- "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n"
85
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
86
  )
87
 
 
75
  )
76
 
77
  gr.Markdown(
78
+ # "Note:\n\n"
79
  "- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
80
  "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
81
  "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
82
  # "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
83
  # "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
84
  "- `char/token` measures how many chars per token on the tokenized corpus.\n"
85
+ "- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
86
  "You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
87
  )
88
 
utils/lang_util.py CHANGED
@@ -10,6 +10,9 @@
10
  然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
11
  例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
12
  此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
 
 
 
13
  """
14
 
15
  import re
 
10
  然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
11
  例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
12
  此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
13
+
14
+ ## 常用语言
15
+ English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
16
  """
17
 
18
  import re
utils/oov.md DELETED
@@ -1,202 +0,0 @@
1
-
2
- ```sh
3
- ###################################
4
- ClueAI/ChatYuan-large-v2, <class 'tokenizers.models.Unigram'>
5
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
6
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
7
- decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
8
-
9
-
10
- ###################################
11
- ClueAI/PromptCLUE-base, <class 'tokenizers.models.Unigram'>
12
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
13
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
14
- decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
15
- ###################################
16
- CohereForAI/aya-101, <class 'tokenizers.models.Unigram'>
17
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
18
- text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
19
- decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
20
- ###################################
21
- FacebookAI/xlm-roberta-base, <class 'tokenizers.models.Unigram'>
22
- reversible: false; unk_token: <unk>, 3, unk_ratio: 0.0096; oov: []
23
- text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
24
- decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
25
- ###################################
26
- OrionStarAI/Orion-14B-Chat, sp_model, byte_num: 0
27
- reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0495; oov: []
28
- text[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
29
- decoding[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئ<unk> ⁇ ردوغان <unk> ⁇ قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለ<unk> ⁇ ጭ የግድግ<unk> ⁇ ; Дзейныя асобы:; « <unk> ⁇ <unk> ⁇ <unk> ⁇ ; \t\n <unk> ⁇ ❤❥웃유♋☮✊; <unk> ⁇ יקי<unk> ⁇ ערטערבוך "
30
- ###################################
31
- THUDM/chatglm-6b, byte_num: 256
32
- reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
33
- text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
34
- decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך"
35
- ###################################
36
- abeja/gpt-neox-japanese-2.7b, japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2
37
- reversible: false; unk_token: <|endoftext|>, 31999, unk_ratio: 0.0000; oov: []
38
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
39
- decoding[7] = "���������������� �������������������� ����������������; ������ ������ 15~17��� ��������� 3������; 確実に春が近づいてること; a k��zoktat��ssal? _ Belf��ld; pum��, i vjet��r, vje��; ���������������� ���� ���������������������� ; ��������������� ��������� ������ ��������� ������ ������������������������; ��������������� ��������������� ; �������������� ����������:; ǀ ��������������������������� ��������������� ���������������; \t\n\n🐯❤‖������🟥🟥🤚;��������������������������"
40
-
41
-
42
- ###################################
43
- baichuan-inc/Baichuan-7B, sp_model, byte_num: 256
44
- reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
45
- text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
46
- decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
47
- ###################################
48
- ckiplab/gpt2-base-chinese, <class 'tokenizers.models.WordPiece'>
49
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1185; oov: []
50
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
51
- decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
52
-
53
-
54
- ###################################
55
- cl-tohoku/bert-base-japanese, wordpiece.MecabTokenizer, 支持byte-level https://github.com/polm/fugashi
56
- reversible: false; unk_token: [UNK], 1, unk_ratio: 0.3951; oov: []
57
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
58
- decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ 17 [UNK] [UNK] 3 [UNK] ; 確実 に 春 が 近づい てる こと ; a közoktatással? _ Belföld ; [UNK], i [UNK], vjeç ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] :; [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK]"
59
-
60
-
61
- ###################################
62
- clue/roberta_chinese_clue_tiny, <class 'tokenizers.models.WordPiece'>
63
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3580; oov: []
64
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
65
- decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
66
-
67
-
68
- ###################################
69
- dbmdz/bert-base-german-uncased, <class 'tokenizers.models.WordPiece'>
70
- reversible: false; unk_token: [UNK], 101, unk_ratio: 0.4459; oov: []
71
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
72
- decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
73
- ###################################
74
- deepseek-ai/deepseek-coder-33b-instruct, <class 'tokenizers.models.BPE'>
75
- reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
76
- text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
77
- decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
78
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
79
- [2024-05-12 00:30:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer eson/kplug-base-encoder
80
- ###################################
81
- deepseek-ai/deepseek-llm-7b-base, <class 'tokenizers.models.BPE'>
82
- reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
83
- text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
84
- decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
85
- [2024-05-12 00:30:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer fnlp/moss-moon-003-sft
86
- ###################################
87
- eson/kplug-base-encoder, <class 'tokenizers.models.WordPiece'>
88
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3625; oov: []
89
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
90
- decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
91
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
92
- [2024-05-12 00:31:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-cased
93
- ###################################
94
- fnlp/moss-moon-003-sft, 应该是 sentencepiece.byte_bpe,待确认
95
- reversible: false; unk_token: <|endoftext|>, 106028, unk_ratio: 0.0000; oov: []
96
- text[74] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
97
- decoding[74] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
98
- ###################################
99
- google-bert/bert-base-cased, <class 'tokenizers.models.WordPiece'>
100
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1732; oov: []
101
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
102
- decoding[5] = " ; Замглавы управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] に [UNK] [UNK] [UNK] [UNK] ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से [UNK] सा [UNK] ; [UNK] [UNK] ; Дзейныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
103
- [2024-05-12 00:31:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-chinese
104
- [2024-05-12 00:32:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-german-cased
105
- ###################################
106
- google-bert/bert-base-chinese, <class 'tokenizers.models.WordPiece'>
107
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3704; oov: []
108
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
109
- decoding[5] = " ; [UNK] управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; 確 実 に 春 [UNK] 近 [UNK] ; a [UNK]? _ [UNK] ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
110
- ###################################
111
- google-bert/bert-base-german-cased, <class 'tokenizers.models.WordPiece'>
112
- reversible: false; unk_token: [UNK], 2, unk_ratio: 0.5938; oov: []
113
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
114
- decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a [UNK]? _ Belföld ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; [UNK] [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
115
- [2024-05-12 00:32:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-cased
116
- [2024-05-12 00:32:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-uncased
117
- ###################################
118
- google-bert/bert-base-multilingual-cased, <class 'tokenizers.models.WordPiece'>
119
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0531; oov: []
120
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
121
- decoding[5] = " ; Замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 が 近 づいてること ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से कौन सा हारडवेयर ; [UNK] [UNK] ; Дзейныя асобы : ; « અમરેલીનાં મહિલા વિકાસ ; [UNK] ; [UNK]"
122
- [2024-05-12 00:33:17] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-uncased
123
- ###################################
124
- google-bert/bert-base-multilingual-uncased, <class 'tokenizers.models.WordPiece'>
125
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0360; oov: []
126
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
127
- decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; يەردوغان ۋە قىرغىزىستان ; निमन म स कौन सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « અમરલીના મહિલા વિકાસ ; [UNK] ; [UNK]"
128
- [2024-05-12 00:33:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-t5/t5-large
129
- ###################################
130
- google-bert/bert-base-uncased, <class 'tokenizers.models.WordPiece'>
131
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
132
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
133
- decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
134
- ###################################
135
- google-t5/t5-large, <class 'tokenizers.models.Unigram'>
136
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
137
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
138
- decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
139
- [2024-05-12 00:34:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/byt5-small
140
- [2024-05-12 00:35:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/gemma-7b
141
- [2024-05-12 00:35:39] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mobilebert-uncased
142
- [2024-05-12 00:36:59] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mt5-large
143
- ###################################
144
- google/mobilebert-uncased, <class 'tokenizers.models.WordPiece'>
145
- reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
146
- text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
147
- decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
148
- C:\Users\xusong28\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\convert_slow_tokenizer.py:560: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
149
- warnings.warn(
150
- [2024-05-12 00:37:23] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/switch-c-2048
151
- ###################################
152
- google/mt5-large, <class 'tokenizers.models.Unigram'>
153
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
154
- text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
155
- decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
156
- [2024-05-12 00:37:43] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-alpaca-lora-7b
157
- ###################################
158
- google/switch-c-2048, <class 'tokenizers.models.Unigram'>
159
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
160
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
161
- decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
162
- You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
163
- [2024-05-12 00:38:04] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-2-7b
164
- [2024-05-12 00:38:25] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-lora-7b
165
- [2024-05-12 00:38:46] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/llama-3-chinese-8b
166
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
167
- [2024-05-12 00:39:07] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hpcai-tech/grok-1
168
- [2024-05-12 00:39:28] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-chat-7b
169
- [2024-05-12 00:40:09] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-xcomposer-7b
170
- [2024-05-12 00:40:31] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-chat-7b
171
- [2024-05-12 00:41:13] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-math-7b
172
- [2024-05-12 00:41:35] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer lmsys/fastchat-t5-3b-v1.0
173
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
174
- ###################################
175
- [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Llama-2-7b-chat
176
- lmsys/fastchat-t5-3b-v1.0, sp_model, byte_num: 0
177
- reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2105; oov: []
178
- text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
179
- decoding[7] = " <unk> ам<unk> лав<unk> у<unk> равлени<unk> ра<unk> вити<unk>; <unk> <unk> 15<unk> 17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk> е<unk> н<unk> асо<unk>:; « <unk> <unk> <unk>; \t \n <unk> ;<unk> "
180
- [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Meta-Llama-3-8B
181
- [2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/Phi-3-mini-4k-instruct
182
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
183
- [2024-05-12 00:42:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-1
184
- [2024-05-12 00:42:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-2
185
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
186
- [2024-05-12 00:42:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mistral-7B-v0.1
187
- [2024-05-12 00:43:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mixtral-8x7B-v0.1
188
- [2024-05-12 00:43:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai-community/gpt2
189
- [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/code-davinci-002
190
- [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-3.5-turbo
191
- [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-4
192
- [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/text-davinci-003
193
- [2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer paust/pko-t5-large
194
- Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
195
- [2024-05-12 00:44:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer thu-coai/CharacterGLM-6B
196
- [2024-05-12 00:44:58] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-180b
197
- [2024-05-12 00:45:19] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-7b
198
-
199
- Process finished with exit code 0
200
-
201
-
202
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.py CHANGED
@@ -206,7 +206,7 @@ _all_tokenizer_config = [
206
  meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
207
  TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
208
  TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
209
- # WoBERT
210
  # WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
211
 
212
 
 
206
  meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
207
  TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
208
  TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
209
+ # WoBERT https://kexue.fm/archives/7758
210
  # WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
211
 
212